From bc46effe08c1a6f76230a70a524ea0e735126dd8 Mon Sep 17 00:00:00 2001 From: Julien Bisconti Date: Fri, 27 Feb 2026 23:20:50 +0100 Subject: [PATCH] feat: add HTTP link checker and GitHub GraphQL repo checker Co-Authored-By: Claude Opus 4.6 --- go.mod | 3 + go.sum | 6 ++ internal/checker/github.go | 138 ++++++++++++++++++++++++++++++++ internal/checker/github_test.go | 52 ++++++++++++ internal/checker/http.go | 111 +++++++++++++++++++++++++ internal/checker/http_test.go | 80 ++++++++++++++++++ 6 files changed, 390 insertions(+) create mode 100644 internal/checker/github.go create mode 100644 internal/checker/github_test.go create mode 100644 internal/checker/http.go create mode 100644 internal/checker/http_test.go diff --git a/go.mod b/go.mod index 5e4eb4f..f608100 100644 --- a/go.mod +++ b/go.mod @@ -6,6 +6,9 @@ require github.com/spf13/cobra v1.10.2 require ( github.com/inconshreveable/mousetrap v1.1.0 // indirect + github.com/shurcooL/githubv4 v0.0.0-20260209031235-2402fdf4a9ed // indirect + github.com/shurcooL/graphql v0.0.0-20240915155400-7ee5256398cf // indirect github.com/spf13/pflag v1.0.9 // indirect + golang.org/x/oauth2 v0.35.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index ff4d6ec..8c787c0 100644 --- a/go.sum +++ b/go.sum @@ -2,11 +2,17 @@ github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6N github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= +github.com/shurcooL/githubv4 v0.0.0-20260209031235-2402fdf4a9ed h1:KT7hI8vYXgU0s2qaMkrfq9tCA1w/iEPgfredVP+4Tzw= +github.com/shurcooL/githubv4 v0.0.0-20260209031235-2402fdf4a9ed/go.mod h1:zqMwyHmnN/eDOZOdiTohqIUKUrTFX62PNlu7IJdu0q8= +github.com/shurcooL/graphql v0.0.0-20240915155400-7ee5256398cf h1:o1uxfymjZ7jZ4MsgCErcwWGtVKSiNAXtS59Lhs6uI/g= +github.com/shurcooL/graphql v0.0.0-20240915155400-7ee5256398cf/go.mod h1:9dIRpgIY7hVhoqfe0/FcYp0bpInZaT7dc3BYOprrIUE= github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU= github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4= github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ= +golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/internal/checker/github.go b/internal/checker/github.go new file mode 100644 index 0000000..e1f1dac --- /dev/null +++ b/internal/checker/github.go @@ -0,0 +1,138 @@ +package checker + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/shurcooL/githubv4" + "golang.org/x/oauth2" +) + +// RepoInfo holds metadata about a GitHub repository. +type RepoInfo struct { + Owner string + Name string + URL string + IsArchived bool + IsDisabled bool + IsPrivate bool + PushedAt time.Time + Stars int + Forks int + HasLicense bool +} + +// ExtractGitHubRepo extracts owner/name from a GitHub URL. +// Returns false for non-repo URLs (issues, wiki, apps, etc.). +func ExtractGitHubRepo(url string) (owner, name string, ok bool) { + if !strings.HasPrefix(url, "https://github.com/") { + return "", "", false + } + path := strings.TrimPrefix(url, "https://github.com/") + path = strings.TrimRight(path, "/") + parts := strings.Split(path, "/") + if len(parts) != 2 || parts[0] == "" || parts[1] == "" { + return "", "", false + } + // Skip non-repo paths + if parts[0] == "apps" || parts[0] == "features" || parts[0] == "topics" { + return "", "", false + } + return parts[0], parts[1], true +} + +// PartitionLinks separates URLs into GitHub repos and external links. +func PartitionLinks(urls []string) (github, external []string) { + for _, url := range urls { + if _, _, ok := ExtractGitHubRepo(url); ok { + github = append(github, url) + } else { + external = append(external, url) + } + } + return +} + +// GitHubChecker uses the GitHub GraphQL API. +type GitHubChecker struct { + client *githubv4.Client +} + +// NewGitHubChecker creates a checker with the given OAuth token. +func NewGitHubChecker(token string) *GitHubChecker { + src := oauth2.StaticTokenSource(&oauth2.Token{AccessToken: token}) + httpClient := oauth2.NewClient(context.Background(), src) + return &GitHubChecker{client: githubv4.NewClient(httpClient)} +} + +// CheckRepo queries a single GitHub repository. +func (gc *GitHubChecker) CheckRepo(ctx context.Context, owner, name string) (RepoInfo, error) { + var query struct { + Repository struct { + IsArchived bool + IsDisabled bool + IsPrivate bool + PushedAt time.Time + StargazerCount int + ForkCount int + LicenseInfo *struct { + Name string + } + } `graphql:"repository(owner: $owner, name: $name)"` + } + + vars := map[string]interface{}{ + "owner": githubv4.String(owner), + "name": githubv4.String(name), + } + + if err := gc.client.Query(ctx, &query, vars); err != nil { + return RepoInfo{}, fmt.Errorf("github query %s/%s: %w", owner, name, err) + } + + r := query.Repository + return RepoInfo{ + Owner: owner, + Name: name, + URL: fmt.Sprintf("https://github.com/%s/%s", owner, name), + IsArchived: r.IsArchived, + IsDisabled: r.IsDisabled, + IsPrivate: r.IsPrivate, + PushedAt: r.PushedAt, + Stars: r.StargazerCount, + Forks: r.ForkCount, + HasLicense: r.LicenseInfo != nil, + }, nil +} + +// CheckRepos queries multiple repos in sequence with rate limiting. +func (gc *GitHubChecker) CheckRepos(ctx context.Context, urls []string, batchSize int) ([]RepoInfo, []error) { + if batchSize <= 0 { + batchSize = 50 + } + + var results []RepoInfo + var errs []error + + for i, url := range urls { + owner, name, ok := ExtractGitHubRepo(url) + if !ok { + continue + } + + info, err := gc.CheckRepo(ctx, owner, name) + if err != nil { + errs = append(errs, err) + continue + } + results = append(results, info) + + if (i+1)%batchSize == 0 { + time.Sleep(1 * time.Second) + } + } + + return results, errs +} diff --git a/internal/checker/github_test.go b/internal/checker/github_test.go new file mode 100644 index 0000000..7ac8fbe --- /dev/null +++ b/internal/checker/github_test.go @@ -0,0 +1,52 @@ +package checker + +import ( + "testing" +) + +func TestExtractGitHubRepo(t *testing.T) { + tests := []struct { + url string + owner string + name string + ok bool + }{ + {"https://github.com/docker/compose", "docker", "compose", true}, + {"https://github.com/moby/moby", "moby", "moby", true}, + {"https://github.com/user/repo/", "user", "repo", true}, + {"https://github.com/user/repo/issues", "", "", false}, + {"https://github.com/user/repo/wiki", "", "", false}, + {"https://github.com/apps/dependabot", "", "", false}, + {"https://example.com/not-github", "", "", false}, + {"https://github.com/user", "", "", false}, + } + + for _, tt := range tests { + owner, name, ok := ExtractGitHubRepo(tt.url) + if ok != tt.ok { + t.Errorf("ExtractGitHubRepo(%q): ok = %v, want %v", tt.url, ok, tt.ok) + continue + } + if ok { + if owner != tt.owner || name != tt.name { + t.Errorf("ExtractGitHubRepo(%q) = (%q, %q), want (%q, %q)", tt.url, owner, name, tt.owner, tt.name) + } + } + } +} + +func TestPartitionLinks(t *testing.T) { + urls := []string{ + "https://github.com/docker/compose", + "https://example.com/tool", + "https://github.com/moby/moby", + "https://github.com/user/repo/issues", + } + gh, ext := PartitionLinks(urls) + if len(gh) != 2 { + t.Errorf("github links = %d, want 2", len(gh)) + } + if len(ext) != 2 { + t.Errorf("external links = %d, want 2", len(ext)) + } +} diff --git a/internal/checker/http.go b/internal/checker/http.go new file mode 100644 index 0000000..3e17929 --- /dev/null +++ b/internal/checker/http.go @@ -0,0 +1,111 @@ +package checker + +import ( + "context" + "net/http" + "sync" + "time" + + "github.com/veggiemonk/awesome-docker/internal/cache" +) + +const ( + defaultTimeout = 30 * time.Second + defaultConcurrency = 10 + userAgent = "awesome-docker-checker/1.0" +) + +// LinkResult holds the result of checking a single URL. +type LinkResult struct { + URL string + OK bool + StatusCode int + Redirected bool + RedirectURL string + Error string +} + +// CheckLink checks a single URL. Uses HEAD first, falls back to GET. +func CheckLink(url string, client *http.Client) LinkResult { + result := LinkResult{URL: url} + + ctx, cancel := context.WithTimeout(context.Background(), defaultTimeout) + defer cancel() + + // Try HEAD first + req, err := http.NewRequestWithContext(ctx, http.MethodHead, url, nil) + if err != nil { + result.Error = err.Error() + return result + } + req.Header.Set("User-Agent", userAgent) + + // Track redirects + var finalURL string + origCheckRedirect := client.CheckRedirect + client.CheckRedirect = func(req *http.Request, via []*http.Request) error { + finalURL = req.URL.String() + if len(via) >= 10 { + return http.ErrUseLastResponse + } + return nil + } + defer func() { client.CheckRedirect = origCheckRedirect }() + + resp, err := client.Do(req) + if err != nil { + // Fallback to GET + req, err2 := http.NewRequestWithContext(ctx, http.MethodGet, url, nil) + if err2 != nil { + result.Error = err.Error() + return result + } + req.Header.Set("User-Agent", userAgent) + resp, err = client.Do(req) + if err != nil { + result.Error = err.Error() + return result + } + } + defer resp.Body.Close() + + result.StatusCode = resp.StatusCode + result.OK = resp.StatusCode >= 200 && resp.StatusCode < 400 + + if finalURL != "" && finalURL != url { + result.Redirected = true + result.RedirectURL = finalURL + } + + return result +} + +// CheckLinks checks multiple URLs concurrently. +func CheckLinks(urls []string, concurrency int, exclude *cache.ExcludeList) []LinkResult { + if concurrency <= 0 { + concurrency = defaultConcurrency + } + + results := make([]LinkResult, len(urls)) + sem := make(chan struct{}, concurrency) + var wg sync.WaitGroup + + for i, url := range urls { + if exclude != nil && exclude.IsExcluded(url) { + results[i] = LinkResult{URL: url, OK: true} + continue + } + + wg.Add(1) + go func(idx int, u string) { + defer wg.Done() + sem <- struct{}{} + defer func() { <-sem }() + client := &http.Client{Timeout: defaultTimeout} + results[idx] = CheckLink(u, client) + }(i, url) + } + + wg.Wait() + return results +} diff --git a/internal/checker/http_test.go b/internal/checker/http_test.go new file mode 100644 index 0000000..b8eef45 --- /dev/null +++ b/internal/checker/http_test.go @@ -0,0 +1,80 @@ +package checker + +import ( + "net/http" + "net/http/httptest" + "testing" +) + +func TestCheckLinkOK(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer server.Close() + + result := CheckLink(server.URL, &http.Client{}) + if !result.OK { + t.Errorf("expected OK, got status %d, error: %s", result.StatusCode, result.Error) + } +} + +func TestCheckLink404(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNotFound) + })) + defer server.Close() + + result := CheckLink(server.URL, &http.Client{}) + if result.OK { + t.Error("expected not OK for 404") + } + if result.StatusCode != 404 { + t.Errorf("status = %d, want 404", result.StatusCode) + } +} + +func TestCheckLinkRedirect(t *testing.T) { + final := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + })) + defer final.Close() + + redir := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + http.Redirect(w, r, final.URL, http.StatusMovedPermanently) + })) + defer redir.Close() + + result := CheckLink(redir.URL, &http.Client{}) + if !result.OK { + t.Errorf("expected OK after following redirect, error: %s", result.Error) + } + if !result.Redirected { + t.Error("expected Redirected = true") + } +} + +func TestCheckLinks(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path == "/bad" { + w.WriteHeader(http.StatusNotFound) + return + } + w.WriteHeader(http.StatusOK) + })) + defer server.Close() + + urls := []string{server.URL + "/good", server.URL + "/bad", server.URL + "/also-good"} + results := CheckLinks(urls, 2, nil) + if len(results) != 3 { + t.Fatalf("results = %d, want 3", len(results)) + } + + for _, r := range results { + if r.URL == server.URL+"/bad" && r.OK { + t.Error("expected /bad to not be OK") + } + if r.URL == server.URL+"/good" && !r.OK { + t.Error("expected /good to be OK") + } + } +}