feat: add HTTP link checker and GitHub GraphQL repo checker

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Julien Bisconti
2026-02-27 23:20:50 +01:00
parent ddc32f45d0
commit bc46effe08
6 changed files with 390 additions and 0 deletions

3
go.mod
View File

@@ -6,6 +6,9 @@ require github.com/spf13/cobra v1.10.2
require ( require (
github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect
github.com/shurcooL/githubv4 v0.0.0-20260209031235-2402fdf4a9ed // indirect
github.com/shurcooL/graphql v0.0.0-20240915155400-7ee5256398cf // indirect
github.com/spf13/pflag v1.0.9 // indirect github.com/spf13/pflag v1.0.9 // indirect
golang.org/x/oauth2 v0.35.0 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect
) )

6
go.sum
View File

@@ -2,11 +2,17 @@ github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6N
github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8= github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2s0bqwp9tc8=
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw= github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM=
github.com/shurcooL/githubv4 v0.0.0-20260209031235-2402fdf4a9ed h1:KT7hI8vYXgU0s2qaMkrfq9tCA1w/iEPgfredVP+4Tzw=
github.com/shurcooL/githubv4 v0.0.0-20260209031235-2402fdf4a9ed/go.mod h1:zqMwyHmnN/eDOZOdiTohqIUKUrTFX62PNlu7IJdu0q8=
github.com/shurcooL/graphql v0.0.0-20240915155400-7ee5256398cf h1:o1uxfymjZ7jZ4MsgCErcwWGtVKSiNAXtS59Lhs6uI/g=
github.com/shurcooL/graphql v0.0.0-20240915155400-7ee5256398cf/go.mod h1:9dIRpgIY7hVhoqfe0/FcYp0bpInZaT7dc3BYOprrIUE=
github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU= github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU=
github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4= github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4=
github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY=
github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
golang.org/x/oauth2 v0.35.0 h1:Mv2mzuHuZuY2+bkyWXIHMfhNdJAdwW3FuWeCPYN5GVQ=
golang.org/x/oauth2 v0.35.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

138
internal/checker/github.go Normal file
View File

@@ -0,0 +1,138 @@
package checker
import (
"context"
"fmt"
"strings"
"time"
"github.com/shurcooL/githubv4"
"golang.org/x/oauth2"
)
// RepoInfo holds metadata about a GitHub repository.
type RepoInfo struct {
Owner string
Name string
URL string
IsArchived bool
IsDisabled bool
IsPrivate bool
PushedAt time.Time
Stars int
Forks int
HasLicense bool
}
// ExtractGitHubRepo extracts owner/name from a GitHub URL.
// Returns false for non-repo URLs (issues, wiki, apps, etc.).
func ExtractGitHubRepo(url string) (owner, name string, ok bool) {
if !strings.HasPrefix(url, "https://github.com/") {
return "", "", false
}
path := strings.TrimPrefix(url, "https://github.com/")
path = strings.TrimRight(path, "/")
parts := strings.Split(path, "/")
if len(parts) != 2 || parts[0] == "" || parts[1] == "" {
return "", "", false
}
// Skip non-repo paths
if parts[0] == "apps" || parts[0] == "features" || parts[0] == "topics" {
return "", "", false
}
return parts[0], parts[1], true
}
// PartitionLinks separates URLs into GitHub repos and external links.
func PartitionLinks(urls []string) (github, external []string) {
for _, url := range urls {
if _, _, ok := ExtractGitHubRepo(url); ok {
github = append(github, url)
} else {
external = append(external, url)
}
}
return
}
// GitHubChecker uses the GitHub GraphQL API.
type GitHubChecker struct {
client *githubv4.Client
}
// NewGitHubChecker creates a checker with the given OAuth token.
func NewGitHubChecker(token string) *GitHubChecker {
src := oauth2.StaticTokenSource(&oauth2.Token{AccessToken: token})
httpClient := oauth2.NewClient(context.Background(), src)
return &GitHubChecker{client: githubv4.NewClient(httpClient)}
}
// CheckRepo queries a single GitHub repository.
func (gc *GitHubChecker) CheckRepo(ctx context.Context, owner, name string) (RepoInfo, error) {
var query struct {
Repository struct {
IsArchived bool
IsDisabled bool
IsPrivate bool
PushedAt time.Time
StargazerCount int
ForkCount int
LicenseInfo *struct {
Name string
}
} `graphql:"repository(owner: $owner, name: $name)"`
}
vars := map[string]interface{}{
"owner": githubv4.String(owner),
"name": githubv4.String(name),
}
if err := gc.client.Query(ctx, &query, vars); err != nil {
return RepoInfo{}, fmt.Errorf("github query %s/%s: %w", owner, name, err)
}
r := query.Repository
return RepoInfo{
Owner: owner,
Name: name,
URL: fmt.Sprintf("https://github.com/%s/%s", owner, name),
IsArchived: r.IsArchived,
IsDisabled: r.IsDisabled,
IsPrivate: r.IsPrivate,
PushedAt: r.PushedAt,
Stars: r.StargazerCount,
Forks: r.ForkCount,
HasLicense: r.LicenseInfo != nil,
}, nil
}
// CheckRepos queries multiple repos in sequence with rate limiting.
func (gc *GitHubChecker) CheckRepos(ctx context.Context, urls []string, batchSize int) ([]RepoInfo, []error) {
if batchSize <= 0 {
batchSize = 50
}
var results []RepoInfo
var errs []error
for i, url := range urls {
owner, name, ok := ExtractGitHubRepo(url)
if !ok {
continue
}
info, err := gc.CheckRepo(ctx, owner, name)
if err != nil {
errs = append(errs, err)
continue
}
results = append(results, info)
if (i+1)%batchSize == 0 {
time.Sleep(1 * time.Second)
}
}
return results, errs
}

View File

@@ -0,0 +1,52 @@
package checker
import (
"testing"
)
func TestExtractGitHubRepo(t *testing.T) {
tests := []struct {
url string
owner string
name string
ok bool
}{
{"https://github.com/docker/compose", "docker", "compose", true},
{"https://github.com/moby/moby", "moby", "moby", true},
{"https://github.com/user/repo/", "user", "repo", true},
{"https://github.com/user/repo/issues", "", "", false},
{"https://github.com/user/repo/wiki", "", "", false},
{"https://github.com/apps/dependabot", "", "", false},
{"https://example.com/not-github", "", "", false},
{"https://github.com/user", "", "", false},
}
for _, tt := range tests {
owner, name, ok := ExtractGitHubRepo(tt.url)
if ok != tt.ok {
t.Errorf("ExtractGitHubRepo(%q): ok = %v, want %v", tt.url, ok, tt.ok)
continue
}
if ok {
if owner != tt.owner || name != tt.name {
t.Errorf("ExtractGitHubRepo(%q) = (%q, %q), want (%q, %q)", tt.url, owner, name, tt.owner, tt.name)
}
}
}
}
func TestPartitionLinks(t *testing.T) {
urls := []string{
"https://github.com/docker/compose",
"https://example.com/tool",
"https://github.com/moby/moby",
"https://github.com/user/repo/issues",
}
gh, ext := PartitionLinks(urls)
if len(gh) != 2 {
t.Errorf("github links = %d, want 2", len(gh))
}
if len(ext) != 2 {
t.Errorf("external links = %d, want 2", len(ext))
}
}

111
internal/checker/http.go Normal file
View File

@@ -0,0 +1,111 @@
package checker
import (
"context"
"net/http"
"sync"
"time"
"github.com/veggiemonk/awesome-docker/internal/cache"
)
const (
defaultTimeout = 30 * time.Second
defaultConcurrency = 10
userAgent = "awesome-docker-checker/1.0"
)
// LinkResult holds the result of checking a single URL.
type LinkResult struct {
URL string
OK bool
StatusCode int
Redirected bool
RedirectURL string
Error string
}
// CheckLink checks a single URL. Uses HEAD first, falls back to GET.
func CheckLink(url string, client *http.Client) LinkResult {
result := LinkResult{URL: url}
ctx, cancel := context.WithTimeout(context.Background(), defaultTimeout)
defer cancel()
// Try HEAD first
req, err := http.NewRequestWithContext(ctx, http.MethodHead, url, nil)
if err != nil {
result.Error = err.Error()
return result
}
req.Header.Set("User-Agent", userAgent)
// Track redirects
var finalURL string
origCheckRedirect := client.CheckRedirect
client.CheckRedirect = func(req *http.Request, via []*http.Request) error {
finalURL = req.URL.String()
if len(via) >= 10 {
return http.ErrUseLastResponse
}
return nil
}
defer func() { client.CheckRedirect = origCheckRedirect }()
resp, err := client.Do(req)
if err != nil {
// Fallback to GET
req, err2 := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err2 != nil {
result.Error = err.Error()
return result
}
req.Header.Set("User-Agent", userAgent)
resp, err = client.Do(req)
if err != nil {
result.Error = err.Error()
return result
}
}
defer resp.Body.Close()
result.StatusCode = resp.StatusCode
result.OK = resp.StatusCode >= 200 && resp.StatusCode < 400
if finalURL != "" && finalURL != url {
result.Redirected = true
result.RedirectURL = finalURL
}
return result
}
// CheckLinks checks multiple URLs concurrently.
func CheckLinks(urls []string, concurrency int, exclude *cache.ExcludeList) []LinkResult {
if concurrency <= 0 {
concurrency = defaultConcurrency
}
results := make([]LinkResult, len(urls))
sem := make(chan struct{}, concurrency)
var wg sync.WaitGroup
for i, url := range urls {
if exclude != nil && exclude.IsExcluded(url) {
results[i] = LinkResult{URL: url, OK: true}
continue
}
wg.Add(1)
go func(idx int, u string) {
defer wg.Done()
sem <- struct{}{}
defer func() { <-sem }()
client := &http.Client{Timeout: defaultTimeout}
results[idx] = CheckLink(u, client)
}(i, url)
}
wg.Wait()
return results
}

View File

@@ -0,0 +1,80 @@
package checker
import (
"net/http"
"net/http/httptest"
"testing"
)
func TestCheckLinkOK(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
result := CheckLink(server.URL, &http.Client{})
if !result.OK {
t.Errorf("expected OK, got status %d, error: %s", result.StatusCode, result.Error)
}
}
func TestCheckLink404(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusNotFound)
}))
defer server.Close()
result := CheckLink(server.URL, &http.Client{})
if result.OK {
t.Error("expected not OK for 404")
}
if result.StatusCode != 404 {
t.Errorf("status = %d, want 404", result.StatusCode)
}
}
func TestCheckLinkRedirect(t *testing.T) {
final := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
}))
defer final.Close()
redir := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.Redirect(w, r, final.URL, http.StatusMovedPermanently)
}))
defer redir.Close()
result := CheckLink(redir.URL, &http.Client{})
if !result.OK {
t.Errorf("expected OK after following redirect, error: %s", result.Error)
}
if !result.Redirected {
t.Error("expected Redirected = true")
}
}
func TestCheckLinks(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path == "/bad" {
w.WriteHeader(http.StatusNotFound)
return
}
w.WriteHeader(http.StatusOK)
}))
defer server.Close()
urls := []string{server.URL + "/good", server.URL + "/bad", server.URL + "/also-good"}
results := CheckLinks(urls, 2, nil)
if len(results) != 3 {
t.Fatalf("results = %d, want 3", len(results))
}
for _, r := range results {
if r.URL == server.URL+"/bad" && r.OK {
t.Error("expected /bad to not be OK")
}
if r.URL == server.URL+"/good" && !r.OK {
t.Error("expected /good to be OK")
}
}
}