From f2680c62214d3eeba8dfb201e883fb2f121a70ab Mon Sep 17 00:00:00 2001 From: Julien Bisconti Date: Fri, 27 Feb 2026 23:17:50 +0100 Subject: [PATCH] feat: add README parser with entry extraction and section tree building Co-Authored-By: Claude Opus 4.6 --- internal/parser/parser.go | 128 ++++++++++++++++++++++++++++ internal/parser/parser_test.go | 147 +++++++++++++++++++++++++++++++++ internal/parser/types.go | 35 ++++++++ 3 files changed, 310 insertions(+) create mode 100644 internal/parser/parser.go create mode 100644 internal/parser/parser_test.go create mode 100644 internal/parser/types.go diff --git a/internal/parser/parser.go b/internal/parser/parser.go new file mode 100644 index 0000000..fdc0854 --- /dev/null +++ b/internal/parser/parser.go @@ -0,0 +1,128 @@ +package parser + +import ( + "bufio" + "fmt" + "io" + "regexp" + "strings" +) + +// entryRe matches: - [Name](URL) - Description +var entryRe = regexp.MustCompile(`^[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s+-\s+(.+)$`) + +// headingRe matches markdown headings: # Title, ## Title, etc. +var headingRe = regexp.MustCompile(`^(#{1,6})\s+(.+?)(?:\s*)?$`) + +var markerMap = map[string]Marker{ + ":skull:": MarkerAbandoned, + ":heavy_dollar_sign:": MarkerPaid, + ":construction:": MarkerWIP, +} + +// ParseEntry parses a single markdown list line into an Entry. +func ParseEntry(line string, lineNum int) (Entry, error) { + m := entryRe.FindStringSubmatch(strings.TrimSpace(line)) + if m == nil { + return Entry{}, fmt.Errorf("line %d: not a valid entry: %q", lineNum, line) + } + + desc := m[3] + var markers []Marker + + for text, marker := range markerMap { + if strings.Contains(desc, text) { + markers = append(markers, marker) + desc = strings.ReplaceAll(desc, text, "") + } + } + desc = strings.TrimSpace(desc) + + return Entry{ + Name: m[1], + URL: m[2], + Description: desc, + Markers: markers, + Line: lineNum, + Raw: line, + }, nil +} + +// Parse reads a full README and returns a Document. +func Parse(r io.Reader) (Document, error) { + scanner := bufio.NewScanner(r) + var doc Document + var allSections []struct { + section Section + level int + } + + lineNum := 0 + for scanner.Scan() { + lineNum++ + line := scanner.Text() + + // Check for heading + if hm := headingRe.FindStringSubmatch(line); hm != nil { + level := len(hm[1]) + title := strings.TrimSpace(hm[2]) + allSections = append(allSections, struct { + section Section + level int + }{ + section: Section{Title: title, Level: level, Line: lineNum}, + level: level, + }) + continue + } + + // Check for entry (list item with link) + if entry, err := ParseEntry(line, lineNum); err == nil { + if len(allSections) > 0 { + allSections[len(allSections)-1].section.Entries = append( + allSections[len(allSections)-1].section.Entries, entry) + } + continue + } + + // Everything else: preamble if no sections yet + if len(allSections) == 0 { + doc.Preamble = append(doc.Preamble, line) + } + } + + if err := scanner.Err(); err != nil { + return doc, err + } + + // Build section tree by nesting based on heading level + doc.Sections = buildTree(allSections) + return doc, nil +} + +func buildTree(flat []struct { + section Section + level int +}) []Section { + if len(flat) == 0 { + return nil + } + + var result []Section + for i := 0; i < len(flat); i++ { + current := flat[i].section + currentLevel := flat[i].level + + // Collect children: everything after this heading at a deeper level + j := i + 1 + for j < len(flat) && flat[j].level > currentLevel { + j++ + } + if j > i+1 { + current.Children = buildTree(flat[i+1 : j]) + } + result = append(result, current) + i = j - 1 + } + return result +} diff --git a/internal/parser/parser_test.go b/internal/parser/parser_test.go new file mode 100644 index 0000000..d414747 --- /dev/null +++ b/internal/parser/parser_test.go @@ -0,0 +1,147 @@ +package parser + +import ( + "os" + "strings" + "testing" +) + +func TestParseEntry(t *testing.T) { + line := `- [Docker Desktop](https://www.docker.com/products/docker-desktop/) - Official native app. Only for Windows and MacOS.` + entry, err := ParseEntry(line, 1) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if entry.Name != "Docker Desktop" { + t.Errorf("name = %q, want %q", entry.Name, "Docker Desktop") + } + if entry.URL != "https://www.docker.com/products/docker-desktop/" { + t.Errorf("url = %q, want %q", entry.URL, "https://www.docker.com/products/docker-desktop/") + } + if entry.Description != "Official native app. Only for Windows and MacOS." { + t.Errorf("description = %q, want %q", entry.Description, "Official native app. Only for Windows and MacOS.") + } + if len(entry.Markers) != 0 { + t.Errorf("markers = %v, want empty", entry.Markers) + } +} + +func TestParseEntryWithMarkers(t *testing.T) { + line := `- [Docker Swarm](https://github.com/docker/swarm) - Swarm clustering system. :skull:` + entry, err := ParseEntry(line, 1) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if entry.Name != "Docker Swarm" { + t.Errorf("name = %q, want %q", entry.Name, "Docker Swarm") + } + if len(entry.Markers) != 1 || entry.Markers[0] != MarkerAbandoned { + t.Errorf("markers = %v, want [MarkerAbandoned]", entry.Markers) + } + if strings.Contains(entry.Description, ":skull:") { + t.Errorf("description should not contain marker text, got %q", entry.Description) + } +} + +func TestParseEntryMultipleMarkers(t *testing.T) { + line := `- [SomeProject](https://example.com) - A project. :heavy_dollar_sign: :construction:` + entry, err := ParseEntry(line, 1) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(entry.Markers) != 2 { + t.Fatalf("markers count = %d, want 2", len(entry.Markers)) + } +} + +func TestParseDocument(t *testing.T) { + input := `# Awesome Docker + +> A curated list + +# Contents + +- [Projects](#projects) + +# Legend + +- Abandoned :skull: + +# Projects + +## Tools + +- [ToolA](https://github.com/a/a) - Does A. +- [ToolB](https://github.com/b/b) - Does B. :skull: + +## Services + +- [ServiceC](https://example.com/c) - Does C. :heavy_dollar_sign: +` + doc, err := Parse(strings.NewReader(input)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(doc.Sections) == 0 { + t.Fatal("expected at least one section") + } + // Find the "Projects" section + var projects *Section + for i := range doc.Sections { + if doc.Sections[i].Title == "Projects" { + projects = &doc.Sections[i] + break + } + } + if projects == nil { + t.Fatal("expected a Projects section") + } + if len(projects.Children) != 2 { + t.Errorf("projects children = %d, want 2", len(projects.Children)) + } + if projects.Children[0].Title != "Tools" { + t.Errorf("first child = %q, want %q", projects.Children[0].Title, "Tools") + } + if len(projects.Children[0].Entries) != 2 { + t.Errorf("Tools entries = %d, want 2", len(projects.Children[0].Entries)) + } +} + +func TestParseNotAnEntry(t *testing.T) { + _, err := ParseEntry("- Abandoned :skull:", 1) + if err == nil { + t.Error("expected error for non-entry list item") + } +} + +func TestParseRealREADME(t *testing.T) { + f, err := os.Open("../../README.md") + if err != nil { + t.Skip("README.md not found, skipping integration test") + } + defer f.Close() + + doc, err := Parse(f) + if err != nil { + t.Fatalf("failed to parse README: %v", err) + } + + if len(doc.Sections) == 0 { + t.Error("expected sections") + } + + total := countEntries(doc.Sections) + if total < 100 { + t.Errorf("expected at least 100 entries, got %d", total) + } + t.Logf("Parsed %d sections, %d total entries", len(doc.Sections), total) +} + +func countEntries(sections []Section) int { + n := 0 + for _, s := range sections { + n += len(s.Entries) + n += countEntries(s.Children) + } + return n +} diff --git a/internal/parser/types.go b/internal/parser/types.go new file mode 100644 index 0000000..43a1ccf --- /dev/null +++ b/internal/parser/types.go @@ -0,0 +1,35 @@ +package parser + +// Marker represents a status emoji on an entry. +type Marker int + +const ( + MarkerAbandoned Marker = iota // :skull: + MarkerPaid // :heavy_dollar_sign: + MarkerWIP // :construction: +) + +// Entry is a single link entry in the README. +type Entry struct { + Name string + URL string + Description string + Markers []Marker + Line int // 1-based line number in source + Raw string // original line text +} + +// Section is a heading with optional entries and child sections. +type Section struct { + Title string + Level int // heading level: 1 = #, 2 = ##, etc. + Entries []Entry + Children []Section + Line int +} + +// Document is the parsed representation of the full README. +type Document struct { + Preamble []string // lines before the first section + Sections []Section +}