From f2680c62214d3eeba8dfb201e883fb2f121a70ab Mon Sep 17 00:00:00 2001
From: Julien Bisconti <veggiemonk@users.noreply.github.com>
Date: Fri, 27 Feb 2026 23:17:50 +0100
Subject: [PATCH] feat: add README parser with entry extraction and section
 tree building

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 internal/parser/parser.go      | 128 ++++++++++++++++++++++++++++
 internal/parser/parser_test.go | 147 +++++++++++++++++++++++++++++++++
 internal/parser/types.go       |  35 ++++++++
 3 files changed, 310 insertions(+)
 create mode 100644 internal/parser/parser.go
 create mode 100644 internal/parser/parser_test.go
 create mode 100644 internal/parser/types.go

diff --git a/internal/parser/parser.go b/internal/parser/parser.go
new file mode 100644
index 0000000..fdc0854
--- /dev/null
+++ b/internal/parser/parser.go
@@ -0,0 +1,128 @@
+package parser
+
+import (
+	"bufio"
+	"fmt"
+	"io"
+	"regexp"
+	"strings"
+)
+
+// entryRe matches: - [Name](URL) - Description
+var entryRe = regexp.MustCompile(`^[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s+-\s+(.+)$`)
+
+// headingRe matches markdown headings: # Title, ## Title, etc.
+var headingRe = regexp.MustCompile(`^(#{1,6})\s+(.+?)(?:\s*<!--.*-->)?$`)
+
+var markerMap = map[string]Marker{
+	":skull:":             MarkerAbandoned,
+	":heavy_dollar_sign:": MarkerPaid,
+	":construction:":      MarkerWIP,
+}
+
+// ParseEntry parses a single markdown list line into an Entry.
+func ParseEntry(line string, lineNum int) (Entry, error) {
+	m := entryRe.FindStringSubmatch(strings.TrimSpace(line))
+	if m == nil {
+		return Entry{}, fmt.Errorf("line %d: not a valid entry: %q", lineNum, line)
+	}
+
+	desc := m[3]
+	var markers []Marker
+
+	for text, marker := range markerMap {
+		if strings.Contains(desc, text) {
+			markers = append(markers, marker)
+			desc = strings.ReplaceAll(desc, text, "")
+		}
+	}
+	desc = strings.TrimSpace(desc)
+
+	return Entry{
+		Name:        m[1],
+		URL:         m[2],
+		Description: desc,
+		Markers:     markers,
+		Line:        lineNum,
+		Raw:         line,
+	}, nil
+}
+
+// Parse reads a full README and returns a Document.
+func Parse(r io.Reader) (Document, error) {
+	scanner := bufio.NewScanner(r)
+	var doc Document
+	var allSections []struct {
+		section Section
+		level   int
+	}
+
+	lineNum := 0
+	for scanner.Scan() {
+		lineNum++
+		line := scanner.Text()
+
+		// Check for heading
+		if hm := headingRe.FindStringSubmatch(line); hm != nil {
+			level := len(hm[1])
+			title := strings.TrimSpace(hm[2])
+			allSections = append(allSections, struct {
+				section Section
+				level   int
+			}{
+				section: Section{Title: title, Level: level, Line: lineNum},
+				level:   level,
+			})
+			continue
+		}
+
+		// Check for entry (list item with link)
+		if entry, err := ParseEntry(line, lineNum); err == nil {
+			if len(allSections) > 0 {
+				allSections[len(allSections)-1].section.Entries = append(
+					allSections[len(allSections)-1].section.Entries, entry)
+			}
+			continue
+		}
+
+		// Everything else: preamble if no sections yet
+		if len(allSections) == 0 {
+			doc.Preamble = append(doc.Preamble, line)
+		}
+	}
+
+	if err := scanner.Err(); err != nil {
+		return doc, err
+	}
+
+	// Build section tree by nesting based on heading level
+	doc.Sections = buildTree(allSections)
+	return doc, nil
+}
+
+func buildTree(flat []struct {
+	section Section
+	level   int
+}) []Section {
+	if len(flat) == 0 {
+		return nil
+	}
+
+	var result []Section
+	for i := 0; i < len(flat); i++ {
+		current := flat[i].section
+		currentLevel := flat[i].level
+
+		// Collect children: everything after this heading at a deeper level
+		j := i + 1
+		for j < len(flat) && flat[j].level > currentLevel {
+			j++
+		}
+		if j > i+1 {
+			current.Children = buildTree(flat[i+1 : j])
+		}
+		result = append(result, current)
+		i = j - 1
+	}
+	return result
+}
diff --git a/internal/parser/parser_test.go b/internal/parser/parser_test.go
new file mode 100644
index 0000000..d414747
--- /dev/null
+++ b/internal/parser/parser_test.go
@@ -0,0 +1,147 @@
+package parser
+
+import (
+	"os"
+	"strings"
+	"testing"
+)
+
+func TestParseEntry(t *testing.T) {
+	line := `- [Docker Desktop](https://www.docker.com/products/docker-desktop/) - Official native app. Only for Windows and MacOS.`
+	entry, err := ParseEntry(line, 1)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if entry.Name != "Docker Desktop" {
+		t.Errorf("name = %q, want %q", entry.Name, "Docker Desktop")
+	}
+	if entry.URL != "https://www.docker.com/products/docker-desktop/" {
+		t.Errorf("url = %q, want %q", entry.URL, "https://www.docker.com/products/docker-desktop/")
+	}
+	if entry.Description != "Official native app. Only for Windows and MacOS." {
+		t.Errorf("description = %q, want %q", entry.Description, "Official native app. Only for Windows and MacOS.")
+	}
+	if len(entry.Markers) != 0 {
+		t.Errorf("markers = %v, want empty", entry.Markers)
+	}
+}
+
+func TestParseEntryWithMarkers(t *testing.T) {
+	line := `- [Docker Swarm](https://github.com/docker/swarm) - Swarm clustering system. :skull:`
+	entry, err := ParseEntry(line, 1)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if entry.Name != "Docker Swarm" {
+		t.Errorf("name = %q, want %q", entry.Name, "Docker Swarm")
+	}
+	if len(entry.Markers) != 1 || entry.Markers[0] != MarkerAbandoned {
+		t.Errorf("markers = %v, want [MarkerAbandoned]", entry.Markers)
+	}
+	if strings.Contains(entry.Description, ":skull:") {
+		t.Errorf("description should not contain marker text, got %q", entry.Description)
+	}
+}
+
+func TestParseEntryMultipleMarkers(t *testing.T) {
+	line := `- [SomeProject](https://example.com) - A project. :heavy_dollar_sign: :construction:`
+	entry, err := ParseEntry(line, 1)
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(entry.Markers) != 2 {
+		t.Fatalf("markers count = %d, want 2", len(entry.Markers))
+	}
+}
+
+func TestParseDocument(t *testing.T) {
+	input := `# Awesome Docker
+
+> A curated list
+
+# Contents
+
+- [Projects](#projects)
+
+# Legend
+
+- Abandoned :skull:
+
+# Projects
+
+## Tools
+
+- [ToolA](https://github.com/a/a) - Does A.
+- [ToolB](https://github.com/b/b) - Does B. :skull:
+
+## Services
+
+- [ServiceC](https://example.com/c) - Does C. :heavy_dollar_sign:
+`
+	doc, err := Parse(strings.NewReader(input))
+	if err != nil {
+		t.Fatalf("unexpected error: %v", err)
+	}
+	if len(doc.Sections) == 0 {
+		t.Fatal("expected at least one section")
+	}
+	// Find the "Projects" section
+	var projects *Section
+	for i := range doc.Sections {
+		if doc.Sections[i].Title == "Projects" {
+			projects = &doc.Sections[i]
+			break
+		}
+	}
+	if projects == nil {
+		t.Fatal("expected a Projects section")
+	}
+	if len(projects.Children) != 2 {
+		t.Errorf("projects children = %d, want 2", len(projects.Children))
+	}
+	if projects.Children[0].Title != "Tools" {
+		t.Errorf("first child = %q, want %q", projects.Children[0].Title, "Tools")
+	}
+	if len(projects.Children[0].Entries) != 2 {
+		t.Errorf("Tools entries = %d, want 2", len(projects.Children[0].Entries))
+	}
+}
+
+func TestParseNotAnEntry(t *testing.T) {
+	_, err := ParseEntry("- Abandoned :skull:", 1)
+	if err == nil {
+		t.Error("expected error for non-entry list item")
+	}
+}
+
+func TestParseRealREADME(t *testing.T) {
+	f, err := os.Open("../../README.md")
+	if err != nil {
+		t.Skip("README.md not found, skipping integration test")
+	}
+	defer f.Close()
+
+	doc, err := Parse(f)
+	if err != nil {
+		t.Fatalf("failed to parse README: %v", err)
+	}
+
+	if len(doc.Sections) == 0 {
+		t.Error("expected sections")
+	}
+
+	total := countEntries(doc.Sections)
+	if total < 100 {
+		t.Errorf("expected at least 100 entries, got %d", total)
+	}
+	t.Logf("Parsed %d sections, %d total entries", len(doc.Sections), total)
+}
+
+func countEntries(sections []Section) int {
+	n := 0
+	for _, s := range sections {
+		n += len(s.Entries)
+		n += countEntries(s.Children)
+	}
+	return n
+}
diff --git a/internal/parser/types.go b/internal/parser/types.go
new file mode 100644
index 0000000..43a1ccf
--- /dev/null
+++ b/internal/parser/types.go
@@ -0,0 +1,35 @@
+package parser
+
+// Marker represents a status emoji on an entry.
+type Marker int
+
+const (
+	MarkerAbandoned Marker = iota // :skull:
+	MarkerPaid                    // :heavy_dollar_sign:
+	MarkerWIP                     // :construction:
+)
+
+// Entry is a single link entry in the README.
+type Entry struct {
+	Name        string
+	URL         string
+	Description string
+	Markers     []Marker
+	Line        int    // 1-based line number in source
+	Raw         string // original line text
+}
+
+// Section is a heading with optional entries and child sections.
+type Section struct {
+	Title    string
+	Level    int // heading level: 1 = #, 2 = ##, etc.
+	Entries  []Entry
+	Children []Section
+	Line     int
+}
+
+// Document is the parsed representation of the full README.
+type Document struct {
+	Preamble []string  // lines before the first section
+	Sections []Section
+}