feat: add README parser with entry extraction and section tree building

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Julien Bisconti
2026-02-27 23:17:50 +01:00
parent 08da394e71
commit f2680c6221
3 changed files with 310 additions and 0 deletions

128
internal/parser/parser.go Normal file
View File

@@ -0,0 +1,128 @@
package parser
import (
"bufio"
"fmt"
"io"
"regexp"
"strings"
)
// entryRe matches: - [Name](URL) - Description
var entryRe = regexp.MustCompile(`^[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s+-\s+(.+)$`)
// headingRe matches markdown headings: # Title, ## Title, etc.
var headingRe = regexp.MustCompile(`^(#{1,6})\s+(.+?)(?:\s*<!--.*-->)?$`)
var markerMap = map[string]Marker{
":skull:": MarkerAbandoned,
":heavy_dollar_sign:": MarkerPaid,
":construction:": MarkerWIP,
}
// ParseEntry parses a single markdown list line into an Entry.
func ParseEntry(line string, lineNum int) (Entry, error) {
m := entryRe.FindStringSubmatch(strings.TrimSpace(line))
if m == nil {
return Entry{}, fmt.Errorf("line %d: not a valid entry: %q", lineNum, line)
}
desc := m[3]
var markers []Marker
for text, marker := range markerMap {
if strings.Contains(desc, text) {
markers = append(markers, marker)
desc = strings.ReplaceAll(desc, text, "")
}
}
desc = strings.TrimSpace(desc)
return Entry{
Name: m[1],
URL: m[2],
Description: desc,
Markers: markers,
Line: lineNum,
Raw: line,
}, nil
}
// Parse reads a full README and returns a Document.
func Parse(r io.Reader) (Document, error) {
scanner := bufio.NewScanner(r)
var doc Document
var allSections []struct {
section Section
level int
}
lineNum := 0
for scanner.Scan() {
lineNum++
line := scanner.Text()
// Check for heading
if hm := headingRe.FindStringSubmatch(line); hm != nil {
level := len(hm[1])
title := strings.TrimSpace(hm[2])
allSections = append(allSections, struct {
section Section
level int
}{
section: Section{Title: title, Level: level, Line: lineNum},
level: level,
})
continue
}
// Check for entry (list item with link)
if entry, err := ParseEntry(line, lineNum); err == nil {
if len(allSections) > 0 {
allSections[len(allSections)-1].section.Entries = append(
allSections[len(allSections)-1].section.Entries, entry)
}
continue
}
// Everything else: preamble if no sections yet
if len(allSections) == 0 {
doc.Preamble = append(doc.Preamble, line)
}
}
if err := scanner.Err(); err != nil {
return doc, err
}
// Build section tree by nesting based on heading level
doc.Sections = buildTree(allSections)
return doc, nil
}
func buildTree(flat []struct {
section Section
level int
}) []Section {
if len(flat) == 0 {
return nil
}
var result []Section
for i := 0; i < len(flat); i++ {
current := flat[i].section
currentLevel := flat[i].level
// Collect children: everything after this heading at a deeper level
j := i + 1
for j < len(flat) && flat[j].level > currentLevel {
j++
}
if j > i+1 {
current.Children = buildTree(flat[i+1 : j])
}
result = append(result, current)
i = j - 1
}
return result
}

View File

@@ -0,0 +1,147 @@
package parser
import (
"os"
"strings"
"testing"
)
func TestParseEntry(t *testing.T) {
line := `- [Docker Desktop](https://www.docker.com/products/docker-desktop/) - Official native app. Only for Windows and MacOS.`
entry, err := ParseEntry(line, 1)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if entry.Name != "Docker Desktop" {
t.Errorf("name = %q, want %q", entry.Name, "Docker Desktop")
}
if entry.URL != "https://www.docker.com/products/docker-desktop/" {
t.Errorf("url = %q, want %q", entry.URL, "https://www.docker.com/products/docker-desktop/")
}
if entry.Description != "Official native app. Only for Windows and MacOS." {
t.Errorf("description = %q, want %q", entry.Description, "Official native app. Only for Windows and MacOS.")
}
if len(entry.Markers) != 0 {
t.Errorf("markers = %v, want empty", entry.Markers)
}
}
func TestParseEntryWithMarkers(t *testing.T) {
line := `- [Docker Swarm](https://github.com/docker/swarm) - Swarm clustering system. :skull:`
entry, err := ParseEntry(line, 1)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if entry.Name != "Docker Swarm" {
t.Errorf("name = %q, want %q", entry.Name, "Docker Swarm")
}
if len(entry.Markers) != 1 || entry.Markers[0] != MarkerAbandoned {
t.Errorf("markers = %v, want [MarkerAbandoned]", entry.Markers)
}
if strings.Contains(entry.Description, ":skull:") {
t.Errorf("description should not contain marker text, got %q", entry.Description)
}
}
func TestParseEntryMultipleMarkers(t *testing.T) {
line := `- [SomeProject](https://example.com) - A project. :heavy_dollar_sign: :construction:`
entry, err := ParseEntry(line, 1)
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(entry.Markers) != 2 {
t.Fatalf("markers count = %d, want 2", len(entry.Markers))
}
}
func TestParseDocument(t *testing.T) {
input := `# Awesome Docker
> A curated list
# Contents
- [Projects](#projects)
# Legend
- Abandoned :skull:
# Projects
## Tools
- [ToolA](https://github.com/a/a) - Does A.
- [ToolB](https://github.com/b/b) - Does B. :skull:
## Services
- [ServiceC](https://example.com/c) - Does C. :heavy_dollar_sign:
`
doc, err := Parse(strings.NewReader(input))
if err != nil {
t.Fatalf("unexpected error: %v", err)
}
if len(doc.Sections) == 0 {
t.Fatal("expected at least one section")
}
// Find the "Projects" section
var projects *Section
for i := range doc.Sections {
if doc.Sections[i].Title == "Projects" {
projects = &doc.Sections[i]
break
}
}
if projects == nil {
t.Fatal("expected a Projects section")
}
if len(projects.Children) != 2 {
t.Errorf("projects children = %d, want 2", len(projects.Children))
}
if projects.Children[0].Title != "Tools" {
t.Errorf("first child = %q, want %q", projects.Children[0].Title, "Tools")
}
if len(projects.Children[0].Entries) != 2 {
t.Errorf("Tools entries = %d, want 2", len(projects.Children[0].Entries))
}
}
func TestParseNotAnEntry(t *testing.T) {
_, err := ParseEntry("- Abandoned :skull:", 1)
if err == nil {
t.Error("expected error for non-entry list item")
}
}
func TestParseRealREADME(t *testing.T) {
f, err := os.Open("../../README.md")
if err != nil {
t.Skip("README.md not found, skipping integration test")
}
defer f.Close()
doc, err := Parse(f)
if err != nil {
t.Fatalf("failed to parse README: %v", err)
}
if len(doc.Sections) == 0 {
t.Error("expected sections")
}
total := countEntries(doc.Sections)
if total < 100 {
t.Errorf("expected at least 100 entries, got %d", total)
}
t.Logf("Parsed %d sections, %d total entries", len(doc.Sections), total)
}
func countEntries(sections []Section) int {
n := 0
for _, s := range sections {
n += len(s.Entries)
n += countEntries(s.Children)
}
return n
}

35
internal/parser/types.go Normal file
View File

@@ -0,0 +1,35 @@
package parser
// Marker represents a status emoji on an entry.
type Marker int
const (
MarkerAbandoned Marker = iota // :skull:
MarkerPaid // :heavy_dollar_sign:
MarkerWIP // :construction:
)
// Entry is a single link entry in the README.
type Entry struct {
Name string
URL string
Description string
Markers []Marker
Line int // 1-based line number in source
Raw string // original line text
}
// Section is a heading with optional entries and child sections.
type Section struct {
Title string
Level int // heading level: 1 = #, 2 = ##, etc.
Entries []Entry
Children []Section
Line int
}
// Document is the parsed representation of the full README.
type Document struct {
Preamble []string // lines before the first section
Sections []Section
}