feat: add README parser with entry extraction and section tree building
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
128
internal/parser/parser.go
Normal file
128
internal/parser/parser.go
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
package parser
|
||||||
|
|
||||||
|
import (
|
||||||
|
"bufio"
|
||||||
|
"fmt"
|
||||||
|
"io"
|
||||||
|
"regexp"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// entryRe matches: - [Name](URL) - Description
|
||||||
|
var entryRe = regexp.MustCompile(`^[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s+-\s+(.+)$`)
|
||||||
|
|
||||||
|
// headingRe matches markdown headings: # Title, ## Title, etc.
|
||||||
|
var headingRe = regexp.MustCompile(`^(#{1,6})\s+(.+?)(?:\s*<!--.*-->)?$`)
|
||||||
|
|
||||||
|
var markerMap = map[string]Marker{
|
||||||
|
":skull:": MarkerAbandoned,
|
||||||
|
":heavy_dollar_sign:": MarkerPaid,
|
||||||
|
":construction:": MarkerWIP,
|
||||||
|
}
|
||||||
|
|
||||||
|
// ParseEntry parses a single markdown list line into an Entry.
|
||||||
|
func ParseEntry(line string, lineNum int) (Entry, error) {
|
||||||
|
m := entryRe.FindStringSubmatch(strings.TrimSpace(line))
|
||||||
|
if m == nil {
|
||||||
|
return Entry{}, fmt.Errorf("line %d: not a valid entry: %q", lineNum, line)
|
||||||
|
}
|
||||||
|
|
||||||
|
desc := m[3]
|
||||||
|
var markers []Marker
|
||||||
|
|
||||||
|
for text, marker := range markerMap {
|
||||||
|
if strings.Contains(desc, text) {
|
||||||
|
markers = append(markers, marker)
|
||||||
|
desc = strings.ReplaceAll(desc, text, "")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
desc = strings.TrimSpace(desc)
|
||||||
|
|
||||||
|
return Entry{
|
||||||
|
Name: m[1],
|
||||||
|
URL: m[2],
|
||||||
|
Description: desc,
|
||||||
|
Markers: markers,
|
||||||
|
Line: lineNum,
|
||||||
|
Raw: line,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// Parse reads a full README and returns a Document.
|
||||||
|
func Parse(r io.Reader) (Document, error) {
|
||||||
|
scanner := bufio.NewScanner(r)
|
||||||
|
var doc Document
|
||||||
|
var allSections []struct {
|
||||||
|
section Section
|
||||||
|
level int
|
||||||
|
}
|
||||||
|
|
||||||
|
lineNum := 0
|
||||||
|
for scanner.Scan() {
|
||||||
|
lineNum++
|
||||||
|
line := scanner.Text()
|
||||||
|
|
||||||
|
// Check for heading
|
||||||
|
if hm := headingRe.FindStringSubmatch(line); hm != nil {
|
||||||
|
level := len(hm[1])
|
||||||
|
title := strings.TrimSpace(hm[2])
|
||||||
|
allSections = append(allSections, struct {
|
||||||
|
section Section
|
||||||
|
level int
|
||||||
|
}{
|
||||||
|
section: Section{Title: title, Level: level, Line: lineNum},
|
||||||
|
level: level,
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for entry (list item with link)
|
||||||
|
if entry, err := ParseEntry(line, lineNum); err == nil {
|
||||||
|
if len(allSections) > 0 {
|
||||||
|
allSections[len(allSections)-1].section.Entries = append(
|
||||||
|
allSections[len(allSections)-1].section.Entries, entry)
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
// Everything else: preamble if no sections yet
|
||||||
|
if len(allSections) == 0 {
|
||||||
|
doc.Preamble = append(doc.Preamble, line)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := scanner.Err(); err != nil {
|
||||||
|
return doc, err
|
||||||
|
}
|
||||||
|
|
||||||
|
// Build section tree by nesting based on heading level
|
||||||
|
doc.Sections = buildTree(allSections)
|
||||||
|
return doc, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func buildTree(flat []struct {
|
||||||
|
section Section
|
||||||
|
level int
|
||||||
|
}) []Section {
|
||||||
|
if len(flat) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
var result []Section
|
||||||
|
for i := 0; i < len(flat); i++ {
|
||||||
|
current := flat[i].section
|
||||||
|
currentLevel := flat[i].level
|
||||||
|
|
||||||
|
// Collect children: everything after this heading at a deeper level
|
||||||
|
j := i + 1
|
||||||
|
for j < len(flat) && flat[j].level > currentLevel {
|
||||||
|
j++
|
||||||
|
}
|
||||||
|
if j > i+1 {
|
||||||
|
current.Children = buildTree(flat[i+1 : j])
|
||||||
|
}
|
||||||
|
result = append(result, current)
|
||||||
|
i = j - 1
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
}
|
||||||
147
internal/parser/parser_test.go
Normal file
147
internal/parser/parser_test.go
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
package parser
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseEntry(t *testing.T) {
|
||||||
|
line := `- [Docker Desktop](https://www.docker.com/products/docker-desktop/) - Official native app. Only for Windows and MacOS.`
|
||||||
|
entry, err := ParseEntry(line, 1)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if entry.Name != "Docker Desktop" {
|
||||||
|
t.Errorf("name = %q, want %q", entry.Name, "Docker Desktop")
|
||||||
|
}
|
||||||
|
if entry.URL != "https://www.docker.com/products/docker-desktop/" {
|
||||||
|
t.Errorf("url = %q, want %q", entry.URL, "https://www.docker.com/products/docker-desktop/")
|
||||||
|
}
|
||||||
|
if entry.Description != "Official native app. Only for Windows and MacOS." {
|
||||||
|
t.Errorf("description = %q, want %q", entry.Description, "Official native app. Only for Windows and MacOS.")
|
||||||
|
}
|
||||||
|
if len(entry.Markers) != 0 {
|
||||||
|
t.Errorf("markers = %v, want empty", entry.Markers)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseEntryWithMarkers(t *testing.T) {
|
||||||
|
line := `- [Docker Swarm](https://github.com/docker/swarm) - Swarm clustering system. :skull:`
|
||||||
|
entry, err := ParseEntry(line, 1)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if entry.Name != "Docker Swarm" {
|
||||||
|
t.Errorf("name = %q, want %q", entry.Name, "Docker Swarm")
|
||||||
|
}
|
||||||
|
if len(entry.Markers) != 1 || entry.Markers[0] != MarkerAbandoned {
|
||||||
|
t.Errorf("markers = %v, want [MarkerAbandoned]", entry.Markers)
|
||||||
|
}
|
||||||
|
if strings.Contains(entry.Description, ":skull:") {
|
||||||
|
t.Errorf("description should not contain marker text, got %q", entry.Description)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseEntryMultipleMarkers(t *testing.T) {
|
||||||
|
line := `- [SomeProject](https://example.com) - A project. :heavy_dollar_sign: :construction:`
|
||||||
|
entry, err := ParseEntry(line, 1)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if len(entry.Markers) != 2 {
|
||||||
|
t.Fatalf("markers count = %d, want 2", len(entry.Markers))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseDocument(t *testing.T) {
|
||||||
|
input := `# Awesome Docker
|
||||||
|
|
||||||
|
> A curated list
|
||||||
|
|
||||||
|
# Contents
|
||||||
|
|
||||||
|
- [Projects](#projects)
|
||||||
|
|
||||||
|
# Legend
|
||||||
|
|
||||||
|
- Abandoned :skull:
|
||||||
|
|
||||||
|
# Projects
|
||||||
|
|
||||||
|
## Tools
|
||||||
|
|
||||||
|
- [ToolA](https://github.com/a/a) - Does A.
|
||||||
|
- [ToolB](https://github.com/b/b) - Does B. :skull:
|
||||||
|
|
||||||
|
## Services
|
||||||
|
|
||||||
|
- [ServiceC](https://example.com/c) - Does C. :heavy_dollar_sign:
|
||||||
|
`
|
||||||
|
doc, err := Parse(strings.NewReader(input))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("unexpected error: %v", err)
|
||||||
|
}
|
||||||
|
if len(doc.Sections) == 0 {
|
||||||
|
t.Fatal("expected at least one section")
|
||||||
|
}
|
||||||
|
// Find the "Projects" section
|
||||||
|
var projects *Section
|
||||||
|
for i := range doc.Sections {
|
||||||
|
if doc.Sections[i].Title == "Projects" {
|
||||||
|
projects = &doc.Sections[i]
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if projects == nil {
|
||||||
|
t.Fatal("expected a Projects section")
|
||||||
|
}
|
||||||
|
if len(projects.Children) != 2 {
|
||||||
|
t.Errorf("projects children = %d, want 2", len(projects.Children))
|
||||||
|
}
|
||||||
|
if projects.Children[0].Title != "Tools" {
|
||||||
|
t.Errorf("first child = %q, want %q", projects.Children[0].Title, "Tools")
|
||||||
|
}
|
||||||
|
if len(projects.Children[0].Entries) != 2 {
|
||||||
|
t.Errorf("Tools entries = %d, want 2", len(projects.Children[0].Entries))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseNotAnEntry(t *testing.T) {
|
||||||
|
_, err := ParseEntry("- Abandoned :skull:", 1)
|
||||||
|
if err == nil {
|
||||||
|
t.Error("expected error for non-entry list item")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestParseRealREADME(t *testing.T) {
|
||||||
|
f, err := os.Open("../../README.md")
|
||||||
|
if err != nil {
|
||||||
|
t.Skip("README.md not found, skipping integration test")
|
||||||
|
}
|
||||||
|
defer f.Close()
|
||||||
|
|
||||||
|
doc, err := Parse(f)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("failed to parse README: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(doc.Sections) == 0 {
|
||||||
|
t.Error("expected sections")
|
||||||
|
}
|
||||||
|
|
||||||
|
total := countEntries(doc.Sections)
|
||||||
|
if total < 100 {
|
||||||
|
t.Errorf("expected at least 100 entries, got %d", total)
|
||||||
|
}
|
||||||
|
t.Logf("Parsed %d sections, %d total entries", len(doc.Sections), total)
|
||||||
|
}
|
||||||
|
|
||||||
|
func countEntries(sections []Section) int {
|
||||||
|
n := 0
|
||||||
|
for _, s := range sections {
|
||||||
|
n += len(s.Entries)
|
||||||
|
n += countEntries(s.Children)
|
||||||
|
}
|
||||||
|
return n
|
||||||
|
}
|
||||||
35
internal/parser/types.go
Normal file
35
internal/parser/types.go
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
package parser
|
||||||
|
|
||||||
|
// Marker represents a status emoji on an entry.
|
||||||
|
type Marker int
|
||||||
|
|
||||||
|
const (
|
||||||
|
MarkerAbandoned Marker = iota // :skull:
|
||||||
|
MarkerPaid // :heavy_dollar_sign:
|
||||||
|
MarkerWIP // :construction:
|
||||||
|
)
|
||||||
|
|
||||||
|
// Entry is a single link entry in the README.
|
||||||
|
type Entry struct {
|
||||||
|
Name string
|
||||||
|
URL string
|
||||||
|
Description string
|
||||||
|
Markers []Marker
|
||||||
|
Line int // 1-based line number in source
|
||||||
|
Raw string // original line text
|
||||||
|
}
|
||||||
|
|
||||||
|
// Section is a heading with optional entries and child sections.
|
||||||
|
type Section struct {
|
||||||
|
Title string
|
||||||
|
Level int // heading level: 1 = #, 2 = ##, etc.
|
||||||
|
Entries []Entry
|
||||||
|
Children []Section
|
||||||
|
Line int
|
||||||
|
}
|
||||||
|
|
||||||
|
// Document is the parsed representation of the full README.
|
||||||
|
type Document struct {
|
||||||
|
Preamble []string // lines before the first section
|
||||||
|
Sections []Section
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user