feat: add README parser with entry extraction and section tree building
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
128
internal/parser/parser.go
Normal file
128
internal/parser/parser.go
Normal file
@@ -0,0 +1,128 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"io"
|
||||
"regexp"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// entryRe matches: - [Name](URL) - Description
|
||||
var entryRe = regexp.MustCompile(`^[-*]\s+\[([^\]]+)\]\(([^)]+)\)\s+-\s+(.+)$`)
|
||||
|
||||
// headingRe matches markdown headings: # Title, ## Title, etc.
|
||||
var headingRe = regexp.MustCompile(`^(#{1,6})\s+(.+?)(?:\s*<!--.*-->)?$`)
|
||||
|
||||
var markerMap = map[string]Marker{
|
||||
":skull:": MarkerAbandoned,
|
||||
":heavy_dollar_sign:": MarkerPaid,
|
||||
":construction:": MarkerWIP,
|
||||
}
|
||||
|
||||
// ParseEntry parses a single markdown list line into an Entry.
|
||||
func ParseEntry(line string, lineNum int) (Entry, error) {
|
||||
m := entryRe.FindStringSubmatch(strings.TrimSpace(line))
|
||||
if m == nil {
|
||||
return Entry{}, fmt.Errorf("line %d: not a valid entry: %q", lineNum, line)
|
||||
}
|
||||
|
||||
desc := m[3]
|
||||
var markers []Marker
|
||||
|
||||
for text, marker := range markerMap {
|
||||
if strings.Contains(desc, text) {
|
||||
markers = append(markers, marker)
|
||||
desc = strings.ReplaceAll(desc, text, "")
|
||||
}
|
||||
}
|
||||
desc = strings.TrimSpace(desc)
|
||||
|
||||
return Entry{
|
||||
Name: m[1],
|
||||
URL: m[2],
|
||||
Description: desc,
|
||||
Markers: markers,
|
||||
Line: lineNum,
|
||||
Raw: line,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Parse reads a full README and returns a Document.
|
||||
func Parse(r io.Reader) (Document, error) {
|
||||
scanner := bufio.NewScanner(r)
|
||||
var doc Document
|
||||
var allSections []struct {
|
||||
section Section
|
||||
level int
|
||||
}
|
||||
|
||||
lineNum := 0
|
||||
for scanner.Scan() {
|
||||
lineNum++
|
||||
line := scanner.Text()
|
||||
|
||||
// Check for heading
|
||||
if hm := headingRe.FindStringSubmatch(line); hm != nil {
|
||||
level := len(hm[1])
|
||||
title := strings.TrimSpace(hm[2])
|
||||
allSections = append(allSections, struct {
|
||||
section Section
|
||||
level int
|
||||
}{
|
||||
section: Section{Title: title, Level: level, Line: lineNum},
|
||||
level: level,
|
||||
})
|
||||
continue
|
||||
}
|
||||
|
||||
// Check for entry (list item with link)
|
||||
if entry, err := ParseEntry(line, lineNum); err == nil {
|
||||
if len(allSections) > 0 {
|
||||
allSections[len(allSections)-1].section.Entries = append(
|
||||
allSections[len(allSections)-1].section.Entries, entry)
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// Everything else: preamble if no sections yet
|
||||
if len(allSections) == 0 {
|
||||
doc.Preamble = append(doc.Preamble, line)
|
||||
}
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
return doc, err
|
||||
}
|
||||
|
||||
// Build section tree by nesting based on heading level
|
||||
doc.Sections = buildTree(allSections)
|
||||
return doc, nil
|
||||
}
|
||||
|
||||
func buildTree(flat []struct {
|
||||
section Section
|
||||
level int
|
||||
}) []Section {
|
||||
if len(flat) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
var result []Section
|
||||
for i := 0; i < len(flat); i++ {
|
||||
current := flat[i].section
|
||||
currentLevel := flat[i].level
|
||||
|
||||
// Collect children: everything after this heading at a deeper level
|
||||
j := i + 1
|
||||
for j < len(flat) && flat[j].level > currentLevel {
|
||||
j++
|
||||
}
|
||||
if j > i+1 {
|
||||
current.Children = buildTree(flat[i+1 : j])
|
||||
}
|
||||
result = append(result, current)
|
||||
i = j - 1
|
||||
}
|
||||
return result
|
||||
}
|
||||
147
internal/parser/parser_test.go
Normal file
147
internal/parser/parser_test.go
Normal file
@@ -0,0 +1,147 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"os"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestParseEntry(t *testing.T) {
|
||||
line := `- [Docker Desktop](https://www.docker.com/products/docker-desktop/) - Official native app. Only for Windows and MacOS.`
|
||||
entry, err := ParseEntry(line, 1)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if entry.Name != "Docker Desktop" {
|
||||
t.Errorf("name = %q, want %q", entry.Name, "Docker Desktop")
|
||||
}
|
||||
if entry.URL != "https://www.docker.com/products/docker-desktop/" {
|
||||
t.Errorf("url = %q, want %q", entry.URL, "https://www.docker.com/products/docker-desktop/")
|
||||
}
|
||||
if entry.Description != "Official native app. Only for Windows and MacOS." {
|
||||
t.Errorf("description = %q, want %q", entry.Description, "Official native app. Only for Windows and MacOS.")
|
||||
}
|
||||
if len(entry.Markers) != 0 {
|
||||
t.Errorf("markers = %v, want empty", entry.Markers)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseEntryWithMarkers(t *testing.T) {
|
||||
line := `- [Docker Swarm](https://github.com/docker/swarm) - Swarm clustering system. :skull:`
|
||||
entry, err := ParseEntry(line, 1)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if entry.Name != "Docker Swarm" {
|
||||
t.Errorf("name = %q, want %q", entry.Name, "Docker Swarm")
|
||||
}
|
||||
if len(entry.Markers) != 1 || entry.Markers[0] != MarkerAbandoned {
|
||||
t.Errorf("markers = %v, want [MarkerAbandoned]", entry.Markers)
|
||||
}
|
||||
if strings.Contains(entry.Description, ":skull:") {
|
||||
t.Errorf("description should not contain marker text, got %q", entry.Description)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseEntryMultipleMarkers(t *testing.T) {
|
||||
line := `- [SomeProject](https://example.com) - A project. :heavy_dollar_sign: :construction:`
|
||||
entry, err := ParseEntry(line, 1)
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(entry.Markers) != 2 {
|
||||
t.Fatalf("markers count = %d, want 2", len(entry.Markers))
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseDocument(t *testing.T) {
|
||||
input := `# Awesome Docker
|
||||
|
||||
> A curated list
|
||||
|
||||
# Contents
|
||||
|
||||
- [Projects](#projects)
|
||||
|
||||
# Legend
|
||||
|
||||
- Abandoned :skull:
|
||||
|
||||
# Projects
|
||||
|
||||
## Tools
|
||||
|
||||
- [ToolA](https://github.com/a/a) - Does A.
|
||||
- [ToolB](https://github.com/b/b) - Does B. :skull:
|
||||
|
||||
## Services
|
||||
|
||||
- [ServiceC](https://example.com/c) - Does C. :heavy_dollar_sign:
|
||||
`
|
||||
doc, err := Parse(strings.NewReader(input))
|
||||
if err != nil {
|
||||
t.Fatalf("unexpected error: %v", err)
|
||||
}
|
||||
if len(doc.Sections) == 0 {
|
||||
t.Fatal("expected at least one section")
|
||||
}
|
||||
// Find the "Projects" section
|
||||
var projects *Section
|
||||
for i := range doc.Sections {
|
||||
if doc.Sections[i].Title == "Projects" {
|
||||
projects = &doc.Sections[i]
|
||||
break
|
||||
}
|
||||
}
|
||||
if projects == nil {
|
||||
t.Fatal("expected a Projects section")
|
||||
}
|
||||
if len(projects.Children) != 2 {
|
||||
t.Errorf("projects children = %d, want 2", len(projects.Children))
|
||||
}
|
||||
if projects.Children[0].Title != "Tools" {
|
||||
t.Errorf("first child = %q, want %q", projects.Children[0].Title, "Tools")
|
||||
}
|
||||
if len(projects.Children[0].Entries) != 2 {
|
||||
t.Errorf("Tools entries = %d, want 2", len(projects.Children[0].Entries))
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseNotAnEntry(t *testing.T) {
|
||||
_, err := ParseEntry("- Abandoned :skull:", 1)
|
||||
if err == nil {
|
||||
t.Error("expected error for non-entry list item")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseRealREADME(t *testing.T) {
|
||||
f, err := os.Open("../../README.md")
|
||||
if err != nil {
|
||||
t.Skip("README.md not found, skipping integration test")
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
doc, err := Parse(f)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to parse README: %v", err)
|
||||
}
|
||||
|
||||
if len(doc.Sections) == 0 {
|
||||
t.Error("expected sections")
|
||||
}
|
||||
|
||||
total := countEntries(doc.Sections)
|
||||
if total < 100 {
|
||||
t.Errorf("expected at least 100 entries, got %d", total)
|
||||
}
|
||||
t.Logf("Parsed %d sections, %d total entries", len(doc.Sections), total)
|
||||
}
|
||||
|
||||
func countEntries(sections []Section) int {
|
||||
n := 0
|
||||
for _, s := range sections {
|
||||
n += len(s.Entries)
|
||||
n += countEntries(s.Children)
|
||||
}
|
||||
return n
|
||||
}
|
||||
35
internal/parser/types.go
Normal file
35
internal/parser/types.go
Normal file
@@ -0,0 +1,35 @@
|
||||
package parser
|
||||
|
||||
// Marker represents a status emoji on an entry.
|
||||
type Marker int
|
||||
|
||||
const (
|
||||
MarkerAbandoned Marker = iota // :skull:
|
||||
MarkerPaid // :heavy_dollar_sign:
|
||||
MarkerWIP // :construction:
|
||||
)
|
||||
|
||||
// Entry is a single link entry in the README.
|
||||
type Entry struct {
|
||||
Name string
|
||||
URL string
|
||||
Description string
|
||||
Markers []Marker
|
||||
Line int // 1-based line number in source
|
||||
Raw string // original line text
|
||||
}
|
||||
|
||||
// Section is a heading with optional entries and child sections.
|
||||
type Section struct {
|
||||
Title string
|
||||
Level int // heading level: 1 = #, 2 = ##, etc.
|
||||
Entries []Entry
|
||||
Children []Section
|
||||
Line int
|
||||
}
|
||||
|
||||
// Document is the parsed representation of the full README.
|
||||
type Document struct {
|
||||
Preamble []string // lines before the first section
|
||||
Sections []Section
|
||||
}
|
||||
Reference in New Issue
Block a user