summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorkaa <kaa@disroot.org>2025-07-13 04:10:38 -0700
committerkaa <kaa@disroot.org>2025-07-13 04:10:38 -0700
commit8f4bb7e9ebc39418d9ef341a9e913d63eff0535d (patch)
tree3026c461c486be977ddb4cce83bf4e8792edc2c5 /lib
parent30ec71ec68698fe35b0f637f1a2c2dc147b461d5 (diff)
HTML parsing
Diffstat (limited to 'lib')
-rw-r--r--lib/parse/parse.go118
-rw-r--r--lib/parse/parse_test.go98
2 files changed, 198 insertions, 18 deletions
diff --git a/lib/parse/parse.go b/lib/parse/parse.go
index 332644d..15e563c 100644
--- a/lib/parse/parse.go
+++ b/lib/parse/parse.go
@@ -1,35 +1,117 @@
package parse
import (
- //"fmt"
+ "fmt"
"io"
+ "errors"
)
-type Element struct {
- name string
- attributes map[string]string
- contents string
- embedded *Element
-}
-
-func ReadTag() {
-}
-func Parse(r io.Reader) (Element, error) {
+type Tag struct {
+ name string
+ attributes map[string]string
+}
+
+func ReadUntil(r io.Reader, sentinels []byte) (string, byte, error) {
b := make([]byte, 1)
+ var buf []byte
+ var foundSentinel byte
+
for {
_, err := r.Read(b)
if err != nil {
- return Element{}, err
+ return "", 0, errors.New(fmt.Sprintf("Missing '%s'", string(sentinels)))
}
- switch (b[0]) {
- case '<':
- fmt.Println("tag")
+
+ found := false
+ for _, sentinel := range sentinels {
+ if b[0] == sentinel {
+ found = true
+ foundSentinel = sentinel
+ }
}
+
+ if found {
+ break
+ }
+
+ buf = append(buf, b[0])
+ }
+
+ return string(buf), foundSentinel, nil
+}
+
+func ReadTag(r io.Reader) (*Tag, error) {
+ e := new(Tag)
+ e.attributes = make(map[string]string)
+ var err error
+ var foundSentinel byte
+
+ e.name, foundSentinel, err = ReadUntil(r, []byte{' ', '>'})
+ if err != nil {
+ return nil, err
}
- var e Element
- e.name = "bob"
+ for {
+ if foundSentinel == '>' {
+ break
+ }
+
+ key, _, err := ReadUntil(r, []byte{'='})
+ if err != nil {
+ return nil, err
+ }
+
+ // Single and double quotation marks are significant.
+ peek := make([]byte, 1)
+ _, err = r.Read(peek)
+ if err != nil {
+ return nil, err
+ }
+
+ var value string
+ if peek[0] == '\'' || peek[0] == '"' {
+ value, _, err = ReadUntil(r, peek)
+ b := make([]byte, 1)
+ _, err = r.Read(b)
+ foundSentinel = b[0]
+ } else {
+ value, foundSentinel, err = ReadUntil(r, []byte{' ', '>'})
+ value = string(peek) + value
+ }
+ e.attributes[key] = value
+ }
+
+ return e, err
+}
+
+/*
+ Some elements are empty. Here's a list taken
+ from https://www.geeksforgeeks.org/html/what-are-empty-elements-in-html/
+ var empty = []string{"area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr"}
+*/
+
+func Parse(r io.Reader) ([]any, error) {
+ var document []any
+ for {
+ s, _, err := ReadUntil(r, []byte{'<'})
+ if err != nil {
+ if err.Error() == "Missing '<'" {
+ break
+ } else {
+ return nil, err
+ }
+ }
+ if s != "" {
+ document = append(document, s)
+ }
+
+ e, err := ReadTag(r)
+ if err != nil {
+ return nil, err
+ }
+ document = append(document, e)
+ }
- return e, nil
+ return document, nil
}
diff --git a/lib/parse/parse_test.go b/lib/parse/parse_test.go
new file mode 100644
index 0000000..3bc837b
--- /dev/null
+++ b/lib/parse/parse_test.go
@@ -0,0 +1,98 @@
+package parse
+
+import (
+ "testing"
+ "strings"
+)
+
+func TestReadUntil(t *testing.T) {
+ s := "until="
+ want := "until"
+ msg, _, err := ReadUntil(strings.NewReader(s), []byte{'='})
+ if err != nil {
+ t.Fatal(err)
+ }
+ if want != msg {
+ t.Errorf(`ReadUntil(strings.NewReader(s), []byte{'='}) = %q, %v want "", error`, msg, err)
+ }
+}
+
+func TestReadTag(t *testing.T) {
+ tag := "<hello>"
+ want := new(Tag)
+ want.name = "hello"
+ r := strings.NewReader(tag)
+ b := make([]byte, 1)
+ // Consume '<'
+ _, _ = r.Read(b)
+ msg, err := ReadTag(r)
+
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if want.name != msg.name {
+ t.Errorf(`ReadTag(strings.NewReader("<hello>")) = %q, %v, want "", error`, msg.name, err)
+ }
+}
+
+func TestReadTagAttributes(t *testing.T) {
+ tag := `<hello attribute=value and="another one">`
+ want := new(Tag)
+ want.attributes = make(map[string]string)
+ want.attributes["attribute"] = "value"
+ want.attributes["and"] = "another one"
+ r := strings.NewReader(tag)
+ b := make([]byte, 1)
+ // Consume '<'
+ _, _ = r.Read(b)
+ msg, err := ReadTag(r)
+
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ if want.attributes["attribute"] != msg.attributes["attribute"] {
+ t.Errorf(`ReadTag(strings.NewReader(<hello attribute=value and="another one">)) = %q, %v, want "", error`, msg.attributes["attribute"], err)
+ }
+}
+
+func TestParseTagContents(t *testing.T) {
+ elementString := `<p>Contents</p>`
+ want := make([]any, 3)
+ {
+ e := new(Tag)
+ e.name = "p"
+ want[0] = e
+ }
+ want[1] = "Contents"
+ {
+ e := new(Tag)
+ e.name = "/p"
+ want[2] = e
+ }
+
+ msg, err := Parse(strings.NewReader(elementString))
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ for e := range msg {
+ same := true
+ switch msg[e].(type) {
+ case Tag:
+ same = msg[e].(Tag).name == want[e].(Tag).name
+ for k := range msg[e].(Tag).attributes {
+ if msg[e].(Tag).attributes[k] != want[e].(Tag).attributes[k] {
+ same = false
+ break
+ }
+ }
+ case string:
+ same = msg[e] == want[e]
+ }
+ if !same {
+ t.Errorf(`Parse(strings.NewReader(elementString)) = %q, %v, want "", error`, msg[e], err)
+ }
+ }
+}