1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
package parse
import (
"errors"
"fmt"
"io"
)
type Tag struct {
name string
attributes map[string]string
}
func (t *Tag) String() string {
s := "<" + t.name
for k, v := range t.attributes {
s += " " + k + `="` + v + `"`
}
s += ">"
return s
}
func ReadUntil(r io.Reader, sentinels []byte) (string, byte, error) {
b := make([]byte, 1)
var buf []byte
var foundSentinel byte
for {
_, err := r.Read(b)
if err != nil {
return "", 0, errors.New(fmt.Sprintf("Missing '%s'", string(sentinels)))
}
found := false
for _, sentinel := range sentinels {
if b[0] == sentinel {
found = true
foundSentinel = sentinel
}
}
if found {
break
}
buf = append(buf, b[0])
}
return string(buf), foundSentinel, nil
}
func ReadTag(r io.Reader) (*Tag, error) {
e := new(Tag)
e.attributes = make(map[string]string)
var err error
var foundSentinel byte
e.name, foundSentinel, err = ReadUntil(r, []byte{' ', '>'})
if err != nil {
return nil, err
}
for {
if foundSentinel == '>' {
break
}
key, _, err := ReadUntil(r, []byte{'='})
if err != nil {
return nil, err
}
// Single and double quotation marks are significant.
peek := make([]byte, 1)
_, err = r.Read(peek)
if err != nil {
return nil, err
}
var value string
if peek[0] == '\'' || peek[0] == '"' {
value, _, err = ReadUntil(r, peek)
b := make([]byte, 1)
_, err = r.Read(b)
foundSentinel = b[0]
} else {
value, foundSentinel, err = ReadUntil(r, []byte{' ', '>'})
value = string(peek) + value
}
e.attributes[key] = value
}
return e, err
}
/*
Some elements are empty. Here's a list taken
from https://www.geeksforgeeks.org/html/what-are-empty-elements-in-html/
var empty = []string{"area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr"}
*/
func Parse(r io.Reader) ([]any, error) {
var document []any
for {
s, _, err := ReadUntil(r, []byte{'<'})
if err != nil {
if err.Error() == "Missing '<'" {
break
} else {
return nil, err
}
}
if s != "" {
document = append(document, s)
}
e, err := ReadTag(r)
if err != nil {
return nil, err
}
document = append(document, e)
}
return document, nil
}
|