package parse import ( "errors" "fmt" "io" ) type Tag struct { name string attributes map[string]string } func (t *Tag) String() string { s := "<" + t.name for k, v := range t.attributes { s += " " + k + `="` + v + `"` } s += ">" return s } func ReadUntil(r io.Reader, sentinels []byte) (string, byte, error) { b := make([]byte, 1) var buf []byte var foundSentinel byte for { _, err := r.Read(b) if err != nil { return "", 0, errors.New(fmt.Sprintf("Missing '%s'", string(sentinels))) } found := false for _, sentinel := range sentinels { if b[0] == sentinel { found = true foundSentinel = sentinel } } if found { break } buf = append(buf, b[0]) } return string(buf), foundSentinel, nil } func ReadTag(r io.Reader) (*Tag, error) { e := new(Tag) e.attributes = make(map[string]string) var err error var foundSentinel byte e.name, foundSentinel, err = ReadUntil(r, []byte{' ', '>'}) if err != nil { return nil, err } for { if foundSentinel == '>' { break } key, _, err := ReadUntil(r, []byte{'='}) if err != nil { return nil, err } // Single and double quotation marks are significant. peek := make([]byte, 1) _, err = r.Read(peek) if err != nil { return nil, err } var value string if peek[0] == '\'' || peek[0] == '"' { value, _, err = ReadUntil(r, peek) b := make([]byte, 1) _, err = r.Read(b) foundSentinel = b[0] } else { value, foundSentinel, err = ReadUntil(r, []byte{' ', '>'}) value = string(peek) + value } e.attributes[key] = value } return e, err } /* Some elements are empty. Here's a list taken from https://www.geeksforgeeks.org/html/what-are-empty-elements-in-html/ var empty = []string{"area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr"} */ func Parse(r io.Reader) ([]any, error) { var document []any for { s, _, err := ReadUntil(r, []byte{'<'}) if err != nil { if err.Error() == "Missing '<'" { break } else { return nil, err } } if s != "" { document = append(document, s) } e, err := ReadTag(r) if err != nil { return nil, err } document = append(document, e) } return document, nil }