summaryrefslogtreecommitdiff
path: root/lib/parse/parse.go
blob: a86e1669967c9f2879c155234c2564bc1678d7cb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
package parse

import (
	"errors"
	"fmt"
	"io"
)

type Tag struct {
	name       string
	attributes map[string]string
}

func (t *Tag) String() string {
	s := "<" + t.name
	for k, v := range t.attributes {
		s += " " + k + `="` + v + `"`
	}
	s += ">"
	return s
}

func ReadUntil(r io.Reader, sentinels []byte) (string, byte, error) {
	b := make([]byte, 1)
	var buf []byte
	var foundSentinel byte

	for {
		_, err := r.Read(b)

		if err != nil {
			return "", 0, errors.New(fmt.Sprintf("Missing '%s'", string(sentinels)))
		}

		found := false
		for _, sentinel := range sentinels {
			if b[0] == sentinel {
				found = true
				foundSentinel = sentinel
			}
		}

		if found {
			break
		}

		buf = append(buf, b[0])
	}

	return string(buf), foundSentinel, nil
}

func ReadTag(r io.Reader) (*Tag, error) {
	e := new(Tag)
	e.attributes = make(map[string]string)
	var err error
	var foundSentinel byte

	e.name, foundSentinel, err = ReadUntil(r, []byte{' ', '>'})
	if err != nil {
		return nil, err
	}

	for {
		if foundSentinel == '>' {
			break
		}

		key, _, err := ReadUntil(r, []byte{'='})
		if err != nil {
			return nil, err
		}

		// Single and double quotation marks are significant.
		peek := make([]byte, 1)
		_, err = r.Read(peek)
		if err != nil {
			return nil, err
		}

		var value string
		if peek[0] == '\'' || peek[0] == '"' {
			value, _, err = ReadUntil(r, peek)
			b := make([]byte, 1)
			_, err = r.Read(b)
			foundSentinel = b[0]
		} else {
			value, foundSentinel, err = ReadUntil(r, []byte{' ', '>'})
			value = string(peek) + value
		}
		e.attributes[key] = value
	}

	return e, err
}

/*
	Some elements are empty. Here's a list taken
	from https://www.geeksforgeeks.org/html/what-are-empty-elements-in-html/
	var empty = []string{"area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr"}
*/

func Parse(r io.Reader) ([]any, error) {
	var document []any
	for {
		s, _, err := ReadUntil(r, []byte{'<'})
		if err != nil {
			if err.Error() == "Missing '<'" {
				break
			} else {
				return nil, err
			}
		}
		if s != "" {
			document = append(document, s)
		}

		e, err := ReadTag(r)
		if err != nil {
			return nil, err
		}
		document = append(document, e)
	}

	return document, nil
}