diff options
| author | kaa <kaa@disroot.org> | 2025-07-13 04:10:38 -0700 | 
|---|---|---|
| committer | kaa <kaa@disroot.org> | 2025-07-13 04:10:38 -0700 | 
| commit | 8f4bb7e9ebc39418d9ef341a9e913d63eff0535d (patch) | |
| tree | 3026c461c486be977ddb4cce83bf4e8792edc2c5 | |
| parent | 30ec71ec68698fe35b0f637f1a2c2dc147b461d5 (diff) | |
HTML parsing
| -rw-r--r-- | lib/parse/parse.go | 118 | ||||
| -rw-r--r-- | lib/parse/parse_test.go | 98 | 
2 files changed, 198 insertions, 18 deletions
diff --git a/lib/parse/parse.go b/lib/parse/parse.go index 332644d..15e563c 100644 --- a/lib/parse/parse.go +++ b/lib/parse/parse.go @@ -1,35 +1,117 @@  package parse  import ( -	//"fmt" +	"fmt"  	"io" +	"errors"  ) -type Element struct {                      -        name    string                     -        attributes      map[string]string  -        contents        string                                   -        embedded        *Element                                 -}                                                                    -                                                                     -func ReadTag() {                                                     -}                                                                    -func Parse(r io.Reader) (Element, error) { +type Tag struct { +        name	string +        attributes	map[string]string +} + +func ReadUntil(r io.Reader, sentinels []byte) (string, byte, error) {  	b := make([]byte, 1) +	var buf []byte +	var foundSentinel byte +  	for {  		_, err := r.Read(b)  		if err != nil { -			return Element{}, err +			return "", 0, errors.New(fmt.Sprintf("Missing '%s'", string(sentinels)))  		} -		switch (b[0]) { -		case '<': -			fmt.Println("tag") + +		found := false +		for _, sentinel := range sentinels { +			if b[0] == sentinel { +				found = true +				foundSentinel = sentinel +			}  		} + +		if found { +			break +		} + +		buf = append(buf, b[0]) +	} + +	return string(buf), foundSentinel, nil +} + +func ReadTag(r io.Reader) (*Tag, error) { +	e := new(Tag) +	e.attributes = make(map[string]string) +	var err error +	var foundSentinel byte + +	e.name, foundSentinel, err = ReadUntil(r, []byte{' ', '>'}) +	if err != nil { +		return nil, err  	} -	var e Element -	e.name = "bob" +	for { +		if foundSentinel == '>' { +			break +		} + +		key, _, err := ReadUntil(r, []byte{'='}) +		if err != nil { +			return nil, err +		} + +		// Single and double quotation marks are significant. +		peek := make([]byte, 1) +		_, err = r.Read(peek) +		if err != nil { +			return nil, err +		} + +		var value string +		if peek[0] == '\'' || peek[0] == '"' { +			value, _, err = ReadUntil(r, peek) +			b := make([]byte, 1) +			_, err = r.Read(b) +			foundSentinel = b[0] +		} else { +			value, foundSentinel, err = ReadUntil(r, []byte{' ', '>'}) +			value = string(peek) + value +		} +		e.attributes[key] = value +	} + +	return e, err +} + +/* +	Some elements are empty. Here's a list taken +	from https://www.geeksforgeeks.org/html/what-are-empty-elements-in-html/ +	var empty = []string{"area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "param", "source", "track", "wbr"} +*/ + +func Parse(r io.Reader) ([]any, error) { +	var document []any +	for { +		s, _, err := ReadUntil(r, []byte{'<'}) +		if err != nil { +			if err.Error() == "Missing '<'" { +				break +			} else { +				return nil, err +			} +		} +		if s != "" { +			document = append(document, s) +		} + +		e, err := ReadTag(r) +		if err != nil { +			return nil, err +		} +		document = append(document, e) +	} -	return e, nil +	return document, nil  } diff --git a/lib/parse/parse_test.go b/lib/parse/parse_test.go new file mode 100644 index 0000000..3bc837b --- /dev/null +++ b/lib/parse/parse_test.go @@ -0,0 +1,98 @@ +package parse + +import ( +	"testing" +	"strings" +) + +func TestReadUntil(t *testing.T) { +	s := "until=" +	want := "until" +	msg, _, err := ReadUntil(strings.NewReader(s), []byte{'='}) +	if err != nil { +		t.Fatal(err) +	} +	if want != msg { +		t.Errorf(`ReadUntil(strings.NewReader(s), []byte{'='}) = %q, %v want "", error`, msg, err) +	} +} + +func TestReadTag(t *testing.T) { +	tag := "<hello>" +	want := new(Tag) +	want.name = "hello" +	r := strings.NewReader(tag) +	b := make([]byte, 1) +	// Consume '<' +	_, _ = r.Read(b) +	msg, err := ReadTag(r) + +	if err != nil { +		t.Fatal(err) +	} + +	if want.name != msg.name { +		t.Errorf(`ReadTag(strings.NewReader("<hello>")) = %q, %v, want "", error`, msg.name, err) +	} +} + +func TestReadTagAttributes(t *testing.T) { +	tag := `<hello attribute=value and="another one">` +	want := new(Tag) +	want.attributes = make(map[string]string) +	want.attributes["attribute"] = "value" +	want.attributes["and"] = "another one" +	r := strings.NewReader(tag) +	b := make([]byte, 1) +	// Consume '<' +	_, _ = r.Read(b) +	msg, err := ReadTag(r) + +	if err != nil { +		t.Fatal(err) +	} + +	if want.attributes["attribute"] != msg.attributes["attribute"] { +		t.Errorf(`ReadTag(strings.NewReader(<hello attribute=value and="another one">)) = %q, %v, want "", error`, msg.attributes["attribute"], err) +	} +} + +func TestParseTagContents(t *testing.T) { +	elementString := `<p>Contents</p>` +	want := make([]any, 3) +	{ +		e := new(Tag) +		e.name = "p" +		want[0] = e +	} +	want[1] = "Contents" +	{ +		e := new(Tag) +		e.name = "/p" +		want[2] = e +	} + +	msg, err := Parse(strings.NewReader(elementString)) +	if err != nil { +		t.Fatal(err) +	} + +	for e := range msg { +		same := true +		switch msg[e].(type) { +		case Tag: +			same = msg[e].(Tag).name == want[e].(Tag).name +			for k := range msg[e].(Tag).attributes { +				if msg[e].(Tag).attributes[k] != want[e].(Tag).attributes[k] { +					same = false +					break +				} +			} +		case string: +			same = msg[e] == want[e] +		} +		if !same { +			t.Errorf(`Parse(strings.NewReader(elementString)) = %q, %v, want "", error`, msg[e], err) +		} +	} +}  | 
