diff options
| -rw-r--r-- | html.go | 88 | ||||
| -rw-r--r-- | html_test.go | 97 |
2 files changed, 185 insertions, 0 deletions
@@ -0,0 +1,88 @@ +package human + +import ( + "net/url" + "strings" +) + +// GetURLFromHTML returns the [url.URL] extracted from the raw HTML. +// Base is the URL containing the HTML. +// +// Returns nil if nothing is found. +func GetURLFromHTML(b []byte, base *url.URL) (*url.URL, error) { + content := string(b) + i := strings.Index(content, `<link `) + for i >= 0 { + if i+6 >= len(content) { + return nil, nil + } + args := parseArgs(string([]rune(content)[i+6:])) + if args["rel"] == "human-json" { + href, ok := args["href"] + if ok { + u, err := url.Parse(href) + if err != nil { + return nil, err + } + if u.Host != "" { + return u, nil + } + path := u.Path + *u = *base + if strings.HasPrefix(path, "/") { + u.Path = path + } else { + u = u.JoinPath(path) + } + return u, nil + } + } + i = strings.IndexAny(content[i:], `<link `) + } + return nil, nil +} + +func parseArgs(base string) map[string]string { + content := []rune(base) + i := 0 + res := map[string]string{} + + var sep bool + var key strings.Builder + var value strings.Builder + var quote bool + for i < len(content) && content[i] != '>' && (i+1 == len(content) || string(content[i:i+2]) != "/>") { + curr := content[i] + if !sep { + switch curr { + case '=': + sep = true + case ' ': + sep = false + if key.Len() > 0 { + res[key.String()] = "" + } + key.Reset() + default: + key.WriteRune(curr) + } + } else { + if value.Len() == 0 && curr == '"' { + quote = true + } else if (curr == '"' && quote) || curr == ' ' && !quote { + quote = false + sep = false + res[key.String()] = value.String() + key.Reset() + value.Reset() + } else { + value.WriteRune(curr) + } + } + i++ + } + if key.Len() > 0 { + res[key.String()] = value.String() + } + return res +} diff --git a/html_test.go b/html_test.go new file mode 100644 index 0000000..ab2bda7 --- /dev/null +++ b/html_test.go @@ -0,0 +1,97 @@ +package human + +import ( + "net/url" + "testing" +) + +func TestGetURLFromHTML(t *testing.T) { + base, _ := url.Parse(`https://example.org/foo/`) + u, err := GetURLFromHTML([]byte(`<link rel=human-json href=/human.json>`), base) + if err != nil { + t.Fatal(err) + } + if u == nil { + t.Fatal("not found") + } + if u.Path != "/human.json" { + t.Errorf("invalid path: %s", u.Path) + } + if u.Host != "example.org" { + t.Errorf("invalid host: %s", u.Host) + } + + u, err = GetURLFromHTML([]byte(`<link rel="human-json" href="/human.json">`), base) + if err != nil { + t.Fatal(err) + } + if u == nil { + t.Fatal("not found") + } + if u.Path != "/human.json" { + t.Errorf("invalid path: %s", u.Path) + } + if u.Host != "example.org" { + t.Errorf("invalid host: %s", u.Host) + } + + u, err = GetURLFromHTML([]byte(`<link rel="human-json" href="human.json">`), base) + if err != nil { + t.Fatal(err) + } + if u == nil { + t.Fatal("not found") + } + if u.Path != "/foo/human.json" { + t.Errorf("invalid path: %s", u.Path) + } + if u.Host != "example.org" { + t.Errorf("invalid host: %s", u.Host) + } + + u, err = GetURLFromHTML([]byte(`<link rel="human-json" href="../human.json">`), base) + if err != nil { + t.Fatal(err) + } + if u == nil { + t.Fatal("not found") + } + if u.Path != "/human.json" { + t.Errorf("invalid path: %s", u.Path) + } + if u.Host != "example.org" { + t.Errorf("invalid host: %s", u.Host) + } +} + +func TestParseArgs(t *testing.T) { + args := parseArgs(`key="hello world">`) + if args["key"] != "hello world" { + t.Errorf("invalid arg: %v", args) + } + + args = parseArgs(`key=hello>`) + if args["key"] != "hello" { + t.Errorf("invalid arg: %v", args) + } + + args = parseArgs(`key=hello world>`) + if args["key"] != "hello" { + t.Errorf("invalid arg: %v", args) + } + + args = parseArgs(`key=hello/>`) + if args["key"] != "hello" { + t.Errorf("invalid arg: %v", args) + } + + args = parseArgs(`key=hello`) + if args["key"] != "hello" { + t.Errorf("invalid arg: %v", args) + } + + args = parseArgs(`key=word foo=bar`) + if args["key"] != "word" || args["foo"] != "bar" { + t.Errorf("invalid args: %v", args) + } +} |
