851 lines
24 KiB
Go
851 lines
24 KiB
Go
// Copyright (c) 2015 Andy Leap, Google
|
|
// SPDX-License-Identifier: MIT
|
|
|
|
// Package microformats provides a microformats parser, supporting both v1 and
|
|
// v2 syntax.
|
|
//
|
|
// Usage:
|
|
//
|
|
// import "willnorris.com/go/microformats"
|
|
//
|
|
// Retrieve the HTML contents of a page, and call Parse or ParseNode, depending
|
|
// on what input you have (an io.Reader or an html.Node).
|
|
//
|
|
// To parse only a section of an HTML document, use a package like goquery to
|
|
// select the root node to parse from. For example, see cmd/gomf/main.go.
|
|
//
|
|
// See also: http://microformats.org/wiki/microformats2
|
|
package microformats // import "willnorris.com/go/microformats"
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"io"
|
|
"net/url"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
|
|
"golang.org/x/net/html"
|
|
"golang.org/x/net/html/atom"
|
|
)
|
|
|
|
var (
|
|
rootClassNames = regexp.MustCompile(`^h-([a-z0-9]+-)?[a-z]+(-[a-z]+)*$`)
|
|
propertyClassNames = regexp.MustCompile(`^(p|u|dt|e)-([a-z0-9]+-)?[a-z]+(-[a-z]+)*$`)
|
|
)
|
|
|
|
// Microformat specifies a single microformat object and its properties. It
|
|
// may represent a person, an address, a blog post, etc.
|
|
type Microformat struct {
|
|
ID string `json:"id,omitempty"`
|
|
Value string `json:"value,omitempty"`
|
|
HTML string `json:"html,omitempty"`
|
|
Type []string `json:"type"`
|
|
Properties map[string][]any `json:"properties"`
|
|
Shape string `json:"shape,omitempty"`
|
|
Coords string `json:"coords,omitempty"`
|
|
Children []*Microformat `json:"children,omitempty"`
|
|
|
|
// track whether this microformat has various types of properties or
|
|
// nested microformats. Used in processing implied property values.
|
|
hasNestedMicroformats bool
|
|
hasPProperties bool
|
|
hasEProperties bool
|
|
hasUProperties bool
|
|
|
|
// whether this is a v1 microformat parsed in backwards compatible mode
|
|
backcompat bool
|
|
}
|
|
|
|
// Data specifies all of the microformats and data parsed from a single HTML
|
|
// page.
|
|
type Data struct {
|
|
// Items includes all top-level microformats found on the page.
|
|
Items []*Microformat `json:"items"`
|
|
|
|
// Rels includes all related URLs found on the page (<a> or <link>
|
|
// elements with a "rel" value). Map keys are the rel value, mapped to
|
|
// a slice of URLs with that relation. For example:
|
|
//
|
|
// map[string][]string{
|
|
// "author": {"http://example.com/a", "http://example.com/b"},
|
|
// "alternate": {"http://example.com/fr"},
|
|
// }
|
|
//
|
|
// Relative URL values are resolved to absolute URLs using the base URL
|
|
// of the page.
|
|
Rels map[string][]string `json:"rels"`
|
|
|
|
// RelURLs maps related URLs found on the page to additional metadata
|
|
// about that relationship. If a URL is linked to more than once, only
|
|
// the metadata for the first link is included here. Relative URL
|
|
// values are resolved to absolute URLs using the base URL of the page.
|
|
RelURLs map[string]*RelURL `json:"rel-urls"`
|
|
}
|
|
|
|
// RelURL represents the attributes of a URL. The URL value itself is the map
|
|
// key in the RelURLs field of the Data type.
|
|
type RelURL struct {
|
|
Rels []string `json:"rels,omitempty"`
|
|
Text string `json:"text,omitempty"`
|
|
Media string `json:"media,omitempty"`
|
|
HrefLang string `json:"hreflang,omitempty"`
|
|
Title string `json:"title,omitempty"`
|
|
Type string `json:"type,omitempty"`
|
|
}
|
|
|
|
// parser parses a single HTML page for microformats. parser is not thread
|
|
// safe, and should only be used to parse a single document.
|
|
type parser struct {
|
|
curData *Data
|
|
curItem *Microformat
|
|
base *url.URL
|
|
baseFound bool
|
|
|
|
// root node of the parsed document
|
|
root *html.Node
|
|
}
|
|
|
|
// Parse the microformats found in the HTML document read from r. baseURL is
|
|
// the URL this document was retrieved from and is used to expand any
|
|
// relative URLs. If baseURL is nil and the base URL is not referenced in the
|
|
// document, relative URLs are not expanded.
|
|
func Parse(r io.Reader, baseURL *url.URL) *Data {
|
|
doc, _ := html.Parse(r)
|
|
return ParseNode(doc, baseURL)
|
|
}
|
|
|
|
// ParseNode parses the microformats found in doc. baseURL is the URL this
|
|
// document was retrieved from and is used to expand any relative URLs. If
|
|
// baseURL is nil and the base URL is not referenced in the document,
|
|
// relative URLs are not expanded.
|
|
func ParseNode(doc *html.Node, baseURL *url.URL) *Data {
|
|
if doc == nil { // makes no sense to go further
|
|
return nil
|
|
}
|
|
p := new(parser)
|
|
p.curData = &Data{
|
|
Items: make([]*Microformat, 0),
|
|
Rels: make(map[string][]string),
|
|
RelURLs: make(map[string]*RelURL),
|
|
}
|
|
p.base = baseURL
|
|
if p.base == nil { // can make sense if base can be inferred from contents
|
|
p.base = &url.URL{}
|
|
}
|
|
p.baseFound = false
|
|
p.root = doc
|
|
p.walk(doc)
|
|
return p.curData
|
|
}
|
|
|
|
// expandAttrURLs expands relative URLs in attributes to be absolute URLs.
|
|
// Attributes are taken from https://html.spec.whatwg.org/multipage/indices.html#attributes-3.
|
|
func (p *parser) expandAttrURLs(node *html.Node) {
|
|
var attr []string
|
|
if isAtom(node, atom.Form) {
|
|
attr = append(attr, "action")
|
|
}
|
|
if isAtom(node, atom.Blockquote, atom.Del, atom.Ins, atom.Q) {
|
|
attr = append(attr, "cite")
|
|
}
|
|
if isAtom(node, atom.Object) {
|
|
attr = append(attr, "data")
|
|
}
|
|
if isAtom(node, atom.Button, atom.Input) {
|
|
attr = append(attr, "formaction")
|
|
}
|
|
if isAtom(node, atom.A, atom.Area, atom.Base, atom.Link) {
|
|
attr = append(attr, "href")
|
|
}
|
|
if isAtom(node, atom.A, atom.Area) {
|
|
attr = append(attr, "ping")
|
|
}
|
|
if isAtom(node, atom.Audio, atom.Embed, atom.Iframe, atom.Img, atom.Input, atom.Script, atom.Source, atom.Track, atom.Video) {
|
|
attr = append(attr, "src")
|
|
}
|
|
if isAtom(node, atom.Video) {
|
|
attr = append(attr, "poster")
|
|
}
|
|
|
|
for _, a := range attr {
|
|
value := getAttrPtr(node, a)
|
|
if value != nil {
|
|
*value = expandURL(*value, p.base)
|
|
}
|
|
}
|
|
|
|
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
|
p.expandAttrURLs(c)
|
|
}
|
|
}
|
|
|
|
// expandURL expands relative URL r into an absolute URL by resolving it relative to
|
|
// base. If r is not a valid URL or base is nil, the original r value is returned.
|
|
func expandURL(r string, base *url.URL) string {
|
|
if base != nil {
|
|
if u, err := url.Parse(r); err == nil {
|
|
u = base.ResolveReference(u)
|
|
r = u.String()
|
|
}
|
|
}
|
|
return r
|
|
}
|
|
|
|
// walk the DOM rooted at node, storing parsed microformats in p.
|
|
//
|
|
//nolint:gocyclo,funlen // maybe we'll refactor it one day
|
|
func (p *parser) walk(node *html.Node) {
|
|
if isAtom(node, atom.Template) {
|
|
return
|
|
}
|
|
|
|
var curItem *Microformat
|
|
var priorItem *Microformat
|
|
var rootclasses []string
|
|
|
|
classes := getClasses(node)
|
|
for _, class := range classes {
|
|
if rootClassNames.MatchString(class) {
|
|
rootclasses = append(rootclasses, class)
|
|
}
|
|
}
|
|
|
|
var backcompat bool
|
|
if len(rootclasses) == 0 {
|
|
rootclasses = backcompatRootClasses(classes, p.curItem)
|
|
if len(rootclasses) > 0 {
|
|
backcompat = true
|
|
}
|
|
}
|
|
|
|
if len(rootclasses) > 0 {
|
|
sort.Strings(rootclasses)
|
|
curItem = &Microformat{
|
|
Type: rootclasses,
|
|
Properties: make(map[string][]any),
|
|
backcompat: backcompat,
|
|
}
|
|
if !backcompat {
|
|
curItem.ID = getAttr(node, "id")
|
|
}
|
|
if p.curItem == nil {
|
|
p.curData.Items = append(p.curData.Items, curItem)
|
|
} else {
|
|
p.curItem.hasNestedMicroformats = true
|
|
}
|
|
priorItem = p.curItem
|
|
p.curItem = curItem
|
|
}
|
|
|
|
// handle backcompat include pattern
|
|
if p.curItem != nil && p.curItem.backcompat {
|
|
refs, replace := p.backcompatIncludeRefs(node)
|
|
if len(refs) != 0 {
|
|
node = p.backcompatIncludeNode(node, refs, replace)
|
|
}
|
|
}
|
|
|
|
if !p.baseFound && isAtom(node, atom.Base) {
|
|
if href := getAttr(node, "href"); href != "" {
|
|
if newbase, err := url.Parse(href); err == nil {
|
|
newbase = p.base.ResolveReference(newbase)
|
|
p.base = newbase
|
|
p.baseFound = true
|
|
}
|
|
}
|
|
}
|
|
|
|
var rels []string
|
|
if isAtom(node, atom.A, atom.Link) {
|
|
if rel := getAttr(node, "rel"); rel != "" {
|
|
urlVal := getAttr(node, "href")
|
|
urlVal = expandURL(urlVal, p.base)
|
|
|
|
rels = strings.Fields(rel)
|
|
for _, relval := range rels {
|
|
var seen bool // whether we've already stored this url for this rel
|
|
for _, u := range p.curData.Rels[relval] {
|
|
if u == urlVal {
|
|
seen = true
|
|
}
|
|
}
|
|
if !seen {
|
|
p.curData.Rels[relval] = append(p.curData.Rels[relval], urlVal)
|
|
}
|
|
}
|
|
|
|
if _, ok := p.curData.RelURLs[urlVal]; !ok {
|
|
sort.Strings(rels)
|
|
p.curData.RelURLs[urlVal] = &RelURL{
|
|
Text: getTextContent(node, nil),
|
|
Rels: rels,
|
|
Media: getAttr(node, "media"),
|
|
HrefLang: getAttr(node, "hreflang"),
|
|
Title: getAttr(node, "title"),
|
|
Type: getAttr(node, "type"),
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
|
p.walk(c)
|
|
}
|
|
|
|
if curItem != nil {
|
|
// all child elements of node have been processed, and all explicit
|
|
// properties on curItem have been set.
|
|
|
|
// Process implied date for 'end' property.
|
|
implyEndDate(curItem)
|
|
|
|
if p.curItem == nil || !p.curItem.backcompat {
|
|
// Now process implied property values.
|
|
if _, ok := curItem.Properties["name"]; !ok {
|
|
if !curItem.hasNestedMicroformats && !curItem.hasPProperties && !curItem.hasEProperties {
|
|
name := getImpliedName(node)
|
|
if name != "" {
|
|
curItem.Properties["name"] = append(curItem.Properties["name"], name)
|
|
}
|
|
}
|
|
}
|
|
if _, ok := curItem.Properties["photo"]; !ok {
|
|
if !curItem.hasNestedMicroformats && !curItem.hasUProperties {
|
|
photo, alt := getImpliedPhoto(node, p.base)
|
|
if alt != "" {
|
|
curItem.Properties["photo"] = append(curItem.Properties["photo"], map[string]string{
|
|
"alt": alt,
|
|
"value": photo,
|
|
})
|
|
} else if photo != "" {
|
|
curItem.Properties["photo"] = append(curItem.Properties["photo"], photo)
|
|
}
|
|
}
|
|
}
|
|
if _, ok := curItem.Properties["url"]; !ok {
|
|
if !curItem.hasNestedMicroformats && !curItem.hasUProperties {
|
|
url := getImpliedURL(node, p.base)
|
|
if url != "" {
|
|
curItem.Properties["url"] = append(curItem.Properties["url"], url)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
p.curItem = priorItem
|
|
}
|
|
|
|
var propertyclasses []string
|
|
if p.curItem != nil && p.curItem.backcompat {
|
|
var itemType []string
|
|
if p.curItem != nil {
|
|
itemType = p.curItem.Type
|
|
}
|
|
propertyclasses = backcompatPropertyClasses(classes, rels, itemType)
|
|
} else {
|
|
for _, class := range classes {
|
|
match := propertyClassNames.FindStringSubmatch(class)
|
|
if match != nil {
|
|
propertyclasses = append(propertyclasses, match[0])
|
|
}
|
|
}
|
|
}
|
|
if len(propertyclasses) > 0 {
|
|
for _, prop := range propertyclasses {
|
|
parts := strings.SplitN(prop, "-", 2)
|
|
prefix, name := parts[0], parts[1]
|
|
|
|
var value, embedValue *string
|
|
var propData = make(map[string]string)
|
|
switch prefix {
|
|
case "p":
|
|
if p.curItem != nil {
|
|
p.curItem.hasPProperties = true
|
|
}
|
|
value = getValueClassPattern(node)
|
|
if value == nil && isAtom(node, atom.Abbr, atom.Link) {
|
|
value = getAttrPtr(node, "title")
|
|
}
|
|
if value == nil && isAtom(node, atom.Data, atom.Input) {
|
|
value = getAttrPtr(node, "value")
|
|
}
|
|
if value == nil && isAtom(node, atom.Img, atom.Area) {
|
|
value = getAttrPtr(node, "alt")
|
|
}
|
|
if value == nil {
|
|
value = new(string)
|
|
*value = strings.TrimSpace(getTextContent(node, p.imageAltSrcValue))
|
|
}
|
|
if curItem != nil && p.curItem != nil {
|
|
embedValue = getFirstPropValue(curItem, "name")
|
|
}
|
|
case "u":
|
|
if p.curItem != nil {
|
|
p.curItem.hasUProperties = true
|
|
}
|
|
if value == nil && isAtom(node, atom.A, atom.Area, atom.Link) {
|
|
value = getAttrPtr(node, "href")
|
|
}
|
|
if value == nil && isAtom(node, atom.Img) {
|
|
value = getAttrPtr(node, "src")
|
|
if p.curItem != nil && !p.curItem.backcompat {
|
|
if alt := imageAltValue(node); alt != "" {
|
|
propData["alt"] = alt
|
|
}
|
|
}
|
|
}
|
|
if value == nil && isAtom(node, atom.Audio, atom.Video, atom.Source) {
|
|
value = getAttrPtr(node, "src")
|
|
}
|
|
if value == nil && isAtom(node, atom.Object) {
|
|
value = getAttrPtr(node, "data")
|
|
}
|
|
if value == nil && isAtom(node, atom.Video) {
|
|
value = getAttrPtr(node, "poster")
|
|
}
|
|
if value == nil {
|
|
value = getValueClassPattern(node)
|
|
}
|
|
if value == nil && isAtom(node, atom.Abbr) {
|
|
value = getAttrPtr(node, "title")
|
|
}
|
|
if value == nil && isAtom(node, atom.Data, atom.Input) {
|
|
value = getAttrPtr(node, "value")
|
|
}
|
|
if value == nil {
|
|
value = new(string)
|
|
*value = strings.TrimSpace(getTextContent(node, nil))
|
|
}
|
|
if value != nil {
|
|
*value = strings.TrimSpace(expandURL(*value, p.base))
|
|
}
|
|
if curItem != nil && p.curItem != nil {
|
|
embedValue = getFirstPropValue(curItem, "url")
|
|
}
|
|
|
|
// for category URLs in backcompat mode, strip to the last path segment
|
|
if p.curItem != nil && p.curItem.backcompat && name == "category" {
|
|
*value = backcompatURLCategory(*value)
|
|
}
|
|
case "e":
|
|
if p.curItem != nil {
|
|
p.curItem.hasEProperties = true
|
|
}
|
|
value = new(string)
|
|
*value = strings.TrimSpace(getTextContent(node, p.imageAltSrcValue))
|
|
var buf bytes.Buffer
|
|
|
|
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
|
p.expandAttrURLs(c) // microformats/microformats2-parsing#38
|
|
|
|
// ignore errors from html.Render which nearly always result from being unable
|
|
// to write to the underlying io.Writer, which never happens with bytes.Buffer.
|
|
_ = html.Render(&buf, c)
|
|
}
|
|
htmlbody := strings.TrimSpace(buf.String())
|
|
|
|
// HTML spec: Serializing HTML Fragments algorithm does not include
|
|
// a trailing slash, so remove it. Nor should apostrophes be
|
|
// encoded, which golang.org/x/net/html is doing.
|
|
htmlbody = strings.ReplaceAll(htmlbody, `/>`, `>`)
|
|
htmlbody = strings.ReplaceAll(htmlbody, `'`, `'`)
|
|
propData["html"] = htmlbody
|
|
case "dt":
|
|
if value == nil {
|
|
value = getDateTimeValue(node)
|
|
}
|
|
if value == nil && isAtom(node, atom.Time, atom.Ins, atom.Del) {
|
|
value = getAttrPtr(node, "datetime")
|
|
}
|
|
if value == nil && isAtom(node, atom.Abbr) {
|
|
value = getAttrPtr(node, "title")
|
|
}
|
|
if value == nil && isAtom(node, atom.Data, atom.Input) {
|
|
value = getAttrPtr(node, "value")
|
|
}
|
|
if value == nil {
|
|
value = new(string)
|
|
*value = strings.TrimSpace(getTextContent(node, nil))
|
|
}
|
|
}
|
|
if curItem != nil && p.curItem != nil {
|
|
if embedValue == nil {
|
|
embedValue = value
|
|
}
|
|
p.curItem.Properties[name] = append(p.curItem.Properties[name], &Microformat{
|
|
ID: curItem.ID,
|
|
Type: curItem.Type,
|
|
Properties: curItem.Properties,
|
|
Coords: curItem.Coords,
|
|
Shape: curItem.Shape,
|
|
Value: *embedValue,
|
|
HTML: propData["html"],
|
|
})
|
|
} else if value != nil && p.curItem != nil {
|
|
if len(propData) > 0 {
|
|
propData["value"] = *value
|
|
p.curItem.Properties[name] = append(p.curItem.Properties[name], propData)
|
|
} else {
|
|
p.curItem.Properties[name] = append(p.curItem.Properties[name], *value)
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
if curItem != nil && p.curItem != nil {
|
|
p.curItem.Children = append(p.curItem.Children, curItem)
|
|
p.curItem.hasNestedMicroformats = true
|
|
}
|
|
}
|
|
}
|
|
|
|
// getClasses returns all of the classes on node.
|
|
func getClasses(node *html.Node) []string {
|
|
if c := getAttrPtr(node, "class"); c != nil {
|
|
return strings.Fields(*c)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// hasMatchingClass whether node contains a class that matches regex.
|
|
func hasMatchingClass(node *html.Node, regex *regexp.Regexp) bool {
|
|
classes := getClasses(node)
|
|
for _, class := range classes {
|
|
if regex.MatchString(class) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// getAttr returns the value of the specified attribute on node.
|
|
func getAttr(node *html.Node, name string) string {
|
|
if v := getAttrPtr(node, name); v != nil {
|
|
return *v
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// getAttr returns pointer to value of the specified attribute on node. If
|
|
// node does not contain the specified attribute, nil will be returned.
|
|
func getAttrPtr(node *html.Node, name string) *string {
|
|
if node == nil {
|
|
return nil
|
|
}
|
|
for i, attr := range node.Attr {
|
|
if strings.EqualFold(attr.Key, name) {
|
|
return &node.Attr[i].Val
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// hasAttr returns whether node has an attribute with the specified name.
|
|
func hasAttr(node *html.Node, name string) bool {
|
|
return getAttrPtr(node, name) != nil
|
|
}
|
|
|
|
// isAtom returns whether node's atom is one of atoms.
|
|
func isAtom(node *html.Node, atoms ...atom.Atom) bool {
|
|
if node == nil {
|
|
return false
|
|
}
|
|
for _, atom := range atoms {
|
|
if atom == node.DataAtom {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// getTextContent returns the text content of node, following the common
|
|
// microformats v2 algorithm. Nested script and style elements are ignored,
|
|
// and img elements are run through imgFn. If imgFn is nil, img elements are
|
|
// ignored as well.
|
|
func getTextContent(node *html.Node, imgFn func(*html.Node) string) string {
|
|
if node == nil {
|
|
return ""
|
|
}
|
|
if isAtom(node, atom.Script, atom.Style, atom.Template) {
|
|
return ""
|
|
}
|
|
if isAtom(node, atom.Img) && imgFn != nil {
|
|
return imgFn(node)
|
|
}
|
|
if node.Type == html.TextNode {
|
|
return node.Data
|
|
}
|
|
var buf bytes.Buffer
|
|
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
|
buf.WriteString(getTextContent(c, imgFn))
|
|
}
|
|
return buf.String()
|
|
}
|
|
|
|
// imageAltValue returns the value of node's alt attribute.
|
|
func imageAltValue(node *html.Node) string {
|
|
return getAttr(node, "alt")
|
|
}
|
|
|
|
// imageAltSrcValue returns the value of node's alt attribute. If node doesn't
|
|
// have an alt attribute, the value of node's src attribute is expanded to an
|
|
// absolute URL and returned.
|
|
func (p *parser) imageAltSrcValue(node *html.Node) string {
|
|
if v := getAttrPtr(node, "alt"); v != nil {
|
|
return *v
|
|
}
|
|
if v := getAttrPtr(node, "src"); v != nil {
|
|
return fmt.Sprintf(" %v ", expandURL(*v, p.base))
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// getOnlyChild returns the sole child of node. Returns nil if node has zero
|
|
// or more than one child.
|
|
func getOnlyChild(node *html.Node) *html.Node {
|
|
if node == nil {
|
|
return nil
|
|
}
|
|
var n *html.Node
|
|
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
|
if c.Type == html.ElementNode {
|
|
if n == nil {
|
|
n = c
|
|
} else {
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
return n
|
|
}
|
|
|
|
// getOnlyChild returns the sole child of node with the specified atom.
|
|
// Returns nil if node has zero or more than one child with that atom.
|
|
func getOnlyChildAtom(node *html.Node, atom atom.Atom) *html.Node {
|
|
if node == nil {
|
|
return nil
|
|
}
|
|
var n *html.Node
|
|
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
|
if c.Type == html.ElementNode && c.DataAtom == atom {
|
|
if n == nil {
|
|
n = c
|
|
} else {
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
return n
|
|
}
|
|
|
|
// getImpliedName gets the implied name value for node.
|
|
//
|
|
// See http://microformats.org/wiki/microformats2-parsing
|
|
func getImpliedName(node *html.Node) string {
|
|
var name *string
|
|
|
|
switch {
|
|
case isAtom(node, atom.Img, atom.Area):
|
|
name = getAttrPtr(node, "alt")
|
|
case isAtom(node, atom.Abbr):
|
|
name = getAttrPtr(node, "title")
|
|
}
|
|
|
|
if name == nil {
|
|
subnode := getOnlyChild(node)
|
|
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
|
|
switch {
|
|
case isAtom(subnode, atom.Img, atom.Area):
|
|
name = getAttrPtr(subnode, "alt")
|
|
case isAtom(subnode, atom.Abbr):
|
|
name = getAttrPtr(subnode, "title")
|
|
}
|
|
}
|
|
}
|
|
|
|
if name == nil {
|
|
subnode := getOnlyChild(node)
|
|
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
|
|
subsubnode := getOnlyChild(subnode)
|
|
if subsubnode != nil && !hasMatchingClass(subsubnode, rootClassNames) {
|
|
switch {
|
|
case isAtom(subsubnode, atom.Img, atom.Area):
|
|
name = getAttrPtr(subsubnode, "alt")
|
|
case isAtom(subsubnode, atom.Abbr):
|
|
name = getAttrPtr(subsubnode, "title")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if name == nil {
|
|
name = new(string)
|
|
*name = getTextContent(node, imageAltValue)
|
|
}
|
|
|
|
return strings.TrimSpace(*name)
|
|
}
|
|
|
|
// getImpliedPhoto gets the implied photo value for node.
|
|
//
|
|
// See http://microformats.org/wiki/microformats2-parsing
|
|
func getImpliedPhoto(node *html.Node, baseURL *url.URL) (src, alt string) {
|
|
var photo *string
|
|
|
|
switch {
|
|
case isAtom(node, atom.Img):
|
|
photo = getAttrPtr(node, "src")
|
|
alt = getAttr(node, "alt")
|
|
case isAtom(node, atom.Object):
|
|
photo = getAttrPtr(node, "data")
|
|
}
|
|
|
|
if photo == nil {
|
|
subnode := getOnlyChildAtom(node, atom.Img)
|
|
if subnode != nil && hasAttr(subnode, "src") && !hasMatchingClass(subnode, rootClassNames) {
|
|
photo = getAttrPtr(subnode, "src")
|
|
alt = getAttr(subnode, "alt")
|
|
}
|
|
}
|
|
if photo == nil {
|
|
subnode := getOnlyChildAtom(node, atom.Object)
|
|
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
|
|
photo = getAttrPtr(subnode, "data")
|
|
}
|
|
}
|
|
|
|
if photo == nil {
|
|
subnode := getOnlyChild(node)
|
|
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
|
|
subsubnode := getOnlyChildAtom(subnode, atom.Img)
|
|
if subsubnode != nil && hasAttr(subsubnode, "src") && !hasMatchingClass(subsubnode, rootClassNames) {
|
|
photo = getAttrPtr(subsubnode, "src")
|
|
alt = getAttr(subsubnode, "alt")
|
|
}
|
|
}
|
|
}
|
|
if photo == nil {
|
|
subnode := getOnlyChild(node)
|
|
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
|
|
subsubnode := getOnlyChildAtom(subnode, atom.Object)
|
|
if subsubnode != nil && !hasMatchingClass(subsubnode, rootClassNames) {
|
|
photo = getAttrPtr(subsubnode, "data")
|
|
}
|
|
}
|
|
}
|
|
|
|
if photo == nil {
|
|
return "", alt
|
|
}
|
|
return expandURL(*photo, baseURL), alt
|
|
}
|
|
|
|
// getImpliedURL gets the implied url value for node.
|
|
//
|
|
// See http://microformats.org/wiki/microformats2-parsing
|
|
func getImpliedURL(node *html.Node, baseURL *url.URL) string {
|
|
var value *string
|
|
if value == nil && isAtom(node, atom.A, atom.Area) {
|
|
value = getAttrPtr(node, "href")
|
|
}
|
|
|
|
if value == nil {
|
|
subnode := getOnlyChildAtom(node, atom.A)
|
|
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
|
|
value = getAttrPtr(subnode, "href")
|
|
}
|
|
}
|
|
if value == nil {
|
|
subnode := getOnlyChildAtom(node, atom.Area)
|
|
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
|
|
value = getAttrPtr(subnode, "href")
|
|
}
|
|
}
|
|
|
|
if value == nil {
|
|
subnode := getOnlyChild(node)
|
|
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
|
|
subsubnode := getOnlyChildAtom(subnode, atom.A)
|
|
if subsubnode != nil && !hasMatchingClass(subsubnode, rootClassNames) {
|
|
value = getAttrPtr(subsubnode, "href")
|
|
}
|
|
}
|
|
}
|
|
if value == nil {
|
|
subnode := getOnlyChild(node)
|
|
if subnode != nil && !hasMatchingClass(subnode, rootClassNames) {
|
|
subsubnode := getOnlyChildAtom(subnode, atom.Area)
|
|
if subsubnode != nil && !hasMatchingClass(subsubnode, rootClassNames) {
|
|
value = getAttrPtr(subsubnode, "href")
|
|
}
|
|
}
|
|
}
|
|
|
|
if value == nil {
|
|
return ""
|
|
}
|
|
return expandURL(*value, baseURL)
|
|
}
|
|
|
|
// getValueClassPattern gets the value of node using the value class pattern.
|
|
//
|
|
// See http://microformats.org/wiki/value-class-pattern
|
|
func getValueClassPattern(node *html.Node) *string {
|
|
values := parseValueClassPattern(node, false)
|
|
if len(values) > 0 {
|
|
val := strings.Join(values, "")
|
|
return &val
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// parseValueClassPattern parses node for values using the value class pattern.
|
|
// If dt is true, the rules for date and time parsing will be used.
|
|
func parseValueClassPattern(node *html.Node, dt bool) []string {
|
|
if node == nil {
|
|
return nil
|
|
}
|
|
var values []string
|
|
for c := node.FirstChild; c != nil; c = c.NextSibling {
|
|
classes := getClasses(c)
|
|
var valueClass, valueTitleClass bool
|
|
for _, class := range classes {
|
|
if class == "value" {
|
|
valueClass = true
|
|
}
|
|
if class == "value-title" {
|
|
valueTitleClass = true
|
|
}
|
|
}
|
|
if valueTitleClass {
|
|
values = append(values, getAttr(c, "title"))
|
|
} else if valueClass {
|
|
switch {
|
|
case isAtom(c, atom.Img, atom.Area) && hasAttr(c, "alt"):
|
|
values = append(values, getAttr(c, "alt"))
|
|
case isAtom(c, atom.Data) && hasAttr(c, "value"):
|
|
values = append(values, getAttr(c, "value"))
|
|
case isAtom(c, atom.Abbr) && hasAttr(c, "title"):
|
|
values = append(values, getAttr(c, "title"))
|
|
case dt && isAtom(c, atom.Del, atom.Ins, atom.Time) && hasAttr(c, "datetime"):
|
|
values = append(values, getAttr(c, "datetime"))
|
|
default:
|
|
values = append(values, strings.TrimSpace(getTextContent(c, nil)))
|
|
}
|
|
}
|
|
}
|
|
|
|
return values
|
|
}
|
|
|
|
// getFirstPropValue returns the first property value for prop in item.
|
|
func getFirstPropValue(item *Microformat, prop string) *string {
|
|
values := item.Properties[prop]
|
|
if len(values) > 0 {
|
|
if v, ok := values[0].(string); ok {
|
|
return &v
|
|
}
|
|
}
|
|
return nil
|
|
}
|