2016-12-24 17:09:28 +00:00
|
|
|
package telegraph
|
|
|
|
|
|
|
|
import (
|
|
|
|
"bytes"
|
|
|
|
"errors"
|
2017-12-13 10:05:27 +00:00
|
|
|
"io"
|
2016-12-24 17:09:28 +00:00
|
|
|
"strings"
|
2017-01-10 10:28:23 +00:00
|
|
|
|
2017-10-03 13:46:11 +00:00
|
|
|
"golang.org/x/net/html"
|
2017-06-13 03:04:39 +00:00
|
|
|
)
|
2016-12-24 17:09:28 +00:00
|
|
|
|
2017-12-13 09:55:41 +00:00
|
|
|
type (
|
|
|
|
// Node is abstract object represents a DOM Node. It can be a String which represents a DOM text
|
|
|
|
// node or a NodeElement object.
|
|
|
|
Node interface{}
|
|
|
|
|
|
|
|
// NodeElement represents a DOM element node.
|
|
|
|
NodeElement struct {
|
|
|
|
// Name of the DOM element.
|
2017-12-13 11:26:01 +00:00
|
|
|
// Available tags: a, aside, b, blockquote, br, code, em, figcaption, figure, h3,
|
|
|
|
// h4, hr, i, iframe, img, li, ol, p, pre, s, strong, u, ul, video.
|
2017-12-13 09:55:41 +00:00
|
|
|
Tag string `json:"tag"`
|
|
|
|
|
2017-12-13 11:26:01 +00:00
|
|
|
// Attributes of the DOM element. Key of object represents name of attribute,
|
|
|
|
// value represents value of attribute.
|
2017-12-13 09:55:41 +00:00
|
|
|
// Available attributes: href, src.
|
|
|
|
Attrs map[string]string `json:"attrs,omitempty"` // optional
|
|
|
|
|
|
|
|
// List of child nodes for the DOM element.
|
|
|
|
Children []Node `json:"children,omitempty"` // optional
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
2017-12-13 10:06:44 +00:00
|
|
|
// ErrInvalidDataType is returned when ContentFormat function are passed a data argument of invalid
|
|
|
|
// type.
|
|
|
|
var ErrInvalidDataType = errors.New("invalid data type")
|
|
|
|
|
2017-12-13 09:55:41 +00:00
|
|
|
// ContentFormat transforms data to a DOM-based format to represent the content of the page.
|
2016-12-24 17:09:28 +00:00
|
|
|
func ContentFormat(data interface{}) ([]Node, error) {
|
|
|
|
var doc html.Node
|
|
|
|
switch dst := data.(type) {
|
|
|
|
case string:
|
|
|
|
dom, err := html.Parse(strings.NewReader(dst))
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
doc = *dom
|
|
|
|
case []byte:
|
|
|
|
dom, err := html.Parse(bytes.NewReader(dst))
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
doc = *dom
|
2017-12-13 10:05:27 +00:00
|
|
|
case io.Reader:
|
|
|
|
dom, err := html.Parse(dst)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
doc = *dom
|
2016-12-24 17:09:28 +00:00
|
|
|
default:
|
2017-12-13 10:06:44 +00:00
|
|
|
return nil, ErrInvalidDataType
|
2016-12-24 17:09:28 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
var content []Node
|
|
|
|
content = append(content, domToNode(doc.FirstChild))
|
|
|
|
|
|
|
|
return content, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func domToNode(domNode *html.Node) interface{} {
|
|
|
|
if domNode.Type == html.TextNode {
|
|
|
|
return domNode.Data
|
|
|
|
}
|
|
|
|
|
|
|
|
if domNode.Type != html.ElementNode {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
var nodeElement NodeElement
|
2017-10-03 13:46:11 +00:00
|
|
|
switch strings.ToLower(domNode.Data) {
|
2017-12-13 10:15:18 +00:00
|
|
|
case "a", "aside", "b", "blockquote", "br", "code", "em", "figcaption", "figure", "h3", "h4",
|
|
|
|
"hr", "i", "iframe", "img", "li", "ol", "p", "pre", "s", "strong", "u", "ul", "video":
|
2017-08-06 18:26:38 +00:00
|
|
|
nodeElement.Tag = domNode.Data
|
2017-09-04 21:09:59 +00:00
|
|
|
|
2017-10-03 13:46:11 +00:00
|
|
|
for i := range domNode.Attr {
|
|
|
|
switch strings.ToLower(domNode.Attr[i].Key) {
|
|
|
|
case "href", "src":
|
|
|
|
nodeElement.Attrs = map[string]string{
|
|
|
|
domNode.Attr[i].Key: domNode.Attr[i].Val,
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
continue
|
2016-12-24 17:09:28 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for child := domNode.FirstChild; child != nil; child = child.NextSibling {
|
|
|
|
nodeElement.Children = append(nodeElement.Children, domToNode(child))
|
|
|
|
}
|
|
|
|
|
|
|
|
return nodeElement
|
|
|
|
}
|