Go library that cleans a HTML page for better readability.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

292 lines
7.2 KiB

package readability
import (
"bytes"
"strings"
"golang.org/x/net/html"
)
// getElementsByTagName returns a collection of all elements in the document with
// the specified tag name, as an array of Node object.
// The special tag "*" will represents all elements.
func getElementsByTagName(doc *html.Node, tagName string) []*html.Node {
var results []*html.Node
var finder func(*html.Node)
finder = func(node *html.Node) {
if node.Type == html.ElementNode && (tagName == "*" || node.Data == tagName) {
results = append(results, node)
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
finder(child)
}
}
for child := doc.FirstChild; child != nil; child = child.NextSibling {
finder(child)
}
return results
}
// createElement creates a new ElementNode with specified tag.
func createElement(tagName string) *html.Node {
return &html.Node{
Type: html.ElementNode,
Data: tagName,
}
}
// createTextNode creates a new Text node.
func createTextNode(data string) *html.Node {
return &html.Node{
Type: html.TextNode,
Data: data,
}
}
// tagName returns the tag name of a Node.
// If it's not ElementNode, return empty string.
func tagName(node *html.Node) string {
if node.Type != html.ElementNode {
return ""
}
return node.Data
}
// getAttribute returns the value of a specified attribute on
// the element. If the given attribute does not exist, the value
// returned will be an empty string.
func getAttribute(node *html.Node, attrName string) string {
for i := 0; i < len(node.Attr); i++ {
if node.Attr[i].Key == attrName {
return node.Attr[i].Val
}
}
return ""
}
// setAttribute sets attribute for node. If attribute already exists,
// it will be replaced.
func setAttribute(node *html.Node, attrName string, attrValue string) {
attrIdx := -1
for i := 0; i < len(node.Attr); i++ {
if node.Attr[i].Key == attrName {
attrIdx = i
break
}
}
if attrIdx >= 0 {
node.Attr[attrIdx].Val = attrValue
} else {
node.Attr = append(node.Attr, html.Attribute{
Key: attrName,
Val: attrValue,
})
}
}
// removeAttribute removes attribute with given name.
func removeAttribute(node *html.Node, attrName string) {
attrIdx := -1
for i := 0; i < len(node.Attr); i++ {
if node.Attr[i].Key == attrName {
attrIdx = i
break
}
}
if attrIdx >= 0 {
a := node.Attr
a = append(a[:attrIdx], a[attrIdx+1:]...)
node.Attr = a
}
}
// hasAttribute returns a Boolean value indicating whether the
// specified node has the specified attribute or not.
func hasAttribute(node *html.Node, attrName string) bool {
for i := 0; i < len(node.Attr); i++ {
if node.Attr[i].Key == attrName {
return true
}
}
return false
}
// textContent returns the text content of the specified node,
// and all its descendants.
func textContent(node *html.Node) string {
var buffer bytes.Buffer
var finder func(*html.Node)
finder = func(n *html.Node) {
if n.Type == html.TextNode {
buffer.WriteString(n.Data)
}
for child := n.FirstChild; child != nil; child = child.NextSibling {
finder(child)
}
}
finder(node)
return buffer.String()
}
// outerHTML returns an HTML serialization of the element and its descendants.
func outerHTML(node *html.Node) string {
var buffer bytes.Buffer
err := html.Render(&buffer, node)
if err != nil {
return ""
}
return buffer.String()
}
// innerHTML returns the HTML content (inner HTML) of an element.
func innerHTML(node *html.Node) string {
var err error
var buffer bytes.Buffer
for child := node.FirstChild; child != nil; child = child.NextSibling {
err = html.Render(&buffer, child)
if err != nil {
return ""
}
}
return strings.TrimSpace(buffer.String())
}
// documentElement returns the Element that is the root element
// of the document. Since we are working with HTML document,
// the root will be <html> element for HTML documents).
func documentElement(doc *html.Node) *html.Node {
if nodes := getElementsByTagName(doc, "html"); len(nodes) > 0 {
return nodes[0]
}
return nil
}
// id returns the value of the id attribute of the specified element.
func id(node *html.Node) string {
id := getAttribute(node, "id")
id = strings.TrimSpace(id)
return id
}
// className returns the value of the class attribute of
// the specified element.
func className(node *html.Node) string {
className := getAttribute(node, "class")
className = strings.TrimSpace(className)
className = rxNormalize.ReplaceAllString(className, " ")
return className
}
// children returns an HTMLCollection of the child elements of Node.
func children(node *html.Node) []*html.Node {
var children []*html.Node
if node == nil {
return nil
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
if child.Type == html.ElementNode {
children = append(children, child)
}
}
return children
}
// childNodes returns list of a node's direct children.
func childNodes(node *html.Node) []*html.Node {
var childNodes []*html.Node
for child := node.FirstChild; child != nil; child = child.NextSibling {
childNodes = append(childNodes, child)
}
return childNodes
}
// firstElementChild returns the object's first child Element,
// or nil if there are no child elements.
func firstElementChild(node *html.Node) *html.Node {
for child := node.FirstChild; child != nil; child = child.NextSibling {
if child.Type == html.ElementNode {
return child
}
}
return nil
}
// nextElementSibling returns the Element immediately following
// the specified one in its parent's children list, or nil if the
// specified Element is the last one in the list.
func nextElementSibling(node *html.Node) *html.Node {
for sibling := node.NextSibling; sibling != nil; sibling = sibling.NextSibling {
if sibling.Type == html.ElementNode {
return sibling
}
}
return nil
}
// appendChild adds a node to the end of the list of children of a
// specified parent node. If the given child is a reference to an
// existing node in the document, appendChild() moves it from its
// current position to the new position.
func appendChild(node *html.Node, child *html.Node) {
if child.Parent != nil {
temp := cloneNode(child)
node.AppendChild(temp)
child.Parent.RemoveChild(child)
} else {
node.AppendChild(child)
}
}
// replaceNode replaces an OldNode with a NewNode.
func replaceNode(oldNode *html.Node, newNode *html.Node) {
if oldNode.Parent == nil {
return
}
newNode.Parent = nil
newNode.PrevSibling = nil
newNode.NextSibling = nil
oldNode.Parent.InsertBefore(newNode, oldNode)
oldNode.Parent.RemoveChild(oldNode)
}
// includeNode determines if node is included inside nodeList.
func includeNode(nodeList []*html.Node, node *html.Node) bool {
for i := 0; i < len(nodeList); i++ {
if nodeList[i] == node {
return true
}
}
return false
}
// cloneNode returns a deep clone of the node and its children.
// However, it will be detached from the original's parents
// and siblings.
func cloneNode(src *html.Node) *html.Node {
clone := &html.Node{
Type: src.Type,
DataAtom: src.DataAtom,
Data: src.Data,
Attr: make([]html.Attribute, len(src.Attr)),
}
copy(clone.Attr, src.Attr)
for child := src.FirstChild; child != nil; child = child.NextSibling {
clone.AppendChild(cloneNode(child))
}
return clone
}