readability/parser.go
2020-04-07 14:40:28 +08:00

1971 lines
62 KiB
Go

package readability
import (
"fmt"
shtml "html"
"io"
"math"
nurl "net/url"
"regexp"
"sort"
"strconv"
"strings"
"golang.org/x/net/html"
)
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops *.
var (
rxUnlikelyCandidates = regexp.MustCompile(`(?i)-ad-|ai2html|banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|gdpr|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
rxOkMaybeItsACandidate = regexp.MustCompile(`(?i)and|article|body|column|main|shadow`)
rxPositive = regexp.MustCompile(`(?i)article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
rxNegative = regexp.MustCompile(`(?i)hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|gdpr|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget`)
rxExtraneous = regexp.MustCompile(`(?i)print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility`)
rxByline = regexp.MustCompile(`(?i)byline|author|dateline|writtenby|p-author`)
rxReplaceFonts = regexp.MustCompile(`(?i)<(/?)font[^>]*>`)
rxNormalize = regexp.MustCompile(`(?i)\s{2,}`)
rxVideos = regexp.MustCompile(`(?i)//(www\.)?((dailymotion|youtube|youtube-nocookie|player\.vimeo|v\.qq)\.com|(archive|upload\.wikimedia)\.org|player\.twitch\.tv)`)
rxNextLink = regexp.MustCompile(`(?i)(next|weiter|continue|>([^\|]|$)|»([^\|]|$))`)
rxPrevLink = regexp.MustCompile(`(?i)(prev|earl|old|new|<|«)`)
rxWhitespace = regexp.MustCompile(`(?i)^\s*$`)
rxHasContent = regexp.MustCompile(`(?i)\S$`)
rxPropertyPattern = regexp.MustCompile(`(?i)\s*(dc|dcterm|og|twitter)\s*:\s*(author|creator|description|title|site_name|image\S*)\s*`)
rxNamePattern = regexp.MustCompile(`(?i)^\s*(?:(dc|dcterm|og|twitter|weibo:(article|webpage))\s*[\.:]\s*)?(author|creator|description|title|site_name|image)\s*$`)
rxTitleSeparator = regexp.MustCompile(`(?i) [\|\-\\/>»] `)
rxTitleHierarchySep = regexp.MustCompile(`(?i) [\\/>»] `)
rxTitleRemoveFinalPart = regexp.MustCompile(`(?i)(.*)[\|\-\\/>»] .*`)
rxTitleRemove1stPart = regexp.MustCompile(`(?i)[^\|\-\\/>»]*[\|\-\\/>»](.*)`)
rxTitleAnySeparator = regexp.MustCompile(`(?i)[\|\-\\/>»]+`)
rxDisplayNone = regexp.MustCompile(`(?i)display\s*:\s*none`)
rxSentencePeriod = regexp.MustCompile(`(?i)\.( |$)`)
rxShareElements = regexp.MustCompile(`(?i)(\b|_)(share|sharedaddy)(\b|_)`)
rxFaviconSize = regexp.MustCompile(`(?i)(\d+)x(\d+)`)
rxLazyImageSrcset = regexp.MustCompile(`(?i)\.(jpg|jpeg|png|webp)\s+\d`)
rxLazyImageSrc = regexp.MustCompile(`(?i)^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$`)
)
// Constants that used by readability.
var (
divToPElems = []string{"a", "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul", "select"}
alterToDivExceptions = []string{"div", "article", "section", "p"}
presentationalAttributes = []string{"align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace"}
deprecatedSizeAttributeElems = []string{"table", "th", "td", "hr", "pre"}
phrasingElems = []string{
"abbr", "audio", "b", "bdo", "br", "button", "cite", "code", "data",
"datalist", "dfn", "em", "embed", "i", "img", "input", "kbd", "label",
"mark", "math", "meter", "noscript", "object", "output", "progress", "q",
"ruby", "samp", "script", "select", "small", "span", "strong", "sub",
"sup", "textarea", "time", "var", "wbr"}
)
// flags is flags that used by parser.
type flags struct {
stripUnlikelys bool
useWeightClasses bool
cleanConditionally bool
}
// parseAttempt is container for the result of previous parse attempts.
type parseAttempt struct {
articleContent *html.Node
textLength int
}
// Article is the final readable content.
type Article struct {
Title string
Byline string
Node *html.Node
Content string
TextContent string
Length int
Excerpt string
SiteName string
Image string
Favicon string
}
// Parser is the parser that parses the page to get the readable content.
type Parser struct {
// MaxElemsToParse is the max number of nodes supported by this
// parser. Default: 0 (no limit)
MaxElemsToParse int
// NTopCandidates is the number of top candidates to consider when
// analysing how tight the competition is among candidates.
NTopCandidates int
// CharThresholds is the default number of chars an article must
// have in order to return a result
CharThresholds int
// ClassesToPreserve are the classes that readability sets itself.
ClassesToPreserve []string
// KeepClasses specify whether the classes should be stripped or not.
KeepClasses bool
// TagsToScore is element tags to score by default.
TagsToScore []string
// Debug determines if the log should be printed or not. Default: false.
Debug bool
doc *html.Node
documentURI *nurl.URL
articleTitle string
articleByline string
articleDir string
articleSiteName string
attempts []parseAttempt
flags flags
}
// NewParser returns new Parser which set up with default value.
func NewParser() Parser {
return Parser{
MaxElemsToParse: 0,
NTopCandidates: 5,
CharThresholds: 500,
ClassesToPreserve: []string{"page"},
KeepClasses: false,
TagsToScore: []string{"section", "h2", "h3", "h4", "h5", "h6", "p", "td", "pre"},
Debug: false,
}
}
// postProcessContent runs any post-process modifications to article
// content as necessary.
func (ps *Parser) postProcessContent(articleContent *html.Node) {
// Readability cannot open relative uris so we convert them to absolute uris.
ps.fixRelativeURIs(articleContent)
// Remove classes.
if !ps.KeepClasses {
ps.cleanClasses(articleContent)
}
// Remove readability attributes.
ps.clearReadabilityAttr(articleContent)
}
// removeNodes iterates over a NodeList, calls `filterFn` for each node
// and removes node if function returned `true`. If function is not
// passed, removes all the nodes in node list.
func (ps *Parser) removeNodes(nodeList []*html.Node, filterFn func(*html.Node) bool) {
for i := len(nodeList) - 1; i >= 0; i-- {
node := nodeList[i]
parentNode := node.Parent
if parentNode != nil && (filterFn == nil || filterFn(node)) {
parentNode.RemoveChild(node)
}
}
}
// replaceNodeTags iterates over a NodeList, and calls setNodeTag for
// each node.
func (ps *Parser) replaceNodeTags(nodeList []*html.Node, newTagName string) {
for i := len(nodeList) - 1; i >= 0; i-- {
node := nodeList[i]
ps.setNodeTag(node, newTagName)
}
}
// forEachNode iterates over a NodeList and runs fn on each node.
func (ps *Parser) forEachNode(nodeList []*html.Node, fn func(*html.Node, int)) {
for i := 0; i < len(nodeList); i++ {
fn(nodeList[i], i)
}
}
// someNode iterates over a NodeList, return true if any of the
// provided iterate function calls returns true, false otherwise.
func (ps *Parser) someNode(nodeList []*html.Node, fn func(*html.Node) bool) bool {
for i := 0; i < len(nodeList); i++ {
if fn(nodeList[i]) {
return true
}
}
return false
}
// everyNode iterates over a NodeList, return true if all of the
// provided iterate function calls returns true, false otherwise.
func (ps *Parser) everyNode(nodeList []*html.Node, fn func(*html.Node) bool) bool {
for i := 0; i < len(nodeList); i++ {
if !fn(nodeList[i]) {
return false
}
}
return true
}
// concatNodeLists concats all nodelists passed as arguments.
func (ps *Parser) concatNodeLists(nodeLists ...[]*html.Node) []*html.Node {
var result []*html.Node
for i := 0; i < len(nodeLists); i++ {
result = append(result, nodeLists[i]...)
}
return result
}
// getAllNodesWithTag returns all nodes that has tag inside tagNames.
func (ps *Parser) getAllNodesWithTag(node *html.Node, tagNames ...string) []*html.Node {
var result []*html.Node
for i := 0; i < len(tagNames); i++ {
result = append(result, getElementsByTagName(node, tagNames[i])...)
}
return result
}
// cleanClasses removes the class="" attribute from every element in the
// given subtree, except those that match CLASSES_TO_PRESERVE and the
// classesToPreserve array from the options object.
func (ps *Parser) cleanClasses(node *html.Node) {
nodeClassName := className(node)
preservedClassName := []string{}
for _, class := range strings.Fields(nodeClassName) {
if indexOf(ps.ClassesToPreserve, class) != -1 {
preservedClassName = append(preservedClassName, class)
}
}
if len(preservedClassName) > 0 {
setAttribute(node, "class", strings.Join(preservedClassName, " "))
} else {
removeAttribute(node, "class")
}
for child := firstElementChild(node); child != nil; child = nextElementSibling(child) {
ps.cleanClasses(child)
}
}
// fixRelativeURIs converts each <a> and <img> uri in the given element
// to an absolute URI, ignoring #ref URIs.
func (ps *Parser) fixRelativeURIs(articleContent *html.Node) {
links := ps.getAllNodesWithTag(articleContent, "a")
ps.forEachNode(links, func(link *html.Node, _ int) {
href := getAttribute(link, "href")
if href == "" {
return
}
// Replace links with javascript: URIs with text content,
// since they won't work after scripts have been removed
// from the page.
if strings.HasPrefix(href, "javascript:") {
text := createTextNode(textContent(link))
replaceNode(link, text)
} else {
newHref := toAbsoluteURI(href, ps.documentURI)
if newHref == "" {
removeAttribute(link, "href")
} else {
setAttribute(link, "href", newHref)
}
}
})
imgs := ps.getAllNodesWithTag(articleContent, "img")
ps.forEachNode(imgs, func(img *html.Node, _ int) {
src := getAttribute(img, "src")
if src == "" {
return
}
newSrc := toAbsoluteURI(src, ps.documentURI)
if newSrc == "" {
removeAttribute(img, "src")
} else {
setAttribute(img, "src", newSrc)
}
})
}
// getArticleTitle attempts to get the article title.
func (ps *Parser) getArticleTitle() string {
doc := ps.doc
curTitle := ""
origTitle := ""
titleHadHierarchicalSeparators := false
// If they had an element with tag "title" in their HTML
if nodes := getElementsByTagName(doc, "title"); len(nodes) > 0 {
origTitle = ps.getInnerText(nodes[0], true)
curTitle = origTitle
}
// If there's a separator in the title, first remove the final part
if rxTitleSeparator.MatchString(curTitle) {
titleHadHierarchicalSeparators = rxTitleHierarchySep.MatchString(curTitle)
curTitle = rxTitleRemoveFinalPart.ReplaceAllString(origTitle, "$1")
// If the resulting title is too short (3 words or fewer), remove
// the first part instead:
if wordCount(curTitle) < 3 {
curTitle = rxTitleRemove1stPart.ReplaceAllString(origTitle, "$1")
}
} else if strings.Index(curTitle, ": ") != -1 {
// Check if we have an heading containing this exact string, so
// we could assume it's the full title.
headings := ps.concatNodeLists(
getElementsByTagName(doc, "h1"),
getElementsByTagName(doc, "h2"),
)
trimmedTitle := strings.TrimSpace(curTitle)
match := ps.someNode(headings, func(heading *html.Node) bool {
return strings.TrimSpace(textContent(heading)) == trimmedTitle
})
// If we don't, let's extract the title out of the original
// title string.
if !match {
curTitle = origTitle[strings.LastIndex(origTitle, ":")+1:]
// If the title is now too short, try the first colon instead:
if wordCount(curTitle) < 3 {
curTitle = origTitle[strings.Index(origTitle, ":")+1:]
// But if we have too many words before the colon there's
// something weird with the titles and the H tags so let's
// just use the original title instead
} else if wordCount(origTitle[:strings.Index(origTitle, ":")]) > 5 {
curTitle = origTitle
}
}
} else if len(curTitle) > 150 || len(curTitle) < 15 {
if hOnes := getElementsByTagName(doc, "h1"); len(hOnes) == 1 {
curTitle = ps.getInnerText(hOnes[0], true)
}
}
curTitle = strings.TrimSpace(curTitle)
curTitle = rxNormalize.ReplaceAllString(curTitle, " ")
// If we now have 4 words or fewer as our title, and either no
// 'hierarchical' separators (\, /, > or ») were found in the original
// title or we decreased the number of words by more than 1 word, use
// the original title.
curTitleWordCount := wordCount(curTitle)
tmpOrigTitle := rxTitleAnySeparator.ReplaceAllString(origTitle, "")
if curTitleWordCount <= 4 &&
(!titleHadHierarchicalSeparators ||
curTitleWordCount != wordCount(tmpOrigTitle)-1) {
curTitle = origTitle
}
return curTitle
}
// prepDocument prepares the HTML document for readability to scrape it.
// This includes things like stripping javascript, CSS, and handling
// terrible markup.
func (ps *Parser) prepDocument() {
doc := ps.doc
// Remove all style tags in head
ps.removeNodes(getElementsByTagName(doc, "style"), nil)
if nodes := getElementsByTagName(doc, "body"); len(nodes) > 0 && nodes[0] != nil {
ps.replaceBrs(nodes[0])
}
ps.replaceNodeTags(getElementsByTagName(doc, "font"), "span")
}
// nextElement finds the next element, starting from the given node, and
// ignoring whitespace in between. If the given node is an element, the
// same node is returned.
func (ps *Parser) nextElement(node *html.Node) *html.Node {
next := node
for next != nil && next.Type != html.ElementNode && rxWhitespace.MatchString(textContent(next)) {
next = next.NextSibling
}
return next
}
// replaceBrs replaces 2 or more successive <br> with a single <p>.
// Whitespace between <br> elements are ignored. For example:
// <div>foo<br>bar<br> <br><br>abc</div>
// will become:
// <div>foo<br>bar<p>abc</p></div>
func (ps *Parser) replaceBrs(elem *html.Node) {
ps.forEachNode(ps.getAllNodesWithTag(elem, "br"), func(br *html.Node, _ int) {
next := br.NextSibling
// Whether 2 or more <br> elements have been found and replaced
// with a <p> block.
replaced := false
// If we find a <br> chain, remove the <br>s until we hit another
// element or non-whitespace. This leaves behind the first <br>
// in the chain (which will be replaced with a <p> later).
for {
next = ps.nextElement(next)
if next == nil || tagName(next) != "br" {
break
}
replaced = true
brSibling := next.NextSibling
next.Parent.RemoveChild(next)
next = brSibling
}
// If we removed a <br> chain, replace the remaining <br> with a <p>. Add
// all sibling nodes as children of the <p> until we hit another <br>
// chain.
if replaced {
p := createElement("p")
replaceNode(br, p)
next = p.NextSibling
for next != nil {
// If we've hit another <br><br>, we're done adding children to this <p>.
if tagName(next) == "br" {
nextElem := ps.nextElement(next.NextSibling)
if nextElem != nil && tagName(nextElem) == "br" {
break
}
}
if !ps.isPhrasingContent(next) {
break
}
// Otherwise, make this node a child of the new <p>.
sibling := next.NextSibling
appendChild(p, next)
next = sibling
}
for p.LastChild != nil && ps.isWhitespace(p.LastChild) {
p.RemoveChild(p.LastChild)
}
if tagName(p.Parent) == "p" {
ps.setNodeTag(p.Parent, "div")
}
}
})
}
// setNodeTag changes tag of the node to newTagName.
func (ps *Parser) setNodeTag(node *html.Node, newTagName string) {
if node.Type == html.ElementNode {
node.Data = newTagName
}
}
// prepArticle prepares the article node for display. Clean out any
// inline styles, iframes, forms, strip extraneous <p> tags, etc.
func (ps *Parser) prepArticle(articleContent *html.Node) {
ps.cleanStyles(articleContent)
// Check for data tables before we continue, to avoid removing
// items in those tables, which will often be isolated even
// though they're visually linked to other content-ful elements
// (text, images, etc.).
ps.markDataTables(articleContent)
ps.fixLazyImages(articleContent)
// Clean out junk from the article content
ps.cleanConditionally(articleContent, "form")
ps.cleanConditionally(articleContent, "fieldset")
ps.clean(articleContent, "object")
ps.clean(articleContent, "embed")
ps.clean(articleContent, "h1")
ps.clean(articleContent, "footer")
ps.clean(articleContent, "link")
ps.clean(articleContent, "aside")
// Clean out elements have "share" in their id/class combinations
// from final top candidates, which means we don't remove the top
// candidates even they have "share".
shareElementThreshold := ps.CharThresholds
ps.forEachNode(children(articleContent), func(topCandidate *html.Node, _ int) {
ps.cleanMatchedNodes(topCandidate, func(node *html.Node, nodeClassID string) bool {
return rxShareElements.MatchString(nodeClassID) && len(textContent(node)) < shareElementThreshold
})
})
// If there is only one h2 and its text content substantially
// equals article title, they are probably using it as a header
// and not a subheader, so remove it since we already extract
// the title separately.
if h2s := getElementsByTagName(articleContent, "h2"); len(h2s) == 1 {
h2 := h2s[0]
h2Text := textContent(h2)
lengthSimilarRate := float64(len(h2Text)-len(ps.articleTitle)) / float64(len(ps.articleTitle))
if math.Abs(lengthSimilarRate) < 0.5 {
titlesMatch := false
if lengthSimilarRate > 0 {
titlesMatch = strings.Contains(h2Text, ps.articleTitle)
} else {
titlesMatch = strings.Contains(ps.articleTitle, h2Text)
}
if titlesMatch {
ps.clean(articleContent, "h2")
}
}
}
ps.clean(articleContent, "iframe")
ps.clean(articleContent, "input")
ps.clean(articleContent, "textarea")
ps.clean(articleContent, "select")
ps.clean(articleContent, "button")
ps.cleanHeaders(articleContent)
// Do these last as the previous stuff may have removed junk
// that will affect these
ps.cleanConditionally(articleContent, "table")
ps.cleanConditionally(articleContent, "ul")
ps.cleanConditionally(articleContent, "div")
// Remove extra paragraphs
ps.removeNodes(getElementsByTagName(articleContent, "p"), func(p *html.Node) bool {
imgCount := len(getElementsByTagName(p, "img"))
embedCount := len(getElementsByTagName(p, "embed"))
objectCount := len(getElementsByTagName(p, "object"))
// At this point, nasty iframes have been removed, only
// remain embedded video ones.
iframeCount := len(getElementsByTagName(p, "iframe"))
totalCount := imgCount + embedCount + objectCount + iframeCount
return totalCount == 0 && ps.getInnerText(p, false) == ""
})
ps.forEachNode(getElementsByTagName(articleContent, "br"), func(br *html.Node, _ int) {
next := ps.nextElement(br.NextSibling)
if next != nil && tagName(next) == "p" {
br.Parent.RemoveChild(br)
}
})
// Remove single-cell tables
ps.forEachNode(getElementsByTagName(articleContent, "table"), func(table *html.Node, _ int) {
tbody := table
if ps.hasSingleTagInsideElement(table, "tbody") {
tbody = firstElementChild(table)
}
if ps.hasSingleTagInsideElement(tbody, "tr") {
row := firstElementChild(tbody)
if ps.hasSingleTagInsideElement(row, "td") {
cell := firstElementChild(row)
newTag := "div"
if ps.everyNode(childNodes(cell), ps.isPhrasingContent) {
newTag = "p"
}
ps.setNodeTag(cell, newTag)
replaceNode(table, cell)
}
}
})
}
// initializeNode initializes a node with the readability score.
// Also checks the className/id for special names to add to its score.
func (ps *Parser) initializeNode(node *html.Node) {
contentScore := float64(ps.getClassWeight(node))
switch tagName(node) {
case "div":
contentScore += 5
case "pre", "td", "blockquote":
contentScore += 3
case "address", "ol", "ul", "dl", "dd", "dt", "li", "form":
contentScore -= 3
case "h1", "h2", "h3", "h4", "h5", "h6", "th":
contentScore -= 5
}
ps.setContentScore(node, contentScore)
}
// removeAndGetNext remove node and returns its next node.
func (ps *Parser) removeAndGetNext(node *html.Node) *html.Node {
nextNode := ps.getNextNode(node, true)
if node.Parent != nil {
node.Parent.RemoveChild(node)
}
return nextNode
}
// getNextNode traverses the DOM from node to node, starting at the
// node passed in. Pass true for the second parameter to indicate
// this node itself (and its kids) are going away, and we want the
// next node over. Calling this in a loop will traverse the DOM
// depth-first.
// In Readability.js, ignoreSelfAndKids default to false.
func (ps *Parser) getNextNode(node *html.Node, ignoreSelfAndKids bool) *html.Node {
// First check for kids if those aren't being ignored
if firstChild := firstElementChild(node); !ignoreSelfAndKids && firstChild != nil {
return firstChild
}
// Then for siblings...
if sibling := nextElementSibling(node); sibling != nil {
return sibling
}
// And finally, move up the parent chain *and* find a sibling
// (because this is depth-first traversal, we will have already
// seen the parent nodes themselves).
for {
node = node.Parent
if node == nil || nextElementSibling(node) != nil {
break
}
}
if node != nil {
return nextElementSibling(node)
}
return nil
}
// checkByline determines if a node is used as byline.
func (ps *Parser) checkByline(node *html.Node, matchString string) bool {
if ps.articleByline != "" {
return false
}
rel := getAttribute(node, "rel")
itemprop := getAttribute(node, "itemprop")
nodeText := textContent(node)
if (rel == "author" || strings.Contains(itemprop, "author") || rxByline.MatchString(matchString)) &&
ps.isValidByline(nodeText) {
nodeText = strings.TrimSpace(nodeText)
nodeText = strings.Join(strings.Fields(nodeText), " ")
ps.articleByline = nodeText
return true
}
return false
}
// getNodeAncestors gets the node's direct parent and grandparents.
// In Readability.js, maxDepth default to 0.
func (ps *Parser) getNodeAncestors(node *html.Node, maxDepth int) []*html.Node {
i := 0
var ancestors []*html.Node
for node.Parent != nil {
i++
ancestors = append(ancestors, node.Parent)
if maxDepth > 0 && i == maxDepth {
break
}
node = node.Parent
}
return ancestors
}
// grabArticle uses a variety of metrics (content score, classname,
// element types), find the content that is most likely to be the
// stuff a user wants to read. Then return it wrapped up in a div.
func (ps *Parser) grabArticle() *html.Node {
for {
doc := cloneNode(ps.doc)
var page *html.Node
if nodes := getElementsByTagName(doc, "body"); len(nodes) > 0 {
page = nodes[0]
}
// We can't grab an article if we don't have a page!
if page == nil {
return nil
}
// First, node prepping. Trash nodes that look cruddy (like ones
// with the class name "comment", etc), and turn divs into P
// tags where they have been used inappropriately (as in, where
// they contain no other block level elements.)
var elementsToScore []*html.Node
var node = documentElement(doc)
for node != nil {
matchString := className(node) + " " + id(node)
if !ps.isProbablyVisible(node) {
node = ps.removeAndGetNext(node)
continue
}
// Check to see if this node is a byline, and remove it if
// it is true.
if ps.checkByline(node, matchString) {
node = ps.removeAndGetNext(node)
continue
}
// Remove unlikely candidates
nodeTagName := tagName(node)
if ps.flags.stripUnlikelys {
if rxUnlikelyCandidates.MatchString(matchString) &&
!rxOkMaybeItsACandidate.MatchString(matchString) &&
!ps.hasAncestorTag(node, "table", 3, nil) &&
nodeTagName != "body" && nodeTagName != "a" {
node = ps.removeAndGetNext(node)
continue
}
}
// Remove DIV, SECTION, and HEADER nodes without any
// content(e.g. text, image, video, or iframe).
switch nodeTagName {
case "div", "section", "header",
"h1", "h2", "h3", "h4", "h5", "h6":
if ps.isElementWithoutContent(node) {
node = ps.removeAndGetNext(node)
continue
}
}
if indexOf(ps.TagsToScore, nodeTagName) != -1 {
elementsToScore = append(elementsToScore, node)
}
// Turn all divs that don't have children block level
// elements into p's
if nodeTagName == "div" {
// Put phrasing content into paragraphs.
var p *html.Node
childNode := node.FirstChild
for childNode != nil {
nextSibling := childNode.NextSibling
if ps.isPhrasingContent(childNode) {
if p != nil {
appendChild(p, childNode)
} else if !ps.isWhitespace(childNode) {
p = createElement("p")
appendChild(p, cloneNode(childNode))
replaceNode(childNode, p)
}
} else if p != nil {
for p.LastChild != nil && ps.isWhitespace(p.LastChild) {
p.RemoveChild(p.LastChild)
}
p = nil
}
childNode = nextSibling
}
// Sites like http://mobile.slate.com encloses each
// paragraph with a DIV element. DIVs with only a P
// element inside and no text content can be safely
// converted into plain P elements to avoid confusing
// the scoring algorithm with DIVs with are, in
// practice, paragraphs.
if ps.hasSingleTagInsideElement(node, "p") && ps.getLinkDensity(node) < 0.25 {
newNode := children(node)[0]
replaceNode(node, newNode)
node = newNode
elementsToScore = append(elementsToScore, node)
} else if !ps.hasChildBlockElement(node) {
ps.setNodeTag(node, "p")
elementsToScore = append(elementsToScore, node)
}
}
node = ps.getNextNode(node, false)
}
// Loop through all paragraphs, and assign a score to them based
// on how content-y they look. Then add their score to their
// parent node. A score is determined by things like number of
// commas, class names, etc. Maybe eventually link density.
var candidates []*html.Node
ps.forEachNode(elementsToScore, func(elementToScore *html.Node, _ int) {
if elementToScore.Parent == nil || tagName(elementToScore.Parent) == "" {
return
}
// If this paragraph is less than 25 characters, don't even count it.
innerText := ps.getInnerText(elementToScore, true)
if len(innerText) < 25 {
return
}
// Exclude nodes with no ancestor.
ancestors := ps.getNodeAncestors(elementToScore, 3)
if len(ancestors) == 0 {
return
}
// Add a point for the paragraph itself as a base.
contentScore := 1
// Add points for any commas within this paragraph.
contentScore += strings.Count(innerText, ",")
// For every 100 characters in this paragraph, add another point. Up to 3 points.
contentScore += int(math.Min(math.Floor(float64(len(innerText))/100.0), 3.0))
// Initialize and score ancestors.
ps.forEachNode(ancestors, func(ancestor *html.Node, level int) {
if tagName(ancestor) == "" || ancestor.Parent == nil || ancestor.Parent.Type != html.ElementNode {
return
}
if !ps.hasContentScore(ancestor) {
ps.initializeNode(ancestor)
candidates = append(candidates, ancestor)
}
// Node score divider:
// - parent: 1 (no division)
// - grandparent: 2
// - great grandparent+: ancestor level * 3
scoreDivider := 1
switch level {
case 0:
scoreDivider = 1
case 1:
scoreDivider = 2
default:
scoreDivider = level * 3
}
ancestorScore := ps.getContentScore(ancestor)
ancestorScore += float64(contentScore) / float64(scoreDivider)
ps.setContentScore(ancestor, ancestorScore)
})
})
// These lines are a bit different compared to Readability.js.
// In Readability.js, they fetch NTopCandidates utilising array
// method like `splice` and `pop`. In Go, array method like that
// is not as simple, especially since we are working with pointer.
// So, here we simply sort top candidates, and limit it to
// max NTopCandidates.
// Scale the final candidates score based on link density. Good
// content should have a relatively small link density (5% or
// less) and be mostly unaffected by this operation.
for i := 0; i < len(candidates); i++ {
candidate := candidates[i]
candidateScore := ps.getContentScore(candidate) * (1 - ps.getLinkDensity(candidate))
ps.setContentScore(candidate, candidateScore)
}
// After we've calculated scores, sort through all of the possible
// candidate nodes we found and find the one with the highest score.
sort.Slice(candidates, func(i int, j int) bool {
return ps.getContentScore(candidates[i]) > ps.getContentScore(candidates[j])
})
var topCandidates []*html.Node
if len(candidates) > ps.NTopCandidates {
topCandidates = candidates[:ps.NTopCandidates]
} else {
topCandidates = candidates
}
var topCandidate, parentOfTopCandidate *html.Node
neededToCreateTopCandidate := false
if len(topCandidates) > 0 {
topCandidate = topCandidates[0]
}
// If we still have no top candidate, just use the body as a last
// resort. We also have to copy the body node so it is something
// we can modify.
if topCandidate == nil || tagName(topCandidate) == "body" {
// Move all of the page's children into topCandidate
topCandidate = createElement("div")
neededToCreateTopCandidate = true
// Move everything (not just elements, also text nodes etc.)
// into the container so we even include text directly in the body:
kids := childNodes(page)
for i := 0; i < len(kids); i++ {
appendChild(topCandidate, kids[i])
}
appendChild(page, topCandidate)
ps.initializeNode(topCandidate)
} else if topCandidate != nil {
// Find a better top candidate node if it contains (at least three)
// nodes which belong to `topCandidates` array and whose scores are
// quite closed with current `topCandidate` node.
topCandidateScore := ps.getContentScore(topCandidate)
var alternativeCandidateAncestors [][]*html.Node
for i := 1; i < len(topCandidates); i++ {
if ps.getContentScore(topCandidates[i])/topCandidateScore >= 0.75 {
topCandidateAncestors := ps.getNodeAncestors(topCandidates[i], 0)
alternativeCandidateAncestors = append(alternativeCandidateAncestors, topCandidateAncestors)
}
}
minimumTopCandidates := 3
if len(alternativeCandidateAncestors) >= minimumTopCandidates {
parentOfTopCandidate = topCandidate.Parent
for parentOfTopCandidate != nil && tagName(parentOfTopCandidate) != "body" {
listContainingThisAncestor := 0
for ancestorIndex := 0; ancestorIndex < len(alternativeCandidateAncestors) && listContainingThisAncestor < minimumTopCandidates; ancestorIndex++ {
if includeNode(alternativeCandidateAncestors[ancestorIndex], parentOfTopCandidate) {
listContainingThisAncestor++
}
}
if listContainingThisAncestor >= minimumTopCandidates {
topCandidate = parentOfTopCandidate
break
}
parentOfTopCandidate = parentOfTopCandidate.Parent
}
}
if !ps.hasContentScore(topCandidate) {
ps.initializeNode(topCandidate)
}
// Because of our bonus system, parents of candidates might
// have scores themselves. They get half of the node. There
// won't be nodes with higher scores than our topCandidate,
// but if we see the score going *up* in the first few steps *
// up the tree, that's a decent sign that there might be more
// content lurking in other places that we want to unify in.
// The sibling stuff below does some of that - but only if
// we've looked high enough up the DOM tree.
parentOfTopCandidate = topCandidate.Parent
lastScore := ps.getContentScore(topCandidate)
// The scores shouldn't get too lops.
scoreThreshold := lastScore / 3.0
for parentOfTopCandidate != nil && tagName(parentOfTopCandidate) != "body" {
if !ps.hasContentScore(parentOfTopCandidate) {
parentOfTopCandidate = parentOfTopCandidate.Parent
continue
}
parentScore := ps.getContentScore(parentOfTopCandidate)
if parentScore < scoreThreshold {
break
}
if parentScore > lastScore {
// Alright! We found a better parent to use.
topCandidate = parentOfTopCandidate
break
}
lastScore = parentScore
parentOfTopCandidate = parentOfTopCandidate.Parent
}
// If the top candidate is the only child, use parent
// instead. This will help sibling joining logic when
// adjacent content is actually located in parent's
// sibling node.
parentOfTopCandidate = topCandidate.Parent
for parentOfTopCandidate != nil && tagName(parentOfTopCandidate) != "body" && len(children(parentOfTopCandidate)) == 1 {
topCandidate = parentOfTopCandidate
parentOfTopCandidate = topCandidate.Parent
}
if !ps.hasContentScore(topCandidate) {
ps.initializeNode(topCandidate)
}
}
// Now that we have the top candidate, look through its siblings
// for content that might also be related. Things like preambles,
// content split by ads that we removed, etc.
articleContent := createElement("div")
siblingScoreThreshold := math.Max(10, ps.getContentScore(topCandidate)*0.2)
// Keep potential top candidate's parent node to try to get text direction of it later.
topCandidateScore := ps.getContentScore(topCandidate)
topCandidateClassName := className(topCandidate)
parentOfTopCandidate = topCandidate.Parent
siblings := children(parentOfTopCandidate)
for s := 0; s < len(siblings); s++ {
sibling := siblings[s]
appendNode := false
if sibling == topCandidate {
appendNode = true
} else {
contentBonus := float64(0)
// Give a bonus if sibling nodes and top candidates have the example same classname
if className(sibling) == topCandidateClassName && topCandidateClassName != "" {
contentBonus += topCandidateScore * 0.2
}
if ps.hasContentScore(sibling) && ps.getContentScore(sibling)+contentBonus >= siblingScoreThreshold {
appendNode = true
} else if tagName(sibling) == "p" {
linkDensity := ps.getLinkDensity(sibling)
nodeContent := ps.getInnerText(sibling, true)
nodeLength := len(nodeContent)
if nodeLength > 80 && linkDensity < 0.25 {
appendNode = true
} else if nodeLength < 80 && nodeLength > 0 && linkDensity == 0 &&
rxSentencePeriod.MatchString(nodeContent) {
appendNode = true
}
}
}
if appendNode {
// We have a node that isn't a common block level
// element, like a form or td tag. Turn it into a div
// so it doesn't get filtered out later by accident.
if indexOf(alterToDivExceptions, tagName(sibling)) == -1 {
ps.setNodeTag(sibling, "div")
}
appendChild(articleContent, sibling)
}
}
// So we have all of the content that we need. Now we clean
// it up for presentation.
ps.prepArticle(articleContent)
if neededToCreateTopCandidate {
// We already created a fake div thing, and there wouldn't
// have been any siblings left for the previous loop, so
// there's no point trying to create a new div, and then
// move all the children over. Just assign IDs and class
// names here. No need to append because that already
// happened anyway.
//
// By the way, this line is different with Readability.js.
// In Readability.js, when using `appendChild`, the node is
// still referenced. Meanwhile here, our `appendChild` will
// clone the node, put it in the new place, then delete
// the original.
firstChild := firstElementChild(articleContent)
if firstChild != nil && tagName(firstChild) == "div" {
setAttribute(firstChild, "id", "readability-page-1")
setAttribute(firstChild, "class", "page")
}
} else {
div := createElement("div")
setAttribute(div, "id", "readability-page-1")
setAttribute(div, "class", "page")
childs := childNodes(articleContent)
for i := 0; i < len(childs); i++ {
appendChild(div, childs[i])
}
appendChild(articleContent, div)
}
parseSuccessful := true
// Now that we've gone through the full algorithm, check to
// see if we got any meaningful content. If we didn't, we may
// need to re-run grabArticle with different flags set. This
// gives us a higher likelihood of finding the content, and
// the sieve approach gives us a higher likelihood of
// finding the -right- content.
textLength := len(ps.getInnerText(articleContent, true))
if textLength < ps.CharThresholds {
parseSuccessful = false
if ps.flags.stripUnlikelys {
ps.flags.stripUnlikelys = false
ps.attempts = append(ps.attempts, parseAttempt{
articleContent: articleContent,
textLength: textLength,
})
} else if ps.flags.useWeightClasses {
ps.flags.useWeightClasses = false
ps.attempts = append(ps.attempts, parseAttempt{
articleContent: articleContent,
textLength: textLength,
})
} else if ps.flags.cleanConditionally {
ps.flags.cleanConditionally = false
ps.attempts = append(ps.attempts, parseAttempt{
articleContent: articleContent,
textLength: textLength,
})
} else {
ps.attempts = append(ps.attempts, parseAttempt{
articleContent: articleContent,
textLength: textLength,
})
// No luck after removing flags, just return the
// longest text we found during the different loops *
sort.Slice(ps.attempts, func(i, j int) bool {
return ps.attempts[i].textLength > ps.attempts[j].textLength
})
// But first check if we actually have something
if ps.attempts[0].textLength == 0 {
return nil
}
articleContent = ps.attempts[0].articleContent
parseSuccessful = true
}
}
if parseSuccessful {
return articleContent
}
}
}
// isValidByline checks whether the input string could be a byline.
// This verifies that the input is a string, and that the length
// is less than 100 chars.
func (ps *Parser) isValidByline(byline string) bool {
byline = strings.TrimSpace(byline)
return len(byline) > 0 && len(byline) < 100
}
// getArticleMetadata attempts to get excerpt and byline
// metadata for the article.
func (ps *Parser) getArticleMetadata() map[string]string {
values := make(map[string]string)
metaElements := getElementsByTagName(ps.doc, "meta")
// Find description tags.
ps.forEachNode(metaElements, func(element *html.Node, _ int) {
elementName := getAttribute(element, "name")
elementProperty := getAttribute(element, "property")
content := getAttribute(element, "content")
if content == "" {
return
}
matches := []string{}
name := ""
if elementProperty != "" {
matches = rxPropertyPattern.FindAllString(elementProperty, -1)
for i := len(matches) - 1; i >= 0; i-- {
// Convert to lowercase, and remove any whitespace
// so we can match belops.
name = strings.ToLower(matches[i])
name = strings.Join(strings.Fields(name), "")
// multiple authors
values[name] = strings.TrimSpace(content)
}
}
if len(matches) == 0 && elementName != "" && rxNamePattern.MatchString(elementName) {
// Convert to lowercase, remove any whitespace, and convert
// dots to colons so we can match belops.
name = strings.ToLower(elementName)
name = strings.Join(strings.Fields(name), "")
name = strings.Replace(name, ".", ":", -1)
values[name] = strings.TrimSpace(content)
}
})
// get title
metadataTitle := ""
possibleAttrNames := []string{
"dc:title", "dcterm:title", "og:title", "weibo:article:title",
"weibo:webpage:title", "title", "twitter:title"}
for _, name := range possibleAttrNames {
if value, ok := values[name]; ok {
metadataTitle = value
break
}
}
if metadataTitle == "" {
metadataTitle = ps.getArticleTitle()
}
// get author
metadataByline := ""
possibleAttrNames = []string{"dc:creator", "dcterm:creator", "author"}
for _, name := range possibleAttrNames {
if value, ok := values[name]; ok {
metadataByline = value
break
}
}
// get description
metadataExcerpt := ""
possibleAttrNames = []string{
"dc:description", "dcterm:description", "og:description",
"weibo:article:description", "weibo:webpage:description",
"description", "twitter:description"}
for _, name := range possibleAttrNames {
if value, ok := values[name]; ok {
metadataExcerpt = value
break
}
}
// get site name
metadataSiteName := values["og:site_name"]
// get image thumbnail
metadataImage := ""
possibleAttrNames = []string{"og:image", "image", "twitter:image"}
for _, name := range possibleAttrNames {
if value, ok := values[name]; ok {
metadataImage = toAbsoluteURI(value, ps.documentURI)
break
}
}
// get favicon
metadataFavicon := ps.getArticleFavicon()
// in some sites, excerpt is used with HTML encoding,
// so here we unescape it.
metadataExcerpt = shtml.UnescapeString(metadataExcerpt)
return map[string]string{
"title": metadataTitle,
"byline": metadataByline,
"excerpt": metadataExcerpt,
"siteName": metadataSiteName,
"image": metadataImage,
"favicon": metadataFavicon,
}
}
// removeScripts removes script tags from the document.
func (ps *Parser) removeScripts(doc *html.Node) {
scripts := getElementsByTagName(doc, "script")
noScripts := getElementsByTagName(doc, "noscript")
ps.removeNodes(scripts, nil)
ps.removeNodes(noScripts, nil)
}
// hasSingleTagInsideElement check if this node has only whitespace
// and a single element with given tag. Returns false if the DIV node
// contains non-empty text nodes or if it contains no element with
// given tag or more than 1 element.
func (ps *Parser) hasSingleTagInsideElement(element *html.Node, tag string) bool {
// There should be exactly 1 element child with given tag
if childs := children(element); len(childs) != 1 || tagName(childs[0]) != tag {
return false
}
// And there should be no text nodes with real content
return !ps.someNode(childNodes(element), func(node *html.Node) bool {
return node.Type == html.TextNode && rxHasContent.MatchString(textContent(node))
})
}
// isElementWithoutContent determines if node is empty
// or only fille with <br> and <hr>.
func (ps *Parser) isElementWithoutContent(node *html.Node) bool {
brs := getElementsByTagName(node, "br")
hrs := getElementsByTagName(node, "hr")
childs := children(node)
return node.Type == html.ElementNode &&
strings.TrimSpace(textContent(node)) == "" &&
(len(childs) == 0 || len(childs) == len(brs)+len(hrs))
}
// hasChildBlockElement determines whether element has any children
// block level elements.
func (ps *Parser) hasChildBlockElement(element *html.Node) bool {
return ps.someNode(childNodes(element), func(node *html.Node) bool {
return indexOf(divToPElems, tagName(node)) != -1 ||
ps.hasChildBlockElement(node)
})
}
// isPhrasingContent determines if a node qualifies as phrasing content.
func (ps *Parser) isPhrasingContent(node *html.Node) bool {
nodeTagName := tagName(node)
return node.Type == html.TextNode || indexOf(phrasingElems, nodeTagName) != -1 ||
((nodeTagName == "a" || nodeTagName == "del" || nodeTagName == "ins") &&
ps.everyNode(childNodes(node), ps.isPhrasingContent))
}
// isWhitespace determines if a node only used as whitespace.
func (ps *Parser) isWhitespace(node *html.Node) bool {
return (node.Type == html.TextNode && strings.TrimSpace(textContent(node)) == "") ||
(node.Type == html.ElementNode && tagName(node) == "br")
}
// getInnerText gets the inner text of a node.
// This also strips * out any excess whitespace to be found.
// In Readability.js, normalizeSpaces default to true.
func (ps *Parser) getInnerText(node *html.Node, normalizeSpaces bool) string {
textContent := strings.TrimSpace(textContent(node))
if normalizeSpaces {
textContent = rxNormalize.ReplaceAllString(textContent, " ")
}
return textContent
}
// getCharCount returns the number of times a string s
// appears in the node.
func (ps *Parser) getCharCount(node *html.Node, s string) int {
innerText := ps.getInnerText(node, true)
return strings.Count(innerText, s)
}
// cleanStyles removes the style attribute on every node and under.
func (ps *Parser) cleanStyles(node *html.Node) {
nodeTagName := tagName(node)
if node == nil || nodeTagName == "svg" {
return
}
// Remove `style` and deprecated presentational attributes
for i := 0; i < len(presentationalAttributes); i++ {
removeAttribute(node, presentationalAttributes[i])
}
if indexOf(deprecatedSizeAttributeElems, nodeTagName) != -1 {
removeAttribute(node, "width")
removeAttribute(node, "height")
}
for child := firstElementChild(node); child != nil; child = nextElementSibling(child) {
ps.cleanStyles(child)
}
}
// getLinkDensity gets the density of links as a percentage of the
// content. This is the amount of text that is inside a link divided
// by the total text in the node.
func (ps *Parser) getLinkDensity(element *html.Node) float64 {
textLength := len(ps.getInnerText(element, true))
if textLength == 0 {
return 0
}
linkLength := 0
ps.forEachNode(getElementsByTagName(element, "a"), func(linkNode *html.Node, _ int) {
linkLength += len(ps.getInnerText(linkNode, true))
})
return float64(linkLength) / float64(textLength)
}
// getClassWeight gets an elements class/id weight. Uses regular
// expressions to tell if this element looks good or bad.
func (ps *Parser) getClassWeight(node *html.Node) int {
if !ps.flags.useWeightClasses {
return 0
}
weight := 0
// Look for a special classname
if nodeClassName := className(node); nodeClassName != "" {
if rxNegative.MatchString(nodeClassName) {
weight -= 25
}
if rxPositive.MatchString(nodeClassName) {
weight += 25
}
}
// Look for a special ID
if nodeID := id(node); nodeID != "" {
if rxNegative.MatchString(nodeID) {
weight -= 25
}
if rxPositive.MatchString(nodeID) {
weight += 25
}
}
return weight
}
// clean cleans a node of all elements of type "tag".
// (Unless it's a youtube/vimeo video. People love movies.)
func (ps *Parser) clean(node *html.Node, tag string) {
isEmbed := indexOf([]string{"object", "embed", "iframe"}, tag) != -1
ps.removeNodes(getElementsByTagName(node, tag), func(element *html.Node) bool {
// Allow youtube and vimeo videos through as people usually want to see those.
if isEmbed {
// First, check the elements attributes to see if any of them contain
// youtube or vimeo
for _, attr := range element.Attr {
if rxVideos.MatchString(attr.Val) {
return false
}
}
// For embed with <object> tag, check inner HTML as well.
if tagName(element) == "object" && rxVideos.MatchString(innerHTML(element)) {
return false
}
}
return true
})
}
// hasAncestorTag checks if a given node has one of its ancestor tag
// name matching the provided one. In Readability.js, default value
// for maxDepth is 3.
func (ps *Parser) hasAncestorTag(node *html.Node, tag string, maxDepth int, filterFn func(*html.Node) bool) bool {
depth := 0
for node.Parent != nil {
if maxDepth > 0 && depth > maxDepth {
return false
}
if tagName(node.Parent) == tag && (filterFn == nil || filterFn(node.Parent)) {
return true
}
node = node.Parent
depth++
}
return false
}
// getRowAndColumnCount returns how many rows and columns this table has.
func (ps *Parser) getRowAndColumnCount(table *html.Node) (int, int) {
rows := 0
columns := 0
trs := getElementsByTagName(table, "tr")
for i := 0; i < len(trs); i++ {
strRowSpan := getAttribute(trs[i], "rowspan")
rowSpan, _ := strconv.Atoi(strRowSpan)
if rowSpan == 0 {
rowSpan = 1
}
rows += rowSpan
// Now look for column-related info
columnsInThisRow := 0
cells := getElementsByTagName(trs[i], "td")
for j := 0; j < len(cells); j++ {
strColSpan := getAttribute(cells[j], "colspan")
colSpan, _ := strconv.Atoi(strColSpan)
if colSpan == 0 {
colSpan = 1
}
columnsInThisRow += colSpan
}
if columnsInThisRow > columns {
columns = columnsInThisRow
}
}
return rows, columns
}
// markDataTables looks for 'data' (as opposed to 'layout') tables
// and mark it.
func (ps *Parser) markDataTables(root *html.Node) {
tables := getElementsByTagName(root, "table")
for i := 0; i < len(tables); i++ {
table := tables[i]
role := getAttribute(table, "role")
if role == "presentation" {
ps.setReadabilityDataTable(table, false)
continue
}
datatable := getAttribute(table, "datatable")
if datatable == "0" {
ps.setReadabilityDataTable(table, false)
continue
}
if hasAttribute(table, "summary") {
ps.setReadabilityDataTable(table, true)
continue
}
if captions := getElementsByTagName(table, "caption"); len(captions) > 0 {
if caption := captions[0]; caption != nil && len(childNodes(caption)) > 0 {
ps.setReadabilityDataTable(table, true)
continue
}
}
// If the table has a descendant with any of these tags, consider a data table:
hasDataTableDescendantTags := false
for _, descendantTag := range []string{"col", "colgroup", "tfoot", "thead", "th"} {
descendants := getElementsByTagName(table, descendantTag)
if len(descendants) > 0 && descendants[0] != nil {
hasDataTableDescendantTags = true
break
}
}
if hasDataTableDescendantTags {
ps.setReadabilityDataTable(table, true)
continue
}
// Nested tables indicates a layout table:
if len(getElementsByTagName(table, "table")) > 0 {
ps.setReadabilityDataTable(table, false)
continue
}
rows, columns := ps.getRowAndColumnCount(table)
if rows >= 10 || columns > 4 {
ps.setReadabilityDataTable(table, true)
continue
}
// Now just go by size entirely:
if rows*columns > 10 {
ps.setReadabilityDataTable(table, true)
}
}
}
// fixLazyImages convert images and figures that have properties like data-src into
// images that can be loaded without JS.
func (ps *Parser) fixLazyImages(root *html.Node) {
imageNodes := ps.getAllNodesWithTag(root, "img", "picture", "figure")
ps.forEachNode(imageNodes, func(elem *html.Node, _ int) {
src := getAttribute(elem, "src")
srcset := getAttribute(elem, "srcset")
nodeTag := tagName(elem)
nodeClass := className(elem)
if (src == "" && srcset == "") || strings.Contains(strings.ToLower(nodeClass), "lazy") {
for i := 0; i < len(elem.Attr); i++ {
attr := elem.Attr[i]
if attr.Key == "src" || attr.Key == "srcset" {
continue
}
copyTo := ""
if rxLazyImageSrcset.MatchString(attr.Val) {
copyTo = "srcset"
} else if rxLazyImageSrc.MatchString(attr.Val) {
copyTo = "src"
}
if copyTo == "" {
continue
}
if nodeTag == "img" || nodeTag == "picture" {
// if this is an img or picture, set the attribute directly
setAttribute(elem, copyTo, attr.Val)
} else if nodeTag == "figure" && len(ps.getAllNodesWithTag(elem, "img", "picture")) == 0 {
// if the item is a <figure> that does not contain an image or picture,
// create one and place it inside the figure see the nytimes-3
// testcase for an example
img := createElement("img")
setAttribute(img, copyTo, attr.Val)
appendChild(elem, img)
}
}
}
})
}
// cleanConditionally cleans an element of all tags of type "tag" if
// they look fishy. "Fishy" is an algorithm based on content length,
// classnames, link density, number of images & embeds, etc.
func (ps *Parser) cleanConditionally(element *html.Node, tag string) {
if !ps.flags.cleanConditionally {
return
}
isList := tag == "ul" || tag == "ol"
// Gather counts for other typical elements embedded within.
// Traverse backwards so we can remove nodes at the same time
// without effecting the traversal.
ps.removeNodes(getElementsByTagName(element, tag), func(node *html.Node) bool {
if tag == "table" && ps.isReadabilityDataTable(node) {
return false
}
if ps.hasAncestorTag(node, "table", -1, ps.isReadabilityDataTable) {
return false
}
weight := ps.getClassWeight(node)
if weight < 0 {
return true
}
if ps.getCharCount(node, ",") < 10 {
// If there are not very many commas, and the number of
// non-paragraph elements is more than paragraphs or other
// ominous signs, remove the element.
p := float64(len(getElementsByTagName(node, "p")))
img := float64(len(getElementsByTagName(node, "img")))
li := float64(len(getElementsByTagName(node, "li")) - 100)
input := float64(len(getElementsByTagName(node, "input")))
embedCount := 0
embeds := ps.concatNodeLists(
getElementsByTagName(node, "object"),
getElementsByTagName(node, "embed"),
getElementsByTagName(node, "iframe"))
for _, embed := range embeds {
// If this embed has attribute that matches video regex,
// don't delete it.
for _, attr := range embed.Attr {
if rxVideos.MatchString(attr.Val) {
return false
}
}
// For embed with <object> tag, check inner HTML as well.
if tagName(embed) == "object" && rxVideos.MatchString(innerHTML(embed)) {
return false
}
embedCount++
}
linkDensity := ps.getLinkDensity(node)
contentLength := len(ps.getInnerText(node, true))
return (img > 1 && p/img < 0.5 && !ps.hasAncestorTag(node, "figure", 3, nil)) ||
(!isList && li > p) ||
(input > math.Floor(p/3)) ||
(!isList && contentLength < 25 && (img == 0 || img > 2) && !ps.hasAncestorTag(node, "figure", 3, nil)) ||
(!isList && weight < 25 && linkDensity > 0.2) ||
(weight >= 25 && linkDensity > 0.5) ||
((embedCount == 1 && contentLength < 75) || embedCount > 1)
}
return false
})
}
// cleanMatchedNodes cleans out elements whose id/class
// combinations match specific string.
func (ps *Parser) cleanMatchedNodes(e *html.Node, filter func(*html.Node, string) bool) {
endOfSearchMarkerNode := ps.getNextNode(e, true)
next := ps.getNextNode(e, false)
for next != nil && next != endOfSearchMarkerNode {
if filter != nil && filter(next, className(next)+" "+id(next)) {
next = ps.removeAndGetNext(next)
} else {
next = ps.getNextNode(next, false)
}
}
}
// cleanHeaders cleans out spurious headers from an Element.
// Checks things like classnames and link density.
func (ps *Parser) cleanHeaders(e *html.Node) {
for headerIndex := 1; headerIndex < 3; headerIndex++ {
headerTag := fmt.Sprintf("h%d", headerIndex)
ps.removeNodes(getElementsByTagName(e, headerTag), func(header *html.Node) bool {
return ps.getClassWeight(header) < 0
})
}
}
// isProbablyVisible determines if a node is visible.
func (ps *Parser) isProbablyVisible(node *html.Node) bool {
nodeStyle := getAttribute(node, "style")
return (nodeStyle == "" || !rxDisplayNone.MatchString(nodeStyle)) && !hasAttribute(node, "hidden")
}
// Parse parses input and find the main readable content.
func (ps *Parser) Parse(input io.Reader, pageURL string) (Article, error) {
// Reset parser data
ps.articleTitle = ""
ps.articleByline = ""
ps.articleDir = ""
ps.articleSiteName = ""
ps.attempts = []parseAttempt{}
ps.flags = flags{
stripUnlikelys: true,
useWeightClasses: true,
cleanConditionally: true,
}
// Parse page url
var err error
ps.documentURI, err = nurl.ParseRequestURI(pageURL)
if err != nil {
return Article{}, fmt.Errorf("failed to parse URL: %v", err)
}
// Parse input
ps.doc, err = html.Parse(input)
if err != nil {
return Article{}, fmt.Errorf("failed to parse input: %v", err)
}
// Avoid parsing too large documents, as per configuration option
if ps.MaxElemsToParse > 0 {
numTags := len(getElementsByTagName(ps.doc, "*"))
if numTags > ps.MaxElemsToParse {
return Article{}, fmt.Errorf("documents too large: %d elements", numTags)
}
}
// Remove script tags from the document.
ps.removeScripts(ps.doc)
// Prepares the HTML document
ps.prepDocument()
// Fetch metadata
metadata := ps.getArticleMetadata()
ps.articleTitle = metadata["title"]
// Try to grab article content
finalHTMLContent := ""
finalTextContent := ""
articleContent := ps.grabArticle()
var readableNode *html.Node
if articleContent != nil {
ps.postProcessContent(articleContent)
// If we haven't found an excerpt in the article's metadata,
// use the article's first paragraph as the excerpt. This is used
// for displaying a preview of the article's content.
if metadata["excerpt"] == "" {
paragraphs := getElementsByTagName(articleContent, "p")
if len(paragraphs) > 0 {
metadata["excerpt"] = strings.TrimSpace(textContent(paragraphs[0]))
}
}
readableNode = firstElementChild(articleContent)
finalHTMLContent = innerHTML(articleContent)
finalTextContent = textContent(articleContent)
finalTextContent = strings.TrimSpace(finalTextContent)
}
finalByline := metadata["byline"]
if finalByline == "" {
finalByline = ps.articleByline
}
// Excerpt is an supposed to be short and concise,
// so it shouldn't have any new line
excerpt := strings.TrimSpace(metadata["excerpt"])
excerpt = strings.Join(strings.Fields(excerpt), " ")
// go-readability special:
// Internet is dangerous and weird, and sometimes we will find
// metadata isn't encoded using a valid Utf-8, so here we check it.
validTitle := toValidUtf8(ps.articleTitle, pageURL)
validByline := toValidUtf8(finalByline, "")
validExcerpt := toValidUtf8(excerpt, "")
return Article{
Title: validTitle,
Byline: validByline,
Node: readableNode,
Content: finalHTMLContent,
TextContent: finalTextContent,
Length: len(finalTextContent),
Excerpt: validExcerpt,
SiteName: metadata["siteName"],
Image: metadata["image"],
Favicon: metadata["favicon"],
}, nil
}
// IsReadable decides whether or not the document is reader-able
// without parsing the whole thing. In `mozilla/readability`,
// this method is located in `Readability-readable.js`.
func (ps *Parser) IsReadable(input io.Reader) bool {
// Parse input
doc, err := html.Parse(input)
if err != nil {
return false
}
// Get <p> and <pre> nodes.
// Also get <div> nodes which have <br> node(s) and append
// them into the `nodes` variable.
// Some articles' DOM structures might look like :
//
// <div>
// Sentences<br>
// <br>
// Sentences<br>
// </div>
//
// So we need to make sure only fetch the div once.
// To do so, we will use map as dictionary.
nodeList := make([]*html.Node, 0)
nodeDict := make(map[*html.Node]struct{})
var finder func(*html.Node)
finder = func(node *html.Node) {
if node.Type == html.ElementNode {
tag := tagName(node)
if tag == "p" || tag == "pre" {
if _, exist := nodeDict[node]; !exist {
nodeList = append(nodeList, node)
nodeDict[node] = struct{}{}
}
} else if tag == "br" && node.Parent != nil && tagName(node.Parent) == "div" {
if _, exist := nodeDict[node.Parent]; !exist {
nodeList = append(nodeList, node.Parent)
nodeDict[node.Parent] = struct{}{}
}
}
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
finder(child)
}
}
finder(doc)
// This is a little cheeky, we use the accumulator 'score'
// to decide what to return from this callback.
score := float64(0)
return ps.someNode(nodeList, func(node *html.Node) bool {
if !ps.isProbablyVisible(node) {
return false
}
matchString := className(node) + " " + id(node)
if rxUnlikelyCandidates.MatchString(matchString) &&
!rxOkMaybeItsACandidate.MatchString(matchString) {
return false
}
if tagName(node) == "p" && ps.hasAncestorTag(node, "li", -1, nil) {
return false
}
nodeText := strings.TrimSpace(textContent(node))
nodeTextLength := len(nodeText)
if nodeTextLength < 140 {
return false
}
score += math.Sqrt(float64(nodeTextLength - 140))
if score > 20 {
return true
}
return false
})
}
// ====================== INFORMATION ======================
// Methods below these point are not exist in Readability.js.
// They are only used as workaround since Readability.js is
// written in JS which is a dynamic language, while this
// package is written in Go, which is static.
// =========================================================
// getArticleFavicon attempts to get high quality favicon
// that used in article. It will only pick favicon in PNG
// format, so small favicon that uses ico file won't be picked.
// Using algorithm by philippe_b.
func (ps *Parser) getArticleFavicon() string {
favicon := ""
faviconSize := -1
linkElements := getElementsByTagName(ps.doc, "link")
ps.forEachNode(linkElements, func(link *html.Node, _ int) {
linkRel := strings.TrimSpace(getAttribute(link, "rel"))
linkType := strings.TrimSpace(getAttribute(link, "type"))
linkHref := strings.TrimSpace(getAttribute(link, "href"))
linkSizes := strings.TrimSpace(getAttribute(link, "sizes"))
if linkHref == "" || !strings.Contains(linkRel, "icon") {
return
}
if linkType != "image/png" && !strings.Contains(linkHref, ".png") {
return
}
size := 0
for _, sizesLocation := range []string{linkSizes, linkHref} {
sizeParts := rxFaviconSize.FindStringSubmatch(sizesLocation)
if len(sizeParts) != 3 || sizeParts[1] != sizeParts[2] {
continue
}
size, _ = strconv.Atoi(sizeParts[1])
break
}
if size > faviconSize {
faviconSize = size
favicon = linkHref
}
})
return toAbsoluteURI(favicon, ps.documentURI)
}
// In dynamic language like JavaScript, we can easily add new
// property to an existing object by simply writing :
//
// obj.newProperty = newValue
//
// This is extensively used in Readability.js to save readability
// content score; and to mark whether a table is data container or
// only used for layout.
//
// However, since Go is static typed, we can't do it that way.
// As workaround, we just saved those data as attribute in the
// HTML nodes. Hence why these methods exists.
// setReadabilityDataTable marks whether a Node is data table or not.
func (ps *Parser) setReadabilityDataTable(node *html.Node, isDataTable bool) {
if isDataTable {
setAttribute(node, "data-readability-table", "true")
} else {
removeAttribute(node, "data-readability-table")
}
}
// isReadabilityDataTable determines if node is data table.
func (ps *Parser) isReadabilityDataTable(node *html.Node) bool {
return hasAttribute(node, "data-readability-table")
}
// setContentScore sets the readability score for a node.
func (ps *Parser) setContentScore(node *html.Node, score float64) {
setAttribute(node, "data-readability-score", fmt.Sprintf("%.4f", score))
}
// hasContentScore checks if node has readability score.
func (ps *Parser) hasContentScore(node *html.Node) bool {
return hasAttribute(node, "data-readability-score")
}
// getContentScore gets the readability score of a node.
func (ps *Parser) getContentScore(node *html.Node) float64 {
strScore := getAttribute(node, "data-readability-score")
strScore = strings.TrimSpace(strScore)
if strScore == "" {
return 0
}
score, _ := strconv.ParseFloat(strScore, 64)
return score
}
// clearReadabilityAttr removes Readability attribute that
// created by this package. Used in `postProcessContent`.
func (ps *Parser) clearReadabilityAttr(node *html.Node) {
removeAttribute(node, "data-readability-score")
removeAttribute(node, "data-readability-table")
for child := firstElementChild(node); child != nil; child = nextElementSibling(child) {
ps.clearReadabilityAttr(child)
}
}