warc/internal/processor/html.go
2020-04-05 21:21:09 +08:00

349 lines
10 KiB
Go

package processor
import (
"fmt"
nurl "net/url"
"regexp"
"strings"
"gitea.com/huiyifyj/warc/internal/dom"
"golang.org/x/net/html"
)
var (
rxLazyImageSrcset = regexp.MustCompile(`(?i)\.(jpg|jpeg|png|webp)\s+\d`)
rxLazyImageSrc = regexp.MustCompile(`(?i)^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$`)
rxImageMeta = regexp.MustCompile(`(?i)image|thumbnail`)
)
// ProcessHTMLFile process HTML file.
func ProcessHTMLFile(req Request) (Resource, []Resource, error) {
// Parse URL
pageURL, err := nurl.ParseRequestURI(req.URL)
if err != nil || pageURL.Scheme == "" || pageURL.Hostname() == "" {
return Resource{}, nil, fmt.Errorf("url %s is not valid", req.URL)
}
// Parse HTML document
doc, err := html.Parse(req.Reader)
if err != nil {
return Resource{}, nil, fmt.Errorf("failed to parse HTML for %s: %v", req.URL, err)
}
// TODO: I'm still not really sure, but IMHO it's safer to
// disable Javascript. Ideally, we only want to remove XHR request
// using disableXHR(). Unfortunately, the result is not that good for now.
dom.RemoveNodes(dom.GetElementsByTagName(doc, "script"), nil)
// Convert lazy loaded image to normal
fixLazyImages(doc)
// Convert hyperlinks with relative URL
fixRelativeURIs(doc, pageURL)
// Extract subresources from each nodes
subResources := []Resource{}
for _, node := range dom.GetElementsByTagName(doc, "*") {
// First extract resources from inline style
cssResources := processInlineCSS(node, pageURL)
subResources = append(subResources, cssResources...)
// Next extract resources from tag's specific attribute
nodeResources := []Resource{}
switch dom.TagName(node) {
case "style":
nodeResources = processStyleTag(node, pageURL)
case "script":
nodeResources = processScriptTag(node, pageURL)
case "meta":
nodeResources = processMetaTag(node, pageURL)
case "img", "picture", "figure", "video", "audio", "source":
nodeResources = processMediaTag(node, pageURL)
case "link":
nodeResources = processGenericTag(node, "href", pageURL)
case "iframe":
nodeResources = processGenericTag(node, "src", pageURL)
case "object":
nodeResources = processGenericTag(node, "data", pageURL)
default:
continue
}
subResources = append(subResources, nodeResources...)
}
// Return outer HTML of the doc
outerHTML := dom.OuterHTML(doc)
resource, err := createResource([]byte(outerHTML), req.URL, nil)
return resource, subResources, err
}
func disableXHR(doc *html.Node) {
var head *html.Node
heads := dom.GetElementsByTagName(doc, "head")
if len(heads) > 0 {
head = heads[0]
} else {
head = dom.CreateElement("head")
dom.PrependChild(doc, head)
}
xhrDisabler := `
fetch = new Promise();
XMLHttpRequest = function() {};
XMLHttpRequest.prototype = {
open: function(){},
send: function(){},
abort: function(){},
setRequestHeader: function(){},
overrideMimeType: function(){},
getResponseHeaders(): function(){},
getAllResponseHeaders(): function(){},
};`
script := dom.CreateElement("script")
scriptContent := dom.CreateTextNode(xhrDisabler)
dom.PrependChild(script, scriptContent)
dom.PrependChild(head, script)
}
// fixRelativeURIs converts each <a> in the given element
// to an absolute URI, ignoring #ref URIs.
func fixRelativeURIs(doc *html.Node, pageURL *nurl.URL) {
links := dom.GetAllNodesWithTag(doc, "a")
dom.ForEachNode(links, func(link *html.Node, _ int) {
href := dom.GetAttribute(link, "href")
if href == "" {
return
}
// Replace links with javascript: URIs with text content,
// since they won't work after scripts have been removed
// from the page.
if strings.HasPrefix(href, "javascript:") {
text := dom.CreateTextNode(dom.TextContent(link))
dom.ReplaceChild(link.Parent, text, link)
} else {
newHref := createAbsoluteURL(href, pageURL)
if newHref == "" {
dom.RemoveAttribute(link, "href")
} else {
dom.SetAttribute(link, "href", newHref)
}
}
})
}
// fixLazyImages convert images and figures that have properties like
// data-src into images that can be loaded without JS.
func fixLazyImages(root *html.Node) {
imageNodes := dom.GetAllNodesWithTag(root, "img", "picture", "figure")
dom.ForEachNode(imageNodes, func(elem *html.Node, _ int) {
src := dom.GetAttribute(elem, "src")
srcset := dom.GetAttribute(elem, "srcset")
nodeTag := dom.TagName(elem)
nodeClass := dom.ClassName(elem)
if (src == "" && srcset == "") || strings.Contains(strings.ToLower(nodeClass), "lazy") {
for i := 0; i < len(elem.Attr); i++ {
attr := elem.Attr[i]
if attr.Key == "src" || attr.Key == "srcset" {
continue
}
copyTo := ""
if rxLazyImageSrcset.MatchString(attr.Val) {
copyTo = "srcset"
} else if rxLazyImageSrc.MatchString(attr.Val) {
copyTo = "src"
}
if copyTo == "" {
continue
}
if nodeTag == "img" || nodeTag == "picture" {
// if this is an img or picture, set the attribute directly
dom.SetAttribute(elem, copyTo, attr.Val)
} else if nodeTag == "figure" && len(dom.GetAllNodesWithTag(elem, "img", "picture")) == 0 {
// if the item is a <figure> that does not contain an image or picture,
// create one and place it inside the figure see the nytimes-3
// testcase for an example
img := dom.CreateElement("img")
dom.SetAttribute(img, copyTo, attr.Val)
dom.AppendChild(elem, img)
}
}
}
})
}
// processInlineCSS extract subresources from the CSS rules inside
// style attribute. Once finished, all CSS URLs in the style attribute
// will be updated to use the resource name.
func processInlineCSS(node *html.Node, pageURL *nurl.URL) []Resource {
// Make sure this node has inline style
styleAttr := dom.GetAttribute(node, "style")
styleAttr = strings.TrimSpace(styleAttr)
if styleAttr == "" {
return nil
}
// Extract resource URLs from the inline style
// and update the CSS rules accordingly.
reader := strings.NewReader(styleAttr)
newStyleAttr, subResources := processCSS(reader, pageURL)
dom.SetAttribute(node, "style", newStyleAttr)
return subResources
}
// processStyleTag extract subresources from inside a <style> tag.
// Once finished, all CSS URLs will be updated to use the resource name.
func processStyleTag(styleNode *html.Node, pageURL *nurl.URL) []Resource {
// Extract CSS rules from <style>
rules := dom.TextContent(styleNode)
rules = strings.TrimSpace(rules)
if rules == "" {
return nil
}
// Extract resource URLs from the rules and update it accordingly.
reader := strings.NewReader(rules)
newRules, subResources := processCSS(reader, pageURL)
dom.SetTextContent(styleNode, newRules)
return subResources
}
// processScriptTag extract archive's resource from inside a <script> tag.
// Once finished, all URLs inside it will be updated to use the resource name.
func processScriptTag(node *html.Node, pageURL *nurl.URL) []Resource {
// Also get the URL from `src` attribute
subResources := processGenericTag(node, "src", pageURL)
// Extract JS code from the <script> itself
script := dom.TextContent(node)
script = strings.TrimSpace(script)
if script == "" {
return subResources
}
reader := strings.NewReader(script)
newScript, scriptResources := processJS(reader, pageURL)
dom.SetTextContent(node, newScript)
// Merge script resources
subResources = append(subResources, scriptResources...)
return subResources
}
// extractMetaTag extract archive's resource from inside a <meta>.
// Normally, <meta> doesn't have any resource URLs. However, as
// social media come and grow, a new metadata is added to contain
// the hero image for a web page, e.g. og:image, twitter:image, etc.
// Once finished, all URLs in <meta> for image will be updated
// to use the resource name.
func processMetaTag(node *html.Node, pageURL *nurl.URL) []Resource {
// Get the needed attributes
name := dom.GetAttribute(node, "name")
property := dom.GetAttribute(node, "property")
content := dom.GetAttribute(node, "content")
// If this <meta> is not for image, don't process it
if !rxImageMeta.MatchString(name + " " + property) {
return nil
}
// If URL is not valid, skip
tmp, err := nurl.ParseRequestURI(content)
if err != nil || tmp.Scheme == "" || tmp.Hostname() == "" {
return nil
}
// Create subresource and update the URL
subResource, err := createResource(nil, content, pageURL)
if err != nil {
return nil
}
dom.SetAttribute(node, "content", subResource.Name)
return []Resource{subResource}
}
// processMediaTag extract resource from inside a media tag e.g.
// <img>, <video>, <audio>, <source>. Once finished, all URLs will be
// updated to use the resource name.
func processMediaTag(node *html.Node, pageURL *nurl.URL) []Resource {
// Create initial subresources
subResources := []Resource{}
// Save `src` and `poster` of media to subresources
for _, attrName := range []string{"src", "poster"} {
attrValue := dom.GetAttribute(node, attrName)
if attrValue == "" {
continue
}
subResource, err := createResource(nil, attrValue, pageURL)
if err != nil {
continue
}
dom.SetAttribute(node, attrName, subResource.Name)
subResources = append(subResources, subResource)
}
// Get `srcset`, split it by comma, then process it like any URLs
strSrcSets := dom.GetAttribute(node, "srcset")
srcSets := strings.Split(strSrcSets, ",")
for i, srcSet := range srcSets {
srcSet = strings.TrimSpace(srcSet)
parts := strings.SplitN(srcSet, " ", 2)
if parts[0] == "" {
continue
}
subResource, err := createResource(nil, parts[0], pageURL)
if err != nil {
continue
}
srcSets[i] = strings.Replace(srcSets[i], parts[0], subResource.Name, 1)
subResources = append(subResources, subResource)
}
if len(srcSets) > 0 {
dom.SetAttribute(node, "srcset", strings.Join(srcSets, ","))
}
return subResources
}
// processGenericTag extract resource from specified attribute.
// This method is used for tags where the URL is obviously exist in
// the tag, without any additional process needed to extract it.
// For example is <link> with its href, <object> with its data, etc.
// Once finished, the URL attribute will be updated to use the
// resource name.
func processGenericTag(node *html.Node, attrName string, pageURL *nurl.URL) []Resource {
// Get the needed attributes
attrValue := dom.GetAttribute(node, attrName)
if attrValue == "" {
return nil
}
subResource, err := createResource(nil, attrValue, pageURL)
if err != nil {
return nil
}
if dom.TagName(node) == "iframe" {
subResource.IsEmbed = true
}
dom.SetAttribute(node, attrName, subResource.Name)
return []Resource{subResource}
}