Go library that cleans a HTML page for better readability.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

93 lines
2.0 KiB

package readability
import (
nurl "net/url"
"os"
"strings"
"unicode/utf8"
"github.com/sirupsen/logrus"
"golang.org/x/net/html"
)
// indexOf returns the position of the first occurrence of a
// specified value in a string array. Returns -1 if the
// value to search for never occurs.
func indexOf(array []string, key string) int {
for i := 0; i < len(array); i++ {
if array[i] == key {
return i
}
}
return -1
}
// wordCount returns number of word in str.
func wordCount(str string) int {
return len(strings.Fields(str))
}
// toAbsoluteURI convert uri to absolute path based on base.
// However, if uri is prefixed with hash (#), the uri won't be changed.
func toAbsoluteURI(uri string, base *nurl.URL) string {
if uri == "" || base == nil {
return ""
}
// If it is hash tag, return as it is
if uri[:1] == "#" {
return uri
}
// If it is already an absolute URL, return as it is
tmp, err := nurl.ParseRequestURI(uri)
if err == nil && tmp.Scheme != "" && tmp.Hostname() != "" {
return uri
}
// Otherwise, resolve against base URI.
tmp, err = nurl.Parse(uri)
if err != nil {
return uri
}
return base.ResolveReference(tmp).String()
}
// renderToFile ender an element and save it to file.
// It will panic if it fails to create destination file.
func renderToFile(element *html.Node, filename string) {
dstFile, err := os.Create(filename)
if err != nil {
logrus.Fatalln("failed to create file:", err)
}
defer dstFile.Close()
html.Render(dstFile, element)
}
// toValidUtf8 convert and make sure a string is a valid Utf-8 string.
// In case the valid output is empty, it will use fallback as the output.
func toValidUtf8(src, fallback string) string {
// Check if it's already valid
if valid := utf8.ValidString(src); valid {
return src
}
// Remove invalid runes
validUtf := strings.Map(utf8RuneChecker, src)
// If it's empty use fallback string
validUtf = strings.TrimSpace(validUtf)
if validUtf == "" {
return fallback
}
return validUtf
}
func utf8RuneChecker(r rune) rune {
if r == utf8.RuneError {
return -1
}
return r
}