readability/parser_test.go
2020-04-07 14:40:28 +08:00

162 lines
4.2 KiB
Go

package readability
import (
"fmt"
"io/ioutil"
"os"
fp "path/filepath"
"strings"
"testing"
"github.com/sergi/go-diff/diffmatchpatch"
"golang.org/x/net/html"
)
func getNodeExcerpt(node *html.Node) string {
outer := outerHTML(node)
outer = strings.Join(strings.Fields(outer), " ")
if len(outer) < 120 {
return outer
}
return outer[:120]
}
func compareArticleContent(result, expected *html.Node) error {
// Make sure number of nodes is same
resultNodesCount := len(children(result))
expectedNodesCount := len(children(expected))
if resultNodesCount != expectedNodesCount {
return fmt.Errorf("number of nodes is different, want %d got %d",
expectedNodesCount, resultNodesCount)
}
resultNode := result
expectedNode := expected
for resultNode != nil && expectedNode != nil {
// Get node excerpt
resultExcerpt := getNodeExcerpt(resultNode)
expectedExcerpt := getNodeExcerpt(expectedNode)
// Compare tag name
resultTagName := tagName(resultNode)
expectedTagName := tagName(expectedNode)
if resultTagName != expectedTagName {
return fmt.Errorf("tag name is different\n"+
"want : %s (%s)\n"+
"got : %s (%s)",
expectedTagName, expectedExcerpt,
resultTagName, resultExcerpt)
}
// Compare attributes
resultAttrCount := len(resultNode.Attr)
expectedAttrCount := len(expectedNode.Attr)
if resultAttrCount != expectedAttrCount {
return fmt.Errorf("number of attributes is different\n"+
"want : %d (%s)\n"+
"got : %d (%s)",
expectedAttrCount, expectedExcerpt,
resultAttrCount, resultExcerpt)
}
for _, resultAttr := range resultNode.Attr {
expectedAttrVal := getAttribute(expectedNode, resultAttr.Key)
switch resultAttr.Key {
case "href", "src":
resultAttr.Val = strings.TrimSuffix(resultAttr.Val, "/")
expectedAttrVal = strings.TrimSuffix(expectedAttrVal, "/")
}
if resultAttr.Val != expectedAttrVal {
return fmt.Errorf("attribute %s is different\n"+
"want : %s (%s)\n"+
"got : %s (%s)",
resultAttr.Key, expectedAttrVal, expectedExcerpt,
resultAttr.Val, resultExcerpt)
}
}
// Compare text content
resultText := strings.TrimSpace(textContent(resultNode))
expectedText := strings.TrimSpace(textContent(expectedNode))
resultText = strings.Join(strings.Fields(resultText), " ")
expectedText = strings.Join(strings.Fields(expectedText), " ")
comparator := diffmatchpatch.New()
diffs := comparator.DiffMain(resultText, expectedText, false)
if len(diffs) > 1 {
return fmt.Errorf("text content is different\n"+
"want : %s\n"+
"got : %s\n"+
"diffs : %s",
expectedExcerpt, resultExcerpt,
comparator.DiffPrettyText(diffs))
}
// Move to next node
ps := Parser{}
resultNode = ps.getNextNode(resultNode, false)
expectedNode = ps.getNextNode(expectedNode, false)
}
return nil
}
func Test_parser(t *testing.T) {
testDir := "test-pages"
testItems, err := ioutil.ReadDir(testDir)
if err != nil {
t.Errorf("\nfailed to read test directory")
}
for _, item := range testItems {
if !item.IsDir() {
continue
}
t.Run(item.Name(), func(t1 *testing.T) {
// Open test file
testFilePath := fp.Join(testDir, item.Name(), "source.html")
testFile, err := os.Open(testFilePath)
if err != nil {
t1.Errorf("\nfailed to open test file")
}
defer testFile.Close()
// Open expected result file
expectedFilePath := fp.Join(testDir, item.Name(), "expected.html")
expectedFile, err := os.Open(expectedFilePath)
if err != nil {
t1.Errorf("\nfailed to open expected result file")
}
defer expectedFile.Close()
// Parse expected result
expectedHTML, err := html.Parse(expectedFile)
if err != nil {
t1.Errorf("\nfailed to parse expected result file")
}
// Get article from test file
resultArticle, err := FromReader(testFile, "http://fakehost/test/page.html")
if err != nil {
t1.Errorf("\nfailed to parse test file")
}
// Parse article into HTML
resultHTML, err := html.Parse(strings.NewReader(resultArticle.Content))
if err != nil {
t1.Errorf("\nfailed to parse test article into HTML")
}
// Compare article
err = compareArticleContent(resultHTML, expectedHTML)
if err != nil {
t1.Errorf("\n%v", err)
}
})
}
}