gitea/services/gitdiff/highlightdiff.go
wxiaoguang 3310dd1d19
Improve code diff highlight, fix incorrect rendered diff result (#19958)
Use Unicode placeholders to replace HTML tags and HTML entities first, then do diff, then recover the HTML tags and HTML entities. Now the code diff with highlight has stable behavior, and won't emit broken tags.
2022-07-23 19:28:02 +08:00

224 lines
7.9 KiB
Go

// Copyright 2022 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package gitdiff
import (
"strings"
"code.gitea.io/gitea/modules/highlight"
"github.com/sergi/go-diff/diffmatchpatch"
)
// token is a html tag or entity, eg: "<span ...>", "</span>", "&lt;"
func extractHTMLToken(s string) (before, token, after string, valid bool) {
for pos1 := 0; pos1 < len(s); pos1++ {
if s[pos1] == '<' {
pos2 := strings.IndexByte(s[pos1:], '>')
if pos2 == -1 {
return "", "", s, false
}
return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
} else if s[pos1] == '&' {
pos2 := strings.IndexByte(s[pos1:], ';')
if pos2 == -1 {
return "", "", s, false
}
return s[:pos1], s[pos1 : pos1+pos2+1], s[pos1+pos2+1:], true
}
}
return "", "", s, true
}
// highlightCodeDiff is used to do diff with highlighted HTML code.
// It totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
// The HTML tags and entities will be replaced by Unicode placeholders: "<span>{TEXT}</span>" => "\uE000{TEXT}\uE001"
// These Unicode placeholders are friendly to the diff.
// Then after diff, the placeholders in diff result will be recovered to the HTML tags and entities.
// It's guaranteed that the tags in final diff result are paired correctly.
type highlightCodeDiff struct {
placeholderBegin rune
placeholderMaxCount int
placeholderIndex int
placeholderTokenMap map[rune]string
tokenPlaceholderMap map[string]rune
placeholderOverflowCount int
lineWrapperTags []string
}
func newHighlightCodeDiff() *highlightCodeDiff {
return &highlightCodeDiff{
placeholderBegin: rune(0x100000), // Plane 16: Supplementary Private Use Area B (U+100000..U+10FFFD)
placeholderMaxCount: 64000,
placeholderTokenMap: map[rune]string{},
tokenPlaceholderMap: map[string]rune{},
}
}
// nextPlaceholder returns 0 if no more placeholder can be used
// the diff is done line by line, usually there are only a few (no more than 10) placeholders in one line
// so the placeholderMaxCount is impossible to be exhausted in real cases.
func (hcd *highlightCodeDiff) nextPlaceholder() rune {
for hcd.placeholderIndex < hcd.placeholderMaxCount {
r := hcd.placeholderBegin + rune(hcd.placeholderIndex)
hcd.placeholderIndex++
// only use non-existing (not used by code) rune as placeholders
if _, ok := hcd.placeholderTokenMap[r]; !ok {
return r
}
}
return 0 // no more available placeholder
}
func (hcd *highlightCodeDiff) isInPlaceholderRange(r rune) bool {
return hcd.placeholderBegin <= r && r < hcd.placeholderBegin+rune(hcd.placeholderMaxCount)
}
func (hcd *highlightCodeDiff) collectUsedRunes(code string) {
for _, r := range code {
if hcd.isInPlaceholderRange(r) {
// put the existing rune (used by code) in map, then this rune won't be used a placeholder anymore.
hcd.placeholderTokenMap[r] = ""
}
}
}
func (hcd *highlightCodeDiff) diffWithHighlight(filename, language, codeA, codeB string) []diffmatchpatch.Diff {
hcd.collectUsedRunes(codeA)
hcd.collectUsedRunes(codeB)
highlightCodeA := highlight.Code(filename, language, codeA)
highlightCodeB := highlight.Code(filename, language, codeB)
highlightCodeA = hcd.convertToPlaceholders(highlightCodeA)
highlightCodeB = hcd.convertToPlaceholders(highlightCodeB)
diffs := diffMatchPatch.DiffMain(highlightCodeA, highlightCodeB, true)
diffs = diffMatchPatch.DiffCleanupEfficiency(diffs)
for i := range diffs {
hcd.recoverOneDiff(&diffs[i])
}
return diffs
}
// convertToPlaceholders totally depends on Chroma's valid HTML output and its structure, do not use these functions for other purposes.
func (hcd *highlightCodeDiff) convertToPlaceholders(htmlCode string) string {
var tagStack []string
res := strings.Builder{}
firstRunForLineTags := hcd.lineWrapperTags == nil
var beforeToken, token string
var valid bool
// the standard chroma highlight HTML is "<span class="line [hl]"><span class="cl"> ... </span></span>"
for {
beforeToken, token, htmlCode, valid = extractHTMLToken(htmlCode)
if !valid || token == "" {
break
}
// write the content before the token into result string, and consume the token in the string
res.WriteString(beforeToken)
// the line wrapper tags should be removed before diff
if strings.HasPrefix(token, `<span class="line`) || strings.HasPrefix(token, `<span class="cl"`) {
if firstRunForLineTags {
// if this is the first run for converting, save the line wrapper tags for later use, they should be added back
hcd.lineWrapperTags = append(hcd.lineWrapperTags, token)
}
htmlCode = strings.TrimSuffix(htmlCode, "</span>")
continue
}
var tokenInMap string
if strings.HasSuffix(token, "</") { // for closing tag
if len(tagStack) == 0 {
break // invalid diff result, no opening tag but see closing tag
}
// make sure the closing tag in map is related to the open tag, to make the diff algorithm can match the opening/closing tags
// the closing tag will be recorded in the map by key "</span><!-- <span the-opening> -->" for "<span the-opening>"
tokenInMap = token + "<!-- " + tagStack[len(tagStack)-1] + "-->"
tagStack = tagStack[:len(tagStack)-1]
} else if token[0] == '<' { // for opening tag
tokenInMap = token
tagStack = append(tagStack, token)
} else if token[0] == '&' { // for html entity
tokenInMap = token
} // else: impossible
// remember the placeholder and token in the map
placeholder, ok := hcd.tokenPlaceholderMap[tokenInMap]
if !ok {
placeholder = hcd.nextPlaceholder()
if placeholder != 0 {
hcd.tokenPlaceholderMap[tokenInMap] = placeholder
hcd.placeholderTokenMap[placeholder] = tokenInMap
}
}
if placeholder != 0 {
res.WriteRune(placeholder) // use the placeholder to replace the token
} else {
// unfortunately, all private use runes has been exhausted, no more placeholder could be used, no more converting
// usually, the exhausting won't occur in real cases, the magnitude of used placeholders is not larger than that of the CSS classes outputted by chroma.
hcd.placeholderOverflowCount++
if strings.HasPrefix(token, "&") {
// when the token is a html entity, something must be outputted even if there is no placeholder.
res.WriteRune(0xFFFD) // replacement character TODO: how to handle this case more gracefully?
res.WriteString(token[1:]) // still output the entity code part, otherwise there will be no diff result.
}
}
}
// write the remaining string
res.WriteString(htmlCode)
return res.String()
}
func (hcd *highlightCodeDiff) recoverOneDiff(diff *diffmatchpatch.Diff) {
sb := strings.Builder{}
var tagStack []string
for _, r := range diff.Text {
token, ok := hcd.placeholderTokenMap[r]
if !ok || token == "" {
sb.WriteRune(r) // if the rune is not a placeholder, write it as it is
continue
}
var tokenToRecover string
if strings.HasPrefix(token, "</") { // for closing tag
// only get the tag itself, ignore the trailing comment (for how the comment is generated, see the code in `convert` function)
tokenToRecover = token[:strings.IndexByte(token, '>')+1]
if len(tagStack) == 0 {
continue // if no opening tag in stack yet, skip the closing tag
}
tagStack = tagStack[:len(tagStack)-1]
} else if token[0] == '<' { // for opening tag
tokenToRecover = token
tagStack = append(tagStack, token)
} else if token[0] == '&' { // for html entity
tokenToRecover = token
} // else: impossible
sb.WriteString(tokenToRecover)
}
if len(tagStack) > 0 {
// close all opening tags
for i := len(tagStack) - 1; i >= 0; i-- {
tagToClose := tagStack[i]
// get the closing tag "</span>" from "<span class=...>" or "<span>"
pos := strings.IndexAny(tagToClose, " >")
if pos != -1 {
sb.WriteString("</" + tagToClose[1:pos] + ">")
} // else: impossible. every tag was pushed into the stack by the code above and is valid HTML opening tag
}
}
diff.Text = sb.String()
}