Browse Source

MB-33617: Support unicode strings with combining characters

Context: reverse-token-filter
tags/v0.8.0
abhinavdangeti 4 months ago
parent
commit
7ca4ea2cd8
2 changed files with 45 additions and 15 deletions
  1. 22
    13
      analysis/token/reverse/reverse.go
  2. 23
    2
      analysis/token/reverse/reverse_test.go

+ 22
- 13
analysis/token/reverse/reverse.go View File

@@ -15,6 +15,7 @@
package reverse

import (
"unicode"
"unicode/utf8"

"github.com/blevesearch/bleve/analysis"
@@ -47,20 +48,28 @@ func init() {
}

// reverse(..) will generate a reversed version of the provided
// utf-8 encoded byte array and return it back to its caller.
// unicode array and return it back to its caller.
func reverse(s []byte) []byte {
j := len(s)
rv := make([]byte, len(s))
for i := 0; i < len(s); {
wid := 1
r := rune(s[i])
if r >= utf8.RuneSelf {
r, wid = utf8.DecodeRune(s[i:])
cursorIn := 0
inputRunes := []rune(string(s))
cursorOut := len(s)
output := make([]byte, len(s))
for i := 0; i < len(inputRunes); {
wid := utf8.RuneLen(inputRunes[i])
i++
for i < len(inputRunes) {
r := inputRunes[i]
if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Me, r) || unicode.Is(unicode.Mc, r) {
wid += utf8.RuneLen(r)
i++
} else {
break
}
}

copy(rv[j-wid:j], s[i:i+wid])
i += wid
j -= wid
copy(output[cursorOut-wid:cursorOut], s[cursorIn:cursorIn+wid])
cursorIn += wid
cursorOut -= wid
}
return rv

return output
}

+ 23
- 2
analysis/token/reverse/reverse_test.go View File

@@ -23,6 +23,7 @@ import (

func TestReverseFilter(t *testing.T) {
inputTokenStream := analysis.TokenStream{
&analysis.Token{},
&analysis.Token{
Term: []byte("one"),
},
@@ -47,10 +48,19 @@ func TestReverseFilter(t *testing.T) {
&analysis.Token{
Term: []byte("!@#$%^&*()"),
},
&analysis.Token{},
&analysis.Token{
Term: []byte("cafés"),
},
&analysis.Token{
Term: []byte("¿Dónde estás?"),
},
&analysis.Token{
Term: []byte("Me gustaría una cerveza."),
},
}

expectedTokenStream := analysis.TokenStream{
&analysis.Token{},
&analysis.Token{
Term: []byte("eno"),
},
@@ -75,7 +85,15 @@ func TestReverseFilter(t *testing.T) {
&analysis.Token{
Term: []byte(")(*&^%$#@!"),
},
&analysis.Token{},
&analysis.Token{
Term: []byte("séfac"),
},
&analysis.Token{
Term: []byte("?sátse ednóD¿"),
},
&analysis.Token{
Term: []byte(".azevrec anu aíratsug eM"),
},
}

filter := NewReverseFilter()
@@ -153,6 +171,9 @@ func BenchmarkReverseFilter(b *testing.B) {
&analysis.Token{
Term: []byte("İȺȾCAT"),
},
&analysis.Token{
Term: []byte("Me gustaría una cerveza."),
},
}
filter := NewReverseFilter()


Loading…
Cancel
Save