Template
1
0
Fork 0
mirror of https://codeberg.org/forgejo/forgejo synced 2024-11-21 17:34:24 +01:00

fix(code search): use alphanumeric tokenizer instead of letter tokenizer

letter tokenizer fails to consider numbers, this results in a searches for "func1" to match "func2" even with exact search.

furthermore, since the tokenizer did not consider numbers searches for numerical values such "2.42" would fail entirely
This commit is contained in:
Shiny Nematoda 2024-11-12 13:13:22 +00:00
parent 16c0361764
commit 4c9ab4e41d
5 changed files with 33 additions and 6 deletions

View file

@ -17,6 +17,7 @@ import (
"code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/gitrepo"
alphanum_tokenizer "code.gitea.io/gitea/modules/indexer/code/bleve/tokenizer/alphanum"
"code.gitea.io/gitea/modules/indexer/code/internal"
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
@ -30,7 +31,6 @@ import (
"github.com/blevesearch/bleve/v2/analysis/token/camelcase"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/letter"
"github.com/blevesearch/bleve/v2/mapping"
"github.com/blevesearch/bleve/v2/search/query"
"github.com/go-enry/go-enry/v2"
@ -95,7 +95,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
"type": analyzer_custom.Name,
"char_filters": []string{},
"tokenizer": letter.Name,
"tokenizer": alphanum_tokenizer.Name,
"token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
}); err != nil {
return nil, err

View file

@ -0,0 +1,23 @@
package alphanum
import (
"unicode"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/character"
"github.com/blevesearch/bleve/v2/registry"
)
const Name = "alphanum"
func alphaNumeric(r rune) bool {
return unicode.IsLetter(r) || unicode.IsNumber(r)
}
func TokenizerConstructor(config map[string]any, cache *registry.Cache) (analysis.Tokenizer, error) {
return character.NewCharacterTokenizer(IsAlphaNumeric), nil
}
func init() {
registry.RegisterTokenizer(Name, TokenizerConstructor)
}

View file

@ -63,13 +63,13 @@ const (
"content_analyzer": {
"tokenizer": "content_tokenizer",
"filter" : ["lowercase"]
},
}
},
"tokenizer": {
"content_tokenizer": {
"type": "simple_pattern_split",
"pattern": "[^a-zA-Z0-9]"
},
}
}
}
},

View file

@ -80,10 +80,10 @@ func guessFuzzinessByKeyword(s string) int {
// according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
// magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
// BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
// Likewise, queries whose terms contains characters that are *not* letters should not use fuzziness
// Likewise, queries whose terms contains characters that are *not* letters or digits should not use fuzziness
for _, r := range s {
if r >= 128 || !unicode.IsLetter(r) {
if r >= 128 || !(unicode.IsLetter(r) || unicode.IsNumber(r)) {
return 0
}
}

View file

@ -37,6 +37,10 @@ func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
},
{
Input: "repo1",
Fuzziness: 1,
},
{
Input: "repo_one",
Fuzziness: 0,
},
{