From 4c9ab4e41d5e99b031c1a19c545ef67098aaf766 Mon Sep 17 00:00:00 2001 From: Shiny Nematoda Date: Tue, 12 Nov 2024 13:13:22 +0000 Subject: [PATCH] fix(code search): use alphanumeric tokenizer instead of letter tokenizer letter tokenizer fails to consider numbers, this results in a searches for "func1" to match "func2" even with exact search. furthermore, since the tokenizer did not consider numbers searches for numerical values such "2.42" would fail entirely --- modules/indexer/code/bleve/bleve.go | 4 ++-- .../code/bleve/tokenizer/alphanum/alphanum.go | 23 +++++++++++++++++++ .../code/elasticsearch/elasticsearch.go | 4 ++-- modules/indexer/internal/bleve/util.go | 4 ++-- modules/indexer/internal/bleve/util_test.go | 4 ++++ 5 files changed, 33 insertions(+), 6 deletions(-) create mode 100644 modules/indexer/code/bleve/tokenizer/alphanum/alphanum.go diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go index 2b58a4bdc4..f43b8bef17 100644 --- a/modules/indexer/code/bleve/bleve.go +++ b/modules/indexer/code/bleve/bleve.go @@ -17,6 +17,7 @@ import ( "code.gitea.io/gitea/modules/charset" "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/gitrepo" + alphanum_tokenizer "code.gitea.io/gitea/modules/indexer/code/bleve/tokenizer/alphanum" "code.gitea.io/gitea/modules/indexer/code/internal" indexer_internal "code.gitea.io/gitea/modules/indexer/internal" inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve" @@ -30,7 +31,6 @@ import ( "github.com/blevesearch/bleve/v2/analysis/token/camelcase" "github.com/blevesearch/bleve/v2/analysis/token/lowercase" "github.com/blevesearch/bleve/v2/analysis/token/unicodenorm" - "github.com/blevesearch/bleve/v2/analysis/tokenizer/letter" "github.com/blevesearch/bleve/v2/mapping" "github.com/blevesearch/bleve/v2/search/query" "github.com/go-enry/go-enry/v2" @@ -95,7 +95,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) { } else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{ "type": analyzer_custom.Name, "char_filters": []string{}, - "tokenizer": letter.Name, + "tokenizer": alphanum_tokenizer.Name, "token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name}, }); err != nil { return nil, err diff --git a/modules/indexer/code/bleve/tokenizer/alphanum/alphanum.go b/modules/indexer/code/bleve/tokenizer/alphanum/alphanum.go new file mode 100644 index 0000000000..0e831f8820 --- /dev/null +++ b/modules/indexer/code/bleve/tokenizer/alphanum/alphanum.go @@ -0,0 +1,23 @@ +package alphanum + +import ( + "unicode" + + "github.com/blevesearch/bleve/v2/analysis" + "github.com/blevesearch/bleve/v2/analysis/tokenizer/character" + "github.com/blevesearch/bleve/v2/registry" +) + +const Name = "alphanum" + +func alphaNumeric(r rune) bool { + return unicode.IsLetter(r) || unicode.IsNumber(r) +} + +func TokenizerConstructor(config map[string]any, cache *registry.Cache) (analysis.Tokenizer, error) { + return character.NewCharacterTokenizer(IsAlphaNumeric), nil +} + +func init() { + registry.RegisterTokenizer(Name, TokenizerConstructor) +} diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go index a1e590a26f..0df3271e80 100644 --- a/modules/indexer/code/elasticsearch/elasticsearch.go +++ b/modules/indexer/code/elasticsearch/elasticsearch.go @@ -63,13 +63,13 @@ const ( "content_analyzer": { "tokenizer": "content_tokenizer", "filter" : ["lowercase"] - }, + } }, "tokenizer": { "content_tokenizer": { "type": "simple_pattern_split", "pattern": "[^a-zA-Z0-9]" - }, + } } } }, diff --git a/modules/indexer/internal/bleve/util.go b/modules/indexer/internal/bleve/util.go index cac49415b8..4d25aa4b78 100644 --- a/modules/indexer/internal/bleve/util.go +++ b/modules/indexer/internal/bleve/util.go @@ -80,10 +80,10 @@ func guessFuzzinessByKeyword(s string) int { // according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2 // magic number 4 was chosen to determine the levenshtein distance per each character of a keyword // BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot. - // Likewise, queries whose terms contains characters that are *not* letters should not use fuzziness + // Likewise, queries whose terms contains characters that are *not* letters or digits should not use fuzziness for _, r := range s { - if r >= 128 || !unicode.IsLetter(r) { + if r >= 128 || !(unicode.IsLetter(r) || unicode.IsNumber(r)) { return 0 } } diff --git a/modules/indexer/internal/bleve/util_test.go b/modules/indexer/internal/bleve/util_test.go index 8f7844464e..006f817741 100644 --- a/modules/indexer/internal/bleve/util_test.go +++ b/modules/indexer/internal/bleve/util_test.go @@ -37,6 +37,10 @@ func TestBleveGuessFuzzinessByKeyword(t *testing.T) { }, { Input: "repo1", + Fuzziness: 1, + }, + { + Input: "repo_one", Fuzziness: 0, }, {