From 5e5531bbc3ba5c2f2957c052ecc1785df347c504 Mon Sep 17 00:00:00 2001 From: Shiny Nematoda Date: Tue, 12 Nov 2024 13:13:22 +0000 Subject: [PATCH] fix(code search): use alphanumeric tokenizer instead of letter tokenizer letter tokenizer fails to consider numbers, this results in a searches for "func1" to match "func2" even with exact search. furthermore, since the tokenizer did not consider numbers searches for numerical values such "2.42" would fail entirely --- modules/indexer/code/bleve/bleve.go | 4 ++-- .../code/bleve/tokenizer/alphanum/alphanum.go | 23 +++++++++++++++++++ modules/indexer/internal/bleve/util.go | 2 +- 3 files changed, 26 insertions(+), 3 deletions(-) create mode 100644 modules/indexer/code/bleve/tokenizer/alphanum/alphanum.go diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go index 2b58a4bdc4..f43b8bef17 100644 --- a/modules/indexer/code/bleve/bleve.go +++ b/modules/indexer/code/bleve/bleve.go @@ -17,6 +17,7 @@ import ( "code.gitea.io/gitea/modules/charset" "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/gitrepo" + alphanum_tokenizer "code.gitea.io/gitea/modules/indexer/code/bleve/tokenizer/alphanum" "code.gitea.io/gitea/modules/indexer/code/internal" indexer_internal "code.gitea.io/gitea/modules/indexer/internal" inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve" @@ -30,7 +31,6 @@ import ( "github.com/blevesearch/bleve/v2/analysis/token/camelcase" "github.com/blevesearch/bleve/v2/analysis/token/lowercase" "github.com/blevesearch/bleve/v2/analysis/token/unicodenorm" - "github.com/blevesearch/bleve/v2/analysis/tokenizer/letter" "github.com/blevesearch/bleve/v2/mapping" "github.com/blevesearch/bleve/v2/search/query" "github.com/go-enry/go-enry/v2" @@ -95,7 +95,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) { } else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{ "type": analyzer_custom.Name, "char_filters": []string{}, - "tokenizer": letter.Name, + "tokenizer": alphanum_tokenizer.Name, "token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name}, }); err != nil { return nil, err diff --git a/modules/indexer/code/bleve/tokenizer/alphanum/alphanum.go b/modules/indexer/code/bleve/tokenizer/alphanum/alphanum.go new file mode 100644 index 0000000000..439daa07cb --- /dev/null +++ b/modules/indexer/code/bleve/tokenizer/alphanum/alphanum.go @@ -0,0 +1,23 @@ +package alphanum + +import ( + "unicode" + + "github.com/blevesearch/bleve/v2/analysis" + "github.com/blevesearch/bleve/v2/analysis/tokenizer/character" + "github.com/blevesearch/bleve/v2/registry" +) + +const Name = "alphanum" + +func IsAlphaNumeric(r rune) bool { + return unicode.IsLetter(r) || unicode.IsNumber(r) +} + +func TokenizerConstructor(config map[string]any, cache *registry.Cache) (analysis.Tokenizer, error) { + return character.NewCharacterTokenizer(IsAlphaNumeric), nil +} + +func init() { + registry.RegisterTokenizer(Name, TokenizerConstructor) +} diff --git a/modules/indexer/internal/bleve/util.go b/modules/indexer/internal/bleve/util.go index cac49415b8..0366f844e0 100644 --- a/modules/indexer/internal/bleve/util.go +++ b/modules/indexer/internal/bleve/util.go @@ -83,7 +83,7 @@ func guessFuzzinessByKeyword(s string) int { // Likewise, queries whose terms contains characters that are *not* letters should not use fuzziness for _, r := range s { - if r >= 128 || !unicode.IsLetter(r) { + if r >= 128 || !(unicode.IsLetter(r) || unicode.IsNumber(r)) { return 0 } }