Template
1
0
Fork 0
mirror of https://codeberg.org/forgejo/forgejo synced 2024-11-22 09:54:24 +01:00

fix(code search): use alphanumeric tokenizer instead of letter tokenizer

letter tokenizer fails to consider numbers, this results in a searches for "func1" to match "func2" even with exact search.

furthermore, since the tokenizer did not consider numbers searches for numerical values such "2.42" would fail entirely
This commit is contained in:
Shiny Nematoda 2024-11-12 13:13:22 +00:00
parent 16c0361764
commit 5e5531bbc3
3 changed files with 26 additions and 3 deletions

View file

@ -17,6 +17,7 @@ import (
"code.gitea.io/gitea/modules/charset" "code.gitea.io/gitea/modules/charset"
"code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/gitrepo" "code.gitea.io/gitea/modules/gitrepo"
alphanum_tokenizer "code.gitea.io/gitea/modules/indexer/code/bleve/tokenizer/alphanum"
"code.gitea.io/gitea/modules/indexer/code/internal" "code.gitea.io/gitea/modules/indexer/code/internal"
indexer_internal "code.gitea.io/gitea/modules/indexer/internal" indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve" inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
@ -30,7 +31,6 @@ import (
"github.com/blevesearch/bleve/v2/analysis/token/camelcase" "github.com/blevesearch/bleve/v2/analysis/token/camelcase"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase" "github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm" "github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/letter"
"github.com/blevesearch/bleve/v2/mapping" "github.com/blevesearch/bleve/v2/mapping"
"github.com/blevesearch/bleve/v2/search/query" "github.com/blevesearch/bleve/v2/search/query"
"github.com/go-enry/go-enry/v2" "github.com/go-enry/go-enry/v2"
@ -95,7 +95,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{ } else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
"type": analyzer_custom.Name, "type": analyzer_custom.Name,
"char_filters": []string{}, "char_filters": []string{},
"tokenizer": letter.Name, "tokenizer": alphanum_tokenizer.Name,
"token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name}, "token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
}); err != nil { }); err != nil {
return nil, err return nil, err

View file

@ -0,0 +1,23 @@
package alphanum
import (
"unicode"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/character"
"github.com/blevesearch/bleve/v2/registry"
)
const Name = "alphanum"
func IsAlphaNumeric(r rune) bool {
return unicode.IsLetter(r) || unicode.IsNumber(r)
}
func TokenizerConstructor(config map[string]any, cache *registry.Cache) (analysis.Tokenizer, error) {
return character.NewCharacterTokenizer(IsAlphaNumeric), nil
}
func init() {
registry.RegisterTokenizer(Name, TokenizerConstructor)
}

View file

@ -83,7 +83,7 @@ func guessFuzzinessByKeyword(s string) int {
// Likewise, queries whose terms contains characters that are *not* letters should not use fuzziness // Likewise, queries whose terms contains characters that are *not* letters should not use fuzziness
for _, r := range s { for _, r := range s {
if r >= 128 || !unicode.IsLetter(r) { if r >= 128 || !(unicode.IsLetter(r) || unicode.IsNumber(r)) {
return 0 return 0
} }
} }