mirror of
https://codeberg.org/forgejo/forgejo
synced 2024-11-25 11:16:11 +01:00
fix(code search): use alphanumeric tokenizer instead of letter tokenizer
letter tokenizer fails to consider numbers, this results in a searches for "func1" to match "func2" even with exact search. furthermore, since the tokenizer did not consider numbers searches for numerical values such "2.42" would fail entirely
This commit is contained in:
parent
16c0361764
commit
5e5531bbc3
|
@ -17,6 +17,7 @@ import (
|
|||
"code.gitea.io/gitea/modules/charset"
|
||||
"code.gitea.io/gitea/modules/git"
|
||||
"code.gitea.io/gitea/modules/gitrepo"
|
||||
alphanum_tokenizer "code.gitea.io/gitea/modules/indexer/code/bleve/tokenizer/alphanum"
|
||||
"code.gitea.io/gitea/modules/indexer/code/internal"
|
||||
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
|
||||
inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
|
||||
|
@ -30,7 +31,6 @@ import (
|
|||
"github.com/blevesearch/bleve/v2/analysis/token/camelcase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/letter"
|
||||
"github.com/blevesearch/bleve/v2/mapping"
|
||||
"github.com/blevesearch/bleve/v2/search/query"
|
||||
"github.com/go-enry/go-enry/v2"
|
||||
|
@ -95,7 +95,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
|
|||
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
|
||||
"type": analyzer_custom.Name,
|
||||
"char_filters": []string{},
|
||||
"tokenizer": letter.Name,
|
||||
"tokenizer": alphanum_tokenizer.Name,
|
||||
"token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
|
||||
}); err != nil {
|
||||
return nil, err
|
||||
|
|
23
modules/indexer/code/bleve/tokenizer/alphanum/alphanum.go
Normal file
23
modules/indexer/code/bleve/tokenizer/alphanum/alphanum.go
Normal file
|
@ -0,0 +1,23 @@
|
|||
package alphanum
|
||||
|
||||
import (
|
||||
"unicode"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/character"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "alphanum"
|
||||
|
||||
func IsAlphaNumeric(r rune) bool {
|
||||
return unicode.IsLetter(r) || unicode.IsNumber(r)
|
||||
}
|
||||
|
||||
func TokenizerConstructor(config map[string]any, cache *registry.Cache) (analysis.Tokenizer, error) {
|
||||
return character.NewCharacterTokenizer(IsAlphaNumeric), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenizer(Name, TokenizerConstructor)
|
||||
}
|
|
@ -83,7 +83,7 @@ func guessFuzzinessByKeyword(s string) int {
|
|||
// Likewise, queries whose terms contains characters that are *not* letters should not use fuzziness
|
||||
|
||||
for _, r := range s {
|
||||
if r >= 128 || !unicode.IsLetter(r) {
|
||||
if r >= 128 || !(unicode.IsLetter(r) || unicode.IsNumber(r)) {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue