mirror of
https://codeberg.org/forgejo/forgejo
synced 2024-11-24 18:56:11 +01:00
fix(code search): use alphanumeric tokenizer instead of letter tokenizer
letter tokenizer fails to consider numbers, this results in a searches for "func1" to match "func2" even with exact search. furthermore, since the tokenizer did not consider numbers searches for numerical values such "2.42" would fail entirely
This commit is contained in:
parent
16c0361764
commit
4c9ab4e41d
|
@ -17,6 +17,7 @@ import (
|
|||
"code.gitea.io/gitea/modules/charset"
|
||||
"code.gitea.io/gitea/modules/git"
|
||||
"code.gitea.io/gitea/modules/gitrepo"
|
||||
alphanum_tokenizer "code.gitea.io/gitea/modules/indexer/code/bleve/tokenizer/alphanum"
|
||||
"code.gitea.io/gitea/modules/indexer/code/internal"
|
||||
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
|
||||
inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
|
||||
|
@ -30,7 +31,6 @@ import (
|
|||
"github.com/blevesearch/bleve/v2/analysis/token/camelcase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
|
||||
"github.com/blevesearch/bleve/v2/analysis/token/unicodenorm"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/letter"
|
||||
"github.com/blevesearch/bleve/v2/mapping"
|
||||
"github.com/blevesearch/bleve/v2/search/query"
|
||||
"github.com/go-enry/go-enry/v2"
|
||||
|
@ -95,7 +95,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
|
|||
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
|
||||
"type": analyzer_custom.Name,
|
||||
"char_filters": []string{},
|
||||
"tokenizer": letter.Name,
|
||||
"tokenizer": alphanum_tokenizer.Name,
|
||||
"token_filters": []string{unicodeNormalizeName, camelcase.Name, lowercase.Name},
|
||||
}); err != nil {
|
||||
return nil, err
|
||||
|
|
23
modules/indexer/code/bleve/tokenizer/alphanum/alphanum.go
Normal file
23
modules/indexer/code/bleve/tokenizer/alphanum/alphanum.go
Normal file
|
@ -0,0 +1,23 @@
|
|||
package alphanum
|
||||
|
||||
import (
|
||||
"unicode"
|
||||
|
||||
"github.com/blevesearch/bleve/v2/analysis"
|
||||
"github.com/blevesearch/bleve/v2/analysis/tokenizer/character"
|
||||
"github.com/blevesearch/bleve/v2/registry"
|
||||
)
|
||||
|
||||
const Name = "alphanum"
|
||||
|
||||
func alphaNumeric(r rune) bool {
|
||||
return unicode.IsLetter(r) || unicode.IsNumber(r)
|
||||
}
|
||||
|
||||
func TokenizerConstructor(config map[string]any, cache *registry.Cache) (analysis.Tokenizer, error) {
|
||||
return character.NewCharacterTokenizer(IsAlphaNumeric), nil
|
||||
}
|
||||
|
||||
func init() {
|
||||
registry.RegisterTokenizer(Name, TokenizerConstructor)
|
||||
}
|
|
@ -63,13 +63,13 @@ const (
|
|||
"content_analyzer": {
|
||||
"tokenizer": "content_tokenizer",
|
||||
"filter" : ["lowercase"]
|
||||
},
|
||||
}
|
||||
},
|
||||
"tokenizer": {
|
||||
"content_tokenizer": {
|
||||
"type": "simple_pattern_split",
|
||||
"pattern": "[^a-zA-Z0-9]"
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
|
|
|
@ -80,10 +80,10 @@ func guessFuzzinessByKeyword(s string) int {
|
|||
// according to https://github.com/blevesearch/bleve/issues/1563, the supported max fuzziness is 2
|
||||
// magic number 4 was chosen to determine the levenshtein distance per each character of a keyword
|
||||
// BUT, when using CJK (eg: `갃갃갃` `啊啊啊`), it mismatches a lot.
|
||||
// Likewise, queries whose terms contains characters that are *not* letters should not use fuzziness
|
||||
// Likewise, queries whose terms contains characters that are *not* letters or digits should not use fuzziness
|
||||
|
||||
for _, r := range s {
|
||||
if r >= 128 || !unicode.IsLetter(r) {
|
||||
if r >= 128 || !(unicode.IsLetter(r) || unicode.IsNumber(r)) {
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
|
|
@ -37,6 +37,10 @@ func TestBleveGuessFuzzinessByKeyword(t *testing.T) {
|
|||
},
|
||||
{
|
||||
Input: "repo1",
|
||||
Fuzziness: 1,
|
||||
},
|
||||
{
|
||||
Input: "repo_one",
|
||||
Fuzziness: 0,
|
||||
},
|
||||
{
|
||||
|
|
Loading…
Reference in a new issue