Template
1
0
Fork 0
mirror of https://codeberg.org/forgejo/forgejo synced 2024-11-25 03:06:10 +01:00

port(partial): partial port of 'Allow code search by filename (gitea#32210)'

NOTE: This is feature is not available. Only a few select changes, such as the tests and its' required repositories were ported.

Original Commit Message
-----------------------

This is a large and complex PR, so let me explain in detail its changes.

First, I had to create new index mappings for Bleve and ElasticSerach as
the current ones do not support search by filename. This requires Gitea
to recreate the code search indexes (I do not know if this is a breaking
change, but I feel it deserves a heads-up).

I've used [this
approach](https://www.elastic.co/guide/en/elasticsearch/reference/7.17/analysis-pathhierarchy-tokenizer.html)
to model the filename index. It allows us to efficiently search for both
the full path and the name of a file. Bleve, however, does not support
this out-of-box, so I had to code a brand new [token
filter](https://blevesearch.com/docs/Token-Filters/) to generate the
search terms.

I also did an overhaul in the `indexer_test.go` file. It now asserts the
order of the expected results (this is important since matches based on
the name of a file are more relevant than those based on its content).
I've added new test scenarios that deal with searching by filename. They
use a new repo included in the Gitea fixture.

The screenshot below depicts how Gitea shows the search results. It
shows results based on content in the same way as the current version
does. In matches based on the filename, the first seven lines of the
file contents are shown (BTW, this is how GitHub does it).

![image](https://github.com/user-attachments/assets/9d938d86-1a8d-4f89-8644-1921a473e858)

Resolves #32096

---------

Signed-off-by: Bruno Sofiato <bruno.sofiato@gmail.com>
This commit is contained in:
Shiny Nematoda 2024-11-11 13:31:25 +00:00
parent 2903e3c527
commit 16c0361764
27 changed files with 203 additions and 126 deletions

View file

@ -795,3 +795,24 @@
type: 10
config: "{}"
created_unix: 946684810
-
id: 115
repo_id: 63
type: 1
config: "{}"
created_unix: 946684810
-
id: 116
repo_id: 63
type: 2
config: "{\"EnableTimetracker\":true,\"AllowOnlyContributorsToTrackTime\":true}"
created_unix: 946684810
-
id: 117
repo_id: 63
type: 3
config: "{\"IgnoreWhitespaceConflicts\":false,\"AllowMerge\":true,\"AllowRebase\":true,\"AllowRebaseMerge\":true,\"AllowSquash\":true}"
created_unix: 946684810

View file

@ -1825,3 +1825,34 @@
size: 0
is_fsck_enabled: true
close_issues_via_commit_in_any_branch: false
-
id: 63
owner_id: 42
owner_name: org42
lower_name: search-by-path
name: search-by-path
default_branch: master
num_watches: 0
num_stars: 0
num_forks: 0
num_issues: 0
num_closed_issues: 0
num_pulls: 0
num_closed_pulls: 0
num_milestones: 0
num_closed_milestones: 0
num_projects: 0
num_closed_projects: 0
is_private: false
is_empty: false
is_archived: false
is_mirror: false
status: 0
is_fork: false
fork_id: 0
is_template: false
template_id: 0
size: 0
is_fsck_enabled: true
close_issues_via_commit_in_any_branch: false

View file

@ -1517,3 +1517,40 @@
repo_admin_change_team_access: false
theme: ""
keep_activity_private: false
-
id: 42
lower_name: org42
name: org42
full_name: Org42
email: org42@example.com
keep_email_private: false
email_notifications_preference: onmention
passwd: ZogKvWdyEx:password
passwd_hash_algo: dummy
must_change_password: false
login_source: 0
login_name: org42
type: 1
salt: ZogKvWdyEx
max_repo_creation: -1
is_active: false
is_admin: false
is_restricted: false
allow_git_hook: false
allow_import_local: false
allow_create_organization: true
prohibit_login: false
avatar: avatar42
avatar_email: org42@example.com
use_custom_avatar: false
num_followers: 0
num_following: 0
num_stars: 0
num_repos: 1
num_teams: 0
num_members: 0
visibility: 0
repo_admin_change_team_access: false
theme: ""
keep_activity_private: false

View file

@ -139,12 +139,12 @@ func getTestCases() []struct {
{
name: "AllPublic/PublicRepositoriesOfUserIncludingCollaborative",
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 15, AllPublic: true, Template: optional.Some(false)},
count: 35,
count: 36,
},
{
name: "AllPublic/PublicAndPrivateRepositoriesOfUserIncludingCollaborative",
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 15, Private: true, AllPublic: true, AllLimited: true, Template: optional.Some(false)},
count: 40,
count: 41,
},
{
name: "AllPublic/PublicAndPrivateRepositoriesOfUserIncludingCollaborativeByName",
@ -159,7 +159,7 @@ func getTestCases() []struct {
{
name: "AllPublic/PublicRepositoriesOfOrganization",
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 17, AllPublic: true, Collaborate: optional.Some(false), Template: optional.Some(false)},
count: 35,
count: 36,
},
{
name: "AllTemplates",

View file

@ -184,7 +184,10 @@ func TestSearchUsers(t *testing.T) {
testOrgSuccess(&user_model.SearchUserOptions{OrderBy: "id ASC", ListOptions: db.ListOptions{Page: 4, PageSize: 2}},
[]int64{26, 41})
testOrgSuccess(&user_model.SearchUserOptions{ListOptions: db.ListOptions{Page: 5, PageSize: 2}},
testOrgSuccess(&user_model.SearchUserOptions{OrderBy: "id ASC", ListOptions: db.ListOptions{Page: 5, PageSize: 2}},
[]int64{42})
testOrgSuccess(&user_model.SearchUserOptions{ListOptions: db.ListOptions{Page: 6, PageSize: 2}},
[]int64{})
// test users

View file

@ -6,6 +6,7 @@ package code
import (
"context"
"os"
"slices"
"testing"
"code.gitea.io/gitea/models/db"
@ -22,29 +23,31 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
_ "github.com/mattn/go-sqlite3"
)
type codeSearchResult struct {
Filename string
Content string
}
func TestMain(m *testing.M) {
unittest.MainTest(m)
}
type codeSearchResult struct {
Filename string
Content string
}
func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
t.Run(name, func(t *testing.T) {
var repoID int64 = 1
err := index(git.DefaultContext, indexer, repoID)
require.NoError(t, err)
require.NoError(t, setupRepositoryIndexes(git.DefaultContext, indexer))
keywords := []struct {
RepoIDs []int64
Keyword string
IDs []int64
Langs int
Results []codeSearchResult
}{
// Search for an exact match on the contents of a file
// This scenario yields a single result (the file README.md on the repo '1')
{
RepoIDs: nil,
Keyword: "Description",
@ -52,10 +55,12 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
Results: []codeSearchResult{
{
Filename: "README.md",
Content: "# repo1\n\nDescription for repo1",
}
Content: "# repo1\n\nDescription for repo1",
},
},
},
// Search for an exact match on the contents of a file within the repo '2'.
// This scenario yields no results
{
RepoIDs: []int64{2},
Keyword: "Description",
@ -74,20 +79,22 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
},
},
},
// Search for an exact match on the contents of a file within the repo '2'.
// This scenario yields no results
{
RepoIDs: []int64{2},
Keyword: "repo1",
Langs: 0,
},
// Search for a non-existing term.
// This scenario yields no results
{
RepoIDs: nil,
Keyword: "non-exist",
Langs: 0,
},
// Search for an exact match on the contents of a file within the repo '62'.
// This scenario yields a single result (the file avocado.md on the repo '62')
{
RepoIDs: []int64{62},
RepoIDs: []int64{63},
Keyword: "pineaple",
Langs: 1,
Results: []codeSearchResult{
@ -97,101 +104,10 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
},
},
},
// Search for an exact match on the filename within the repo '62'.
// This scenario yields a single result (the file avocado.md on the repo '62')
{
RepoIDs: []int64{62},
Keyword: "avocado.md",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "avocado.md",
Content: "# repo1\n\npineaple pie of cucumber juice",
},
},
},
// Search for an partial match on the filename within the repo '62'.
// This scenario yields a single result (the file avocado.md on the repo '62')
{
RepoIDs: []int64{62},
Keyword: "avo",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "avocado.md",
Content: "# repo1\n\npineaple pie of cucumber juice",
},
},
},
// Search for matches on both the contents and the filenames within the repo '62'.
// This scenario yields two results: the first result is baed on the file (cucumber.md) while the second is based on the contents
{
RepoIDs: []int64{62},
RepoIDs: []int64{63},
Keyword: "cucumber",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "cucumber.md",
Content: "Salad is good for your health",
},
{
Filename: "avocado.md",
Content: "# repo1\n\npineaple pie of cucumber juice",
},
},
},
// Search for matches on the filenames within the repo '62'.
// This scenario yields two results (both are based on filename, the first one is an exact match)
{
RepoIDs: []int64{62},
Keyword: "ham",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "ham.md",
Content: "This is also not cheese",
},
{
Filename: "potato/ham.md",
Content: "This is not cheese",
},
},
},
// Search for matches on the contents of files within the repo '62'.
// This scenario yields two results (both are based on contents, the first one is an exact match where as the second is a 'fuzzy' one)
{
RepoIDs: []int64{62},
Keyword: "This is not cheese",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "potato/ham.md",
Content: "This is not cheese",
},
{
Filename: "ham.md",
Content: "This is also not cheese",
},
},
},
// Search for matches on the contents of files regardless of case.
{
RepoIDs: nil,
Keyword: "dESCRIPTION",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "README.md",
Content: "# repo1\n\nDescription for repo1",
},
},
},
// Search for an exact match on the filename within the repo '62' (case insenstive).
// This scenario yields a single result (the file avocado.md on the repo '62')
{
RepoIDs: []int64{62},
Keyword: "AVOCADO.MD",
Langs: 1,
Results: []codeSearchResult{
{
Filename: "avocado.md",
@ -201,7 +117,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
},
// Search for matches on the contents of files when the criteria is a expression.
{
RepoIDs: []int64{62},
RepoIDs: []int64{63},
Keyword: "console.log",
Langs: 1,
Results: []codeSearchResult{
@ -213,7 +129,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
},
// Search for matches on the contents of files when the criteria is part of a expression.
{
RepoIDs: []int64{62},
RepoIDs: []int64{63},
Keyword: "log",
Langs: 1,
Results: []codeSearchResult{
@ -267,7 +183,7 @@ func testIndexer(name string, t *testing.T, indexer internal.Indexer) {
})
}
require.NoError(t, indexer.Delete(context.Background(), repoID))
require.NoError(t, tearDownRepositoryIndexes(indexer))
})
}
@ -310,3 +226,25 @@ func TestESIndexAndSearch(t *testing.T) {
testIndexer("elastic_search", t, indexer)
}
func setupRepositoryIndexes(ctx context.Context, indexer internal.Indexer) error {
for _, repoID := range repositoriesToSearch() {
if err := index(ctx, indexer, repoID); err != nil {
return err
}
}
return nil
}
func tearDownRepositoryIndexes(indexer internal.Indexer) error {
for _, repoID := range repositoriesToSearch() {
if err := indexer.Delete(context.Background(), repoID); err != nil {
return err
}
}
return nil
}
func repositoriesToSearch() []int64 {
return []int64{1, 63}
}

View file

@ -16,6 +16,13 @@ import (
"github.com/blevesearch/bleve/v2/index/upsidedown"
)
const (
// fuzzyDenominator determines the levenshtein distance per each character of a keyword
fuzzyDenominator = 4
// see https://github.com/blevesearch/bleve/issues/1563#issuecomment-786822311
maxFuzziness = 2
)
// openIndexer open the index at the specified path, checking for metadata
// updates and bleve version updates. If index needs to be created (or
// re-created), returns (nil, nil)
@ -49,14 +56,6 @@ func openIndexer(path string, latestVersion int) (bleve.Index, int, error) {
return index, 0, nil
}
const (
// fuzzyDenominator determines the levenshtein distance per each character of a keyword
fuzzyDenominator = 4
// see https://github.com/blevesearch/bleve/issues/1563#issuecomment-786822311
maxFuzziness = 2
)
// This method test the GuessFuzzinessByKeyword method. The fuzziness is based on the levenshtein distance and determines how many chars
// may be different on two string and they still be considered equivalent.
// Given a phrasse, its shortest word determines its fuzziness. If a phrase uses CJK (eg: `갃갃갃` `啊啊啊`), the fuzziness is zero.

View file

@ -0,0 +1 @@
ref: refs/heads/master

View file

@ -0,0 +1,4 @@
[core]
repositoryformatversion = 0
filemode = true
bare = true

View file

@ -0,0 +1,7 @@
#!/usr/bin/env bash
ORI_DIR=`pwd`
SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
cd "$ORI_DIR"
for i in `ls "$SHELL_FOLDER/post-receive.d"`; do
sh "$SHELL_FOLDER/post-receive.d/$i"
done

View file

@ -0,0 +1,2 @@
#!/usr/bin/env bash
"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" post-receive

View file

@ -0,0 +1,7 @@
#!/usr/bin/env bash
ORI_DIR=`pwd`
SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
cd "$ORI_DIR"
for i in `ls "$SHELL_FOLDER/pre-receive.d"`; do
sh "$SHELL_FOLDER/pre-receive.d/$i"
done

View file

@ -0,0 +1,2 @@
#!/usr/bin/env bash
"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" pre-receive

View file

@ -0,0 +1,7 @@
#!/usr/bin/env bash
ORI_DIR=`pwd`
SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
cd "$ORI_DIR"
for i in `ls "$SHELL_FOLDER/proc-receive.d"`; do
sh "$SHELL_FOLDER/proc-receive.d/$i"
done

View file

@ -0,0 +1,2 @@
#!/usr/bin/env bash
"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" proc-receive

View file

@ -0,0 +1,7 @@
#!/usr/bin/env bash
ORI_DIR=`pwd`
SHELL_FOLDER=$(cd "$(dirname "$0")";pwd)
cd "$ORI_DIR"
for i in `ls "$SHELL_FOLDER/update.d"`; do
sh "$SHELL_FOLDER/update.d/$i" $1 $2 $3
done

View file

@ -0,0 +1,2 @@
#!/usr/bin/env bash
"$GITEA_ROOT/gitea" hook --config="$GITEA_ROOT/$GITEA_CONF" update $1 $2 $3

View file

@ -0,0 +1,6 @@
# git ls-files --others --exclude-from=.git/info/exclude
# Lines that start with '#' are comments.
# For a project mostly in C, the following would be a good set of
# exclude patterns (uncomment them if you want to use them):
# *.[oa]
# *~

View file

@ -170,7 +170,7 @@ func TestAPIGetAll(t *testing.T) {
var apiOrgList []*api.Organization
DecodeJSON(t, resp, &apiOrgList)
assert.Len(t, apiOrgList, 12)
assert.Len(t, apiOrgList, 13)
assert.Equal(t, "Limited Org 36", apiOrgList[1].FullName)
assert.Equal(t, "limited", apiOrgList[1].Visibility)
@ -179,7 +179,7 @@ func TestAPIGetAll(t *testing.T) {
resp = MakeRequest(t, req, http.StatusOK)
DecodeJSON(t, resp, &apiOrgList)
assert.Len(t, apiOrgList, 8)
assert.Len(t, apiOrgList, 9)
assert.Equal(t, "org 17", apiOrgList[0].FullName)
assert.Equal(t, "public", apiOrgList[0].Visibility)
}

View file

@ -96,9 +96,9 @@ func TestAPISearchRepo(t *testing.T) {
}{
{
name: "RepositoriesMax50", requestURL: "/api/v1/repos/search?limit=50&private=false", expectedResults: expectedResults{
nil: {count: 37},
user: {count: 37},
user2: {count: 37},
nil: {count: 38},
user: {count: 38},
user2: {count: 38},
},
},
{