Skip to content

Commit 93e7f36

Browse files
bsofiatomatera-bs
authored andcommitted
[Cherry picked from v1.22.4 - Tentative] Allow code search by filename (go-gitea#32210)
This is a large and complex PR, so let me explain in detail its changes. First, I had to create new index mappings for Bleve and ElasticSerach as the current ones do not support search by filename. This requires Gitea to recreate the code search indexes (I do not know if this is a breaking change, but I feel it deserves a heads-up). I've used [this approach](https://www.elastic.co/guide/en/elasticsearch/reference/7.17/analysis-pathhierarchy-tokenizer.html) to model the filename index. It allows us to efficiently search for both the full path and the name of a file. Bleve, however, does not support this out-of-box, so I had to code a brand new [token filter](https://blevesearch.com/docs/Token-Filters/) to generate the search terms. I also did an overhaul in the `indexer_test.go` file. It now asserts the order of the expected results (this is important since matches based on the name of a file are more relevant than those based on its content). I've added new test scenarios that deal with searching by filename. They use a new repo included in the Gitea fixture. The screenshot below depicts how Gitea shows the search results. It shows results based on content in the same way as the current version does. In matches based on the filename, the first seven lines of the file contents are shown (BTW, this is how GitHub does it). ![image](https://github.com/user-attachments/assets/9d938d86-1a8d-4f89-8644-1921a473e858) Resolves go-gitea#32096 --------- Signed-off-by: Bruno Sofiato <[email protected]>
1 parent d7800f6 commit 93e7f36

38 files changed

+720
-49
lines changed

models/fixtures/repo_unit.yml

+21
Original file line numberDiff line numberDiff line change
@@ -712,3 +712,24 @@
712712
type: 3
713713
config: "{\"IgnoreWhitespaceConflicts\":false,\"AllowMerge\":true,\"AllowRebase\":true,\"AllowRebaseMerge\":true,\"AllowSquash\":true}"
714714
created_unix: 946684810
715+
716+
-
717+
id: 108
718+
repo_id: 62
719+
type: 1
720+
config: "{}"
721+
created_unix: 946684810
722+
723+
-
724+
id: 109
725+
repo_id: 62
726+
type: 2
727+
config: "{\"EnableTimetracker\":true,\"AllowOnlyContributorsToTrackTime\":true}"
728+
created_unix: 946684810
729+
730+
-
731+
id: 110
732+
repo_id: 62
733+
type: 3
734+
config: "{\"IgnoreWhitespaceConflicts\":false,\"AllowMerge\":true,\"AllowRebase\":true,\"AllowRebaseMerge\":true,\"AllowSquash\":true}"
735+
created_unix: 946684810

models/fixtures/repository.yml

+31
Original file line numberDiff line numberDiff line change
@@ -1768,3 +1768,34 @@
17681768
size: 0
17691769
is_fsck_enabled: true
17701770
close_issues_via_commit_in_any_branch: false
1771+
1772+
-
1773+
id: 62
1774+
owner_id: 42
1775+
owner_name: org42
1776+
lower_name: search-by-path
1777+
name: search-by-path
1778+
default_branch: master
1779+
num_watches: 0
1780+
num_stars: 0
1781+
num_forks: 0
1782+
num_issues: 0
1783+
num_closed_issues: 0
1784+
num_pulls: 0
1785+
num_closed_pulls: 0
1786+
num_milestones: 0
1787+
num_closed_milestones: 0
1788+
num_projects: 0
1789+
num_closed_projects: 0
1790+
is_private: false
1791+
is_empty: false
1792+
is_archived: false
1793+
is_mirror: false
1794+
status: 0
1795+
is_fork: false
1796+
fork_id: 0
1797+
is_template: false
1798+
template_id: 0
1799+
size: 0
1800+
is_fsck_enabled: true
1801+
close_issues_via_commit_in_any_branch: false

models/fixtures/user.yml

+37
Original file line numberDiff line numberDiff line change
@@ -1517,3 +1517,40 @@
15171517
repo_admin_change_team_access: false
15181518
theme: ""
15191519
keep_activity_private: false
1520+
1521+
-
1522+
id: 42
1523+
lower_name: org42
1524+
name: org42
1525+
full_name: Org42
1526+
1527+
keep_email_private: false
1528+
email_notifications_preference: onmention
1529+
passwd: ZogKvWdyEx:password
1530+
passwd_hash_algo: dummy
1531+
must_change_password: false
1532+
login_source: 0
1533+
login_name: org42
1534+
type: 1
1535+
salt: ZogKvWdyEx
1536+
max_repo_creation: -1
1537+
is_active: false
1538+
is_admin: false
1539+
is_restricted: false
1540+
allow_git_hook: false
1541+
allow_import_local: false
1542+
allow_create_organization: true
1543+
prohibit_login: false
1544+
avatar: avatar42
1545+
avatar_email: [email protected]
1546+
use_custom_avatar: false
1547+
num_followers: 0
1548+
num_following: 0
1549+
num_stars: 0
1550+
num_repos: 1
1551+
num_teams: 0
1552+
num_members: 0
1553+
visibility: 0
1554+
repo_admin_change_team_access: false
1555+
theme: ""
1556+
keep_activity_private: false

models/repo/repo_list_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -138,12 +138,12 @@ func getTestCases() []struct {
138138
{
139139
name: "AllPublic/PublicRepositoriesOfUserIncludingCollaborative",
140140
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 15, AllPublic: true, Template: optional.Some(false)},
141-
count: 33,
141+
count: 34,
142142
},
143143
{
144144
name: "AllPublic/PublicAndPrivateRepositoriesOfUserIncludingCollaborative",
145145
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 15, Private: true, AllPublic: true, AllLimited: true, Template: optional.Some(false)},
146-
count: 38,
146+
count: 39,
147147
},
148148
{
149149
name: "AllPublic/PublicAndPrivateRepositoriesOfUserIncludingCollaborativeByName",
@@ -158,7 +158,7 @@ func getTestCases() []struct {
158158
{
159159
name: "AllPublic/PublicRepositoriesOfOrganization",
160160
opts: &repo_model.SearchRepoOptions{ListOptions: db.ListOptions{Page: 1, PageSize: 10}, OwnerID: 17, AllPublic: true, Collaborate: optional.Some(false), Template: optional.Some(false)},
161-
count: 33,
161+
count: 34,
162162
},
163163
{
164164
name: "AllTemplates",

models/user/user_test.go

+4-1
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,10 @@ func TestSearchUsers(t *testing.T) {
9292
testOrgSuccess(&user_model.SearchUserOptions{OrderBy: "id ASC", ListOptions: db.ListOptions{Page: 4, PageSize: 2}},
9393
[]int64{26, 41})
9494

95-
testOrgSuccess(&user_model.SearchUserOptions{ListOptions: db.ListOptions{Page: 5, PageSize: 2}},
95+
testOrgSuccess(&user_model.SearchUserOptions{OrderBy: "id ASC", ListOptions: db.ListOptions{Page: 5, PageSize: 2}},
96+
[]int64{42})
97+
98+
testOrgSuccess(&user_model.SearchUserOptions{ListOptions: db.ListOptions{Page: 6, PageSize: 2}},
9699
[]int64{})
97100

98101
// test users

modules/indexer/code/bleve/bleve.go

+37-7
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"code.gitea.io/gitea/modules/charset"
1818
"code.gitea.io/gitea/modules/git"
1919
"code.gitea.io/gitea/modules/gitrepo"
20+
path_filter "code.gitea.io/gitea/modules/indexer/code/bleve/token/path"
2021
"code.gitea.io/gitea/modules/indexer/code/internal"
2122
indexer_internal "code.gitea.io/gitea/modules/indexer/internal"
2223
inner_bleve "code.gitea.io/gitea/modules/indexer/internal/bleve"
@@ -53,6 +54,7 @@ type RepoIndexerData struct {
5354
RepoID int64
5455
CommitID string
5556
Content string
57+
Filename string
5658
Language string
5759
UpdatedAt time.Time
5860
}
@@ -64,8 +66,10 @@ func (d *RepoIndexerData) Type() string {
6466

6567
const (
6668
repoIndexerAnalyzer = "repoIndexerAnalyzer"
69+
filenameIndexerAnalyzer = "filenameIndexerAnalyzer"
70+
filenameIndexerTokenizer = "filenameIndexerTokenizer"
6771
repoIndexerDocType = "repoIndexerDocType"
68-
repoIndexerLatestVersion = 6
72+
repoIndexerLatestVersion = 7
6973
)
7074

7175
// generateBleveIndexMapping generates a bleve index mapping for the repo indexer
@@ -79,6 +83,11 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
7983
textFieldMapping.IncludeInAll = false
8084
docMapping.AddFieldMappingsAt("Content", textFieldMapping)
8185

86+
fileNamedMapping := bleve.NewTextFieldMapping()
87+
fileNamedMapping.IncludeInAll = false
88+
fileNamedMapping.Analyzer = filenameIndexerAnalyzer
89+
docMapping.AddFieldMappingsAt("Filename", fileNamedMapping)
90+
8291
termFieldMapping := bleve.NewTextFieldMapping()
8392
termFieldMapping.IncludeInAll = false
8493
termFieldMapping.Analyzer = analyzer_keyword.Name
@@ -90,6 +99,7 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
9099
docMapping.AddFieldMappingsAt("UpdatedAt", timeFieldMapping)
91100

92101
mapping := bleve.NewIndexMapping()
102+
93103
if err := addUnicodeNormalizeTokenFilter(mapping); err != nil {
94104
return nil, err
95105
} else if err := mapping.AddCustomAnalyzer(repoIndexerAnalyzer, map[string]any{
@@ -100,6 +110,16 @@ func generateBleveIndexMapping() (mapping.IndexMapping, error) {
100110
}); err != nil {
101111
return nil, err
102112
}
113+
114+
if err := mapping.AddCustomAnalyzer(filenameIndexerAnalyzer, map[string]any{
115+
"type": analyzer_custom.Name,
116+
"char_filters": []string{},
117+
"tokenizer": unicode.Name,
118+
"token_filters": []string{unicodeNormalizeName, path_filter.Name, lowercase.Name},
119+
}); err != nil {
120+
return nil, err
121+
}
122+
103123
mapping.DefaultAnalyzer = repoIndexerAnalyzer
104124
mapping.AddDocumentMapping(repoIndexerDocType, docMapping)
105125
mapping.AddDocumentMapping("_all", bleve.NewDocumentDisabledMapping())
@@ -174,6 +194,7 @@ func (b *Indexer) addUpdate(ctx context.Context, batchWriter git.WriteCloserErro
174194
return batch.Index(id, &RepoIndexerData{
175195
RepoID: repo.ID,
176196
CommitID: commitSha,
197+
Filename: update.Filename,
177198
Content: string(charset.ToUTF8DropErrors(fileContents, charset.ConvertOpts{})),
178199
Language: analyze.GetCodeLanguage(update.Filename, fileContents),
179200
UpdatedAt: time.Now().UTC(),
@@ -240,14 +261,19 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
240261
keywordQuery query.Query
241262
)
242263

243-
phraseQuery := bleve.NewMatchPhraseQuery(opts.Keyword)
244-
phraseQuery.FieldVal = "Content"
245-
phraseQuery.Analyzer = repoIndexerAnalyzer
246-
keywordQuery = phraseQuery
264+
pathQuery := bleve.NewPrefixQuery(strings.ToLower(opts.Keyword))
265+
pathQuery.FieldVal = "Filename"
266+
pathQuery.SetBoost(10)
267+
268+
contentQuery := bleve.NewMatchQuery(opts.Keyword)
269+
contentQuery.FieldVal = "Content"
270+
247271
if opts.IsKeywordFuzzy {
248-
phraseQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
272+
contentQuery.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(opts.Keyword)
249273
}
250274

275+
keywordQuery = bleve.NewDisjunctionQuery(contentQuery, pathQuery)
276+
251277
if len(opts.RepoIDs) > 0 {
252278
repoQueries := make([]query.Query, 0, len(opts.RepoIDs))
253279
for _, repoID := range opts.RepoIDs {
@@ -277,7 +303,7 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
277303

278304
from, pageSize := opts.GetSkipTake()
279305
searchRequest := bleve.NewSearchRequestOptions(indexerQuery, pageSize, from, false)
280-
searchRequest.Fields = []string{"Content", "RepoID", "Language", "CommitID", "UpdatedAt"}
306+
searchRequest.Fields = []string{"Content", "Filename", "RepoID", "Language", "CommitID", "UpdatedAt"}
281307
searchRequest.IncludeLocations = true
282308

283309
if len(opts.Language) == 0 {
@@ -305,6 +331,10 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int
305331
endIndex = locationEnd
306332
}
307333
}
334+
if len(hit.Locations["Filename"]) > 0 {
335+
startIndex, endIndex = internal.FilenameMatchIndexPos(hit.Fields["Content"].(string))
336+
}
337+
308338
language := hit.Fields["Language"].(string)
309339
var updatedUnix timeutil.TimeStamp
310340
if t, err := time.Parse(time.RFC3339, hit.Fields["UpdatedAt"].(string)); err == nil {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
// Copyright 2024 The Gitea Authors. All rights reserved.
2+
// SPDX-License-Identifier: MIT
3+
4+
package path
5+
6+
import (
7+
"slices"
8+
"strings"
9+
10+
"github.com/blevesearch/bleve/v2/analysis"
11+
"github.com/blevesearch/bleve/v2/registry"
12+
)
13+
14+
const (
15+
Name = "gitea/path"
16+
)
17+
18+
type TokenFilter struct{}
19+
20+
func NewTokenFilter() *TokenFilter {
21+
return &TokenFilter{}
22+
}
23+
24+
func TokenFilterConstructor(config map[string]any, cache *registry.Cache) (analysis.TokenFilter, error) {
25+
return NewTokenFilter(), nil
26+
}
27+
28+
func (s *TokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
29+
if len(input) == 1 {
30+
// if there is only one token, we dont need to generate the reversed chain
31+
return generatePathTokens(input, false)
32+
}
33+
34+
normal := generatePathTokens(input, false)
35+
reversed := generatePathTokens(input, true)
36+
37+
return append(normal, reversed...)
38+
}
39+
40+
// Generates path tokens from the input tokens.
41+
// This mimics the behavior of the path hierarchy tokenizer in ES. It takes the input tokens and combine them, generating a term for each component
42+
// in tree (e.g., foo/bar/baz.md will generate foo, foo/bar, and foo/bar/baz.md).
43+
//
44+
// If the reverse flag is set, the order of the tokens is reversed (the same input will generate baz.md, baz.md/bar, baz.md/bar/foo). This is useful
45+
// to efficiently search for filenames without supplying the fullpath.
46+
func generatePathTokens(input analysis.TokenStream, reversed bool) analysis.TokenStream {
47+
terms := make([]string, 0, len(input))
48+
longestTerm := 0
49+
50+
if reversed {
51+
slices.Reverse(input)
52+
}
53+
54+
for i := 0; i < len(input); i++ {
55+
var sb strings.Builder
56+
sb.WriteString(string(input[0].Term))
57+
58+
for j := 1; j < i; j++ {
59+
sb.WriteString("/")
60+
sb.WriteString(string(input[j].Term))
61+
}
62+
63+
term := sb.String()
64+
65+
if longestTerm < len(term) {
66+
longestTerm = len(term)
67+
}
68+
69+
terms = append(terms, term)
70+
}
71+
72+
output := make(analysis.TokenStream, 0, len(terms))
73+
74+
for _, term := range terms {
75+
var start, end int
76+
77+
if reversed {
78+
start = 0
79+
end = len(term)
80+
} else {
81+
start = longestTerm - len(term)
82+
end = longestTerm
83+
}
84+
85+
token := analysis.Token{
86+
Position: 1,
87+
Start: start,
88+
End: end,
89+
Type: analysis.AlphaNumeric,
90+
Term: []byte(term),
91+
}
92+
93+
output = append(output, &token)
94+
}
95+
96+
return output
97+
}
98+
99+
func init() {
100+
registry.RegisterTokenFilter(Name, TokenFilterConstructor)
101+
}

0 commit comments

Comments
 (0)