Skip to content

Commit b128c46

Browse files
[chore][pkg/stanza] Speed up file deduplication in finder (#34888)
**Description:** <Describe what has changed.> For large numbers of files, the logic that deduplicates the filenames between matches is costly. This is mainly due to the O(n^2) deduping algorithm used. If we instead use a map (as a hashset), we can make this ~O(n). This PR speeds up the deduplication logic, as well as adds a benchmark for a case where the filelog receiver is polling many files at once. **Testing:** <Describe what testing was performed and which tests were added.> Running the added benchmark and comparing with benchstat, we can see a large increase in speed for the large number of files case (10000 monitored files), at the cost of a very slight increase in memory usage: ``` goos: darwin goarch: arm64 pkg: github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/fileconsumer/matcher/internal/finder cpu: Apple M3 Pro │ old.txt │ new.txt │ │ sec/op │ sec/op vs base │ Find10kFiles-12 198.636m ± 6% 8.696m ± 16% -95.62% (p=0.002 n=6) │ old.txt │ new.txt │ │ B/op │ B/op vs base │ Find10kFiles-12 5.416Mi ± 0% 5.581Mi ± 0% +3.04% (p=0.002 n=6) │ old.txt │ new.txt │ │ allocs/op │ allocs/op vs base │ Find10kFiles-12 80.06k ± 0% 80.25k ± 0% +0.23% (p=0.002 n=6) ```
1 parent c6cda87 commit b128c46

File tree

3 files changed

+40
-12
lines changed

3 files changed

+40
-12
lines changed

pkg/stanza/fileconsumer/matcher/internal/finder/finder.go

+5-9
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"fmt"
99

1010
"github.com/bmatcuk/doublestar/v4"
11+
"golang.org/x/exp/maps"
1112
)
1213

1314
func Validate(globs []string) error {
@@ -23,7 +24,8 @@ func Validate(globs []string) error {
2324
// FindFiles gets a list of paths given an array of glob patterns to include and exclude
2425
func FindFiles(includes []string, excludes []string) ([]string, error) {
2526
var errs error
26-
all := make([]string, 0, len(includes))
27+
28+
allSet := make(map[string]struct{}, len(includes))
2729
for _, include := range includes {
2830
matches, err := doublestar.FilepathGlob(include, doublestar.WithFilesOnly(), doublestar.WithFailOnIOErrors())
2931
if err != nil {
@@ -40,15 +42,9 @@ func FindFiles(includes []string, excludes []string) ([]string, error) {
4042
}
4143
}
4244

43-
for _, existing := range all {
44-
if existing == match {
45-
continue INCLUDE
46-
}
47-
}
48-
49-
all = append(all, match)
45+
allSet[match] = struct{}{}
5046
}
5147
}
5248

53-
return all, errs
49+
return maps.Keys(allSet), errs
5450
}

pkg/stanza/fileconsumer/matcher/internal/finder/finder_test.go

+34-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
package finder
55

66
import (
7+
"fmt"
78
"os"
89
"path/filepath"
910
"runtime"
@@ -188,7 +189,7 @@ func TestFindFiles(t *testing.T) {
188189
}
189190
files, err := FindFiles(tc.include, tc.exclude)
190191
assert.NoError(t, err)
191-
assert.Equal(t, tc.expected, files)
192+
assert.ElementsMatch(t, tc.expected, files)
192193
})
193194
}
194195
}
@@ -251,7 +252,38 @@ func TestFindFilesWithIOErrors(t *testing.T) {
251252
t.Run(tc.name, func(t *testing.T) {
252253
files, err := FindFiles(tc.include, []string{})
253254
assert.ErrorContains(t, err, tc.failedMsg)
254-
assert.Equal(t, tc.expected, files)
255+
assert.ElementsMatch(t, tc.expected, files)
255256
})
256257
}
257258
}
259+
260+
// benchResult is package level variable that store the result of the benchmark.
261+
// It is used to prevent go from optimizing out the benchmarked code.
262+
var benchResult []string
263+
264+
func BenchmarkFind10kFiles(b *testing.B) {
265+
numFiles := 10000
266+
tmpDir := b.TempDir()
267+
268+
// Create a bunch of files for benchmarking
269+
for i := range numFiles {
270+
path := filepath.Join(tmpDir, fmt.Sprintf("log-%05d.log", i))
271+
f, err := os.Create(path)
272+
require.NoError(b, err)
273+
require.NoError(b, f.Close())
274+
}
275+
276+
includeGlobs := []string{
277+
filepath.Join(tmpDir, "log-*.log"),
278+
}
279+
280+
excludeGlobs := []string{}
281+
282+
var r []string
283+
b.ResetTimer()
284+
for range b.N {
285+
r, _ = FindFiles(includeGlobs, excludeGlobs)
286+
}
287+
288+
benchResult = r
289+
}

pkg/stanza/fileconsumer/matcher/matcher_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -785,7 +785,7 @@ func TestMatcher(t *testing.T) {
785785
} else {
786786
assert.NoError(t, err)
787787
}
788-
assert.Equal(t, tc.expected, files)
788+
assert.ElementsMatch(t, tc.expected, files)
789789
})
790790
}
791791
}

0 commit comments

Comments
 (0)