Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 67 additions & 9 deletions pkg/utils/file_filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,43 @@ import (
"gopkg.in/yaml.v3"
)

type FileFilterStrategy interface {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Potentially more go idiomatic different name (don't care much): Filterable

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something to consider would be something like this:

func Filter[T any](items []T, predicate func(T) bool) []T {
    var result []T
    for _, item := range items {
        if predicate(item) {
            result = append(result, item)
        }
    }
    return result
}

FilterOut(path string) bool
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nitpick: I'd call it Filter

}

type GlobFileFilter struct {
ignores *gitignore.GitIgnore
}

func NewGlobFileFilter(path string, ruleFiles []string, logger *zerolog.Logger) (FileFilterStrategy, error) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd call it GitIgnoreFileFilter as it explicitly uses gitignore syntax, I think.

ff := FileFilter{
path: path,
defaultRules: []string{"**/.git/**"},
logger: logger,
max_threads: int64(runtime.NumCPU()),
}

rules, err := ff.GetRules(ruleFiles)
if err != nil {
return nil, err
}

return &GlobFileFilter{ignores: gitignore.CompileIgnoreLines(rules...)}, nil
}

func (ff *GlobFileFilter) FilterOut(path string) bool {
if ff.ignores == nil {
return false
}
return ff.ignores.MatchesPath(path)
}

type FileFilter struct {
path string
defaultRules []string
logger *zerolog.Logger
max_threads int64
path string
defaultRules []string
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this may not be needed anymore, as it could be a strategy, right?

logger *zerolog.Logger
filterStrategies []FileFilterStrategy
max_threads int64
}

type FileFilterOption func(*FileFilter) error
Expand All @@ -35,6 +67,26 @@ func WithThreadNumber(maxThreadCount int) FileFilterOption {
}
}

func WithFileFilterStrategies(strategies []FileFilterStrategy) FileFilterOption {
return func(filter *FileFilter) error {
filter.filterStrategies = append(filter.filterStrategies, strategies...)
return nil
}
}

func WithSecretsFileFilterStrategies(path string, ruleFiles []string, logger *zerolog.Logger) FileFilterOption {
globFilter, err := NewGlobFileFilter(path, ruleFiles, logger)
if err != nil {
return nil //TODO handle error

}

return func(filter *FileFilter) error {
filter.filterStrategies = append(filter.filterStrategies, globFilter) // TODO add other filters here such as null byte filter
return nil
}
}

func NewFileFilter(path string, logger *zerolog.Logger, options ...FileFilterOption) *FileFilter {
filter := &FileFilter{
path: path,
Expand Down Expand Up @@ -103,11 +155,9 @@ func (fw *FileFilter) GetRules(ruleFiles []string) ([]string, error) {
}

// GetFilteredFiles returns a filtered channel of filepaths from a given channel of filespaths and glob patterns to filter on
func (fw *FileFilter) GetFilteredFiles(filesCh chan string, globs []string) chan string {
func (fw *FileFilter) GetFilteredFiles(filesCh chan string) chan string {
var filteredFilesCh = make(chan string)

// create pattern matcher used to match filesToFilter to glob patterns
globPatternMatcher := gitignore.CompileIgnoreLines(globs...)
go func() {
ctx := context.Background()
availableThreads := semaphore.NewWeighted(fw.max_threads)
Expand All @@ -122,8 +172,16 @@ func (fw *FileFilter) GetFilteredFiles(filesCh chan string, globs []string) chan
}
go func(f string) {
defer availableThreads.Release(1)
// filesToFilter that do not match the glob pattern are filtered
if !globPatternMatcher.MatchesPath(f) {
// filesToFilter that do not match the filter list are excluded
keepFile := true
for _, filter := range fw.filterStrategies {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A thought: we could also have a AggregateFileStrategy that delegates to sub-filters and encapsulates the logic.

if filter.FilterOut(f) {
keepFile = false
break
}
}
Comment on lines +178 to +182
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, exactly like I was thinking how it could/would work.


if keepFile {
filteredFilesCh <- f
}
}(file)
Expand Down
35 changes: 20 additions & 15 deletions pkg/utils/file_filter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package utils

import (
"fmt"
"github.com/stretchr/testify/require"
"os"
"path/filepath"
"runtime"
Expand Down Expand Up @@ -133,14 +134,13 @@ func TestFileFilter_GetFilteredFiles(t *testing.T) {
t.Run(testCase.name, func(t *testing.T) {
setupTestFileSystem(t, testCase)

fileFilter := NewFileFilter(testCase.repoPath, &log.Logger)
globFileFilter, err := NewGlobFileFilter(testCase.repoPath, testCase.ruleFiles, &log.Logger)
require.NoError(t, err)

fileFilter := NewFileFilter(testCase.repoPath, &log.Logger, WithFileFilterStrategies([]FileFilterStrategy{globFileFilter}))
files := fileFilter.GetAllFiles()

globs, err := fileFilter.GetRules(testCase.ruleFiles)
assert.NoError(t, err)

filteredFiles := fileFilter.GetFilteredFiles(files, globs)
filteredFiles := fileFilter.GetFilteredFiles(files)
expectedFilePaths := make([]string, 0)
for _, file := range testCase.expectedFiles {
expectedFilePaths = append(expectedFilePaths, filepath.Join(testCase.repoPath, file))
Expand All @@ -151,7 +151,7 @@ func TestFileFilter_GetFilteredFiles(t *testing.T) {
}

t.Run("2nd call should return the same filesToFilter", func(t *testing.T) {
filteredFiles := fileFilter.GetFilteredFiles(files, globs)
filteredFiles := fileFilter.GetFilteredFiles(files)
for filteredFile := range filteredFiles {
assert.Contains(t, expectedFilePaths, filteredFile)
}
Expand Down Expand Up @@ -184,12 +184,13 @@ func BenchmarkFileFilter_GetFilteredFiles(b *testing.B) {

b.ResetTimer()
for n := 0; n < b.N; n++ {
fileFilter := NewFileFilter(rootDir, &log.Logger, WithThreadNumber(runtime.NumCPU()))
globFileFilter, err := NewGlobFileFilter(rootDir, ruleFiles, &log.Logger)
Copy link
Contributor

@bastiandoetsch bastiandoetsch Nov 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably we should leave that test to ensure that clients that use the NewFileFilter functions right now have a smooth upgrade path.

assert.NoError(b, err)

fileFilter := NewFileFilter(rootDir, &log.Logger, WithFileFilterStrategies([]FileFilterStrategy{globFileFilter}), WithThreadNumber(runtime.NumCPU()))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this should probably be a new test


b.StartTimer()
globs, err := fileFilter.GetRules(ruleFiles)
assert.NoError(b, err)
filteredFiles := fileFilter.GetFilteredFiles(fileFilter.GetAllFiles(), globs)
filteredFiles := fileFilter.GetFilteredFiles(fileFilter.GetAllFiles())
b.StopTimer()

var actualFilteredFilesLen int
Expand Down Expand Up @@ -472,7 +473,10 @@ func TestFileFilter_SlashPatternInGitIgnore(t *testing.T) {
createFileInPath(t, gitignorePath, []byte("/"))

// Test file filtering
fileFilter := NewFileFilter(tempDir, &log.Logger)
globFileFilter, err := NewGlobFileFilter(tempDir, []string{".gitignore"}, &log.Logger)
assert.NoError(t, err)

fileFilter := NewFileFilter(tempDir, &log.Logger, WithFileFilterStrategies([]FileFilterStrategy{globFileFilter}))
rules, err := fileFilter.GetRules([]string{".gitignore"})
assert.NoError(t, err)

Expand All @@ -481,7 +485,7 @@ func TestFileFilter_SlashPatternInGitIgnore(t *testing.T) {

// Get all files and filter them
allFiles := fileFilter.GetAllFiles()
filteredFiles := fileFilter.GetFilteredFiles(allFiles, rules)
filteredFiles := fileFilter.GetFilteredFiles(allFiles)

// Collect filtered files
var filteredFilesList []string
Expand Down Expand Up @@ -517,13 +521,14 @@ func TestFileFilter_SlashPatternInGitIgnore(t *testing.T) {
createFileInPath(t, gitignorePath, []byte("/*"))

// Test file filtering
fileFilter := NewFileFilter(tempDir, &log.Logger)
rules, err := fileFilter.GetRules([]string{".gitignore"})
globFileFilter, err := NewGlobFileFilter(tempDir, []string{".gitignore"}, &log.Logger)
assert.NoError(t, err)

fileFilter := NewFileFilter(tempDir, &log.Logger, WithFileFilterStrategies([]FileFilterStrategy{globFileFilter}))

// Get all files and filter them
allFiles := fileFilter.GetAllFiles()
filteredFiles := fileFilter.GetFilteredFiles(allFiles, rules)
filteredFiles := fileFilter.GetFilteredFiles(allFiles)

// Collect filtered files
var filteredFilesList []string
Expand Down