From 4d084c52770ef9b3d674182b1ea70cdb0b839cbb Mon Sep 17 00:00:00 2001 From: Alina Matei Date: Thu, 20 Nov 2025 17:02:35 +0200 Subject: [PATCH] feat: draft file filtering [PS-105] --- pkg/utils/file_filter.go | 76 ++++++++++++++++++++++++++++++----- pkg/utils/file_filter_test.go | 35 +++++++++------- 2 files changed, 87 insertions(+), 24 deletions(-) diff --git a/pkg/utils/file_filter.go b/pkg/utils/file_filter.go index b4faa7556..93626c19d 100644 --- a/pkg/utils/file_filter.go +++ b/pkg/utils/file_filter.go @@ -15,11 +15,43 @@ import ( "gopkg.in/yaml.v3" ) +type FileFilterStrategy interface { + FilterOut(path string) bool +} + +type GlobFileFilter struct { + ignores *gitignore.GitIgnore +} + +func NewGlobFileFilter(path string, ruleFiles []string, logger *zerolog.Logger) (FileFilterStrategy, error) { + ff := FileFilter{ + path: path, + defaultRules: []string{"**/.git/**"}, + logger: logger, + max_threads: int64(runtime.NumCPU()), + } + + rules, err := ff.GetRules(ruleFiles) + if err != nil { + return nil, err + } + + return &GlobFileFilter{ignores: gitignore.CompileIgnoreLines(rules...)}, nil +} + +func (ff *GlobFileFilter) FilterOut(path string) bool { + if ff.ignores == nil { + return false + } + return ff.ignores.MatchesPath(path) +} + type FileFilter struct { - path string - defaultRules []string - logger *zerolog.Logger - max_threads int64 + path string + defaultRules []string + logger *zerolog.Logger + filterStrategies []FileFilterStrategy + max_threads int64 } type FileFilterOption func(*FileFilter) error @@ -35,6 +67,26 @@ func WithThreadNumber(maxThreadCount int) FileFilterOption { } } +func WithFileFilterStrategies(strategies []FileFilterStrategy) FileFilterOption { + return func(filter *FileFilter) error { + filter.filterStrategies = append(filter.filterStrategies, strategies...) + return nil + } +} + +func WithSecretsFileFilterStrategies(path string, ruleFiles []string, logger *zerolog.Logger) FileFilterOption { + globFilter, err := NewGlobFileFilter(path, ruleFiles, logger) + if err != nil { + return nil //TODO handle error + + } + + return func(filter *FileFilter) error { + filter.filterStrategies = append(filter.filterStrategies, globFilter) // TODO add other filters here such as null byte filter + return nil + } +} + func NewFileFilter(path string, logger *zerolog.Logger, options ...FileFilterOption) *FileFilter { filter := &FileFilter{ path: path, @@ -103,11 +155,9 @@ func (fw *FileFilter) GetRules(ruleFiles []string) ([]string, error) { } // GetFilteredFiles returns a filtered channel of filepaths from a given channel of filespaths and glob patterns to filter on -func (fw *FileFilter) GetFilteredFiles(filesCh chan string, globs []string) chan string { +func (fw *FileFilter) GetFilteredFiles(filesCh chan string) chan string { var filteredFilesCh = make(chan string) - // create pattern matcher used to match filesToFilter to glob patterns - globPatternMatcher := gitignore.CompileIgnoreLines(globs...) go func() { ctx := context.Background() availableThreads := semaphore.NewWeighted(fw.max_threads) @@ -122,8 +172,16 @@ func (fw *FileFilter) GetFilteredFiles(filesCh chan string, globs []string) chan } go func(f string) { defer availableThreads.Release(1) - // filesToFilter that do not match the glob pattern are filtered - if !globPatternMatcher.MatchesPath(f) { + // filesToFilter that do not match the filter list are excluded + keepFile := true + for _, filter := range fw.filterStrategies { + if filter.FilterOut(f) { + keepFile = false + break + } + } + + if keepFile { filteredFilesCh <- f } }(file) diff --git a/pkg/utils/file_filter_test.go b/pkg/utils/file_filter_test.go index 3cc4326a5..d430eedbe 100644 --- a/pkg/utils/file_filter_test.go +++ b/pkg/utils/file_filter_test.go @@ -2,6 +2,7 @@ package utils import ( "fmt" + "github.com/stretchr/testify/require" "os" "path/filepath" "runtime" @@ -133,14 +134,13 @@ func TestFileFilter_GetFilteredFiles(t *testing.T) { t.Run(testCase.name, func(t *testing.T) { setupTestFileSystem(t, testCase) - fileFilter := NewFileFilter(testCase.repoPath, &log.Logger) + globFileFilter, err := NewGlobFileFilter(testCase.repoPath, testCase.ruleFiles, &log.Logger) + require.NoError(t, err) + fileFilter := NewFileFilter(testCase.repoPath, &log.Logger, WithFileFilterStrategies([]FileFilterStrategy{globFileFilter})) files := fileFilter.GetAllFiles() - globs, err := fileFilter.GetRules(testCase.ruleFiles) - assert.NoError(t, err) - - filteredFiles := fileFilter.GetFilteredFiles(files, globs) + filteredFiles := fileFilter.GetFilteredFiles(files) expectedFilePaths := make([]string, 0) for _, file := range testCase.expectedFiles { expectedFilePaths = append(expectedFilePaths, filepath.Join(testCase.repoPath, file)) @@ -151,7 +151,7 @@ func TestFileFilter_GetFilteredFiles(t *testing.T) { } t.Run("2nd call should return the same filesToFilter", func(t *testing.T) { - filteredFiles := fileFilter.GetFilteredFiles(files, globs) + filteredFiles := fileFilter.GetFilteredFiles(files) for filteredFile := range filteredFiles { assert.Contains(t, expectedFilePaths, filteredFile) } @@ -184,12 +184,13 @@ func BenchmarkFileFilter_GetFilteredFiles(b *testing.B) { b.ResetTimer() for n := 0; n < b.N; n++ { - fileFilter := NewFileFilter(rootDir, &log.Logger, WithThreadNumber(runtime.NumCPU())) + globFileFilter, err := NewGlobFileFilter(rootDir, ruleFiles, &log.Logger) + assert.NoError(b, err) + + fileFilter := NewFileFilter(rootDir, &log.Logger, WithFileFilterStrategies([]FileFilterStrategy{globFileFilter}), WithThreadNumber(runtime.NumCPU())) b.StartTimer() - globs, err := fileFilter.GetRules(ruleFiles) - assert.NoError(b, err) - filteredFiles := fileFilter.GetFilteredFiles(fileFilter.GetAllFiles(), globs) + filteredFiles := fileFilter.GetFilteredFiles(fileFilter.GetAllFiles()) b.StopTimer() var actualFilteredFilesLen int @@ -472,7 +473,10 @@ func TestFileFilter_SlashPatternInGitIgnore(t *testing.T) { createFileInPath(t, gitignorePath, []byte("/")) // Test file filtering - fileFilter := NewFileFilter(tempDir, &log.Logger) + globFileFilter, err := NewGlobFileFilter(tempDir, []string{".gitignore"}, &log.Logger) + assert.NoError(t, err) + + fileFilter := NewFileFilter(tempDir, &log.Logger, WithFileFilterStrategies([]FileFilterStrategy{globFileFilter})) rules, err := fileFilter.GetRules([]string{".gitignore"}) assert.NoError(t, err) @@ -481,7 +485,7 @@ func TestFileFilter_SlashPatternInGitIgnore(t *testing.T) { // Get all files and filter them allFiles := fileFilter.GetAllFiles() - filteredFiles := fileFilter.GetFilteredFiles(allFiles, rules) + filteredFiles := fileFilter.GetFilteredFiles(allFiles) // Collect filtered files var filteredFilesList []string @@ -517,13 +521,14 @@ func TestFileFilter_SlashPatternInGitIgnore(t *testing.T) { createFileInPath(t, gitignorePath, []byte("/*")) // Test file filtering - fileFilter := NewFileFilter(tempDir, &log.Logger) - rules, err := fileFilter.GetRules([]string{".gitignore"}) + globFileFilter, err := NewGlobFileFilter(tempDir, []string{".gitignore"}, &log.Logger) assert.NoError(t, err) + fileFilter := NewFileFilter(tempDir, &log.Logger, WithFileFilterStrategies([]FileFilterStrategy{globFileFilter})) + // Get all files and filter them allFiles := fileFilter.GetAllFiles() - filteredFiles := fileFilter.GetFilteredFiles(allFiles, rules) + filteredFiles := fileFilter.GetFilteredFiles(allFiles) // Collect filtered files var filteredFilesList []string