Skip to content

Commit bb95fcb

Browse files
authored
[SQG] Do not follow symlinks (#41778)
### What does this PR do? This is a minor update in experimental SQGs to stop following symlinks and reflecting them in report as such to get precise size estimation.
1 parent 621e4d7 commit bb95fcb

File tree

3 files changed

+182
-45
lines changed

3 files changed

+182
-45
lines changed

tasks/quality_gates.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -424,7 +424,6 @@ def measure_package_local(
424424
config_path="test/static/static_quality_gates.yml",
425425
output_path=None,
426426
build_job_name="local_test",
427-
max_files=20000,
428427
no_checksums=False,
429428
debug=False,
430429
):
@@ -454,7 +453,6 @@ def measure_package_local(
454453
config_path=config_path,
455454
output_path=output_path,
456455
build_job_name=build_job_name,
457-
max_files=max_files,
458456
no_checksums=no_checksums,
459457
debug=debug,
460458
)

tasks/static_quality_gates/experimental_gates.py

Lines changed: 92 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -27,18 +27,27 @@
2727
class FileInfo:
2828
"""
2929
Information about a single file within an artifact.
30+
31+
For regular files, only relative_path, size_bytes, and optionally checksum are set.
32+
For symlinks, is_symlink is True and symlink_target contains the link target.
33+
For broken symlinks, is_broken is True.
3034
"""
3135

3236
relative_path: str
3337
size_bytes: int
3438
checksum: str | None = None
39+
is_symlink: bool | None = None
40+
symlink_target: str | None = None
41+
is_broken: bool | None = None
3542

3643
def __post_init__(self):
3744
"""Validate file info data"""
3845
if not self.relative_path:
3946
raise ValueError("relative_path cannot be empty")
4047
if self.size_bytes < 0:
4148
raise ValueError("size_bytes must be non-negative")
49+
if self.is_symlink and not self.symlink_target:
50+
raise ValueError("symlink_target must be provided when is_symlink is True")
4251

4352
@property
4453
def size_mb(self) -> float:
@@ -124,7 +133,6 @@ def measure_package(
124133
package_path: str,
125134
gate_name: str,
126135
build_job_name: str,
127-
max_files: int = 20000,
128136
generate_checksums: bool = True,
129137
debug: bool = False,
130138
) -> InPlaceArtifactReport:
@@ -153,7 +161,7 @@ def measure_package(
153161
gate_config = create_quality_gate_config(gate_name, self.config[gate_name])
154162

155163
measurement, file_inventory = self._extract_and_analyze_package(
156-
ctx, package_path, gate_config, max_files, generate_checksums, debug
164+
ctx, package_path, gate_config, generate_checksums, debug
157165
)
158166

159167
return InPlaceArtifactReport(
@@ -177,7 +185,6 @@ def _extract_and_analyze_package(
177185
ctx: Context,
178186
package_path: str,
179187
config: QualityGateConfig,
180-
max_files: int = 10000,
181188
generate_checksums: bool = True,
182189
debug: bool = False,
183190
) -> tuple[ArtifactMeasurement, list[FileInfo]]:
@@ -188,7 +195,6 @@ def _extract_and_analyze_package(
188195
ctx: Invoke context for running commands
189196
package_path: Path to the package file
190197
config: Quality gate configuration
191-
max_files: Maximum number of files to process in inventory
192198
generate_checksums: Whether to generate checksums for files
193199
debug: Enable debug logging
194200
@@ -213,7 +219,7 @@ def _extract_and_analyze_package(
213219
artifact_path=package_path, on_wire_size=wire_size, on_disk_size=disk_size
214220
)
215221

216-
file_inventory = self._walk_extracted_files(extract_dir, max_files, generate_checksums, debug)
222+
file_inventory = self._walk_extracted_files(extract_dir, generate_checksums, debug)
217223

218224
if debug:
219225
print("✅ Single extraction completed:")
@@ -226,15 +232,12 @@ def _extract_and_analyze_package(
226232
except Exception as e:
227233
raise RuntimeError(f"Failed to extract and analyze package {package_path}: {e}") from e
228234

229-
def _walk_extracted_files(
230-
self, extract_dir: str, max_files: int, generate_checksums: bool, debug: bool
231-
) -> list[FileInfo]:
235+
def _walk_extracted_files(self, extract_dir: str, generate_checksums: bool, debug: bool) -> list[FileInfo]:
232236
"""
233237
Walk through extracted files and create file inventory.
234238
235239
Args:
236240
extract_dir: Directory containing extracted package files
237-
max_files: Maximum number of files to process
238241
generate_checksums: Whether to generate checksums for files
239242
debug: Enable debug logging
240243
@@ -260,19 +263,55 @@ def _walk_extracted_files(
260263
total_size = 0
261264

262265
for file_path in extract_path.rglob('*'):
263-
if file_path.is_file():
264-
# Respect max_files limit
265-
if files_processed >= max_files:
266-
if debug:
267-
print(f"⚠️ Reached max files limit ({max_files}), stopping inventory")
268-
break
269-
270-
try:
271-
relative_path = str(file_path.relative_to(extract_path))
272-
file_stat = file_path.stat()
266+
# Skip directories
267+
if file_path.is_dir():
268+
continue
269+
270+
try:
271+
relative_path = str(file_path.relative_to(extract_path))
272+
273+
if file_path.is_symlink():
274+
try:
275+
symlink_target = os.readlink(file_path)
276+
logical_size = len(symlink_target)
277+
is_broken = False
278+
279+
try:
280+
resolved_target = file_path.resolve(strict=True)
281+
if resolved_target.is_relative_to(extract_path):
282+
symlink_target_rel = str(resolved_target.relative_to(extract_path))
283+
else:
284+
symlink_target_rel = symlink_target
285+
except (OSError, RuntimeError):
286+
symlink_target_rel = symlink_target
287+
is_broken = True
288+
289+
file_inventory.append(
290+
FileInfo(
291+
relative_path=relative_path,
292+
size_bytes=logical_size,
293+
checksum=None,
294+
is_symlink=True,
295+
symlink_target=symlink_target_rel,
296+
is_broken=is_broken if is_broken else None,
297+
)
298+
)
299+
300+
if debug and files_processed % 1000 == 0:
301+
broken_marker = " [BROKEN]" if is_broken else ""
302+
print(f"🔗 Symlink: {relative_path} -> {symlink_target_rel}{broken_marker}")
303+
304+
except OSError as e:
305+
if debug:
306+
print(f"⚠️ Could not read symlink {file_path}: {e}")
307+
continue
308+
309+
elif file_path.is_file():
310+
# Regular file - use lstat to not follow symlinks
311+
file_stat = file_path.lstat()
273312
size_bytes = file_stat.st_size
274313

275-
checksum = self._generate_checksum(file_path)
314+
checksum = self._generate_checksum(file_path) if generate_checksums else None
276315

277316
file_inventory.append(
278317
FileInfo(
@@ -282,16 +321,17 @@ def _walk_extracted_files(
282321
)
283322
)
284323

285-
files_processed += 1
286324
total_size += size_bytes
287325

288326
if debug and files_processed % 1000 == 0:
289327
print(f"📋 Processed {files_processed} files...")
290328

291-
except (OSError, PermissionError) as e:
292-
if debug:
293-
print(f"⚠️ Skipping file {file_path}: {e}")
294-
continue
329+
files_processed += 1
330+
331+
except (OSError, PermissionError) as e:
332+
if debug:
333+
print(f"⚠️ Skipping file {file_path}: {e}")
334+
continue
295335

296336
# Sort by size (descending) for easier analysis
297337
file_inventory.sort(key=lambda f: f.size_bytes, reverse=True)
@@ -349,14 +389,7 @@ def save_report_to_yaml(self, report: InPlaceArtifactReport, output_path: str) -
349389
"arch": report.arch,
350390
"os": report.os,
351391
"build_job_name": report.build_job_name,
352-
"file_inventory": [
353-
{
354-
"relative_path": file_info.relative_path,
355-
"size_bytes": file_info.size_bytes,
356-
"checksum": file_info.checksum,
357-
}
358-
for file_info in report.file_inventory
359-
],
392+
"file_inventory": [self._serialize_file_info(file_info) for file_info in report.file_inventory],
360393
}
361394

362395
with open(output_path, 'w') as f:
@@ -365,6 +398,32 @@ def save_report_to_yaml(self, report: InPlaceArtifactReport, output_path: str) -
365398
except Exception as e:
366399
raise RuntimeError(f"Failed to save report to {output_path}: {e}") from e
367400

401+
def _serialize_file_info(self, file_info: FileInfo) -> dict[str, Any]:
402+
"""
403+
Serialize a FileInfo object to a dictionary, excluding None/False fields for regular files.
404+
405+
Args:
406+
file_info: The FileInfo object to serialize
407+
408+
Returns:
409+
Dictionary with only relevant fields
410+
"""
411+
result = {
412+
"relative_path": file_info.relative_path,
413+
"size_bytes": file_info.size_bytes,
414+
}
415+
416+
if file_info.checksum is not None:
417+
result["checksum"] = file_info.checksum
418+
419+
if file_info.is_symlink:
420+
result["is_symlink"] = True
421+
result["symlink_target"] = file_info.symlink_target
422+
if file_info.is_broken:
423+
result["is_broken"] = True
424+
425+
return result
426+
368427

369428
def measure_package_local(
370429
ctx,
@@ -373,7 +432,6 @@ def measure_package_local(
373432
config_path="test/static/static_quality_gates.yml",
374433
output_path=None,
375434
build_job_name="local_test",
376-
max_files=20000,
377435
no_checksums=False,
378436
debug=False,
379437
):
@@ -389,7 +447,6 @@ def measure_package_local(
389447
config_path: Path to quality gates configuration (default: test/static/static_quality_gates.yml)
390448
output_path: Path to save the measurement report (default: {gate_name}_report.yml)
391449
build_job_name: Simulated build job name (default: local_test)
392-
max_files: Maximum number of files to process in inventory (default: 20000)
393450
no_checksums: Skip checksum generation for faster processing (default: false)
394451
debug: Enable debug logging for troubleshooting (default: false)
395452
@@ -437,7 +494,6 @@ def measure_package_local(
437494
package_path=package_path,
438495
gate_name=gate_name,
439496
build_job_name=build_job_name,
440-
max_files=max_files,
441497
generate_checksums=not no_checksums,
442498
debug=debug,
443499
)

tasks/unit_tests/experimental_gates_tests.py

Lines changed: 90 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -221,9 +221,7 @@ def test_measure_package_success(self, mock_exists):
221221

222222
# Verify key methods were called
223223
mock_exists.assert_called_once_with("/path/to/package.deb")
224-
mock_extract_analyze.assert_called_once_with(
225-
mock_ctx, "/path/to/package.deb", unittest.mock.ANY, 20000, True, False
226-
)
224+
mock_extract_analyze.assert_called_once_with(mock_ctx, "/path/to/package.deb", unittest.mock.ANY, True, False)
227225

228226
def test_measure_package_missing_file(self):
229227
"""Test measuring package with missing file."""
@@ -279,7 +277,7 @@ def test_generate_checksum_failure(self, mock_file):
279277

280278
def test_save_report_to_yaml(self):
281279
"""Test saving report to YAML file."""
282-
# Create a sample report
280+
# Create a sample report with regular files and symlinks
283281
report = InPlaceArtifactReport(
284282
artifact_path="/path/to/package.deb",
285283
gate_name="static_quality_gate_agent_deb_amd64",
@@ -290,6 +288,8 @@ def test_save_report_to_yaml(self):
290288
file_inventory=[
291289
FileInfo("opt/agent/bin/agent", 400000, "sha256:abc123"),
292290
FileInfo("etc/config.yaml", 100000, None),
291+
FileInfo("opt/bin/python3", 9, None, True, "python3.13", None),
292+
FileInfo("opt/bin/broken_link", 15, None, True, "/nonexistent/path", True),
293293
],
294294
measurement_timestamp="2024-01-15T14:30:22.123456Z",
295295
pipeline_id="12345",
@@ -316,13 +316,96 @@ def test_save_report_to_yaml(self):
316316
self.assertEqual(saved_data['gate_name'], "static_quality_gate_agent_deb_amd64")
317317
self.assertEqual(saved_data['on_wire_size'], 100000)
318318
self.assertEqual(saved_data['on_disk_size'], 500000)
319-
self.assertEqual(len(saved_data['file_inventory']), 2)
320-
self.assertEqual(saved_data['file_inventory'][0]['relative_path'], "opt/agent/bin/agent")
321-
self.assertEqual(saved_data['file_inventory'][0]['checksum'], "sha256:abc123")
319+
self.assertEqual(len(saved_data['file_inventory']), 4)
320+
321+
# Verify regular file doesn't have symlink fields
322+
regular_file = saved_data['file_inventory'][0]
323+
self.assertEqual(regular_file['relative_path'], "opt/agent/bin/agent")
324+
self.assertEqual(regular_file['checksum'], "sha256:abc123")
325+
self.assertNotIn('is_symlink', regular_file)
326+
self.assertNotIn('symlink_target', regular_file)
327+
328+
# Verify file without checksum
329+
no_checksum_file = saved_data['file_inventory'][1]
330+
self.assertEqual(no_checksum_file['relative_path'], "etc/config.yaml")
331+
self.assertNotIn('checksum', no_checksum_file)
332+
self.assertNotIn('is_symlink', no_checksum_file)
333+
334+
# Verify symlink has symlink fields
335+
symlink_file = saved_data['file_inventory'][2]
336+
self.assertEqual(symlink_file['relative_path'], "opt/bin/python3")
337+
self.assertTrue(symlink_file['is_symlink'])
338+
self.assertEqual(symlink_file['symlink_target'], "python3.13")
339+
self.assertNotIn('is_broken', symlink_file)
340+
341+
# Verify broken symlink has is_broken field
342+
broken_symlink = saved_data['file_inventory'][3]
343+
self.assertEqual(broken_symlink['relative_path'], "opt/bin/broken_link")
344+
self.assertTrue(broken_symlink['is_symlink'])
345+
self.assertEqual(broken_symlink['symlink_target'], "/nonexistent/path")
346+
self.assertTrue(broken_symlink['is_broken'])
322347

323348
finally:
324349
os.unlink(output_path)
325350

351+
def test_serialize_file_info_regular_file_with_checksum(self):
352+
"""Test serializing regular file with checksum."""
353+
file_info = FileInfo("opt/agent/bin/agent", 400000, "sha256:abc123")
354+
result = self.measurer._serialize_file_info(file_info)
355+
356+
# Should have relative_path, size_bytes, and checksum
357+
self.assertEqual(result['relative_path'], "opt/agent/bin/agent")
358+
self.assertEqual(result['size_bytes'], 400000)
359+
self.assertEqual(result['checksum'], "sha256:abc123")
360+
361+
# Should NOT have symlink fields
362+
self.assertNotIn('is_symlink', result)
363+
self.assertNotIn('symlink_target', result)
364+
365+
def test_serialize_file_info_regular_file_without_checksum(self):
366+
"""Test serializing regular file without checksum."""
367+
file_info = FileInfo("etc/config.yaml", 1024)
368+
result = self.measurer._serialize_file_info(file_info)
369+
370+
# Should have relative_path and size_bytes only
371+
self.assertEqual(result['relative_path'], "etc/config.yaml")
372+
self.assertEqual(result['size_bytes'], 1024)
373+
374+
# Should NOT have checksum or symlink fields
375+
self.assertNotIn('checksum', result)
376+
self.assertNotIn('is_symlink', result)
377+
self.assertNotIn('symlink_target', result)
378+
379+
def test_serialize_file_info_symlink(self):
380+
"""Test serializing symlink."""
381+
file_info = FileInfo("opt/bin/python3", 9, None, True, "python3.13", None)
382+
result = self.measurer._serialize_file_info(file_info)
383+
384+
# Should have relative_path, size_bytes, is_symlink, and symlink_target
385+
self.assertEqual(result['relative_path'], "opt/bin/python3")
386+
self.assertEqual(result['size_bytes'], 9)
387+
self.assertTrue(result['is_symlink'])
388+
self.assertEqual(result['symlink_target'], "python3.13")
389+
390+
# Should NOT have checksum or is_broken
391+
self.assertNotIn('checksum', result)
392+
self.assertNotIn('is_broken', result)
393+
394+
def test_serialize_file_info_broken_symlink(self):
395+
"""Test serializing broken symlink."""
396+
file_info = FileInfo("opt/bin/broken_link", 15, None, True, "/nonexistent/path", True)
397+
result = self.measurer._serialize_file_info(file_info)
398+
399+
# Should have all symlink fields plus is_broken
400+
self.assertEqual(result['relative_path'], "opt/bin/broken_link")
401+
self.assertEqual(result['size_bytes'], 15)
402+
self.assertTrue(result['is_symlink'])
403+
self.assertEqual(result['symlink_target'], "/nonexistent/path")
404+
self.assertTrue(result['is_broken'])
405+
406+
# Should NOT have checksum
407+
self.assertNotIn('checksum', result)
408+
326409
@patch('builtins.open', side_effect=OSError("Permission denied"))
327410
def test_save_report_to_yaml_failure(self, mock_file):
328411
"""Test handling of YAML save failures."""

0 commit comments

Comments
 (0)