Skip to content

Commit 4b27192

Browse files
committed
gh-152845: Keep EFS flag for a file loaded from the archive
This is a regression introduced by gh-84353/gh-150091, where a file with ASCII filename and UTF-8 comment lost its EFS flag when the central directory was rewritten in append mode, causing an unexpected metadata change and rendering its comment at risk of mis-decoding. Introduce an internal `_metadata_encoding` attribute for `ZipInfo` to make sure that files read from an archive keep the original encoding and EFS flag, while any newly added file enforces EFS when having a non-ASCII filename or comment.
1 parent 6e81d21 commit 4b27192

2 files changed

Lines changed: 79 additions & 17 deletions

File tree

Lib/test/test_zipfile/test_core.py

Lines changed: 59 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1440,15 +1440,7 @@ class ZstdWriterTests(AbstractWriterTests, unittest.TestCase):
14401440

14411441
def comparable_zinfo(zinfo):
14421442
"""Return a dict of public ZipInfo attributes for assertEqual comparison."""
1443-
attrs = {k: getattr(zinfo, k) for k in _ZINFO_PUBLIC_KEYS}
1444-
1445-
# Since patch gh-84353, the _MASK_UTF_FILENAME (0x800) bit may be
1446-
# changed when writing to the end record depending on whether filename
1447-
# can be encoded with ascii or cp437. Skip checking this bit by
1448-
# pretending it's always set.
1449-
attrs['flag_bits'] |= 0x800
1450-
1451-
return attrs
1443+
return {k: getattr(zinfo, k) for k in _ZINFO_PUBLIC_KEYS}
14521444

14531445
_struct_pack = struct.pack
14541446

@@ -5710,7 +5702,8 @@ def setUp(self):
57105702
with open(TESTFN, "wb") as tf:
57115703
tf.write(data)
57125704

5713-
def _test_read(self, zipfp, expected_names, expected_content):
5705+
def _test_read(self, zipfp, expected_names, expected_content,
5706+
expected_comments=None, expected_efs_flags=None):
57145707
# Check the namelist
57155708
names = zipfp.namelist()
57165709
self.assertEqual(names, expected_names)
@@ -5720,6 +5713,17 @@ def _test_read(self, zipfp, expected_names, expected_content):
57205713
names = [zi.filename for zi in infos]
57215714
self.assertEqual(names, expected_names)
57225715

5716+
if expected_comments is not None:
5717+
comments = [zi.comment for zi in infos]
5718+
self.assertEqual(comments, expected_comments)
5719+
5720+
if expected_efs_flags is not None:
5721+
efs_flags = [
5722+
bool(zi.flag_bits & zipfile._MASK_UTF_FILENAME)
5723+
for zi in infos
5724+
]
5725+
self.assertEqual(efs_flags, expected_efs_flags)
5726+
57235727
# check getinfo
57245728
for name, content in zip(expected_names, expected_content):
57255729
info = zipfp.getinfo(name)
@@ -5774,6 +5778,51 @@ def test_read_after_append(self):
57745778
with zipfile.ZipFile(TESTFN, "r", metadata_encoding='shift_jis') as zipfp:
57755779
self._test_read(zipfp, expected_names, expected_content)
57765780

5781+
def test_append_keep_efs_flag(self):
5782+
"""Files loaded from an archive should keep original EFS flags when
5783+
rewriting metadata in append mode."""
5784+
names = ['file1', 'file2', 'file3', 'file4']
5785+
contents = [b'content1', b'content2', b'content3', b'content4']
5786+
comments = ['\u4e00'.encode('utf-8'), b'foo', '\u4e8c'.encode('shift_jis'), b'bar']
5787+
efs_flags = [True, True, False, False]
5788+
5789+
def mock_encode(self):
5790+
if efs_flags[i]:
5791+
zinfo.flag_bits |= zipfile._MASK_UTF_FILENAME
5792+
return (self.filename.encode('ascii'), self.flag_bits)
5793+
5794+
with mock.patch('zipfile.ZipInfo._encodeFilenameFlags', mock_encode), \
5795+
zipfile.ZipFile(TESTFN, "w") as zipfp:
5796+
for i, name in enumerate(names):
5797+
zinfo = zipfile.ZipInfo(name)
5798+
zinfo.comment = comments[i]
5799+
zipfp.writestr(zinfo, contents[i])
5800+
5801+
with zipfile.ZipFile(TESTFN, "a") as zipfp:
5802+
# trigger archive rewriting
5803+
zipfp.comment = b'com'
5804+
5805+
with zipfile.ZipFile(TESTFN, "r") as zipfp:
5806+
self.assertEqual(zipfp.comment, b'com')
5807+
self._test_read(zipfp, names, contents, comments, efs_flags)
5808+
5809+
def test_write_enforce_efs_flag(self):
5810+
"""New files should enforce EFS flag if name or comment contains non-ASCII chars"""
5811+
names = ['\u4e00', '\u4e8c', 'file3', 'file4']
5812+
contents = [b'content1', b'content2', b'content3', b'content4']
5813+
comments = ['\u4e00'.encode('utf-8'), b'foo', '\u4e8c'.encode('utf-8'), b'bar']
5814+
efs_flags = [True, True, True, False]
5815+
5816+
with zipfile.ZipFile(TESTFN, "w") as zipfp:
5817+
for i, name in enumerate(names):
5818+
zinfo = zipfile.ZipInfo(name)
5819+
zinfo.comment = comments[i]
5820+
zipfp.writestr(zinfo, contents[i])
5821+
self.assertEqual(zipfp.namelist(), names)
5822+
5823+
with zipfile.ZipFile(TESTFN, "r") as zipfp:
5824+
self._test_read(zipfp, names, contents, comments, efs_flags)
5825+
57775826
def test_write_with_metadata_encoding(self):
57785827
ZF = zipfile.ZipFile
57795828
for mode in ("w", "x", "a"):

Lib/zipfile/__init__.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,7 @@ class ZipInfo:
453453
'file_size',
454454
'_raw_time',
455455
'_end_offset',
456+
'_metadata_encoding',
456457
)
457458

458459
def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)):
@@ -488,6 +489,7 @@ def __init__(self, filename="NoName", date_time=(1980,1,1,0,0,0)):
488489
self.compress_size = 0 # Size of the compressed file
489490
self.file_size = 0 # Size of the uncompressed file
490491
self._end_offset = None # Start of the next local header or central directory
492+
self._metadata_encoding = None # Metadata encoding when read
491493
# Other attributes are set by class ZipFile:
492494
# header_offset Byte offset to the file header
493495
# CRC CRC-32 of the uncompressed file
@@ -575,13 +577,20 @@ def FileHeader(self, zip64=None):
575577

576578
def _encodeFilenameFlags(self):
577579
if self.flag_bits & _MASK_UTF_FILENAME:
578-
encoding = 'ascii'
580+
return self.filename.encode('utf-8'), self.flag_bits
581+
582+
encoding = self._metadata_encoding
583+
if encoding:
584+
# Use the original encoding for a legacy file loaded from archive.
585+
return self.filename.encode(encoding), self.flag_bits
579586
else:
580-
encoding = 'cp437'
581-
try:
582-
return self.filename.encode(encoding), self.flag_bits & ~_MASK_UTF_FILENAME
583-
except UnicodeEncodeError:
584-
return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME
587+
# Enforce EFS if either filename or comment is non-ASCII for a newly
588+
# added file.
589+
try:
590+
self.comment.decode('ascii')
591+
return self.filename.encode('ascii'), self.flag_bits
592+
except (UnicodeEncodeError, UnicodeDecodeError):
593+
return self.filename.encode('utf-8'), self.flag_bits | _MASK_UTF_FILENAME
585594

586595
def _decodeExtra(self, filename_crc):
587596
# Try to decode the extra field.
@@ -2072,6 +2081,10 @@ def _RealGetContents(self):
20722081
t>>11, (t>>5)&0x3F, (t&0x1F) * 2 )
20732082
x._decodeExtra(orig_filename_crc)
20742083
x.header_offset = x.header_offset + concat
2084+
2085+
# Enforce cp437 for a legacy file if not specified
2086+
x._metadata_encoding = self.metadata_encoding or 'cp437'
2087+
20752088
self.filelist.append(x)
20762089
self.NameToInfo[x.filename] = x
20772090

@@ -2286,7 +2299,7 @@ def _open_to_write(self, zinfo, force_zip64=False):
22862299
zinfo.compress_size = 0
22872300
zinfo.CRC = 0
22882301

2289-
zinfo.flag_bits = _MASK_UTF_FILENAME
2302+
zinfo.flag_bits = 0x00
22902303
if zinfo.compress_type == ZIP_LZMA:
22912304
# Compressed data includes an end-of-stream (EOS) marker
22922305
zinfo.flag_bits |= _MASK_COMPRESS_OPTION_1

0 commit comments

Comments
 (0)