Skip to content

Commit e4cd001

Browse files
itamarofacebook-github-bot
authored andcommitted
Support zip64 in zipimport
Summary: upstream PR: python/cpython#94146 upstream issue: python/cpython#89739 Reviewed By: zsol Differential Revision: D54468361 fbshipit-source-id: 36d09b9480fb24a13157e7996f072451f7df9a79
1 parent 16841b7 commit e4cd001

File tree

3 files changed

+141
-38
lines changed

3 files changed

+141
-38
lines changed

Doc/library/zipimport.rst

+3
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,9 @@ Any files may be present in the ZIP archive, but importers are only invoked for
3030
corresponding :file:`.pyc` file, meaning that if a ZIP archive
3131
doesn't contain :file:`.pyc` files, importing may be rather slow.
3232

33+
.. versionchanged:: 3.13
34+
ZIP64 is supported
35+
3336
.. versionchanged:: 3.8
3437
Previously, ZIP archives with an archive comment were not supported.
3538

Lib/test/test_zipimport.py

+12
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,10 @@ def makeZip(self, files, zipName=TEMP_ZIP, **kw):
128128
f.write(stuff)
129129
f.write(data)
130130

131+
def getZip64Files(self):
132+
# This is the simplest way to make zipfile generate the zip64 EOCD block
133+
return {f"f{n}.py": (NOW, test_src) for n in range(65537)}
134+
131135
def doTest(self, expected_ext, files, *modules, **kw):
132136
self.makeZip(files, **kw)
133137

@@ -761,6 +765,14 @@ def testLargestPossibleComment(self):
761765
files = {TESTMOD + ".py": (NOW, test_src)}
762766
self.doTest(".py", files, TESTMOD, comment=b"c" * ((1 << 16) - 1))
763767

768+
def testZip64(self):
769+
files = self.getZip64Files()
770+
self.doTest(".py", files, "f6")
771+
772+
def testZip64CruftAndComment(self):
773+
files = self.getZip64Files()
774+
self.doTest(".py", files, "f65536", comment=b"c" * ((1 << 16) - 1))
775+
764776

765777
@support.requires_zlib()
766778
class CompressedZipImportTestCase(UncompressedZipImportTestCase):

Lib/zipimport.py

+126-38
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,14 @@ class ZipImportError(ImportError):
4040
_module_type = type(sys)
4141

4242
END_CENTRAL_DIR_SIZE = 22
43-
STRING_END_ARCHIVE = b'PK\x05\x06'
43+
END_CENTRAL_DIR_SIZE_64 = 56
44+
END_CENTRAL_DIR_LOCATOR_SIZE_64 = 20
45+
STRING_END_ARCHIVE = b'PK\x05\x06' # standard EOCD signature
46+
STRING_END_LOCATOR_64 = b'PK\x06\x07' # Zip64 EOCD Locator signature
47+
STRING_END_ZIP_64 = b'PK\x06\x06' # Zip64 EOCD signature
4448
MAX_COMMENT_LEN = (1 << 16) - 1
49+
MAX_UINT32 = 0xffffffff
50+
ZIP64_EXTRA_TAG = 0x1
4551

4652
class zipimporter(_bootstrap_external._LoaderBasics):
4753
"""zipimporter(archivepath) -> zipimporter object
@@ -352,49 +358,72 @@ def _read_directory(archive):
352358
# to not cause problems when some runs 'python3 /dev/fd/9 9<some_script'
353359
start_offset = fp.tell()
354360
try:
361+
# Check if there's a comment.
355362
try:
356-
fp.seek(-END_CENTRAL_DIR_SIZE, 2)
357-
header_position = fp.tell()
358-
buffer = fp.read(END_CENTRAL_DIR_SIZE)
363+
fp.seek(0, 2)
364+
file_size = fp.tell()
359365
except OSError:
360-
raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
361-
if len(buffer) != END_CENTRAL_DIR_SIZE:
362-
raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
363-
if buffer[:4] != STRING_END_ARCHIVE:
364-
# Bad: End of Central Dir signature
365-
# Check if there's a comment.
366-
try:
367-
fp.seek(0, 2)
368-
file_size = fp.tell()
369-
except OSError:
370-
raise ZipImportError(f"can't read Zip file: {archive!r}",
371-
path=archive)
372-
max_comment_start = max(file_size - MAX_COMMENT_LEN -
373-
END_CENTRAL_DIR_SIZE, 0)
374-
try:
375-
fp.seek(max_comment_start)
376-
data = fp.read()
377-
except OSError:
378-
raise ZipImportError(f"can't read Zip file: {archive!r}",
379-
path=archive)
380-
pos = data.rfind(STRING_END_ARCHIVE)
381-
if pos < 0:
382-
raise ZipImportError(f'not a Zip file: {archive!r}',
383-
path=archive)
366+
raise ZipImportError(f"can't read Zip file: {archive!r}",
367+
path=archive)
368+
max_comment_plus_dirs_size = (
369+
MAX_COMMENT_LEN + END_CENTRAL_DIR_SIZE +
370+
END_CENTRAL_DIR_SIZE_64 + END_CENTRAL_DIR_LOCATOR_SIZE_64)
371+
max_comment_start = max(file_size - max_comment_plus_dirs_size, 0)
372+
try:
373+
fp.seek(max_comment_start)
374+
data = fp.read(max_comment_plus_dirs_size)
375+
except OSError:
376+
raise ZipImportError(f"can't read Zip file: {archive!r}",
377+
path=archive)
378+
pos = data.rfind(STRING_END_ARCHIVE)
379+
pos64 = data.rfind(STRING_END_ZIP_64)
380+
381+
if (pos64 >= 0 and pos64+END_CENTRAL_DIR_SIZE_64+END_CENTRAL_DIR_LOCATOR_SIZE_64==pos):
382+
# Zip64 at "correct" offset from standard EOCD
383+
buffer = data[pos64:pos64 + END_CENTRAL_DIR_SIZE_64]
384+
if len(buffer) != END_CENTRAL_DIR_SIZE_64:
385+
raise ZipImportError(
386+
f"corrupt Zip64 file: Expected {END_CENTRAL_DIR_SIZE_64} byte "
387+
f"zip64 central directory, but read {len(buffer)} bytes.",
388+
path=archive)
389+
header_position = file_size - len(data) + pos64
390+
391+
central_directory_size = int.from_bytes(buffer[40:48], 'little')
392+
central_directory_position = int.from_bytes(buffer[48:56], 'little')
393+
num_entries = int.from_bytes(buffer[24:32], 'little')
394+
elif pos >= 0:
384395
buffer = data[pos:pos+END_CENTRAL_DIR_SIZE]
385396
if len(buffer) != END_CENTRAL_DIR_SIZE:
386397
raise ZipImportError(f"corrupt Zip file: {archive!r}",
387398
path=archive)
399+
388400
header_position = file_size - len(data) + pos
389401

390-
header_size = _unpack_uint32(buffer[12:16])
391-
header_offset = _unpack_uint32(buffer[16:20])
392-
if header_position < header_size:
402+
# Buffer now contains a valid EOCD, and header_position gives the
403+
# starting position of it.
404+
central_directory_size = _unpack_uint32(buffer[12:16])
405+
central_directory_position = _unpack_uint32(buffer[16:20])
406+
num_entries = _unpack_uint16(buffer[8:10])
407+
408+
# N.b. if someday you want to prefer the standard (non-zip64) EOCD,
409+
# you need to adjust position by 76 for arc to be 0.
410+
else:
411+
raise ZipImportError(f'not a Zip file: {archive!r}',
412+
path=archive)
413+
414+
# Buffer now contains a valid EOCD, and header_position gives the
415+
# starting position of it.
416+
# XXX: These are cursory checks but are not as exact or strict as they
417+
# could be. Checking the arc-adjusted value is probably good too.
418+
if header_position < central_directory_size:
393419
raise ZipImportError(f'bad central directory size: {archive!r}', path=archive)
394-
if header_position < header_offset:
420+
if header_position < central_directory_position:
395421
raise ZipImportError(f'bad central directory offset: {archive!r}', path=archive)
396-
header_position -= header_size
397-
arc_offset = header_position - header_offset
422+
header_position -= central_directory_size
423+
# On just-a-zipfile these values are the same and arc_offset is zero; if
424+
# the file has some bytes prepended, `arc_offset` is the number of such
425+
# bytes. This is used for pex as well as self-extracting .exe.
426+
arc_offset = header_position - central_directory_position
398427
if arc_offset < 0:
399428
raise ZipImportError(f'bad central directory size or offset: {archive!r}', path=archive)
400429

@@ -411,6 +440,11 @@ def _read_directory(archive):
411440
raise EOFError('EOF read where not expected')
412441
# Start of file header
413442
if buffer[:4] != b'PK\x01\x02':
443+
if count != num_entries:
444+
raise ZipImportError(
445+
f"mismatched num_entries: {count} should be {num_entries} in {archive!r}",
446+
path=archive,
447+
)
414448
break # Bad: Central Dir File Header
415449
if len(buffer) != 46:
416450
raise EOFError('EOF read where not expected')
@@ -426,9 +460,6 @@ def _read_directory(archive):
426460
comment_size = _unpack_uint16(buffer[32:34])
427461
file_offset = _unpack_uint32(buffer[42:46])
428462
header_size = name_size + extra_size + comment_size
429-
if file_offset > header_offset:
430-
raise ZipImportError(f'bad local header offset: {archive!r}', path=archive)
431-
file_offset += arc_offset
432463

433464
try:
434465
name = fp.read(name_size)
@@ -440,7 +471,10 @@ def _read_directory(archive):
440471
# slower than reading the data because fseek flushes stdio's
441472
# internal buffers. See issue #8745.
442473
try:
443-
if len(fp.read(header_size - name_size)) != header_size - name_size:
474+
extra_data_len = header_size - name_size
475+
extra_data = memoryview(fp.read(extra_data_len))
476+
477+
if len(extra_data) != extra_data_len:
444478
raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
445479
except OSError:
446480
raise ZipImportError(f"can't read Zip file: {archive!r}", path=archive)
@@ -457,6 +491,60 @@ def _read_directory(archive):
457491

458492
name = name.replace('/', path_sep)
459493
path = _bootstrap_external._path_join(archive, name)
494+
495+
# Ordering matches unpacking below.
496+
if (
497+
file_size == MAX_UINT32 or
498+
data_size == MAX_UINT32 or
499+
file_offset == MAX_UINT32
500+
):
501+
# need to decode extra_data looking for a zip64 extra (which might not
502+
# be present)
503+
while extra_data:
504+
if len(extra_data) < 4:
505+
raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
506+
tag = _unpack_uint16(extra_data[:2])
507+
size = _unpack_uint16(extra_data[2:4])
508+
if len(extra_data) < 4 + size:
509+
raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
510+
if tag == ZIP64_EXTRA_TAG:
511+
if (len(extra_data) - 4) % 8 != 0:
512+
raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
513+
num_extra_values = (len(extra_data) - 4) // 8
514+
if num_extra_values > 3:
515+
raise ZipImportError(f"can't read header extra: {archive!r}", path=archive)
516+
values = struct.unpack_from(f"<{min(num_extra_values, 3)}Q",
517+
extra_data, offset=4)
518+
519+
# N.b. Here be dragons: the ordering of these is different than
520+
# the header fields, and it's really easy to get it wrong since
521+
# naturally-occuring zips that use all 3 are >4GB
522+
if file_size == MAX_UINT32:
523+
file_size = values.pop(0)
524+
if data_size == MAX_UINT32:
525+
data_size = values.pop(0)
526+
if file_offset == MAX_UINT32:
527+
file_offset = values.pop(0)
528+
529+
break
530+
531+
# For a typical zip, this bytes-slicing only happens 2-3 times, on
532+
# small data like timestamps and filesizes.
533+
extra_data = extra_data[4+size:]
534+
else:
535+
_bootstrap._verbose_message(
536+
"zipimport: suspected zip64 but no zip64 extra for {!r}",
537+
path,
538+
)
539+
# XXX These two statements seem swapped because `central_directory_position`
540+
# is a position within the actual file, but `file_offset` (when compared) is
541+
# as encoded in the entry, not adjusted for this file.
542+
# N.b. this must be after we've potentially read the zip64 extra which can
543+
# change `file_offset`.
544+
if file_offset > central_directory_position:
545+
raise ZipImportError(f'bad local header offset: {archive!r}', path=archive)
546+
file_offset += arc_offset
547+
460548
t = (path, compress, data_size, file_size, file_offset, time, date, crc)
461549
files[name] = t
462550
count += 1

0 commit comments

Comments
 (0)