|
| 1 | +"""Repack wheels to limit per-entry inflation below a threshold. |
| 2 | +
|
| 3 | +PyPI rejects wheels where any zip entry inflates over 50x (zip bomb detection). |
| 4 | +This script re-compresses entries that exceed the limit using deflate with |
| 5 | +frequent Z_FULL_FLUSH to reduce compression efficiency while keeping the |
| 6 | +data genuinely compressed. |
| 7 | +
|
| 8 | +Usage (cibuildwheel repair-wheel-command): |
| 9 | + # Single wheel -> dest_dir (Windows, no prior repair step) |
| 10 | + python tools/repack_wheel.py WHEEL DEST_DIR |
| 11 | +
|
| 12 | + # All wheels in a directory, in-place (after auditwheel/delocate) |
| 13 | + python tools/repack_wheel.py DEST_DIR |
| 14 | +""" |
| 15 | + |
| 16 | +import os |
| 17 | +import struct |
| 18 | +import sys |
| 19 | +import time |
| 20 | +import zlib |
| 21 | +import zipfile |
| 22 | + |
| 23 | +CHUNK_SIZE = 4096 |
| 24 | +MAX_INFLATION = 50 |
| 25 | + |
| 26 | + |
| 27 | +def compress_chunked(data, chunk_size): |
| 28 | + """Compress data with frequent flushes to reduce compression efficiency.""" |
| 29 | + c = zlib.compressobj(1, zlib.DEFLATED, -15) |
| 30 | + parts = [] |
| 31 | + for i in range(0, len(data), chunk_size): |
| 32 | + chunk = data[i:i + chunk_size] |
| 33 | + parts.append(c.compress(chunk)) |
| 34 | + parts.append(c.flush(zlib.Z_FULL_FLUSH)) |
| 35 | + parts.append(c.flush(zlib.Z_FINISH)) |
| 36 | + return b''.join(parts) |
| 37 | + |
| 38 | + |
| 39 | +def make_dos_datetime(dt): |
| 40 | + mod_time = (dt[3] << 11) | (dt[4] << 5) | (dt[5] // 2) |
| 41 | + mod_date = ((dt[0] - 1980) << 9) | (dt[1] << 5) | dt[2] |
| 42 | + return mod_time, mod_date |
| 43 | + |
| 44 | + |
| 45 | +def repack_wheel(src_path, dst_path): |
| 46 | + """Repack a wheel, limiting per-entry inflation below MAX_INFLATION. |
| 47 | +
|
| 48 | + src_path and dst_path may be the same file (in-place repack). |
| 49 | + """ |
| 50 | + with zipfile.ZipFile(src_path, 'r') as zin: |
| 51 | + items_data = [(item, zin.read(item.filename)) for item in zin.infolist()] |
| 52 | + |
| 53 | + tmp_path = dst_path + '.tmp' |
| 54 | + needs_repack = False |
| 55 | + |
| 56 | + with open(tmp_path, 'wb') as f: |
| 57 | + central_dir = [] |
| 58 | + |
| 59 | + for item, data in items_data: |
| 60 | + crc = zlib.crc32(data) & 0xFFFFFFFF |
| 61 | + uncompressed_size = len(data) |
| 62 | + |
| 63 | + # Compress normally first |
| 64 | + c = zlib.compressobj(6, zlib.DEFLATED, -15) |
| 65 | + compressed = c.compress(data) + c.flush() |
| 66 | + |
| 67 | + # Check if inflation exceeds limit |
| 68 | + if len(compressed) > 0 and uncompressed_size / len(compressed) >= MAX_INFLATION: |
| 69 | + needs_repack = True |
| 70 | + t0 = time.time() |
| 71 | + compressed = compress_chunked(data, CHUNK_SIZE) |
| 72 | + elapsed = time.time() - t0 |
| 73 | + inflation = uncompressed_size / len(compressed) |
| 74 | + print(f' {item.filename}: {uncompressed_size} -> {len(compressed)} ' |
| 75 | + f'({len(compressed) / uncompressed_size * 100:.1f}%, {inflation:.1f}x) [{elapsed:.1f}s]') |
| 76 | + |
| 77 | + compress_type = zipfile.ZIP_DEFLATED |
| 78 | + compressed_size = len(compressed) |
| 79 | + |
| 80 | + local_header_offset = f.tell() |
| 81 | + fname = item.filename.encode('utf-8') |
| 82 | + mod_time, mod_date = make_dos_datetime(item.date_time) |
| 83 | + |
| 84 | + # Local file header |
| 85 | + f.write(struct.pack( |
| 86 | + '<4sHHHHHIIIHH', |
| 87 | + b'PK\x03\x04', |
| 88 | + 20, # version needed |
| 89 | + 0, # flags |
| 90 | + compress_type, |
| 91 | + mod_time, |
| 92 | + mod_date, |
| 93 | + crc, |
| 94 | + compressed_size, |
| 95 | + uncompressed_size, |
| 96 | + len(fname), |
| 97 | + 0, # extra length |
| 98 | + )) |
| 99 | + f.write(fname) |
| 100 | + f.write(compressed) |
| 101 | + |
| 102 | + central_dir.append(( |
| 103 | + fname, compress_type, mod_time, mod_date, |
| 104 | + crc, compressed_size, uncompressed_size, |
| 105 | + local_header_offset, item.external_attr, |
| 106 | + )) |
| 107 | + |
| 108 | + # Central directory |
| 109 | + cd_offset = f.tell() |
| 110 | + for (fname, ct, mt, md, crc, cs, us, offset, ext_attr) in central_dir: |
| 111 | + f.write(struct.pack( |
| 112 | + '<4sHHHHHHIIIHHHHHII', |
| 113 | + b'PK\x01\x02', |
| 114 | + 20, # version made by |
| 115 | + 20, # version needed |
| 116 | + 0, # flags |
| 117 | + ct, mt, md, crc, cs, us, |
| 118 | + len(fname), |
| 119 | + 0, # extra length |
| 120 | + 0, # comment length |
| 121 | + 0, # disk number start |
| 122 | + 0, # internal attributes |
| 123 | + ext_attr, |
| 124 | + offset, |
| 125 | + )) |
| 126 | + f.write(fname) |
| 127 | + |
| 128 | + cd_size = f.tell() - cd_offset |
| 129 | + |
| 130 | + # End of central directory |
| 131 | + f.write(struct.pack( |
| 132 | + '<4sHHHHIIH', |
| 133 | + b'PK\x05\x06', |
| 134 | + 0, 0, |
| 135 | + len(central_dir), |
| 136 | + len(central_dir), |
| 137 | + cd_size, |
| 138 | + cd_offset, |
| 139 | + 0, |
| 140 | + )) |
| 141 | + |
| 142 | + if needs_repack: |
| 143 | + os.replace(tmp_path, dst_path) |
| 144 | + else: |
| 145 | + os.unlink(tmp_path) |
| 146 | + if src_path != dst_path: |
| 147 | + import shutil |
| 148 | + shutil.copy2(src_path, dst_path) |
| 149 | + |
| 150 | + |
| 151 | +def verify_wheel(whl_path): |
| 152 | + with zipfile.ZipFile(whl_path, 'r') as zf: |
| 153 | + max_inflation = 0 |
| 154 | + for info in zf.infolist(): |
| 155 | + if info.compress_size > 0: |
| 156 | + inf = info.file_size / info.compress_size |
| 157 | + max_inflation = max(max_inflation, inf) |
| 158 | + zf.testzip() |
| 159 | + fsize = os.path.getsize(whl_path) |
| 160 | + return fsize, max_inflation |
| 161 | + |
| 162 | + |
| 163 | +def main(): |
| 164 | + if len(sys.argv) == 3 and not os.path.isdir(sys.argv[1]): |
| 165 | + # Mode: repack_wheel.py WHEEL DEST_DIR |
| 166 | + src_wheel = sys.argv[1] |
| 167 | + dest_dir = sys.argv[2] |
| 168 | + basename = os.path.basename(src_wheel) |
| 169 | + dst_wheel = os.path.join(dest_dir, basename) |
| 170 | + print(f'Repacking {basename}...') |
| 171 | + repack_wheel(src_wheel, dst_wheel) |
| 172 | + fsize, max_inf = verify_wheel(dst_wheel) |
| 173 | + print(f' -> {fsize:,} bytes, max entry inflation={max_inf:.1f}x') |
| 174 | + elif len(sys.argv) == 2 and os.path.isdir(sys.argv[1]): |
| 175 | + # Mode: repack_wheel.py DEST_DIR (in-place) |
| 176 | + dest_dir = sys.argv[1] |
| 177 | + for name in sorted(os.listdir(dest_dir)): |
| 178 | + if not name.endswith('.whl'): |
| 179 | + continue |
| 180 | + path = os.path.join(dest_dir, name) |
| 181 | + print(f'Repacking {name}...') |
| 182 | + repack_wheel(path, path) |
| 183 | + fsize, max_inf = verify_wheel(path) |
| 184 | + print(f' -> {fsize:,} bytes, max entry inflation={max_inf:.1f}x') |
| 185 | + else: |
| 186 | + print(f'Usage: {sys.argv[0]} WHEEL DEST_DIR', file=sys.stderr) |
| 187 | + print(f' {sys.argv[0]} DEST_DIR', file=sys.stderr) |
| 188 | + sys.exit(1) |
| 189 | + |
| 190 | + |
| 191 | +if __name__ == '__main__': |
| 192 | + main() |
0 commit comments