Skip to content

Commit a1f9bd2

Browse files
jcristauglandium
andauthored
Bug 1853082 - Don't rely on file extension to guess archive type in fetch-content. r=taskgraph-reviewers,bhearsum (#547)
I want to add a fetch for a rust crate archive from crates.io, and those don't come with an extension in the url, so we can't rely on that. Content sniffing is easy enough to put in place. Differential Revision: https://phabricator.services.mozilla.com/D188152 Co-authored-by: Mike Hommey <[email protected]>
1 parent 49c6421 commit a1f9bd2

File tree

1 file changed

+44
-25
lines changed

1 file changed

+44
-25
lines changed

src/taskgraph/run-task/fetch-content

Lines changed: 44 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import contextlib
1010
import datetime
1111
import gzip
1212
import hashlib
13+
import io
1314
import json
1415
import lzma
1516
import multiprocessing
@@ -332,21 +333,41 @@ def gpg_verify_path(path: pathlib.Path, public_key_data: bytes, signature_data:
332333
subprocess.run(["gpgconf", "--kill", "gpg-agent"], env=env)
333334

334335

335-
def open_tar_stream(path: pathlib.Path):
336-
""""""
337-
if path.suffix == ".bz2":
338-
return bz2.open(str(path), "rb")
339-
elif path.suffix in (".gz", ".tgz") :
340-
return gzip.open(str(path), "rb")
341-
elif path.suffix == ".xz":
342-
return lzma.open(str(path), "rb")
343-
elif path.suffix == ".zst":
344-
dctx = ZstdDecompressor()
345-
return dctx.stream_reader(path.open("rb"))
346-
elif path.suffix == ".tar":
347-
return path.open("rb")
348-
else:
349-
raise ValueError("unknown archive format for tar file: %s" % path)
336+
class ArchiveTypeNotSupported(Exception):
337+
def __init__(self, path: pathlib.Path):
338+
super(Exception, self).__init__("Archive type not supported for %s" % path)
339+
340+
341+
def open_stream(path: pathlib.Path):
342+
"""Attempt to identify a path as an extractable archive by looking at its
343+
content."""
344+
fh = path.open(mode="rb")
345+
magic = fh.read(6)
346+
fh.seek(0)
347+
if magic[:2] == b"PK":
348+
return "zip", fh
349+
if magic[:2] == b"\x1f\x8b":
350+
fh = gzip.GzipFile(fileobj=fh)
351+
elif magic[:3] == b"BZh":
352+
fh = bz2.BZ2File(fh)
353+
elif magic == b"\xfd7zXZ\x00":
354+
fh = lzma.LZMAFile(fh)
355+
elif magic[:4] == b"\x28\xb5\x2f\xfd":
356+
fh = ZstdDecompressor().stream_reader(fh)
357+
fh = io.BufferedReader(fh)
358+
try:
359+
# A full tar info header is 512 bytes.
360+
headers = fh.peek(512)
361+
# 257 is the offset of the ustar magic.
362+
magic = headers[257 : 257 + 8]
363+
# For older unix tar, rely on TarInfo.frombuf's checksum check
364+
if magic in (b"ustar\x0000", b"ustar \x00") or tarfile.TarInfo.frombuf(
365+
headers[:512], tarfile.ENCODING, "surrogateescape"
366+
):
367+
return "tar", fh
368+
except Exception as e:
369+
pass
370+
raise ArchiveTypeNotSupported(path)
350371

351372

352373
def archive_type(path: pathlib.Path):
@@ -359,7 +380,7 @@ def archive_type(path: pathlib.Path):
359380
return None
360381

361382

362-
def extract_archive(path, dest_dir, typ):
383+
def extract_archive(path, dest_dir):
363384
"""Extract an archive to a destination directory."""
364385

365386
# Resolve paths to absolute variants.
@@ -371,8 +392,8 @@ def extract_archive(path, dest_dir, typ):
371392

372393
# We pipe input to the decompressor program so that we can apply
373394
# custom decompressors that the program may not know about.
395+
typ, ifh = open_stream(path)
374396
if typ == "tar":
375-
ifh = open_tar_stream(path)
376397
# On Windows, the tar program doesn't support things like symbolic
377398
# links, while Windows actually support them. The tarfile module in
378399
# python does. So use that. But since it's significantly slower than
@@ -419,10 +440,8 @@ def repack_archive(
419440
):
420441
assert orig != dest
421442
log("Repacking as %s" % dest)
422-
orig_typ = archive_type(orig)
443+
orig_typ, ifh = open_stream(orig)
423444
typ = archive_type(dest)
424-
if not orig_typ:
425-
raise Exception("Archive type not supported for %s" % orig.name)
426445
if not typ:
427446
raise Exception("Archive type not supported for %s" % dest.name)
428447

@@ -448,7 +467,7 @@ def repack_archive(
448467
ctx = ZstdCompressor()
449468
if orig_typ == "zip":
450469
assert typ == "tar"
451-
zip = zipfile.ZipFile(orig)
470+
zip = zipfile.ZipFile(ifh)
452471
# Convert the zip stream to a tar on the fly.
453472
with ctx.stream_writer(fh) as compressor, tarfile.open(
454473
fileobj=compressor, mode="w:"
@@ -490,7 +509,6 @@ def repack_archive(
490509
raise Exception("Repacking a tar to zip is not supported")
491510
assert typ == "tar"
492511

493-
ifh = open_tar_stream(orig)
494512
if filter:
495513
# To apply the filter, we need to open the tar stream and
496514
# tweak it.
@@ -533,11 +551,12 @@ def fetch_and_extract(url, dest_dir, extract=True, sha256=None, size=None):
533551
if not extract:
534552
return
535553

536-
typ = archive_type(dest_path)
537-
if typ:
538-
extract_archive(dest_path, dest_dir, typ)
554+
try:
555+
extract_archive(dest_path, dest_dir)
539556
log("Removing %s" % dest_path)
540557
dest_path.unlink()
558+
except ArchiveTypeNotSupported:
559+
pass
541560

542561

543562
def fetch_urls(downloads):

0 commit comments

Comments
 (0)