From ad927872aed27a46afbd480de39251f70df359d4 Mon Sep 17 00:00:00 2001 From: Saul Pwanson Date: Thu, 23 Jun 2022 20:51:25 -0700 Subject: [PATCH] initial checkin --- .gitignore | 1 + LICENSE-mit.txt | 19 +++++++ MANIFEST.in | 3 ++ README.md | 31 ++++++++++++ requirements.txt | 1 + setup.py | 33 +++++++++++++ unzip-http | 62 +++++++++++++++++++++++ unzip_http.py | 125 +++++++++++++++++++++++++++++++++++++++++++++++ 8 files changed, 275 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE-mit.txt create mode 100644 MANIFEST.in create mode 100644 README.md create mode 100644 requirements.txt create mode 100644 setup.py create mode 100755 unzip-http create mode 100644 unzip_http.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bee8a64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__ diff --git a/LICENSE-mit.txt b/LICENSE-mit.txt new file mode 100644 index 0000000..0798bfe --- /dev/null +++ b/LICENSE-mit.txt @@ -0,0 +1,19 @@ +Copyright (c) 2022 Saul Pwanson + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..e636148 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,3 @@ +include README.md +include requirements.txt +include LICENSE-mit.txt diff --git a/README.md b/README.md new file mode 100644 index 0000000..8094ed7 --- /dev/null +++ b/README.md @@ -0,0 +1,31 @@ +# unzip-http + +Extract files from .zip files over http without downloading the entire archive. + +## Install + + pip install unzip-http + +## Usage + + unzip-http + +Extract from a remote .zip at to stdout. + +If no filenames given, displays .zip contents (filenames and sizes). + +Each filename can be a wildcard glob; all matching files are concatenated and sent to stdout in zipfile order. + +Note: HTTP server must send `Accept-Ranges: bytes` and `Content-Length` in headers. + +# Python module `unzip_http` + + import unzip_http + + rzf = unzip_http.RemoteZipFile('https://example.com/foo.zip') + binfp = rzf.open('bar.bin') + txtfp = rzf.open_text('baz.txt') + +# Credits + +`unzip-http` was written by [Saul Pwanson](https://saul.pw) and made available for use under the MIT License. diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..a42590b --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +urllib3 diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..d5f3e55 --- /dev/null +++ b/setup.py @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: + +from setuptools import setup + + +def readme(): + with open("README.md") as f: + return f.read() + +def requirements(): + with open("requirements.txt") as f: + return f.read().split("\n") + + +setup( + name="unzip-http", + version="0.1", + description="extract files from .zip files over http without downloading entire archive", + long_description=readme(), + long_description_content_type="text/markdown", + classifiers=[ + "Development Status :: 4 - Beta", + "Programming Language :: Python ::3", + ], + keywords="http zip unzip", + author="Saul Pwanson", + url="https://github.com/saulpw/unzip-http", + python_requires=">=3.8", + py_modules=["unzip_http"], + packages=["unzip-http"], + scripts=["unzip-http"], + install_requires=requirements(), +) diff --git a/unzip-http b/unzip-http new file mode 100755 index 0000000..d9efda4 --- /dev/null +++ b/unzip-http @@ -0,0 +1,62 @@ +#!/usr/bin/env python3 + +''' +Usage: + unzip-http + +Extract from a remote .zip at to stdout. +If no filenames given, displays .zip contents (filenames and sizes). +Each filename can be a wildcard glob; all matching files are concatenated and sent to stdout in zipfile order. + +HTTP server must send `Accept-Ranges: bytes` and `Content-Length` in headers. +''' + +import sys +import io +import time +import fnmatch + +import unzip_http + + +class StreamProgress: + def __init__(self, fp, name='', total=0): + self.name = name + self.fp = fp + self.total = total + self.start_time = time.time() + self.last_update = 0 + self.amtread = 0 + + def read(self, n): + r = self.fp.read(n) + self.amtread += len(r) + now = time.time() + if now - self.last_update > 0.1: + self.last_update = now + + elapsed_s = now - self.start_time + sys.stderr.write(f'\r{elapsed_s:.0f}s {self.amtread/10**6:.02f}/{self.total/10**6:.02f}MB ({self.amtread/10**6/elapsed_s:.02f} MB/s) {self.name}') + + if not r: + sys.stderr.write('\n') + + return r + + +def main(url, *globs): + rzf = unzip_http.RemoteZipFile(url) + for f in rzf.infolist(): + if not globs: + print(f'{f.compress_size/2**20:.02f}MB -> {f.file_size/2**20:.02f}MB {f.filename}') + elif any(fnmatch.fnmatch(f.filename, g) for g in globs): + fp = StreamProgress(rzf.open(f), name=f.filename, total=f.compress_size) + while r := fp.read(2**18): + sys.stdout.buffer.write(r) + + +args = sys.argv[1:] +if not args: + print(__doc__, file=sys.stderr) +else: + main(*args) diff --git a/unzip_http.py b/unzip_http.py new file mode 100644 index 0000000..6dea764 --- /dev/null +++ b/unzip_http.py @@ -0,0 +1,125 @@ +from dataclasses import dataclass + +import sys +import io +import time +import zlib +import struct +import fnmatch + +import urllib3 + + +def error(s): + raise Exception(s) + + +@dataclass +class RemoteZipInfo: + filename:str = '' + date_time:int = 0 + header_offset:int = 0 + compress_type:int = 0 + compress_size:int = 0 + file_size:int = 0 + + +class RemoteZipFile: + fmt_endcdir = 'IHHHHIIH' + fmt_cdirentry = ' len(self._buffer): + r = self.raw.read(2**18) + if not r: + break + self._buffer += self._decompressor.decompress(r) + + ret = self._buffer[:n] + self._buffer = self._buffer[n:] + + return ret