diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b4ab3d8..cc0ad43 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -48,3 +48,23 @@ jobs: - uses: actions/checkout@v4 - name: Check formatting run: cargo fmt -- --check + + python-tests: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e .[test] + working-directory: python + + - name: Run pytest + run: python -m pytest + working-directory: python diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml new file mode 100644 index 0000000..4979669 --- /dev/null +++ b/.github/workflows/python-publish.yml @@ -0,0 +1,94 @@ +name: Publish Python Package + +on: + workflow_dispatch: + inputs: + publish_target: + description: 'Publish target (testpypi, pypi, dry-run)' + required: true + default: 'dry-run' + type: choice + options: + - dry-run + - testpypi + - pypi + push: + branches: + - main + paths: + - 'python/**' + pull_request: + branches: + - main + paths: + - 'python/**' + +jobs: + build-wheel: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build twine auditwheel + working-directory: python + + - name: Build package + run: python -m build + working-directory: python + + - name: Check distribution + run: twine check dist/* + working-directory: python + + - name: Upload Python package dist artifacts + uses: actions/upload-artifact@v4 + with: + name: python-package-dist + path: python/dist + + pypi-publish: + name: Upload release to PyPI + runs-on: ubuntu-latest + needs: build-wheel + if: github.event.inputs.publish_target == 'pypi' + environment: + name: pypi + url: https://pypi.org/p/dapper-python + permissions: + id-token: write # IMPORTANT: this permission is mandatory for trusted publishing + steps: + - name: Download Python package dist artifacts + uses: actions/download-artifact@v4 + with: + name: python-package-dist + path: dist + - name: Publish package distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + + testpypi-publish: + name: Upload release to TestPyPI + runs-on: ubuntu-latest + needs: build-wheel + if: github.event.inputs.publish_target == 'testpypi' + environment: + name: pypi + url: https://test.pypi.org/p/dapper-python + permissions: + id-token: write # IMPORTANT: this permission is mandatory for trusted publishing + steps: + - name: Download Python package dist artifacts + uses: actions/download-artifact@v4 + with: + name: python-package-dist + path: dist + - name: Publish package distributions to TestPyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/python/README.md b/python/README.md new file mode 100644 index 0000000..9b1c2df --- /dev/null +++ b/python/README.md @@ -0,0 +1,44 @@ +# dapper-python + +dapper-python is a Python package for working with DAPper datasets. It provides helper functions for normalizing shared library file names similar to the Rust implementation in the DAPper project, and other methods for helping developers access the DAPper datasets. + +## Installation + +You can install the `dapper-python` package from PyPI using pip: + +```bash +pip install dapper-python +``` + +## Usage + +Here is an example of how to use the `dapper-python` package: + +```python +from dapper_python.normalize import normalize_file_name + +# Example usage +file_name = "libexample-1.2.3.so.1.2" +normalized_name = normalize_file_name(file_name) +print(normalized_name) +``` + +## Tests + +The `dapper-python` package includes tests to help ensure the normalization function matches the Rust implementation. + +You can run the tests using the following command: + +```bash +python -m pytest +``` + +## License + +DAPper is released under the MIT license. See the [LICENSE](../LICENSE) +and [NOTICE](../NOTICE) files for details. All new contributions must be made +under this license. + +SPDX-License-Identifier: MIT + +LLNL-CODE-871441 diff --git a/python/dapper_python/__init__.py b/python/dapper_python/__init__.py new file mode 100644 index 0000000..ab69f5b --- /dev/null +++ b/python/dapper_python/__init__.py @@ -0,0 +1 @@ +# This file makes the folder a package diff --git a/python/dapper_python/normalize.py b/python/dapper_python/normalize.py new file mode 100644 index 0000000..00776b5 --- /dev/null +++ b/python/dapper_python/normalize.py @@ -0,0 +1,141 @@ +import re +from typing import Optional, Union + +class NormalizedFileName: + """ + Represents a normalized file name with optional version and SOABI information. + + Attributes: + name (str): The normalized file name. + version (Optional[str]): The version number, if available. + soabi (Optional[str]): The SOABI version, if available. + normalized (bool): Indicates if the file name was normalized. + """ + def __init__(self, name: str, version: Optional[str] = None, soabi: Optional[str] = None, normalized: bool = False): + self.name = name + self.version = version + self.soabi = soabi + self.normalized = normalized + +def normalize_file_name(name: str) -> Union[NormalizedFileName, str]: + """ + Normalize a shared library file name. + + Args: + name (str): The file name to normalize. + + Returns: + Union[NormalizedFileName, str]: A NormalizedFileName object if the file name is a shared library, + otherwise the original file name. + """ + if name.endswith(".so") or (".so." in name and not any(name.endswith(suffix) for suffix in [".gz", ".patch", ".diff", ".hmac", ".qm"])): + return normalize_soname(name) + return name + +def normalize_soname(soname: str) -> NormalizedFileName: + """ + Normalize a shared object file name. + + Args: + soname (str): The shared object file name to normalize. + + Returns: + NormalizedFileName: A NormalizedFileName object with the normalized name, version, and SOABI information. + """ + soname, soabi = extract_soabi_version(soname) + soabi_version = soabi if soabi else None + + if ".cpython-" in soname: + pos = soname.find(".cpython-") + return NormalizedFileName(normalize_cpython(soname, pos), soabi=soabi_version, normalized=True) + elif ".pypy" in soname: + pos = soname.find(".pypy") + return NormalizedFileName(normalize_pypy(soname, pos), soabi=soabi_version, normalized=True) + elif soname.startswith("libHS"): + normalized_name, version, normalized = normalize_haskell(soname) + return NormalizedFileName(normalized_name, version, soabi_version, normalized) + else: + normalized_name, version = extract_version_suffix(soname) + if version: + return NormalizedFileName(normalized_name, version, soabi_version, True) + return NormalizedFileName(soname, soabi=soabi_version, normalized=False) + +def extract_soabi_version(soname: str) -> (str, str): + """ + Extract the SOABI version from a shared object file name. + + Args: + soname (str): The shared object file name. + + Returns: + (str, str): A tuple containing the base file name and the SOABI version. + """ + if ".so." in soname: + pos = soname.find(".so.") + return soname[:pos + 3], soname[pos + 4:] + return soname, "" + +def extract_version_suffix(soname: str) -> (str, Optional[str]): + """ + Extract the version number from a shared object file name. + + Args: + soname (str): The shared object file name. + + Returns: + (str, Optional[str]): A tuple containing the base file name and the version number, if available. + """ + version_pattern = re.compile(r"-(\d+(\.\d+)+)\.so") + match = version_pattern.search(soname) + if match: + version = match.group(1) + base_soname = soname.rsplit('-', 1)[0] + return f"{base_soname}.so", version + return soname, None + +def normalize_cpython(soname: str, pos: int) -> str: + """ + Normalize a CPython shared object file name. + + Args: + soname (str): The shared object file name. + pos (int): The position of the CPython tag in the file name. + + Returns: + str: The normalized file name. + """ + return f"{soname[:pos]}.cpython.so" + +def normalize_pypy(soname: str, pos: int) -> str: + """ + Normalize a PyPy shared object file name. + + Args: + soname (str): The shared object file name. + pos (int): The position of the PyPy tag in the file name. + + Returns: + str: The normalized file name. + """ + return f"{soname[:pos]}.pypy.so" + +def normalize_haskell(soname: str) -> (str, Optional[str], bool): + """ + Normalize a Haskell shared object file name. + + Args: + soname (str): The shared object file name. + + Returns: + (str, Optional[str], bool): A tuple containing the normalized file name, version number, and a boolean + indicating if the file name was normalized. + """ + if "-ghc" in soname: + pos = soname.rfind("-ghc") + name = soname[:pos] + api_hash = name.rsplit('-', 1)[-1] + if len(api_hash) in [20, 21, 22] and api_hash.isalnum(): + name = name[:-(len(api_hash) + 1)] + name, version = name.rsplit('-', 1) + return f"{name}.so", version, True + return soname, None, False diff --git a/python/pyproject.toml b/python/pyproject.toml new file mode 100644 index 0000000..37075e5 --- /dev/null +++ b/python/pyproject.toml @@ -0,0 +1,58 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "dapper-python" +version = "0.0.0.dev0" +description = "A Python package for interacting with DAPper datasets" +authors = [ + { name = "Ryan Mast", email = "mast9@llnl.gov" } +] +license = { text = "MIT License" } +readme = "README.md" +requires-python = ">=3.6" +classifiers = [ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Topic :: Database", + "Topic :: Security", + "Topic :: Utilities", +] + +[project.urls] +Homepage = "https://github.com/LLNL/dapper" +Discussions = "https://github.com/LLNL/dapper/discussions" +"Issue Tracker" = "https://github.com/LLNL/dapper/issues" +"Source Code" = "https://github.com/LLNL/dapper" + +[project.optional-dependencies] +test = ["pytest"] +dev = ["build", "pre-commit"] + +[dependency-groups] +test = ["pytest"] +dev = ["build", "pre-commit"] + +[tool.setuptools.packages.find] +include = ["dapper_python", "dapper_python.*"] + +[project.entry-points."surfactant"] + +[tool.pytest.ini_options] +addopts = ["--import-mode=importlib"] +pythonpath = "." + +[tool.ruff] +line-length = 100 +indent-width = 4 + +[tool.ruff.lint] +# ruff defaults: E4, E7, E9, F +select = ["E", "F", "B", "I"] +ignore = ["E501", "F841"] +# don't fix flake8-bugbear (`B`) violations +unfixable = ["B"] \ No newline at end of file diff --git a/python/tests/test_normalize.py b/python/tests/test_normalize.py new file mode 100644 index 0000000..d92809f --- /dev/null +++ b/python/tests/test_normalize.py @@ -0,0 +1,99 @@ +import pytest +from dapper_python.normalize import normalize_file_name, NormalizedFileName + +def do_soname_normalization_tests(test_cases): + for input_name, expected_name, expected_version, expected_soabi, expected_normalized in test_cases: + result = normalize_file_name(input_name) + if isinstance(result, NormalizedFileName): + assert result.name == expected_name + assert result.version == expected_version + assert result.soabi == expected_soabi + assert result.normalized == expected_normalized + else: + assert result == expected_name + +def test_basic_normalization(): + test_cases = [ + ("libexample.so", "libexample.so", None, None, False), + ("libexample.so.1", "libexample.so", None, "1", False), + ("libexample-1.2.3.so", "libexample.so", "1.2.3", None, True), + ] + do_soname_normalization_tests(test_cases) + +def test_edge_cases(): + test_cases = [ + ("libexample.so.gz", "libexample.so.gz"), + ("libexample.so.patch", "libexample.so.patch"), + ("libexample.so.diff", "libexample.so.diff"), + ("libexample.so.hmac", "libexample.so.hmac"), + ("libexample.so.qm", "libexample.so.qm"), + ] + do_soname_normalization_tests(test_cases) + +def test_version_extraction(): + test_cases = [ + ("libexample-1.2.3.so", "libexample.so", "1.2.3", None, True), + ("libexample-1.2.3.4.so", "libexample.so", "1.2.3.4", None, True), + ("libexample-1.2.3-beta.so", "libexample-1.2.3-beta.so", None, None, False), + ] + do_soname_normalization_tests(test_cases) + +def test_soabi_handling(): + test_cases = [ + ("libexample.so.0d", "libexample.so", None, "0d", False), + ("libexample.so.1", "libexample.so", None, "1", False), + ("libexample.so.1.2.3", "libexample.so", None, "1.2.3", False), + ("libexample.so.1.2.3.4", "libexample.so", None, "1.2.3.4", False), + ] + do_soname_normalization_tests(test_cases) + +def test_cpython_normalization(): + test_cases = [ + ("stringprep.cpython-312-x86_64-linux-gnu.so", "stringprep.cpython.so", None, None, True), + # This one is strange -- has x86-64 instead of x86_64 + ("libpytalloc-util.cpython-312-x86-64-linux-gnu.so", "libpytalloc-util.cpython.so", None, None, True), + #This one is also a bit odd, has samba4 in the platform tag + ("libsamba-net.cpython-312-x86-64-linux-gnu-samba4.so.0", "libsamba-net.cpython.so", None, "0", True), + ] + do_soname_normalization_tests(test_cases) + +def test_pypy_normalization(): + test_cases = [ + ("tklib_cffi.pypy39-pp73-x86_64-linux-gnu.so", "tklib_cffi.pypy.so", None, None, True), + ] + do_soname_normalization_tests(test_cases) + +def test_haskell_normalization(): + test_cases = [ + ("libHSAgda-2.6.3-F91ij4KwIR0JAPMMfugHqV-ghc9.4.7.so", "libHSAgda.so", "2.6.3", None, True), + ("libHScpphs-1.20.9.1-1LyMg8r2jodFb2rhIiKke-ghc9.4.7.so", "libHScpphs.so", "1.20.9.1", None, True), + ("libHSrts-1.0.2_thr_debug-ghc9.4.7.so", "libHSrts.so", "1.0.2_thr_debug", None, True), + ] + do_soname_normalization_tests(test_cases) + +def test_dash_version_suffix_normalization(): + test_cases = [ + ("libsingular-factory-4.3.2.so", "libsingular-factory.so", "4.3.2", None, True), + # Filename includes an SOABI version + ("libvtkIOCGNSReader-9.1.so.9.1.0", "libvtkIOCGNSReader.so", "9.1", "9.1.0", True), + # No dots in the version number is not normalized -- many false positives with 32/64 bit markers + ("switch.linux-amd64-64.so", "switch.linux-amd64-64.so", None, None, False), + # Version number isn't at the end, so not normalized + ("liblua5.3-luv.so.1", "liblua5.3-luv.so", None, "1", False), + # v prefixed versions not normalized since most match this false positive + ("libvtkCommonSystem-pv5.11.so", "libvtkCommonSystem-pv5.11.so", None, None, False), + # A few letters added to the end of the version number are not normalized + ("libpsmile.MPI1.so.0d", "libpsmile.MPI1.so", None, "0d", False), + ("libdsdp-5.8gf.so", "libdsdp-5.8gf.so", None, None, False), + # Potential + in the middle of a version number also makes so it won't be normalized + ("libgupnp-dlna-0.10.5+0.10.5.so", "libgupnp-dlna-0.10.5+0.10.5.so", None, None, False), + ("libsingular-omalloc-4.3.2+0.9.6.so", "libsingular-omalloc-4.3.2+0.9.6.so", None, None, False), + ] + do_soname_normalization_tests(test_cases) + +def test_weird_soabi_normalization(): + test_cases = [ + # "*.so.0.*" (accidentally created file in happycoders-libsocket-dev? https://bugs.launchpad.net/ubuntu/+source/libsocket/+bug/636598) + ("*.so.0.*", "*.so", None, "0.*", False), + ] + do_soname_normalization_tests(test_cases) \ No newline at end of file