From a5dfaf1706b3ddee9633a6d40bd0540dd2cb3639 Mon Sep 17 00:00:00 2001 From: Carl Flottmann Date: Tue, 26 Nov 2024 14:02:24 +1000 Subject: [PATCH] feat: added in code and tests for a new wheel file presence heuristic in the pypi malware analyzer, which checks for whether a wheel file is available with the package. --- src/macaron/database/table_definitions.py | 2 +- src/macaron/errors.py | 4 + .../pypi_heuristics/heuristics.py | 3 + .../metadata/wheel_presence.py | 74 +++++++ .../checks/detect_malicious_metadata_check.py | 44 ++++- .../pypi/test_wheel_presence.py | 185 ++++++++++++++++++ 6 files changed, 305 insertions(+), 7 deletions(-) create mode 100644 src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_presence.py create mode 100644 tests/malware_analyzer/pypi/test_wheel_presence.py diff --git a/src/macaron/database/table_definitions.py b/src/macaron/database/table_definitions.py index 61c90da2e..7b173fad4 100644 --- a/src/macaron/database/table_definitions.py +++ b/src/macaron/database/table_definitions.py @@ -103,7 +103,7 @@ class PackageURLMixin: name: Mapped[str] = mapped_column(String(100), nullable=False, comment="Name of the package.") #: Version of the package. - version: Mapped[str] = mapped_column(String(100), nullable=True, comment="Version of the package.") + version: Mapped[str] | None = mapped_column(String(100), nullable=True, comment="Version of the package.") #: Extra qualifying data for a package such as the name of an OS. qualifiers: Mapped[str] = mapped_column( diff --git a/src/macaron/errors.py b/src/macaron/errors.py index a3178e8da..b4e8b813f 100644 --- a/src/macaron/errors.py +++ b/src/macaron/errors.py @@ -86,3 +86,7 @@ class CycloneDXParserError(MacaronError): class DependencyAnalyzerError(MacaronError): """The DependencyAnalyzer error class.""" + + +class HeuristicAnalyzerValueError(MacaronError): + """Error class for BaseHeuristicAnalyzer errors when parsing data.""" diff --git a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py index 0bd74d343..3ac7cb613 100644 --- a/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py +++ b/src/macaron/malware_analyzer/pypi_heuristics/heuristics.py @@ -31,6 +31,9 @@ class Heuristics(str, Enum): #: Indicates that the setup.py file contains suspicious imports, such as base64 and requests. SUSPICIOUS_SETUP = "suspicious_setup" + #: Indicates that the package does not include a .whl file + WHEEL_PRESENCE = "wheel_presence" + class HeuristicResult(str, Enum): """Result type indicating the outcome of a heuristic.""" diff --git a/src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_presence.py b/src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_presence.py new file mode 100644 index 000000000..8610948d3 --- /dev/null +++ b/src/macaron/malware_analyzer/pypi_heuristics/metadata/wheel_presence.py @@ -0,0 +1,74 @@ +# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""The heuristic analyzer to check .whl file presence.""" + +import logging + +from macaron.errors import HeuristicAnalyzerValueError +from macaron.json_tools import JsonType +from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics +from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset + +logger: logging.Logger = logging.getLogger(__name__) + + +class WheelPresenceAnalyzer(BaseHeuristicAnalyzer): + """ + Analyze to see if a .whl file is available for the package. + + If a package is distributed with a .whl file, this heuristic passes. Otherwise, the + heuristic fails. + """ + + WHEEL: str = "bdist_wheel" + + def __init__(self) -> None: + super().__init__( + name="download_file_presence_analyzer", + heuristic=Heuristics.WHEEL_PRESENCE, + depends_on=None, + ) + + def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]: + """Analyze the package. + + Parameters + ---------- + pypi_package_json: PyPIPackageJsonAsset + The PyPI package JSON asset object. + + Returns + ------- + tuple[HeuristicResult, dict[str, JsonType]]: + The result and related information collected during the analysis. + + Raises + ------ + HeuristicAnalyzerValueError + If there is no release information, or has no most recent version (if queried). + """ + releases = pypi_package_json.get_releases() + if releases is None: # no release information + raise HeuristicAnalyzerValueError("There is no information for any release of this package.") + + version = pypi_package_json.component.version + if version is None: # check latest release version + version = pypi_package_json.get_latest_version() + if version is None: + raise HeuristicAnalyzerValueError("There is no latest version of this package.") + + release_files: list[JsonType] = [] + wheel_present: bool = False + + for release_metadata in releases[version]: + if release_metadata["packagetype"] == self.WHEEL: + wheel_present = True + + release_files.append(release_metadata["filename"]) + + if wheel_present: + return HeuristicResult.PASS, {version: release_files} + + return HeuristicResult.FAIL, {version: release_files} diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py index 13522e38f..a4d3c1e3e 100644 --- a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py +++ b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py @@ -11,6 +11,7 @@ from macaron.database.db_custom_types import DBJsonDict from macaron.database.table_definitions import CheckFacts +from macaron.errors import HeuristicAnalyzerValueError from macaron.json_tools import JsonType, json_extract from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics @@ -20,6 +21,7 @@ from macaron.malware_analyzer.pypi_heuristics.metadata.one_release import OneReleaseAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.unchanged_release import UnchangedReleaseAnalyzer from macaron.malware_analyzer.pypi_heuristics.metadata.unreachable_project_links import UnreachableProjectLinksAnalyzer +from macaron.malware_analyzer.pypi_heuristics.metadata.wheel_presence import WheelPresenceAnalyzer from macaron.malware_analyzer.pypi_heuristics.sourcecode.suspicious_setup import SuspiciousSetupAnalyzer from macaron.slsa_analyzer.analyze_context import AnalyzeContext from macaron.slsa_analyzer.build_tool.pip import Pip @@ -70,6 +72,7 @@ class MaliciousMetadataFacts(CheckFacts): UnchangedReleaseAnalyzer, CloserReleaseJoinDateAnalyzer, SuspiciousSetupAnalyzer, + WheelPresenceAnalyzer, ] # The HeuristicResult sequence is aligned with the sequence of ANALYZERS list @@ -82,6 +85,7 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult, HeuristicResult, HeuristicResult, + HeuristicResult, ], float, ] = { @@ -93,9 +97,10 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.SKIP, # Unchanged Release HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.FAIL, # Suspicious Setup + HeuristicResult.FAIL, # Wheel Presence # No project link, only one release, and the maintainer released it shortly # after account registration. - # The setup.py file contains suspicious imports. + # The setup.py file contains suspicious imports and .whl file isn't present. ): Confidence.HIGH, ( HeuristicResult.FAIL, # Empty Project @@ -105,9 +110,10 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.FAIL, # Unchanged Release HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.FAIL, # Suspicious Setup + HeuristicResult.FAIL, # Wheel Presence # No project link, frequent releases of multiple versions without modifying the content, # and the maintainer released it shortly after account registration. - # The setup.py file contains suspicious imports. + # The setup.py file contains suspicious imports and .whl file isn't present. ): Confidence.HIGH, ( HeuristicResult.FAIL, # Empty Project @@ -117,9 +123,10 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.PASS, # Unchanged Release HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.FAIL, # Suspicious Setup + HeuristicResult.FAIL, # Wheel Presence # No project link, frequent releases of multiple versions, # and the maintainer released it shortly after account registration. - # The setup.py file contains suspicious imports. + # The setup.py file contains suspicious imports and .whl file isn't present. ): Confidence.HIGH, ( HeuristicResult.FAIL, # Empty Project @@ -129,8 +136,23 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.FAIL, # Unchanged Release HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.PASS, # Suspicious Setup + HeuristicResult.PASS, # Wheel Presence # No project link, frequent releases of multiple versions without modifying the content, - # and the maintainer released it shortly after account registration. + # and the maintainer released it shortly after account registration. Presence of .whl file + # has no effect + ): Confidence.MEDIUM, + ( + HeuristicResult.FAIL, # Empty Project + HeuristicResult.SKIP, # Unreachable Project Links + HeuristicResult.PASS, # One Release + HeuristicResult.FAIL, # High Release Frequency + HeuristicResult.FAIL, # Unchanged Release + HeuristicResult.FAIL, # Closer Release Join Date + HeuristicResult.PASS, # Suspicious Setup + HeuristicResult.FAIL, # Wheel Presence + # No project link, frequent releases of multiple versions without modifying the content, + # and the maintainer released it shortly after account registration. Presence of .whl file + # has no effect ): Confidence.MEDIUM, ( HeuristicResult.PASS, # Empty Project @@ -140,9 +162,10 @@ class MaliciousMetadataFacts(CheckFacts): HeuristicResult.PASS, # Unchanged Release HeuristicResult.FAIL, # Closer Release Join Date HeuristicResult.FAIL, # Suspicious Setup + HeuristicResult.FAIL, # Wheel Presence # All project links are unreachable, frequent releases of multiple versions, # and the maintainer released it shortly after account registration. - # The setup.py file contains suspicious imports. + # The setup.py file contains suspicious imports and .whl file isn't present. ): Confidence.HIGH, } @@ -197,6 +220,11 @@ def run_heuristics( ------- tuple[dict[Heuristics, HeuristicResult], dict[str, JsonType]] Containing the analysis results and relevant metadata. + + Raises + ------ + HeuristicAnalyzerValueError + If a heuristic analysis fails due to malformed package information. """ results: dict[Heuristics, HeuristicResult] = {} detail_info: dict[str, JsonType] = {} @@ -277,7 +305,11 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData: # Download the PyPI package JSON, but no need to persist it to the filesystem. if pypi_package_json.download(dest=""): - result, detail_info = self.run_heuristics(pypi_package_json) + try: + result, detail_info = self.run_heuristics(pypi_package_json) + except HeuristicAnalyzerValueError: + return CheckResultData(result_tables=[], result_type=CheckResultType.UNKNOWN) + result_combo: tuple = tuple(result.values()) confidence: float | None = SUSPICIOUS_COMBO.get(result_combo, None) result_type = CheckResultType.FAILED diff --git a/tests/malware_analyzer/pypi/test_wheel_presence.py b/tests/malware_analyzer/pypi/test_wheel_presence.py new file mode 100644 index 000000000..783eb6f9c --- /dev/null +++ b/tests/malware_analyzer/pypi/test_wheel_presence.py @@ -0,0 +1,185 @@ +# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. + +"""Tests for heuristic detecting wheel (.whl) file presence from PyPI""" +from unittest.mock import MagicMock + +import pytest + +from macaron.errors import HeuristicAnalyzerValueError +from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult +from macaron.malware_analyzer.pypi_heuristics.metadata.wheel_presence import WheelPresenceAnalyzer + + +def test_analyze_no_information(pypi_package_json: MagicMock) -> None: + """Test for when there is no release information, so error""" + analyzer = WheelPresenceAnalyzer() + + pypi_package_json.get_releases.return_value = None + + try: + analyzer.analyze(pypi_package_json) + pytest.fail("Analyzer did not through a HeuristicAnalyzerValueError") + except HeuristicAnalyzerValueError: + pass + + +def test_analyze_tar_present(pypi_package_json: MagicMock) -> None: + """Test for when only .tar.gz is present, so failed""" + analyzer = WheelPresenceAnalyzer() + version = "0.1.0" + filename = "ttttttttest_nester.py-0.1.0.tar.gz" + + release = { + version: [ + { + "comment_text": "", + "digests": { + "blake2b_256": "defa2fbcebaeeb909511139ce28dac4a77ab2452ba72b49a22b12981b2f375b3", + "md5": "9203bbb130f8ddb38269f4861c170d04", + "sha256": "168bcccbf5106132e90b85659297700194369b8f6b3e5a03769614f0d200e370", + }, + "downloads": -1, + "filename": filename, + "has_sig": False, + "md5_digest": "9203bbb130f8ddb38269f4861c170d04", + "packagetype": "sdist", + "python_version": "source", + "requires_python": None, + "size": 546, + "upload_time": "2016-10-13T05:42:27", + "upload_time_iso_8601": "2016-10-13T05:42:27.073842Z", + "url": f"https://files.pythonhosted.org/packages/de/fa/2fbcebaeeb909511139ce28d \ + ac4a77ab2452ba72b49a22b12981b2f375b3/{filename}", + "yanked": False, + "yanked_reason": None, + } + ] + } + + pypi_package_json.get_releases.return_value = release + pypi_package_json.get_latest_version.return_value = version + pypi_package_json.component.version = None + expected_result: tuple[HeuristicResult, dict] = (HeuristicResult.FAIL, {version: [filename]}) + + try: + actual_result = analyzer.analyze(pypi_package_json) + except HeuristicAnalyzerValueError: + pytest.fail("Analyzer threw a HeuristicAnalyzerValueError") + + assert actual_result == expected_result + + +def test_analyze_whl_present(pypi_package_json: MagicMock) -> None: + """Test for when only .whl is present, so pass""" + analyzer = WheelPresenceAnalyzer() + version = "0.1.0" + filename = "ttttttttest_nester.py-0.1.0.whl" + + release = { + version: [ + { + "comment_text": "", + "digests": { + "blake2b_256": "defa2fbcebaeeb909511139ce28dac4a77ab2452ba72b49a22b12981b2f375b3", + "md5": "9203bbb130f8ddb38269f4861c170d04", + "sha256": "168bcccbf5106132e90b85659297700194369b8f6b3e5a03769614f0d200e370", + }, + "downloads": -1, + "filename": filename, + "has_sig": False, + "md5_digest": "9203bbb130f8ddb38269f4861c170d04", + "packagetype": "bdist_wheel", + "python_version": "py2.py3", + "requires_python": None, + "size": 546, + "upload_time": "2016-10-13T05:42:27", + "upload_time_iso_8601": "2016-10-13T05:42:27.073842Z", + "url": f"https://files.pythonhosted.org/packages/de/fa/2fbcebaeeb909511139ce28d \ + ac4a77ab2452ba72b49a22b12981b2f375b3/{filename}", + "yanked": False, + "yanked_reason": None, + } + ] + } + + pypi_package_json.get_releases.return_value = release + pypi_package_json.component.version = version + expected_result: tuple[HeuristicResult, dict] = (HeuristicResult.PASS, {version: [filename]}) + + try: + actual_result = analyzer.analyze(pypi_package_json) + except HeuristicAnalyzerValueError: + pytest.fail("Analyzer threw a HeuristicAnalyzerValueError") + + assert actual_result == expected_result + + +def test_analyze_both_present(pypi_package_json: MagicMock) -> None: + """Test for when both .tar.gz and .whl are present, so passed""" + analyzer = WheelPresenceAnalyzer() + version = "0.1.0" + file_prefix = "ttttttttest_nester.py-0.1.0" + + release = { + version: [ + { + "comment_text": "", + "digests": { + "blake2b_256": "defa2fbcebaeeb909511139ce28dac4a77ab2452ba72b49a22b12981b2f375b3", + "md5": "9203bbb130f8ddb38269f4861c170d04", + "sha256": "168bcccbf5106132e90b85659297700194369b8f6b3e5a03769614f0d200e370", + }, + "downloads": -1, + "filename": f"{file_prefix}.whl", + "has_sig": False, + "md5_digest": "9203bbb130f8ddb38269f4861c170d04", + "packagetype": "bdist_wheel", + "python_version": "py2.py3", + "requires_python": None, + "size": 546, + "upload_time": "2016-10-13T05:42:27", + "upload_time_iso_8601": "2016-10-13T05:42:27.073842Z", + "url": f"https://files.pythonhosted.org/packages/de/fa/2fbcebaeeb909511139ce28d \ + ac4a77ab2452ba72b49a22b12981b2f375b3/{file_prefix}.whl", + "yanked": False, + "yanked_reason": None, + }, + { + "comment_text": "", + "digests": { + "blake2b_256": "defa2fbcebaeeb909511139ce28dac4a77ab2452ba72b49a22b12981b2f375b3", + "md5": "9203bbb130f8ddb38269f4861c170d04", + "sha256": "168bcccbf5106132e90b85659297700194369b8f6b3e5a03769614f0d200e370", + }, + "downloads": -1, + "filename": f"{file_prefix}.tar.gz", + "has_sig": False, + "md5_digest": "9203bbb130f8ddb38269f4861c170d04", + "packagetype": "sdist", + "python_version": "source", + "requires_python": None, + "size": 546, + "upload_time": "2016-10-13T05:42:27", + "upload_time_iso_8601": "2016-10-13T05:42:27.073842Z", + "url": f"https://files.pythonhosted.org/packages/de/fa/2fbcebaeeb909511139ce28d \ + ac4a77ab2452ba72b49a22b12981b2f375b3/{file_prefix}.tar.gz", + "yanked": False, + "yanked_reason": None, + }, + ] + } + + pypi_package_json.get_releases.return_value = release + pypi_package_json.component.version = version + expected_result: tuple[HeuristicResult, dict] = ( + HeuristicResult.PASS, + {version: [f"{file_prefix}.whl", f"{file_prefix}.tar.gz"]}, + ) + + try: + actual_result = analyzer.analyze(pypi_package_json) + except HeuristicAnalyzerValueError: + pytest.fail("Analyzer threw a HeuristicAnalyzerValueError") + + assert actual_result == expected_result