Skip to content

Commit

Permalink
feat: new metadata-based heuristic analyzing version numbers for sing…
Browse files Browse the repository at this point in the history
…le releases that are too high
  • Loading branch information
art1f1c3R committed Dec 13, 2024
1 parent 81555e2 commit 71140db
Show file tree
Hide file tree
Showing 4 changed files with 804 additions and 0 deletions.
4 changes: 4 additions & 0 deletions src/macaron/malware_analyzer/pypi_heuristics/heuristics.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ class Heuristics(str, Enum):
#: Indicates that the package does not include a .whl file
WHEEL_ABSENCE = "wheel_absence"

#: Indicates that the package has an unusually large version jump between any two release versions, or it starts
#: at an unusually high version
ANOMALISTIC_VERSION = "anomalistic_version"


class HeuristicResult(str, Enum):
"""Result type indicating the outcome of a heuristic."""
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""The heuristic analyzer to check for an anomalistic package version."""

import logging
from enum import Enum

from packaging.version import InvalidVersion, parse

from macaron.errors import HeuristicAnalyzerValueError
from macaron.json_tools import JsonType, json_extract
from macaron.malware_analyzer.datetime_parser import parse_datetime
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset

logger: logging.Logger = logging.getLogger(__name__)


class AnomalisticVersionAnalyzer(BaseHeuristicAnalyzer):
"""
Analyze the version number (if there is only a single release) to detect if it is anomalistic.
A version number is anomalistic if it is above the thresholds for an epoch, major, or minor value.
If the version does not adhere to PyPI standards (PEP 440, as per the 'packaging' module), this heuristic
cannot analyze it.
Calendar versioning is detected as version numbers with the major value as the year (either yyyy or yy),
the minor as the month, and the micro as the day (+/- 2 days), with no further values.
Calendar-semantic versioning is detected as version numbers with the major value as the year (either yyyy or yy),
and any other series of numbers following it.
All other versionings are detected as semantic versioning.
"""

DATETIME_FORMAT: str = "%Y-%m-%dT%H:%M:%S"

MAJOR_THRESHOLD: int = 20
MINOR_THRESHOLD: int = 40
EPOCH_THRESHOLD: int = 5

DETAIL_INFO_KEY: str = "versioning"

def __init__(self) -> None:
super().__init__(
name="anomalistic_version_analyzer",
heuristic=Heuristics.ANOMALISTIC_VERSION,
depends_on=[(Heuristics.ONE_RELEASE, HeuristicResult.FAIL)],
)

def analyze(self, pypi_package_json: PyPIPackageJsonAsset) -> tuple[HeuristicResult, dict[str, JsonType]]:
"""Analyze the package.
Parameters
----------
pypi_package_json: PyPIPackageJsonAsset
The PyPI package JSON asset object.
Returns
-------
tuple[HeuristicResult, dict[str, JsonType]]:
The result and related information collected during the analysis.
Raises
------
HeuristicAnalyzerValueError
if there is no release information available.
"""
releases = pypi_package_json.get_releases()
if releases is None: # no release information
error_msg = "There is no information for any release of this package."
logger.debug(error_msg)
raise HeuristicAnalyzerValueError(error_msg)

if len(releases) != 1:
error_msg = (
"This heuristic depends on a single release, but somehow there are multiple when the one release"
+ " heuristic failed."
)
logger.debug(error_msg)
raise HeuristicAnalyzerValueError(error_msg)

# Since there is only one release, the latest version should be that release
release = pypi_package_json.get_latest_version()
if release is None:
error_msg = "No latest version information available"
logger.debug(error_msg)
raise HeuristicAnalyzerValueError(error_msg)

try:
release_metadata = releases[release]
except KeyError as release_error:
error_msg = "The latest release is not available in the list of releases"
logger.debug(error_msg)
raise HeuristicAnalyzerValueError(error_msg) from release_error

try:
version = parse(release)
except InvalidVersion:
return HeuristicResult.SKIP, {self.DETAIL_INFO_KEY: Versioning.INVALID.value}

calendar_semantic = False

if len(str(version.major)) == 4 or len(str(version.major)) == 2:
# possible this version number refers to a date

for distribution in release_metadata:
upload_time = json_extract(distribution, ["upload_time"], str)
if upload_time is None:
error_msg = "Missing upload time from release information"
logger.debug(error_msg)
raise HeuristicAnalyzerValueError(error_msg)

parsed_time = parse_datetime(upload_time, self.DATETIME_FORMAT)
if parsed_time is None:
error_msg = "Upload time is not of the expected PyPI format"
logger.debug(error_msg)
raise HeuristicAnalyzerValueError(error_msg)

if version.major in (parsed_time.year, parsed_time.year % 100):
# the major of the version refers to the year published
if (
parsed_time.month == version.minor
and parsed_time.day + 2 >= version.micro >= parsed_time.day - 2
and len(version.release) == 3
):
# In the format of full_year.month.day or year.month.day, with a 48-hour buffer for timezone differences
detail_info: dict[str, JsonType] = {self.DETAIL_INFO_KEY: Versioning.CALENDAR.value}
if version.epoch > self.EPOCH_THRESHOLD:
return HeuristicResult.FAIL, detail_info

return HeuristicResult.PASS, detail_info

calendar_semantic = True

if calendar_semantic:
detail_info = {self.DETAIL_INFO_KEY: Versioning.CALENDAR_SEMANTIC.value}
# analyze starting from the minor instead
if version.epoch > self.EPOCH_THRESHOLD:
return HeuristicResult.FAIL, detail_info
if version.minor > self.MAJOR_THRESHOLD:
return HeuristicResult.FAIL, detail_info

return HeuristicResult.PASS, detail_info

# semantic versioning
detail_info = {self.DETAIL_INFO_KEY: Versioning.SEMANTIC.value}

if version.epoch > self.EPOCH_THRESHOLD:
return HeuristicResult.FAIL, detail_info
if version.major > self.MAJOR_THRESHOLD:
return HeuristicResult.FAIL, detail_info
if version.minor > self.MINOR_THRESHOLD:
return HeuristicResult.FAIL, detail_info

return HeuristicResult.PASS, detail_info


class Versioning(Enum):
"""Enum used to assign different versioning methods."""

INVALID = "invalid"
CALENDAR = "calendar"
CALENDAR_SEMANTIC = "calendar_semantic"
SEMANTIC = "semantic"
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from macaron.json_tools import JsonType, json_extract
from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
from macaron.malware_analyzer.pypi_heuristics.metadata.anomalistic_version import AnomalisticVersionAnalyzer
from macaron.malware_analyzer.pypi_heuristics.metadata.closer_release_join_date import CloserReleaseJoinDateAnalyzer
from macaron.malware_analyzer.pypi_heuristics.metadata.empty_project_link import EmptyProjectLinkAnalyzer
from macaron.malware_analyzer.pypi_heuristics.metadata.high_release_frequency import HighReleaseFrequencyAnalyzer
Expand Down Expand Up @@ -73,6 +74,7 @@ class MaliciousMetadataFacts(CheckFacts):
CloserReleaseJoinDateAnalyzer,
SuspiciousSetupAnalyzer,
WheelAbsenceAnalyzer,
AnomalisticVersionAnalyzer,
]

# The HeuristicResult sequence is aligned with the sequence of ANALYZERS list
Expand All @@ -86,6 +88,7 @@ class MaliciousMetadataFacts(CheckFacts):
HeuristicResult,
HeuristicResult,
HeuristicResult,
HeuristicResult,
],
float,
] = {
Expand All @@ -98,9 +101,26 @@ class MaliciousMetadataFacts(CheckFacts):
HeuristicResult.FAIL, # Closer Release Join Date
HeuristicResult.FAIL, # Suspicious Setup
HeuristicResult.FAIL, # Wheel Absence
HeuristicResult.FAIL, # Anomalistic Version
# No project link, only one release, and the maintainer released it shortly
# after account registration.
# The setup.py file contains suspicious imports and .whl file isn't present.
# Anomalistic version has no effect.
): Confidence.HIGH,
(
HeuristicResult.FAIL, # Empty Project
HeuristicResult.SKIP, # Unreachable Project Links
HeuristicResult.FAIL, # One Release
HeuristicResult.SKIP, # High Release Frequency
HeuristicResult.SKIP, # Unchanged Release
HeuristicResult.FAIL, # Closer Release Join Date
HeuristicResult.FAIL, # Suspicious Setup
HeuristicResult.FAIL, # Wheel Absence
HeuristicResult.PASS, # Anomalistic Version
# No project link, only one release, and the maintainer released it shortly
# after account registration.
# The setup.py file contains suspicious imports and .whl file isn't present.
# Anomalistic version has no effect.
): Confidence.HIGH,
(
HeuristicResult.FAIL, # Empty Project
Expand All @@ -111,6 +131,7 @@ class MaliciousMetadataFacts(CheckFacts):
HeuristicResult.FAIL, # Closer Release Join Date
HeuristicResult.FAIL, # Suspicious Setup
HeuristicResult.FAIL, # Wheel Absence
HeuristicResult.SKIP, # Anomalistic Version
# No project link, frequent releases of multiple versions without modifying the content,
# and the maintainer released it shortly after account registration.
# The setup.py file contains suspicious imports and .whl file isn't present.
Expand All @@ -124,6 +145,7 @@ class MaliciousMetadataFacts(CheckFacts):
HeuristicResult.FAIL, # Closer Release Join Date
HeuristicResult.FAIL, # Suspicious Setup
HeuristicResult.FAIL, # Wheel Absence
HeuristicResult.SKIP, # Anomalistic Version
# No project link, frequent releases of multiple versions,
# and the maintainer released it shortly after account registration.
# The setup.py file contains suspicious imports and .whl file isn't present.
Expand All @@ -137,6 +159,7 @@ class MaliciousMetadataFacts(CheckFacts):
HeuristicResult.FAIL, # Closer Release Join Date
HeuristicResult.PASS, # Suspicious Setup
HeuristicResult.PASS, # Wheel Absence
HeuristicResult.SKIP, # Anomalistic Version
# No project link, frequent releases of multiple versions without modifying the content,
# and the maintainer released it shortly after account registration. Presence/Absence of
# .whl file has no effect
Expand All @@ -150,6 +173,7 @@ class MaliciousMetadataFacts(CheckFacts):
HeuristicResult.FAIL, # Closer Release Join Date
HeuristicResult.PASS, # Suspicious Setup
HeuristicResult.FAIL, # Wheel Absence
HeuristicResult.SKIP, # Anomalistic Version
# No project link, frequent releases of multiple versions without modifying the content,
# and the maintainer released it shortly after account registration. Presence/Absence of
# .whl file has no effect
Expand All @@ -163,10 +187,56 @@ class MaliciousMetadataFacts(CheckFacts):
HeuristicResult.FAIL, # Closer Release Join Date
HeuristicResult.FAIL, # Suspicious Setup
HeuristicResult.FAIL, # Wheel Absence
HeuristicResult.SKIP, # Anomalistic Version
# All project links are unreachable, frequent releases of multiple versions,
# and the maintainer released it shortly after account registration.
# The setup.py file contains suspicious imports and .whl file isn't present.
): Confidence.HIGH,
(
HeuristicResult.FAIL, # Empty Project
HeuristicResult.SKIP, # Unreachable Project Links
HeuristicResult.FAIL, # One Release
HeuristicResult.SKIP, # High Release Frequency
HeuristicResult.SKIP, # Unchanged Release
HeuristicResult.FAIL, # Closer Release Join Date
HeuristicResult.PASS, # Suspicious Setup
HeuristicResult.PASS, # Wheel Absence
HeuristicResult.FAIL, # Anomalistic Version
# No project link, only one release, and the maintainer released it shortly
# after account registration.
# The setup.py file has no effect and .whl file is present.
# The version number is anomalistic.
): Confidence.MEDIUM,
(
HeuristicResult.FAIL, # Empty Project
HeuristicResult.SKIP, # Unreachable Project Links
HeuristicResult.FAIL, # One Release
HeuristicResult.SKIP, # High Release Frequency
HeuristicResult.SKIP, # Unchanged Release
HeuristicResult.FAIL, # Closer Release Join Date
HeuristicResult.FAIL, # Suspicious Setup
HeuristicResult.PASS, # Wheel Absence
HeuristicResult.FAIL, # Anomalistic Version
# No project link, only one release, and the maintainer released it shortly
# after account registration.
# The setup.py file has no effect and .whl file is present.
# The version number is anomalistic.
): Confidence.MEDIUM,
(
HeuristicResult.FAIL, # Empty Project
HeuristicResult.SKIP, # Unreachable Project Links
HeuristicResult.FAIL, # One Release
HeuristicResult.SKIP, # High Release Frequency
HeuristicResult.SKIP, # Unchanged Release
HeuristicResult.FAIL, # Closer Release Join Date
HeuristicResult.SKIP, # Suspicious Setup
HeuristicResult.PASS, # Wheel Absence
HeuristicResult.FAIL, # Anomalistic Version
# No project link, only one release, and the maintainer released it shortly
# after account registration.
# The setup.py file has no effect and .whl file is present.
# The version number is anomalistic.
): Confidence.MEDIUM,
}


Expand Down
Loading

0 comments on commit 71140db

Please sign in to comment.