feat: report known malware for all ecosystems

behnazh-w · behnazh-w · commit 688af68f6386 · 2024-11-19T14:22:58.000+10:00
Signed-off-by: behnazh-w &lt;behnaz.hassanshahi@oracle.com&gt;
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -107,7 +107,7 @@ Macaron checks that report integrity issues but do not map to SLSA requirements
    * - Check ID
      - Description
    * - ``mcn_detect_malicious_metadata_1``
-     - This check analyzes the metadata of a package and reports malicious behavior. This check currently supports PyPI packages.
+     - This check performs analysis on PyPI package metadata to detect malicious behavior. It also reports known malware from other ecosystems, but the analysis is currently limited to PyPI packages.
 
 ----------------------
 How does Macaron work?
diff --git a/docs/source/pages/tutorials/detect_malicious_package.rst b/docs/source/pages/tutorials/detect_malicious_package.rst
@@ -13,9 +13,11 @@ In this tutorial we show how to use Macaron to find malicious packages. Imagine
    :widths: 25
    :header-rows: 1
 
-   * - Supported packages
+   * - Supported packages for analysis
    * - Python packages (PyPI)
 
+Note that known malware is reported for packages across all ecosystems.
+
 .. contents:: :local:
 
 
diff --git a/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py b/src/macaron/slsa_analyzer/checks/detect_malicious_metadata_check.py
@@ -5,12 +5,13 @@
 
 import logging
 
-from sqlalchemy import ForeignKey
+import requests
+from sqlalchemy import ForeignKey, String
 from sqlalchemy.orm import Mapped, mapped_column
 
 from macaron.database.db_custom_types import DBJsonDict
 from macaron.database.table_definitions import CheckFacts
-from macaron.json_tools import JsonType
+from macaron.json_tools import JsonType, json_extract
 from macaron.malware_analyzer.pypi_heuristics.base_analyzer import BaseHeuristicAnalyzer
 from macaron.malware_analyzer.pypi_heuristics.heuristics import HeuristicResult, Heuristics
 from macaron.malware_analyzer.pypi_heuristics.metadata.closer_release_join_date import CloserReleaseJoinDateAnalyzer
@@ -28,6 +29,7 @@
 from macaron.slsa_analyzer.package_registry.pypi_registry import PyPIPackageJsonAsset, PyPIRegistry
 from macaron.slsa_analyzer.registry import registry
 from macaron.slsa_analyzer.specs.package_registry_spec import PackageRegistryInfo
+from macaron.util import send_post_http_raw
 
 logger: logging.Logger = logging.getLogger(__name__)
 
@@ -40,13 +42,16 @@ class MaliciousMetadataFacts(CheckFacts):
     #: The primary key.
     id: Mapped[int] = mapped_column(ForeignKey("_check_facts.id"), primary_key=True)  # noqa: A003
 
+    #: Known malware.
+    known_malware: Mapped[str | None] = mapped_column(
+        String, nullable=False, info={"justification": JustificationType.HREF}
+    )
+
     #: Detailed information about the analysis.
     detail_information: Mapped[dict[str, JsonType]] = mapped_column(DBJsonDict, nullable=False)
 
-    #: The result of analysis, which is of dict[Heuristics, HeuristicResult] type.
-    result: Mapped[dict[Heuristics, HeuristicResult]] = mapped_column(
-        DBJsonDict, nullable=False, info={"justification": JustificationType.TEXT}
-    )
+    #: The result of analysis, which can be an empty dictionary.
+    result: Mapped[dict] = mapped_column(DBJsonDict, nullable=False, info={"justification": JustificationType.TEXT})
 
     __mapper_args__ = {
         "polymorphic_identity": "_detect_malicious_metadata_check",
@@ -223,14 +228,43 @@ def run_check(self, ctx: AnalyzeContext) -> CheckResultData:
         CheckResultData
             The result of the check.
         """
+        result_tables: list[CheckFacts] = []
+        # First check if this package is a known malware
+
+        url = "https://api.osv.dev/v1/query"
+        data = {"package": {"purl": ctx.component.purl}}
+        response = send_post_http_raw(url, json_data=data, headers=None)
+        res_obj = None
+        if response:
+            try:
+                res_obj = response.json()
+            except requests.exceptions.JSONDecodeError as error:
+                logger.debug("Unable to get a valid response from %s: %s", url, error)
+        if res_obj:
+            for vuln in res_obj.get("vulns", {}):
+                v_id = json_extract(vuln, ["id"], str)
+                if v_id and v_id.startswith("MAL-"):
+                    result_tables.append(
+                        MaliciousMetadataFacts(
+                            known_malware=f"https://osv.dev/vulnerability/{v_id}",
+                            result={},
+                            detail_information=vuln,
+                            confidence=Confidence.HIGH,
+                        )
+                    )
+            if result_tables:
+                return CheckResultData(
+                    result_tables=result_tables,
+                    result_type=CheckResultType.FAILED,
+                )
+
         package_registry_info_entries = ctx.dynamic_data["package_registries"]
         for package_registry_info_entry in package_registry_info_entries:
             match package_registry_info_entry:
                 case PackageRegistryInfo(
                     build_tool=Pip() | Poetry(),
                     package_registry=PyPIRegistry() as pypi_registry,
                 ) as pypi_registry_info:
-                    result_tables: list[CheckFacts] = []
 
                     # Create an AssetLocator object for the PyPI package JSON object.
                     pypi_package_json = PyPIPackageJsonAsset(
diff --git a/src/macaron/util.py b/src/macaron/util.py
@@ -125,6 +125,79 @@ def send_get_http_raw(
     return response
 
 
+def send_post_http_raw(
+    url: str,
+    json_data: dict | None = None,
+    headers: dict | None = None,
+    timeout: int | None = None,
+    allow_redirects: bool = True,
+) -> Response | None:
+    """Send a POST HTTP request with the given url, data, and headers.
+
+    This method also handle logging when the API server returns error status code.
+
+    Parameters
+    ----------
+    url : str
+        The url of the request.
+    json_data: dict | None
+        The request payload.
+    headers : dict | None
+        The dict that describes the headers of the request.
+    timeout: int | None
+        The request timeout (optional).
+    allow_redirects: bool
+        Whether to allow redirects. Default: True.
+
+    Returns
+    -------
+    Response | None
+        If a Response object is returned and ``allow_redirects`` is ``True`` (the default) it will have a status code of
+        200 (OK). If ``allow_redirects`` is ``False`` the response can instead have a status code of 302. Otherwise, the
+        request has failed and ``None`` will be returned.
+    """
+    logger.debug("POST - %s", url)
+    if not timeout:
+        timeout = defaults.getint("requests", "timeout", fallback=10)
+    error_retries = defaults.getint("requests", "error_retries", fallback=5)
+    retry_counter = error_retries
+    try:
+        response = requests.post(
+            url=url,
+            json=json_data,
+            headers=headers,
+            timeout=timeout,
+            allow_redirects=allow_redirects,
+        )
+    except requests.exceptions.RequestException as error:
+        logger.debug(error)
+        return None
+    if not allow_redirects and response.status_code == 302:
+        # Found, most likely because a redirect is about to happen.
+        return response
+    while response.status_code != 200:
+        logger.debug(
+            "Receiving error code %s from server.",
+            response.status_code,
+        )
+        if retry_counter <= 0:
+            logger.debug("Maximum retries reached: %s", error_retries)
+            return None
+        if response.status_code == 403:
+            check_rate_limit(response)
+        else:
+            return None
+        retry_counter = retry_counter - 1
+        response = requests.get(
+            url=url,
+            headers=headers,
+            timeout=timeout,
+            allow_redirects=allow_redirects,
+        )
+
+    return response
+
+
 def check_rate_limit(response: Response) -> None:
     """Check the remaining calls limit to GitHub API and wait accordingly.
 
diff --git a/tests/integration/cases/tautoak4-hello-world/policy.dl b/tests/integration/cases/tautoak4-hello-world/policy.dl
@@ -0,0 +1,10 @@
+/* Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. */
+/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */
+
+#include "prelude.dl"
+
+Policy("check-malicious-package", component_id, "Check the malicious package.") :-
+    check_failed(component_id, "mcn_detect_malicious_metadata_1").
+
+apply_policy_to("check-malicious-package", component_id) :-
+    is_component(component_id, "pkg:npm/tautoak4-hello-world").
diff --git a/tests/integration/cases/tautoak4-hello-world/test.yaml b/tests/integration/cases/tautoak4-hello-world/test.yaml
@@ -0,0 +1,21 @@
+# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
+
+description: |
+  Analyzing a known malicious package.
+
+tags:
+- macaron-python-package
+- macaron-docker-image
+
+steps:
+- name: Run macaron analyze
+  kind: analyze
+  options:
+    command_args:
+    - -purl
+    - pkg:npm/tautoak4-hello-world
+- name: Run macaron verify-policy to verify that the malicious metadata check fails.
+  kind: verify
+  options:
+    policy: policy.dl
diff --git a/tests/integration/cases/type-extension/policy.dl b/tests/integration/cases/type-extension/policy.dl
@@ -0,0 +1,10 @@
+/* Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved. */
+/* Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/. */
+
+#include "prelude.dl"
+
+Policy("check-malicious-package", component_id, "Check the malicious package.") :-
+    check_failed(component_id, "mcn_detect_malicious_metadata_1").
+
+apply_policy_to("check-malicious-package", component_id) :-
+    is_component(component_id, "pkg:pypi/type-extension").
diff --git a/tests/integration/cases/type-extension/test.yaml b/tests/integration/cases/type-extension/test.yaml
@@ -0,0 +1,21 @@
+# Copyright (c) 2024 - 2024, Oracle and/or its affiliates. All rights reserved.
+# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.
+
+description: |
+  Analyzing a known malicious package.
+
+tags:
+- macaron-python-package
+- macaron-docker-image
+
+steps:
+- name: Run macaron analyze
+  kind: analyze
+  options:
+    command_args:
+    - -purl
+    - pkg:pypi/type-extension
+- name: Run macaron verify-policy to verify that the malicious metadata check fails.
+  kind: verify
+  options:
+    policy: policy.dl