Skip to content

Commit

Permalink
feat: add command to run repo and commit finder without analysis (#827)
Browse files Browse the repository at this point in the history
This PR adds a new command, find-source, that runs the commit finder and repo finder on a given PURL whilst skipping analysis. It also optionally accepts a repo path, in which case only the commit finder will be called. The results of the operation show up in the logs, and are written to a file in a JSON report format.

Signed-off-by: Ben Selwyn-Smith <[email protected]>
  • Loading branch information
benmss authored Nov 11, 2024
1 parent f39784a commit 2d4582f
Show file tree
Hide file tree
Showing 17 changed files with 745 additions and 262 deletions.
42 changes: 42 additions & 0 deletions src/macaron/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from macaron.errors import ConfigurationError
from macaron.output_reporter.reporter import HTMLReporter, JSONReporter, PolicyReporter
from macaron.policy_engine.policy_engine import run_policy_engine, show_prelude
from macaron.repo_finder import repo_finder
from macaron.slsa_analyzer.analyzer import Analyzer
from macaron.slsa_analyzer.git_service import GIT_SERVICES
from macaron.slsa_analyzer.package_registry import PACKAGE_REGISTRIES
Expand Down Expand Up @@ -212,6 +213,14 @@ def verify_policy(verify_policy_args: argparse.Namespace) -> int:
return os.EX_USAGE


def find_source(find_args: argparse.Namespace) -> int:
"""Perform repo and commit finding for a passed PURL, or commit finding for a passed PURL and repo."""
if repo_finder.find_source(find_args.package_url, find_args.repo_path or None):
return os.EX_OK

return os.EX_DATAERR


def perform_action(action_args: argparse.Namespace) -> None:
"""Perform the indicated action of Macaron."""
match action_args.action:
Expand Down Expand Up @@ -239,6 +248,17 @@ def perform_action(action_args: argparse.Namespace) -> None:
sys.exit(os.EX_USAGE)

analyze_slsa_levels_single(action_args)

case "find-source":
try:
for git_service in GIT_SERVICES:
git_service.load_defaults()
except ConfigurationError as error:
logger.error(error)
sys.exit(os.EX_USAGE)

find_source(action_args)

case _:
logger.error("Macaron does not support command option %s.", action_args.action)
sys.exit(os.EX_USAGE)
Expand Down Expand Up @@ -444,6 +464,28 @@ def main(argv: list[str] | None = None) -> None:
vp_group.add_argument("-f", "--file", type=str, help="Path to the Datalog policy.")
vp_group.add_argument("-s", "--show-prelude", action="store_true", help="Show policy prelude.")

# Find the repo and commit of a passed PURL, or the commit of a passed PURL and repo.
find_parser = sub_parser.add_parser(name="find-source")

find_parser.add_argument(
"-purl",
"--package-url",
required=True,
type=str,
help=("The PURL string to perform repository and commit finding for."),
)

find_parser.add_argument(
"-rp",
"--repo-path",
required=False,
type=str,
help=(
"The path to a repository that matches the provided PURL, can be local or remote. "
"This argument is only required in cases where the repository cannot be discovered automatically."
),
)

args = main_parser.parse_args(argv)

if not args.action:
Expand Down
1 change: 1 addition & 0 deletions src/macaron/config/defaults.ini
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ use_open_source_insights = True
redirect_urls =
gitbox.apache.org
git-wip-us.apache.org
find_source_should_clone = False

[repofinder.java]
# The list of maven-like repositories to attempt to retrieve artifact POMs from.
Expand Down
24 changes: 23 additions & 1 deletion src/macaron/repo_finder/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,26 @@
# Copyright (c) 2023 - 2023, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2023 - 2024, Oracle and/or its affiliates. All rights reserved.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/.

"""This package contains the dependency resolvers for Java projects."""


def to_domain_from_known_purl_types(purl_type: str) -> str | None:
"""Return the git service domain from a known web-based purl type.
This method is used to handle cases where the purl type value is not the git domain but a pre-defined
repo-based type in https://github.com/package-url/purl-spec/blob/master/PURL-TYPES.rst.
Note that this method will be updated when there are new pre-defined types as per the PURL specification.
Parameters
----------
purl_type : str
The type field of the PURL.
Returns
-------
str | None
The git service domain corresponding to the purl type or None if the purl type is unknown.
"""
known_types = {"github": "github.com", "bitbucket": "bitbucket.org"}
return known_types.get(purl_type, None)
3 changes: 1 addition & 2 deletions src/macaron/repo_finder/commit_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
from packageurl import PackageURL
from pydriller import Commit, Git

from macaron.repo_finder import repo_finder_deps_dev
from macaron.repo_finder.repo_finder import to_domain_from_known_purl_types
from macaron.repo_finder import repo_finder_deps_dev, to_domain_from_known_purl_types
from macaron.slsa_analyzer.git_service import GIT_SERVICES

logger: logging.Logger = logging.getLogger(__name__)
Expand Down
2 changes: 1 addition & 1 deletion src/macaron/repo_finder/provenance_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@

from macaron.errors import ProvenanceError
from macaron.json_tools import JsonType, json_extract
from macaron.repo_finder import to_domain_from_known_purl_types
from macaron.repo_finder.commit_finder import (
AbstractPurlType,
determine_abstract_purl_type,
extract_commit_from_version,
)
from macaron.repo_finder.repo_finder import to_domain_from_known_purl_types
from macaron.slsa_analyzer.provenance.intoto import InTotoPayload, InTotoV1Payload, InTotoV01Payload

logger: logging.Logger = logging.getLogger(__name__)
Expand Down
159 changes: 137 additions & 22 deletions src/macaron/repo_finder/repo_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,18 @@
import os
from urllib.parse import ParseResult, urlunparse

import git
from packageurl import PackageURL

from macaron.config.defaults import defaults
from macaron.config.global_config import global_config
from macaron.repo_finder import to_domain_from_known_purl_types
from macaron.repo_finder.commit_finder import match_tags
from macaron.repo_finder.repo_finder_base import BaseRepoFinder
from macaron.repo_finder.repo_finder_deps_dev import DepsDevRepoFinder
from macaron.repo_finder.repo_finder_java import JavaRepoFinder
from macaron.repo_finder.repo_utils import generate_report, prepare_repo
from macaron.slsa_analyzer.git_url import GIT_REPOS_DIR

logger: logging.Logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -78,28 +84,6 @@ def find_repo(purl: PackageURL) -> str:
return repo_finder.find_repo(purl)


def to_domain_from_known_purl_types(purl_type: str) -> str | None:
"""Return the git service domain from a known web-based purl type.
This method is used to handle cases where the purl type value is not the git domain but a pre-defined
repo-based type in https://github.com/package-url/purl-spec/blob/master/PURL-TYPES.rst.
Note that this method will be updated when there are new pre-defined types as per the PURL specification.
Parameters
----------
purl_type : str
The type field of the PURL.
Returns
-------
str | None
The git service domain corresponding to the purl type or None if the purl type is unknown.
"""
known_types = {"github": "github.com", "bitbucket": "bitbucket.org"}
return known_types.get(purl_type, None)


def to_repo_path(purl: PackageURL, available_domains: list[str]) -> str | None:
"""Return the repository path from the PURL string.
Expand Down Expand Up @@ -148,3 +132,134 @@ def to_repo_path(purl: PackageURL, available_domains: list[str]) -> str | None:
fragment="",
)
)


def find_source(purl_string: str, input_repo: str | None) -> bool:
"""Perform repo and commit finding for a passed PURL, or commit finding for a passed PURL and repo.
Parameters
----------
purl_string: str
The PURL string of the target.
input_repo: str | None
The repository path optionally provided by the user.
Returns
-------
bool
True if the source was found.
"""
try:
purl = PackageURL.from_string(purl_string)
except ValueError as error:
logger.error("Could not parse PURL: %s", error)
return False

if not purl.version:
logger.debug("PURL is missing version.")
return False

found_repo = input_repo
if not input_repo:
logger.debug("Searching for repo of PURL: %s", purl)
found_repo = find_repo(purl)

if not found_repo:
logger.error("Could not find repo for PURL: %s", purl)
return False

# Disable other loggers for cleaner output.
logging.getLogger("macaron.slsa_analyzer.analyzer").disabled = True
logging.getLogger("macaron.slsa_analyzer.git_url").disabled = True

if defaults.getboolean("repofinder", "find_source_should_clone"):
logger.debug("Preparing repo: %s", found_repo)
repo_dir = os.path.join(global_config.output_path, GIT_REPOS_DIR)
git_obj = prepare_repo(
repo_dir,
found_repo,
purl=purl,
)

if not git_obj:
# TODO expand this message to cover cases where the obj was not created due to lack of correct tag.
logger.error("Could not resolve repository: %s", found_repo)
return False

try:
digest = git_obj.get_head().hash
except ValueError:
logger.debug("Could not retrieve commit hash from repository.")
return False
else:
# Retrieve the tags.
tags = get_tags_via_git_remote(found_repo)
if not tags:
return False

matches = match_tags(list(tags.keys()), purl.name, purl.version)

if not matches:
return False

matched_tag = matches[0]
digest = tags[matched_tag]

if not digest:
logger.error("Could not find commit for purl / repository: %s / %s", purl, found_repo)
return False

if not input_repo:
logger.info("Found repository for PURL: %s", found_repo)

logger.info("Found commit for PURL: %s", digest)

if not generate_report(purl_string, digest, found_repo, os.path.join(global_config.output_path, "reports")):
return False

return True


def get_tags_via_git_remote(repo: str) -> dict[str, str] | None:
"""Retrieve all tags from a given repository using ls-remote.
Parameters
----------
repo: str
The repository to perform the operation on.
Returns
-------
dict[str]
A dictionary of tags mapped to their commits, or None if the operation failed..
"""
tags = {}
try:
tag_data = git.cmd.Git().ls_remote("--tags", repo)
except git.exc.GitCommandError as error:
logger.debug("Failed to retrieve tags: %s", error)
return None

for tag_line in tag_data.splitlines():
tag_line = tag_line.strip()
if not tag_line:
continue
split = tag_line.split("\t")
if len(split) != 2:
continue
possible_tag = split[1]
if possible_tag.endswith("^{}"):
possible_tag = possible_tag[:-3]
elif possible_tag in tags:
# If a tag already exists, it must be the annotated reference of an annotated tag.
# In that case we skip the tag as it does not point to the proper source commit.
# Note that this should only happen if the tags are received out of standard order.
continue
possible_tag = possible_tag.replace("refs/tags/", "")
if not possible_tag:
continue
tags[possible_tag] = split[0]

logger.debug("Found %s tags via ls-remote of %s", len(tags), repo)

return tags
2 changes: 1 addition & 1 deletion src/macaron/repo_finder/repo_finder_java.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def find_repo(self, purl: PackageURL) -> str:
limit = defaults.getint("repofinder.java", "parent_limit", fallback=10)

if not version:
logger.debug("Version missing for maven artifact: %s:%s", group, artifact)
logger.info("Version missing for maven artifact: %s:%s", group, artifact)
# TODO add support for Java artifacts without a version
return ""

Expand Down
Loading

0 comments on commit 2d4582f

Please sign in to comment.