Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Migrate rust importer #858

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
209 changes: 25 additions & 184 deletions vulnerabilities/importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import logging
import os
import shutil
import tempfile
import traceback
import xml.etree.ElementTree as ET
from pathlib import Path
Expand All @@ -23,9 +22,7 @@
from typing import Set
from typing import Tuple

from binaryornot.helpers import is_binary_string
from git import DiffIndex
from git import Repo
from fetchcode.vcs import fetch_via_vcs
from license_expression import Licensing
from packageurl import PackageURL
from univers.version_range import VersionRange
Expand Down Expand Up @@ -312,193 +309,37 @@ def advisory_data(self) -> Iterable[AdvisoryData]:
raise NotImplementedError


# TODO: Needs rewrite
class GitImporter(Importer):
def validate_configuration(self) -> None:
class ForkError(Exception):
pass

if not self.config.create_working_directory and self.config.working_directory is None:
self.error(
'"create_working_directory" is not set but "working_directory" is set to '
"the default, which calls tempfile.mkdtemp()"
)

if not self.config.create_working_directory and not os.path.exists(
self.config.working_directory
):
self.error(
'"working_directory" does not contain an existing directory and'
'"create_working_directory" is not set'
)

if not self.config.remove_working_directory and self.config.working_directory is None:
self.error(
'"remove_working_directory" is not set and "working_directory" is set to '
"the default, which calls tempfile.mkdtemp()"
)
class GitImporter(Importer):
def __init__(self, repo_url):
super().__init__()
self.repo_url = repo_url
self.vcs_response = None

def __enter__(self):
self._ensure_working_directory()
self._ensure_repository()

def __exit__(self, exc_type, exc_val, exc_tb):
if self.config.remove_working_directory:
shutil.rmtree(self.config.working_directory)

def file_changes(
self,
subdir: str = None,
recursive: bool = False,
file_ext: Optional[str] = None,
) -> Tuple[Set[str], Set[str]]:
"""
Returns all added and modified files since last_run_date or cutoff_date (whichever is more
recent).

:param subdir: filter by files in this directory
:param recursive: whether to include files in subdirectories
:param file_ext: filter files by this extension
:return: The first set contains (absolute paths to) added files, the second one modified
files
"""
if subdir is None:
working_dir = self.config.working_directory
else:
working_dir = os.path.join(self.config.working_directory, subdir)
super().__enter__()
self.clone()
return self

path = Path(working_dir)
def __exit__(self):
self.vcs_response.delete()

if self.config.last_run_date is None and self.config.cutoff_date is None:
if recursive:
glob = "**/*"
else:
glob = "*"

if file_ext:
glob = f"{glob}.{file_ext}"

return {str(p) for p in path.glob(glob) if p.is_file()}, set()

return self._collect_file_changes(subdir=subdir, recursive=recursive, file_ext=file_ext)

def _collect_file_changes(
self,
subdir: Optional[str],
recursive: bool,
file_ext: Optional[str],
) -> Tuple[Set[str], Set[str]]:

added_files, updated_files = set(), set()

# find the most ancient commit we need to diff with
cutoff_commit = None
for commit in self._repo.iter_commits(self._repo.head):
if commit.committed_date < self.cutoff_timestamp:
break
cutoff_commit = commit

if cutoff_commit is None:
return added_files, updated_files

def _is_binary(d: DiffIndex):
return is_binary_string(d.b_blob.data_stream.read(1024))

for d in cutoff_commit.diff(self._repo.head.commit):
if not _include_file(d.b_path, subdir, recursive, file_ext) or _is_binary(d):
continue

abspath = os.path.join(self.config.working_directory, d.b_path)
if d.new_file:
added_files.add(abspath)
elif d.a_blob and d.b_blob:
if d.a_path != d.b_path:
# consider moved files as added
added_files.add(abspath)
elif d.a_blob != d.b_blob:
updated_files.add(abspath)

# Any file that has been added and then updated inside the window of the git history we
# looked at, should be considered "added", not "updated", since it does not exist in the
# database yet.
updated_files = updated_files - added_files

return added_files, updated_files

def _ensure_working_directory(self) -> None:
if self.config.working_directory is None:
self.config.working_directory = tempfile.mkdtemp()
elif self.config.create_working_directory and not os.path.exists(
self.config.working_directory
):
os.mkdir(self.config.working_directory)

def _ensure_repository(self) -> None:
if not os.path.exists(os.path.join(self.config.working_directory, ".git")):
self._clone_repository()
return
self._repo = Repo(self.config.working_directory)

if self.config.branch is None:
self.config.branch = str(self._repo.active_branch)
branch = self.config.branch
self._repo.head.reference = self._repo.heads[branch]
self._repo.head.reset(index=True, working_tree=True)

remote = self._find_or_add_remote()
self._update_from_remote(remote, branch)

def _clone_repository(self) -> None:
kwargs = {}
if self.config.branch:
kwargs["branch"] = self.config.branch

self._repo = Repo.clone_from(
self.config.repository_url, self.config.working_directory, **kwargs
)

def _find_or_add_remote(self):
remote = None
for r in self._repo.remotes:
if r.url == self.config.repository_url:
remote = r
break

if remote is None:
remote = self._repo.create_remote(
"added_by_vulnerablecode", url=self.config.repository_url
)

return remote

def _update_from_remote(self, remote, branch) -> None:
fetch_info = remote.fetch()
if len(fetch_info) == 0:
return
branch = self._repo.branches[branch]
branch.set_reference(remote.refs[branch.name])
self._repo.head.reset(index=True, working_tree=True)


def _include_file(
path: str,
subdir: Optional[str] = None,
recursive: bool = False,
file_ext: Optional[str] = None,
) -> bool:
match = True

if subdir:
if not subdir.endswith(os.path.sep):
subdir = f"{subdir}{os.path.sep}"

match = match and path.startswith(subdir)

if not recursive:
match = match and (os.path.sep not in path[len(subdir or "") :])

if file_ext:
match = match and path.endswith(f".{file_ext}")
def clone(self):
try:
self.vcs_response = fetch_via_vcs(self.repo_url)
except Exception as e:
msg = f"Failed to fetch {self.repo_url} via vcs: {e}"
logger.error(msg)
raise ForkError(msg) from e

return match
def advisory_data(self) -> Iterable[AdvisoryData]:
"""
Return AdvisoryData objects corresponding to the data being imported
"""
raise NotImplementedError


# TODO: Needs rewrite
Expand Down
4 changes: 3 additions & 1 deletion vulnerabilities/importers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from vulnerabilities.importers import openssl
from vulnerabilities.importers import pysec
from vulnerabilities.importers import redhat
from vulnerabilities.importers import rust

IMPORTERS_REGISTRY = [
nginx.NginxImporter,
Expand All @@ -26,7 +27,8 @@
redhat.RedhatImporter,
pysec.PyPIImporter,
debian.DebianImporter,
gitlab.GitLabAPIImporter,
gitlab.GitLabGitImporter,
rust.RustImporter,
]

IMPORTERS_REGISTRY = {x.qualified_name: x for x in IMPORTERS_REGISTRY}
70 changes: 43 additions & 27 deletions vulnerabilities/importers/gitlab.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,15 @@
#

import logging
import os
import traceback
from datetime import datetime
from pathlib import Path
from typing import Iterable
from typing import List
from typing import Mapping
from typing import Optional

import pytz
import saneyaml
from dateutil import parser as dateparser
from django.db.models.query import QuerySet
from fetchcode.vcs import fetch_via_vcs
Expand All @@ -29,7 +28,7 @@

from vulnerabilities.importer import AdvisoryData
from vulnerabilities.importer import AffectedPackage
from vulnerabilities.importer import Importer
from vulnerabilities.importer import GitImporter
from vulnerabilities.importer import Reference
from vulnerabilities.importer import UnMergeablePackageError
from vulnerabilities.improver import Improver
Expand All @@ -42,6 +41,7 @@
from vulnerabilities.utils import AffectedPackage as LegacyAffectedPackage
from vulnerabilities.utils import build_description
from vulnerabilities.utils import get_affected_packages_by_patched_package
from vulnerabilities.utils import load_yaml
from vulnerabilities.utils import nearest_patched_package
from vulnerabilities.utils import resolve_version_range

Expand Down Expand Up @@ -71,31 +71,45 @@ def fork_and_get_dir(url):
return fetch_via_vcs(url).dest_dir


class ForkError(Exception):
pass


class GitLabAPIImporter(Importer):
class GitLabGitImporter(GitImporter):
spdx_license_expression = "MIT"
license_url = "https://gitlab.com/gitlab-org/advisories-community/-/blob/main/LICENSE"
gitlab_url = "git+https://gitlab.com/gitlab-org/advisories-community/"

def __init__(self):
super().__init__(repo_url="git+https://gitlab.com/gitlab-org/advisories-community/")

def advisory_data(self) -> Iterable[AdvisoryData]:
try:
fork_directory = fork_and_get_dir(url=self.gitlab_url)
except Exception as e:
logger.error(f"Can't clone url {self.gitlab_url}")
raise ForkError(self.gitlab_url) from e
for root_dir in os.listdir(fork_directory):
# skip well known files and directories that contain no advisory data
if root_dir in ("ci", "CODEOWNERS", "README.md", "LICENSE", ".git"):
continue
if root_dir not in PURL_TYPE_BY_GITLAB_SCHEME:
logger.error(f"Unknown package type: {root_dir}")
continue
for root, _, files in os.walk(os.path.join(fork_directory, root_dir)):
for file in files:
yield parse_gitlab_advisory(file=os.path.join(root, file))
self.clone()
path = Path(self.vcs_response.dest_dir)

glob = "**/*.yml"
files = (p for p in path.glob(glob) if p.is_file())
for file in files:
purl_type = get_gitlab_package_type(path=file, root=path)
if not purl_type:
logger.error(f"Unknow gitlab directory structure {file!r}")
continue

if purl_type in PURL_TYPE_BY_GITLAB_SCHEME:
yield parse_gitlab_advisory(file)

else:
logger.error(f"Unknow package type {purl_type!r}")
continue
finally:
if self.vcs_response:
self.vcs_response.delete()


def get_gitlab_package_type(path: Path, root: Path):
"""
Return a package type extracted from a gitlab advisory path
"""
relative = path.relative_to(root)
parts = relative.parts
gitlab_schema = parts[0]
return gitlab_schema


def get_purl(package_slug):
Expand Down Expand Up @@ -168,10 +182,12 @@ def parse_gitlab_advisory(file):
identifiers:
- "GMS-2018-26"
"""
with open(file, "r") as f:
gitlab_advisory = saneyaml.load(f)
gitlab_advisory = load_yaml(file)

if not isinstance(gitlab_advisory, dict):
logger.error(f"parse_yaml_file: yaml_file is not of type `dict`: {gitlab_advisory!r}")
logger.error(
f"parse_gitlab_advisory: unknown gitlab advisory format in {file!r} with data: {gitlab_advisory!r}"
)
return

# refer to schema here https://gitlab.com/gitlab-org/advisories-community/-/blob/main/ci/schema/schema.json
Expand Down Expand Up @@ -261,7 +277,7 @@ def __init__(self) -> None:

@property
def interesting_advisories(self) -> QuerySet:
return Advisory.objects.filter(created_by=GitLabAPIImporter.qualified_name)
return Advisory.objects.filter(created_by=GitLabGitImporter.qualified_name)

def get_package_versions(
self, package_url: PackageURL, until: Optional[datetime] = None
Expand Down
Loading