Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 24 additions & 10 deletions meta_creator/metadata_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,8 @@
from .gitlab_metadata import get_gitlab_metadata
from .read_tokens import read_token_from_file
from .hermes_process import run_hermes_commands
import json
import os


from .spdx_utils import validate_license

#################### getting metadata from github/gitlab project ####################

@csrf_exempt
Expand Down Expand Up @@ -55,7 +53,15 @@ def data_extraction(request):
if not extracted_metadata:
extracted_metadata = get_gitlab_metadata(gl_url, default_access_token_gitlab)

result['metadata'] = init_curated_metadata(extracted_metadata)
output = validate_license(extracted_metadata)
if output.get('success') is False:
return {
'success': False,
'errors': 'No valid license found in metadata.'
}
else:
result['metadata'] = init_curated_metadata(extracted_metadata)


else:
# TODO we need to pass the token to hermes_process
Expand All @@ -66,12 +72,20 @@ def data_extraction(request):
# hermes_metadata = get_github_metadata(gl_url, default_access_token_GH)

if isinstance(hermes_metadata, dict):
result['metadata'] = init_curated_metadata(hermes_metadata.get('metadata'))
result['warnings'].extend(hermes_metadata.get('warnings', []))
result['errors'].extend(hermes_metadata.get('errors', []))
result['success'] = hermes_metadata.get('success', False)
extracted_metadata = hermes_metadata.get('metadata')
if extracted_metadata:
output = validate_license(extracted_metadata)
if output.get('success') is False:
return {
'success': False,
'errors': 'No valid license found in metadata.'
}
else:
result['metadata'] = init_curated_metadata(hermes_metadata.get('metadata'))
result['warnings'].extend(hermes_metadata.get('warnings', []))
result['errors'].extend(hermes_metadata.get('errors', []))
result['success'] = hermes_metadata.get('success', False)
else:
result['success'] = False
result['errors'].append("HERMES returned unexpected result format.")

return result
58 changes: 58 additions & 0 deletions meta_creator/spdx_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import requests
import re
from functools import lru_cache

SPDX_URL = 'https://raw.githubusercontent.com/spdx/license-list-data/master/json/licenses.json'

@lru_cache(maxsize=1)
def get_spdx_licenses():
licenses = set()
response = requests.get(SPDX_URL, timeout=10)
response.raise_for_status()
data = response.json()

for license_entry in data.get("licenses", []):
if not license_entry.get("isDeprecatedLicenseId", False):
licenses.add(license_entry["licenseId"])

return licenses

def extract_license_from_metadata(metadata):
if not isinstance(metadata, dict):
return None

def extract_spdx_id_from_url(url):
match = re.search(r'spdx\.org/licenses/([A-Za-z0-9\.-]+)', url)
return match.group(1) if match else None

license_data = metadata.get('license')

if isinstance(license_data, list):
for item in license_data:
if isinstance(item, str):
spdx_id = extract_spdx_id_from_url(item)
if spdx_id:
return spdx_id

if isinstance(license_data, dict):
return license_data.get('spdx_id') or license_data.get('key')

if isinstance(license_data, str):
return extract_spdx_id_from_url(license_data) or license_data.strip()

if 'spdx_license' in metadata:
return metadata['spdx_license'].strip()

return None

def validate_license(metadata):
license_id = extract_license_from_metadata(metadata)

if not license_id:
return {'success': False}

spdx_licenses = get_spdx_licenses()
if license_id in spdx_licenses:
return {'success': True}
else:
return {'success': False}