From 791a1465c8ca70d2220e797a51b2ebf8cdd8b3cc Mon Sep 17 00:00:00 2001 From: Pal Kerecsenyi Date: Thu, 25 Sep 2025 11:56:58 +0100 Subject: [PATCH 1/6] WIP: feat(vcs): service layer * Created a new VCSService class to act as the 'glue' between the new generic provider API methods and the view handlers, with mostly higher-level methods that combine multiple API calls and reads/writes to the DB. * Also included the modified VCSRelease class in the same file. This has been kept to maintain good compatibility with InvenioRDM and anyone else who might have overriden it. However, it has been updated to support the new generic structure. There have also been some small changes such as the variable naming (`release_object` and `release_payload` to `db_release` and `generic_release` to make more clear where the data comes from). * The OAuth handlers, tasks, and webhook receivers are also updated as part of this PR, again with small changes to make them compatible with the generic structure. --- invenio_vcs/config.py | 68 ++++ invenio_vcs/oauth/handlers.py | 89 +++++ invenio_vcs/receivers.py | 118 +++++++ invenio_vcs/service.py | 620 ++++++++++++++++++++++++++++++++++ invenio_vcs/tasks.py | 185 ++++++++++ 5 files changed, 1080 insertions(+) create mode 100644 invenio_vcs/config.py create mode 100644 invenio_vcs/oauth/handlers.py create mode 100644 invenio_vcs/receivers.py create mode 100644 invenio_vcs/service.py create mode 100644 invenio_vcs/tasks.py diff --git a/invenio_vcs/config.py b/invenio_vcs/config.py new file mode 100644 index 00000000..ecbcb837 --- /dev/null +++ b/invenio_vcs/config.py @@ -0,0 +1,68 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Invenio. +# Copyright (C) 2023 CERN. +# +# Invenio is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Invenio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Invenio. If not, see . +# +# In applying this licence, CERN does not waive the privileges and immunities +# granted to it by virtue of its status as an Intergovernmental Organization +# or submit itself to any jurisdiction. + +"""Configuration for GitHub module.""" + +from typing import TYPE_CHECKING + +from flask import current_app + +if TYPE_CHECKING: + from invenio_vcs.providers import RepositoryServiceProviderFactory + +VCS_PROVIDERS = [] + +VCS_RELEASE_CLASS = "invenio_vcs.service:VCSRelease" +"""GitHubRelease class to be used for release handling.""" + +VCS_TEMPLATE_INDEX = "invenio_vcs/settings/index.html" +"""Repositories list template.""" + +VCS_TEMPLATE_VIEW = "invenio_vcs/settings/view.html" +"""Repository detail view template.""" + +VCS_ERROR_HANDLERS = None +"""Definition of the way specific exceptions are handled.""" + +VCS_MAX_CONTRIBUTORS_NUMBER = 30 +"""Max number of contributors of a release to be retrieved from Github.""" + +VCS_CITATION_FILE = None +"""Citation file name.""" + +VCS_CITATION_METADATA_SCHEMA = None +"""Citation metadata schema.""" + +VCS_ZIPBALL_TIMEOUT = 300 +"""Timeout for the zipball download, in seconds.""" + + +def get_provider_list(app=current_app) -> list["RepositoryServiceProviderFactory"]: + return app.config["VCS_PROVIDERS"] + + +def get_provider_by_id(id: str) -> "RepositoryServiceProviderFactory": + providers = get_provider_list() + for provider in providers: + if id == provider.id: + return provider + raise Exception(f"VCS provider with ID {id} not registered") diff --git a/invenio_vcs/oauth/handlers.py b/invenio_vcs/oauth/handlers.py new file mode 100644 index 00000000..3f5f505f --- /dev/null +++ b/invenio_vcs/oauth/handlers.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- +# This file is part of Invenio. +# Copyright (C) 2025 CERN. +# +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. + +"""Implement OAuth client handler.""" + +import typing + +from flask import current_app, redirect, url_for +from flask_login import current_user +from invenio_db import db +from invenio_oauth2server.models import Token as ProviderToken +from invenio_oauthclient import oauth_unlink_external_id + +from invenio_vcs.service import VCSService +from invenio_vcs.tasks import disconnect_provider + +if typing.TYPE_CHECKING: + from invenio_vcs.providers import RepositoryServiceProviderFactory + + +class OAuthHandlers: + def __init__(self, provider_factory: "RepositoryServiceProviderFactory") -> None: + self.provider_factory = provider_factory + + def account_setup_handler(self, remote, token, resp): + """Perform post initialization.""" + try: + svc = VCSService( + self.provider_factory.for_user(token.remote_account.user_id) + ) + svc.init_account() + svc.sync() + db.session.commit() + except Exception as e: + current_app.logger.warning(str(e), exc_info=True) + + def disconnect_handler(self, remote): + """Disconnect callback handler for GitHub.""" + # User must be authenticated + if not current_user.is_authenticated: + return current_app.login_manager.unauthorized() + + external_method = self.provider_factory.id + external_ids = [ + i.id + for i in current_user.external_identifiers + if i.method == external_method + ] + if external_ids: + oauth_unlink_external_id(dict(id=external_ids[0], method=external_method)) + + svc = VCSService(self.provider_factory.for_user(current_user.id)) + token = svc.provider.session_token + + if token: + extra_data = token.remote_account.extra_data + + # Delete the token that we issued for GitHub to deliver webhooks + webhook_token_id = extra_data.get("tokens", {}).get("webhook") + ProviderToken.query.filter_by(id=webhook_token_id).delete() + + # Disable every GitHub webhooks from our side + repos = svc.user_enabled_repositories.all() + repos_with_hooks = [] + for repo in repos: + if repo.hook: + repos_with_hooks.append((repo.provider_id, repo.hook)) + svc.disable_repository(repo.provider_id) + + # Commit any changes before running the ascynhronous task + db.session.commit() + + # Send Celery task for webhooks removal and token revocation + disconnect_provider.delay( + self.provider_factory.id, + current_user.id, + token.access_token, + repos_with_hooks, + ) + + # Delete the RemoteAccount (along with the associated RemoteToken) + token.remote_account.delete() + db.session.commit() + + return redirect(url_for("invenio_oauthclient_settings.index")) diff --git a/invenio_vcs/receivers.py b/invenio_vcs/receivers.py new file mode 100644 index 00000000..0220e387 --- /dev/null +++ b/invenio_vcs/receivers.py @@ -0,0 +1,118 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Invenio. +# Copyright (C) 2023 CERN. +# +# Invenio is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Invenio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Invenio. If not, see . +# +# In applying this licence, CERN does not waive the privileges and immunities +# granted to it by virtue of its status as an Intergovernmental Organization +# or submit itself to any jurisdiction. + +"""Task for managing GitHub integration.""" + +from invenio_db import db +from invenio_webhooks.models import Receiver + +from invenio_vcs.config import get_provider_by_id +from invenio_vcs.models import Release, ReleaseStatus, Repository +from invenio_vcs.tasks import process_release + +from .errors import ( + InvalidSenderError, + ReleaseAlreadyReceivedError, + RepositoryAccessError, + RepositoryDisabledError, + RepositoryNotFoundError, +) + + +class VCSReceiver(Receiver): + """Handle incoming notification from GitHub on a new release.""" + + def __init__(self, receiver_id): + super().__init__(receiver_id) + self.provider_factory = get_provider_by_id(receiver_id) + + def run(self, event): + """Process an event. + + .. note:: + + We should only do basic server side operation here, since we send + the rest of the processing to a Celery task which will be mainly + accessing the GitHub API. + """ + self._handle_event(event) + + def _handle_event(self, event): + """Handles an incoming github event.""" + is_create_release_event = self.provider_factory.webhook_is_create_release_event( + event.payload + ) + + if is_create_release_event: + self._handle_create_release(event) + + def _handle_create_release(self, event): + """Creates a release in invenio.""" + try: + generic_release, generic_repo = ( + self.provider_factory.webhook_event_to_generic(event.payload) + ) + + # Check if the release already exists + existing_release = Release.query.filter_by( + provider_id=generic_release.id, + ).first() + + if existing_release: + raise ReleaseAlreadyReceivedError(release=existing_release) + + # Create the Release + repo = Repository.get( + self.provider_factory.id, + provider_id=generic_repo.id, + full_name=generic_repo.full_name, + ) + if not repo: + raise RepositoryNotFoundError(generic_repo.full_name) + + if repo.enabled: + release = Release( + provider_id=generic_release.id, + provider=self.provider_factory.id, + tag=generic_release.tag_name, + repository=repo, + event=event, + status=ReleaseStatus.RECEIVED, + ) + db.session.add(release) + else: + raise RepositoryDisabledError(repo=repo) + + # Process the release + # Since 'process_release' is executed asynchronously, we commit the current state of session + db.session.commit() + process_release.delay(self.provider_factory.id, release.provider_id) + + except (ReleaseAlreadyReceivedError, RepositoryDisabledError) as e: + event.response_code = 409 + event.response = dict(message=str(e), status=409) + except (RepositoryAccessError, InvalidSenderError) as e: + event.response_code = 403 + event.response = dict(message=str(e), status=403) + except RepositoryNotFoundError as e: + event.response_code = 404 + event.response = dict(message=str(e), status=404) diff --git a/invenio_vcs/service.py b/invenio_vcs/service.py new file mode 100644 index 00000000..3dc47444 --- /dev/null +++ b/invenio_vcs/service.py @@ -0,0 +1,620 @@ +from abc import abstractmethod +from contextlib import contextmanager +from dataclasses import asdict +from typing import TYPE_CHECKING + +from flask import current_app +from invenio_access.permissions import authenticated_user +from invenio_access.utils import get_identity +from invenio_accounts.models import User, UserIdentity +from invenio_db import db +from invenio_i18n import gettext as _ +from invenio_oauth2server.models import Token as ProviderToken +from invenio_oauthclient import oauth_link_external_id +from invenio_oauthclient.models import RemoteAccount +from sqlalchemy import delete, select +from sqlalchemy.exc import NoResultFound +from werkzeug.utils import cached_property + +from invenio_vcs.config import get_provider_by_id +from invenio_vcs.errors import ( + RemoteAccountDataNotSet, + RemoteAccountNotFound, + RepositoryAccessError, + RepositoryDisabledError, + RepositoryNotFoundError, + UserInfoNoneError, +) +from invenio_vcs.generic_models import GenericRelease, GenericRepository +from invenio_vcs.models import ( + Release, + ReleaseStatus, + Repository, + repository_user_association, +) +from invenio_vcs.proxies import current_vcs +from invenio_vcs.tasks import sync_hooks as sync_hooks_task +from invenio_vcs.utils import iso_utcnow + +if TYPE_CHECKING: + from invenio_vcs.providers import ( + RepositoryServiceProvider, + ) + + +class VCSService: + def __init__(self, provider: "RepositoryServiceProvider") -> None: + self.provider = provider + + @staticmethod + def for_provider_and_user(provider_id: str, user_id: int): + return VCSService(get_provider_by_id(provider_id).for_user(user_id)) + + @staticmethod + def for_provider_and_token(provider_id: str, user_id: int, access_token: str): + return VCSService( + get_provider_by_id(provider_id).for_access_token(user_id, access_token) + ) + + @cached_property + def is_authenticated(self): + return self.provider.session_token is not None + + @property + def user_available_repositories(self): + """Retrieve user repositories from user's remote data.""" + return Repository.query.join(repository_user_association).filter( + repository_user_association.c.user_id == self.provider.user_id, + Repository.provider == self.provider.factory.id, + ) + + @property + def user_enabled_repositories(self): + """Retrieve user repositories from the model.""" + return Repository.query.join(repository_user_association).filter( + repository_user_association.c.user_id == self.provider.user_id, + Repository.provider == self.provider.factory.id, + Repository.hook != None, + ) + + def list_repositories(self): + """Retrieves user repositories, containing db repositories plus remote repositories.""" + repos = {} + for db_repo in self.user_available_repositories: + repos[db_repo.provider_id] = asdict(GenericRepository.from_model(db_repo)) + release_instance = current_vcs.release_api_class( + db_repo.latest_release(), self.provider + ) + repos[db_repo.provider_id]["instance"] = db_repo + repos[db_repo.provider_id]["latest"] = release_instance + + return repos + + def get_repo_latest_release(self, repo): + """Retrieves the repository last release.""" + # Bail out fast if object (Repository) not in DB session. + if repo not in db.session: + return None + + q = repo.releases.filter_by(status=ReleaseStatus.PUBLISHED) + release_object = q.order_by(db.desc(Release.created)).first() + + return current_vcs.release_api_class(release_object, self.provider) + + def list_repo_releases(self, repo): + # Retrieve releases and sort them by creation date + release_instances = [] + for release_object in repo.releases.order_by(Release.created): + release_instances.append( + current_vcs.release_api_class(release_object, self.provider) + ) + return release_instances + + def get_repo_default_branch(self, repo_id): + db_repo = self.user_available_repositories.filter( + Repository.provider_id == repo_id + ).first() + + if db_repo is None: + return None + + return db_repo.default_branch + + def get_last_sync_time(self): + """Retrieves the last sync delta time from github's client extra data. + + Time is computed as the delta between now and the last sync time. + """ + extra_data = self.provider.remote_account.extra_data + if not extra_data.get("last_sync"): + raise RemoteAccountDataNotSet( + self.provider.user_id, + _("Last sync data is not set for user (remote data)."), + ) + + return extra_data["last_sync"] + + def get_repository(self, repo_id=None, repo_name=None): + """Retrieves one repository. + + Checks for access permission. + """ + repo = Repository.get( + self.provider.factory.id, provider_id=repo_id, full_name=repo_name + ) + if not repo: + raise RepositoryNotFoundError(repo_id) + + # Might raise a RepositoryAccessError + self.check_repo_access_permissions(repo) + + return repo + + def check_repo_access_permissions(self, repo: Repository): + """Checks permissions from user on repo. + + Repo has access if any of the following is True: + + - user is the owner of the repo + - user has access to the repo in GitHub (stored in RemoteAccount.extra_data.repos) + """ + if self.provider.user_id and repo: + user_is_collaborator = any( + user.id == self.provider.user_id for user in repo.users + ) + if user_is_collaborator: + return True + + if self.provider.remote_account and self.provider.remote_account.extra_data: + user_has_remote_access_count = self.user_available_repositories.filter( + Repository.provider_id == repo.provider_id + ).count() + if user_has_remote_access_count == 1: + return True + + raise RepositoryAccessError( + user=self.provider.user_id, repo=repo.full_name, repo_id=repo.provider_id + ) + + def sync_repo_users(self, db_repo: Repository): + """ + Synchronises the member users of the repository. + This retrieves a list of the IDs of users from the VCS who have sufficient access to the + repository (i.e. being able to access all details and create/manage webhooks). + The user IDs are compared locally to find Invenio users who have connected their VCS account. + This is then propagated to the database: Invenio users who have access to the repo are added to + the `repository_user_association` table, and ones who no longer have access are removed. + + :return: boolean of whether any changed were made to the DB + """ + + vcs_user_ids = self.provider.list_repository_user_ids(db_repo.provider_id) + if vcs_user_ids is None: + return + + vcs_user_identities: list[UserIdentity] = [] + # Find local users who have connected their VCS accounts with the IDs from the repo members + for extern_user_id in vcs_user_ids: + user_identity = UserIdentity.query.filter_by( + method=self.provider.factory.id, + id=extern_user_id, + ).first() + + if user_identity is None: + continue + + vcs_user_identities.append(user_identity) + + is_changed = False + + # Create user associations that exist in the VCS but not in the DB + for user_identity in vcs_user_identities: + if not any( + db_user.id == user_identity.id_user for db_user in db_repo.users + ): + db_repo.add_user(user_identity.id_user) + is_changed = True + + # Remove user associations that exist in the DB but not in the VCS + for db_user in db_repo.users: + if not any( + user_identity.id_user == db_user.id + for user_identity in vcs_user_identities + ): + db_repo.remove_user(db_user.id) + is_changed = True + + return is_changed + + def sync(self, hooks=True, async_hooks=True): + """Synchronize user repositories. + + :param bool hooks: True for syncing hooks. + :param bool async_hooks: True for sending of an asynchronous task to + sync hooks. + + .. note:: + + Syncing happens from GitHub's direction only. This means that we + consider the information on GitHub as valid, and we overwrite our + own state based on this information. + """ + vcs_repos = self.provider.list_repositories() + if vcs_repos is None: + vcs_repos = {} + + if hooks: + self._sync_hooks(vcs_repos.keys(), asynchronous=async_hooks) + + # Update changed names for repositories stored in DB + db_repos = ( + Repository.query.join(repository_user_association) + .filter( + repository_user_association.c.user_id == self.provider.user_id, + Repository.provider == self.provider.factory.id, + ) + .all() + ) + + for db_repo in db_repos: + vcs_repo = vcs_repos.get(db_repo.provider_id) + if not vcs_repo: + continue + + changed_users = self.sync_repo_users(db_repo) + changed_model = vcs_repo.to_model(db_repo) + if changed_users or changed_model: + db.session.add(db_repo) + + # Remove ownership from repositories that the user has no longer + # 'admin' permissions, or have been deleted. + delete_stmt = delete(repository_user_association).where( + repository_user_association.c.user_id == self.provider.user_id, + Repository.provider == self.provider.factory.id, + ~Repository.provider_id.in_(vcs_repos.keys()), + repository_user_association.c.repository_id == Repository.id, + ) + db.session.execute(delete_stmt) + + # Add new repos from VCS to the DB (without the hook activated) + for _, vcs_repo in vcs_repos.items(): + # We cannot just check the repo from the existing `db_repos` list as this only includes the repos to which the user + # already has access. E.g. a repo from the VCS might already exist in our DB but the user doesn't yet have access to it. + corresponding_db_repo = Repository.query.filter( + Repository.provider_id == vcs_repo.id, + Repository.provider == self.provider.factory.id, + ).first() + + if corresponding_db_repo is None: + # We do not yet have this repo registered for any user at all in our DB, so we need to create it. + corresponding_db_repo = Repository.create( + provider=self.provider.factory.id, + provider_id=vcs_repo.id, + html_url=vcs_repo.html_url, + default_branch=vcs_repo.default_branch, + full_name=vcs_repo.full_name, + description=vcs_repo.description, + license_spdx=vcs_repo.license_spdx, + ) + + # In any case (even if we already have the repo) we need to sync its member users + # E.g. maybe the repo is in our DB but the user for which this sync has been trigerred isn't registered as a member + self.sync_repo_users(corresponding_db_repo) + + # Update last sync + self.provider.remote_account.extra_data.update( + dict( + last_sync=iso_utcnow(), + ) + ) + self.provider.remote_account.extra_data.changed() + db.session.add(self.provider.remote_account) + + def _sync_hooks(self, repo_ids, asynchronous=True): + """Check if a hooks sync task needs to be started.""" + if not asynchronous: + for repo_id in repo_ids: + try: + self.sync_repo_hook(repo_id) + except RepositoryAccessError: + current_app.logger.warning( + str(RepositoryAccessError), exc_info=True + ) + except NoResultFound: + pass # Repository not in DB yet + else: + # If hooks will run asynchronously, we need to commit any changes done so far + db.session.commit() + sync_hooks_task.delay( + self.provider.factory.id, self.provider.user_id, list(repo_ids) + ) + + def sync_repo_hook(self, repo_id): + """Sync a GitHub repo's hook with the locally stored repo.""" + # Get the hook that we may have set in the past + hook = self.provider.get_first_valid_webhook(repo_id) + vcs_repo = self.provider.get_repository(repo_id) + assert vcs_repo is not None + + # If hook on GitHub exists, get or create corresponding db object and + # enable the hook. Otherwise remove the old hook information. + db_repo = Repository.get(self.provider.factory.id, provider_id=repo_id) + + if hook: + if not db_repo: + db_repo = Repository.create( + provider=self.provider.factory.id, + provider_id=repo_id, + html_url=vcs_repo.html_url, + default_branch=vcs_repo.default_branch, + full_name=vcs_repo.full_name, + description=vcs_repo.description, + license_spdx=vcs_repo.license_spdx, + ) + self.sync_repo_users(db_repo) + if not db_repo.enabled: + self.mark_repo_enabled(db_repo, hook.id) + else: + if db_repo: + self.mark_repo_disabled(db_repo) + + def mark_repo_disabled(self, db_repo: Repository): + """Disables an user repository.""" + db_repo.hook = None + db_repo.enabled_by_user_id = None + + def mark_repo_enabled(self, db_repo: Repository, hook_id: str): + """Enables an user repository.""" + db_repo.hook = hook_id + db_repo.enabled_by_user_id = self.provider.user_id + + def init_account(self): + """Setup a new GitHub account.""" + if not self.provider.remote_account: + raise RemoteAccountNotFound( + self.provider.user_id, _("Remote account was not found for user.") + ) + + user = self.provider.get_own_user() + if user is None: + raise UserInfoNoneError + + # Setup local access tokens to be used by the webhooks + hook_token = ProviderToken.create_personal( + f"{self.provider.factory.id}-webhook", + self.provider.user_id, + scopes=["webhooks:event"], + is_internal=True, + ) + # Initial structure of extra data + self.provider.remote_account.extra_data = dict( + id=user.id, + login=user.username, + name=user.display_name, + tokens=dict( + webhook=hook_token.id, + ), + last_sync=iso_utcnow(), + ) + + oauth_link_external_id( + User(id=self.provider.user_id), + dict(id=user.id, method=self.provider.factory.id), + ) + + db.session.add(self.provider.remote_account) + + def enable_repository(self, repository_id): + db_repo = self.user_available_repositories.filter( + Repository.provider_id == repository_id + ).first() + if db_repo is None: + raise RepositoryNotFoundError( + repository_id, _("Failed to enable repository.") + ) + + hook_id = self.provider.create_webhook(repository_id) + if hook_id is None: + return False + + self.mark_repo_enabled(db_repo, hook_id) + return True + + def disable_repository(self, repository_id, hook_id=None): + db_repo = self.user_available_repositories.filter( + Repository.provider_id == repository_id + ).first() + + if db_repo is None: + raise RepositoryNotFoundError( + repository_id, _("Failed to disable repository.") + ) + + if not db_repo.enabled: + raise RepositoryDisabledError(repository_id) + + if not self.provider.delete_webhook(repository_id, hook_id): + return False + + self.mark_repo_disabled(db_repo) + return True + + +class VCSRelease: + """A GitHub release.""" + + def __init__(self, release: Release, provider: "RepositoryServiceProvider"): + """Constructor.""" + self.db_release = release + self.provider = provider + self._resolved_zipball_url = None + + @cached_property + def record(self): + """Release record.""" + return self.resolve_record() + + @cached_property + def event(self): + """Get release event.""" + return self.db_release.event + + @cached_property + def payload(self): + """Return event payload.""" + return self.event.payload + + @cached_property + def _generic_release_and_repo(self): + return self.provider.factory.webhook_event_to_generic(self.payload) + + @cached_property + def generic_release(self) -> "GenericRelease": + """Return release metadata.""" + return self._generic_release_and_repo[0] + + @cached_property + def generic_repo(self) -> "GenericRepository": + """Return repo metadata.""" + return self._generic_release_and_repo[1] + + @cached_property + def db_repo(self) -> Repository: + """Return repository model from database.""" + if self.db_release.repository_id: + repository = self.db_release.repository + else: + repository = Repository.query.filter_by( + user_id=self.event.user_id, provider_id=self.provider.factory.id + ).one() + return repository + + @cached_property + def release_file_name(self): + """Returns release zipball file name.""" + tag_name = self.generic_release.tag_name + repo_name = self.generic_repo.full_name + filename = f"{repo_name}-{tag_name}.zip" + return filename + + @cached_property + def release_zipball_url(self): + """Returns the release zipball URL.""" + return self.generic_release.zipball_url + + @cached_property + def user_identity(self): + """Generates release owner's user identity.""" + identity = get_identity(self.db_repo.enabled_by_user) + identity.provides.add(authenticated_user) + identity.user = self.db_repo.enabled_by_user + return identity + + @cached_property + def contributors(self): + """Get list of contributors to a repository. + + The list of contributors is fetched from Github API, filtered for type "User" and sorted by contributions. + + :returns: a generator of objects that contains contributors information. + :raises UnexpectedGithubResponse: when Github API returns a status code other than 200. + """ + max_contributors = current_app.config.get("VCS_MAX_CONTRIBUTORS_NUMBER", 30) + return self.provider.list_repository_contributors( + self.db_repo.provider_id, max=max_contributors + ) + + @cached_property + def owner(self): + """Get owner of repository as a creator.""" + try: + return self.provider.get_repository_owner(self.db_repo.provider_id) + except Exception: + return None + + # Helper functions + + def is_first_release(self): + """Checks whether the current release is the first release of the repository.""" + latest_release = self.db_repo.latest_release(ReleaseStatus.PUBLISHED) + return True if not latest_release else False + + def test_zipball(self): + """Test if the zipball URL is accessible and return the resolved URL.""" + return self.resolve_zipball_url() + + def resolve_zipball_url(self, cache=True): + """Resolve the zipball URL. + + This method will try to resolve the zipball URL by making a HEAD request, + handling the following edge cases: + + - In the case of a 300 Multiple Choices response, which can happen when a tag + and branch have the same name, it will try to fetch an "alternate" link. + - If the access token does not have the required scopes/permissions to access + public links, it will fallback to a non-authenticated request. + """ + if self._resolved_zipball_url and cache: + return self._resolved_zipball_url + + url = self.release_zipball_url + url = self.provider.resolve_release_zipball_url(url) + + if cache: + self._resolved_zipball_url = url + + return url + + # High level API + + def release_failed(self): + """Set release status to FAILED.""" + self.db_release.status = ReleaseStatus.FAILED + + def release_processing(self): + """Set release status to PROCESSING.""" + self.db_release.status = ReleaseStatus.PROCESSING + + def release_published(self): + """Set release status to PUBLISHED.""" + self.db_release.status = ReleaseStatus.PUBLISHED + + @contextmanager + def fetch_zipball_file(self): + """Fetch release zipball file using the current github session.""" + timeout = current_app.config.get("VCS_ZIPBALL_TIMEOUT", 300) + zipball_url = self.resolve_zipball_url() + return self.provider.fetch_release_zipball(zipball_url, timeout) + + def publish(self): + """Publish a GitHub release.""" + raise NotImplementedError + + def process_release(self): + """Processes a github release.""" + raise NotImplementedError + + def resolve_record(self): + """Resolves a record from the release. To be implemented by the API class implementation.""" + raise NotImplementedError + + def serialize_record(self): + """Serializes the release record.""" + raise NotImplementedError + + @property + @abstractmethod + def badge_title(self): + """Stores a string to render in the record badge title (e.g. 'DOI').""" + return None + + @property + @abstractmethod + def badge_value(self): + """Stores a string to render in the record badge value (e.g. '10.1234/invenio.1234').""" + raise NotImplementedError + + @property + def record_url(self): + """Release self url (e.g. github HTML url).""" + raise NotImplementedError diff --git a/invenio_vcs/tasks.py b/invenio_vcs/tasks.py new file mode 100644 index 00000000..469df9b8 --- /dev/null +++ b/invenio_vcs/tasks.py @@ -0,0 +1,185 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Invenio. +# Copyright (C) 2023 CERN. +# Copyright (C) 2024 KTH Royal Institute of Technology. +# +# Invenio is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Invenio is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Invenio. If not, see . +# +# In applying this licence, CERN does not waive the privileges and immunities +# granted to it by virtue of its status as an Intergovernmental Organization +# or submit itself to any jurisdiction. + +"""Task for managing GitHub integration.""" + +import datetime +from typing import TYPE_CHECKING + +from celery import shared_task +from flask import current_app, g +from invenio_db import db +from invenio_i18n import gettext as _ +from invenio_oauthclient.models import RemoteAccount +from invenio_oauthclient.proxies import current_oauthclient + +from invenio_vcs.config import get_provider_by_id +from invenio_vcs.errors import CustomGitHubMetadataError, RepositoryAccessError +from invenio_vcs.models import Release, ReleaseStatus +from invenio_vcs.proxies import current_vcs + +if TYPE_CHECKING: + from invenio_vcs.service import VCSRelease + + +def _get_err_obj(msg): + """Generate the error entry with a Sentry ID.""" + err = {"errors": msg} + if hasattr(g, "sentry_event_id"): + err["error_id"] = str(g.sentry_event_id) + return err + + +def release_gh_metadata_handler(release: "VCSRelease", ex): + """Handler for CustomGithubMetadataError.""" + release.db_release.errors = _get_err_obj(str(ex)) + db.session.commit() + + +def release_default_exception_handler(release: "VCSRelease", ex): + """Default handler.""" + release.db_release.errors = _get_err_obj(str(ex)) + db.session.commit() + + +DEFAULT_ERROR_HANDLERS = [ + (CustomGitHubMetadataError, release_gh_metadata_handler), + (Exception, release_default_exception_handler), +] + + +@shared_task(max_retries=6, default_retry_delay=10 * 60, rate_limit="100/m") +def disconnect_provider(provider_id, user_id, access_token, repo_hooks): + """Uninstall webhooks.""" + # Note at this point the remote account and all associated data have + # already been deleted. The celery task is passed the access_token to make + # some last cleanup and afterwards delete itself remotely. + + # Local import to avoid circular imports + from .service import VCSService + + try: + # Create a nested transaction to make sure that hook deletion + token revoke is atomic + with db.session.begin_nested(): + svc = VCSService.for_provider_and_token(provider_id, user_id, access_token) + + for repo_id, repo_hook in repo_hooks: + if svc.disable_repository(repo_id, repo_hook): + current_app.logger.info( + _("Deleted hook from github repository."), + extra={"hook": repo_hook, "repo": repo_id}, + ) + + # If we finished our clean-up successfully, we can revoke the token + svc.provider.revoke_token(access_token) + except Exception as exc: + # Retry in case GitHub may be down... + disconnect_provider.retry(exc=exc) + + +@shared_task(max_retries=6, default_retry_delay=10 * 60, rate_limit="100/m") +def sync_hooks(provider, user_id, repositories): + """Sync repository hooks for a user.""" + # Local import to avoid circular imports + from .service import VCSService + + try: + # Sync hooks + svc = VCSService.for_provider_and_user(provider, user_id) + for repo_id in repositories: + try: + with db.session.begin_nested(): + svc.sync_repo_hook(repo_id) + # We commit per repository, because while the task is running + db.session.commit() + except RepositoryAccessError as e: + current_app.logger.warning(str(e), exc_info=True) + pass # Repository not in DB yet + except Exception as exc: + current_app.logger.warning(str(exc), exc_info=True) + sync_hooks.retry(exc=exc) + + +@shared_task(ignore_result=True, max_retries=5, default_retry_delay=10 * 60) +def process_release(provider, release_id): + """Process a received Release.""" + release_model = Release.query.filter( + Release.provider_id == release_id, + Release.status.in_([ReleaseStatus.RECEIVED, ReleaseStatus.FAILED]), + ).one() + + provider = get_provider_by_id(provider).for_user( + release_model.repository.enabled_by_user_id + ) + release = current_vcs.release_api_class(release_model, provider) + + matched_error_cls = None + matched_ex = None + + try: + release.process_release() + db.session.commit() + except Exception as ex: + error_handlers = current_vcs.release_error_handlers + matched_ex = None + for error_cls, handler in error_handlers + DEFAULT_ERROR_HANDLERS: + if isinstance(ex, error_cls): + handler(release, ex) + matched_error_cls = error_cls + matched_ex = ex + break + + if matched_error_cls is Exception: + process_release.retry(ex=matched_ex) + + +@shared_task(ignore_result=True) +def refresh_accounts(provider, expiration_threshold=None): + """Refresh stale accounts, avoiding token expiration. + + :param expiration_threshold: Dictionary containing timedelta parameters + referring to the maximum inactivity time. + """ + expiration_date = datetime.datetime.now( + tz=datetime.timezone.utc + ) - datetime.timedelta(**(expiration_threshold or {"days": 6 * 30})) + + remote = current_oauthclient.oauth.remote_apps[provider] + remote_accounts_to_be_updated = RemoteAccount.query.filter( + RemoteAccount.updated < expiration_date, + RemoteAccount.client_id == remote.consumer_key, + ) + for remote_account in remote_accounts_to_be_updated: + sync_account.delay(provider, remote_account.user_id) + + +@shared_task(ignore_result=True) +def sync_account(provider, user_id): + """Sync a user account.""" + # Local import to avoid circular imports + from .service import VCSService + + # Start a nested transaction so every data writing inside sync is executed atomically + with db.session.begin_nested(): + svc = VCSService.for_provider_and_user(provider, user_id) + svc.sync(hooks=False, async_hooks=False) From 273eda41fdcfe89136c4aad49b10764fbbb3d55e Mon Sep 17 00:00:00 2001 From: Pal Kerecsenyi Date: Wed, 8 Oct 2025 16:53:12 +0200 Subject: [PATCH 2/6] chore: formatting/pydoc fixes --- invenio_vcs/config.py | 29 +++++---------- invenio_vcs/oauth/handlers.py | 9 +++-- invenio_vcs/receivers.py | 30 +++++----------- invenio_vcs/service.py | 67 ++++++++++++++++++++++++----------- invenio_vcs/tasks.py | 33 +++++------------ 5 files changed, 78 insertions(+), 90 deletions(-) diff --git a/invenio_vcs/config.py b/invenio_vcs/config.py index ecbcb837..2ed3cef0 100644 --- a/invenio_vcs/config.py +++ b/invenio_vcs/config.py @@ -1,26 +1,11 @@ # -*- coding: utf-8 -*- -# # This file is part of Invenio. -# Copyright (C) 2023 CERN. -# -# Invenio is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Invenio is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio. If not, see . +# Copyright (C) 2025 CERN. # -# In applying this licence, CERN does not waive the privileges and immunities -# granted to it by virtue of its status as an Intergovernmental Organization -# or submit itself to any jurisdiction. +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. -"""Configuration for GitHub module.""" +"""Configuration for the VCS module.""" from typing import TYPE_CHECKING @@ -32,7 +17,7 @@ VCS_PROVIDERS = [] VCS_RELEASE_CLASS = "invenio_vcs.service:VCSRelease" -"""GitHubRelease class to be used for release handling.""" +"""VCSRelease class to be used for release handling.""" VCS_TEMPLATE_INDEX = "invenio_vcs/settings/index.html" """Repositories list template.""" @@ -44,7 +29,7 @@ """Definition of the way specific exceptions are handled.""" VCS_MAX_CONTRIBUTORS_NUMBER = 30 -"""Max number of contributors of a release to be retrieved from Github.""" +"""Max number of contributors of a release to be retrieved from vcs.""" VCS_CITATION_FILE = None """Citation file name.""" @@ -57,10 +42,12 @@ def get_provider_list(app=current_app) -> list["RepositoryServiceProviderFactory"]: + """Get a list of configured VCS provider factories.""" return app.config["VCS_PROVIDERS"] def get_provider_by_id(id: str) -> "RepositoryServiceProviderFactory": + """Get a specific VCS provider by its registered ID.""" providers = get_provider_list() for provider in providers: if id == provider.id: diff --git a/invenio_vcs/oauth/handlers.py b/invenio_vcs/oauth/handlers.py index 3f5f505f..82814ad2 100644 --- a/invenio_vcs/oauth/handlers.py +++ b/invenio_vcs/oauth/handlers.py @@ -23,7 +23,10 @@ class OAuthHandlers: + """Provider-agnostic handler overrides to ensure VCS events are executed at certain points throughout the OAuth lifecyle.""" + def __init__(self, provider_factory: "RepositoryServiceProviderFactory") -> None: + """Instance are non-user-specific.""" self.provider_factory = provider_factory def account_setup_handler(self, remote, token, resp): @@ -39,7 +42,7 @@ def account_setup_handler(self, remote, token, resp): current_app.logger.warning(str(e), exc_info=True) def disconnect_handler(self, remote): - """Disconnect callback handler for GitHub.""" + """Disconnect callback handler for the provider.""" # User must be authenticated if not current_user.is_authenticated: return current_app.login_manager.unauthorized() @@ -59,11 +62,11 @@ def disconnect_handler(self, remote): if token: extra_data = token.remote_account.extra_data - # Delete the token that we issued for GitHub to deliver webhooks + # Delete the token that we issued for vcs to deliver webhooks webhook_token_id = extra_data.get("tokens", {}).get("webhook") ProviderToken.query.filter_by(id=webhook_token_id).delete() - # Disable every GitHub webhooks from our side + # Disable every vcs webhooks from our side repos = svc.user_enabled_repositories.all() repos_with_hooks = [] for repo in repos: diff --git a/invenio_vcs/receivers.py b/invenio_vcs/receivers.py index 0220e387..6e6093d4 100644 --- a/invenio_vcs/receivers.py +++ b/invenio_vcs/receivers.py @@ -1,26 +1,11 @@ # -*- coding: utf-8 -*- -# # This file is part of Invenio. -# Copyright (C) 2023 CERN. -# -# Invenio is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Invenio is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio. If not, see . +# Copyright (C) 2025 CERN. # -# In applying this licence, CERN does not waive the privileges and immunities -# granted to it by virtue of its status as an Intergovernmental Organization -# or submit itself to any jurisdiction. +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. -"""Task for managing GitHub integration.""" +"""Task for managing vcs integration.""" from invenio_db import db from invenio_webhooks.models import Receiver @@ -39,9 +24,10 @@ class VCSReceiver(Receiver): - """Handle incoming notification from GitHub on a new release.""" + """Handle incoming notification from vcs on a new release.""" def __init__(self, receiver_id): + """Constructor.""" super().__init__(receiver_id) self.provider_factory = get_provider_by_id(receiver_id) @@ -52,12 +38,12 @@ def run(self, event): We should only do basic server side operation here, since we send the rest of the processing to a Celery task which will be mainly - accessing the GitHub API. + accessing the vcs API. """ self._handle_event(event) def _handle_event(self, event): - """Handles an incoming github event.""" + """Handles an incoming vcs event.""" is_create_release_event = self.provider_factory.webhook_is_create_release_event( event.payload ) diff --git a/invenio_vcs/service.py b/invenio_vcs/service.py index 3dc47444..60e63c2a 100644 --- a/invenio_vcs/service.py +++ b/invenio_vcs/service.py @@ -1,3 +1,11 @@ +# -*- coding: utf-8 -*- +# This file is part of Invenio. +# Copyright (C) 2025 CERN. +# +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. +"""Higher-level operations for the view handlers and upstream code to use.""" + from abc import abstractmethod from contextlib import contextmanager from dataclasses import asdict @@ -11,8 +19,7 @@ from invenio_i18n import gettext as _ from invenio_oauth2server.models import Token as ProviderToken from invenio_oauthclient import oauth_link_external_id -from invenio_oauthclient.models import RemoteAccount -from sqlalchemy import delete, select +from sqlalchemy import delete from sqlalchemy.exc import NoResultFound from werkzeug.utils import cached_property @@ -43,21 +50,31 @@ class VCSService: + """ + High level glue operations that operate on both the VCS and the DB. + + Because provider instances are user-specific, this class is too. + """ + def __init__(self, provider: "RepositoryServiceProvider") -> None: + """Please construct the service using the `for_provider_and_user` method instead.""" self.provider = provider @staticmethod def for_provider_and_user(provider_id: str, user_id: int): + """Construct VCSService for a locally configured provider and a user with a DB-queried access token.""" return VCSService(get_provider_by_id(provider_id).for_user(user_id)) @staticmethod def for_provider_and_token(provider_id: str, user_id: int, access_token: str): + """Construct VCSService for a locally configured provider and a user with a predefined access token.""" return VCSService( get_provider_by_id(provider_id).for_access_token(user_id, access_token) ) @cached_property def is_authenticated(self): + """Whether we have a valid VCS API token for the user. Should (almost) always return True.""" return self.provider.session_token is not None @property @@ -102,7 +119,7 @@ def get_repo_latest_release(self, repo): return current_vcs.release_api_class(release_object, self.provider) def list_repo_releases(self, repo): - # Retrieve releases and sort them by creation date + """Retrieve releases and sort them by creation date.""" release_instances = [] for release_object in repo.releases.order_by(Release.created): release_instances.append( @@ -111,6 +128,7 @@ def list_repo_releases(self, repo): return release_instances def get_repo_default_branch(self, repo_id): + """Return the locally-synced default branch.""" db_repo = self.user_available_repositories.filter( Repository.provider_id == repo_id ).first() @@ -121,7 +139,7 @@ def get_repo_default_branch(self, repo_id): return db_repo.default_branch def get_last_sync_time(self): - """Retrieves the last sync delta time from github's client extra data. + """Retrieves the last sync delta time from VCS's client extra data. Time is computed as the delta between now and the last sync time. """ @@ -156,7 +174,7 @@ def check_repo_access_permissions(self, repo: Repository): Repo has access if any of the following is True: - user is the owner of the repo - - user has access to the repo in GitHub (stored in RemoteAccount.extra_data.repos) + - user has access to the repo in the VCS """ if self.provider.user_id and repo: user_is_collaborator = any( @@ -179,6 +197,7 @@ def check_repo_access_permissions(self, repo: Repository): def sync_repo_users(self, db_repo: Repository): """ Synchronises the member users of the repository. + This retrieves a list of the IDs of users from the VCS who have sufficient access to the repository (i.e. being able to access all details and create/manage webhooks). The user IDs are compared locally to find Invenio users who have connected their VCS account. @@ -187,7 +206,6 @@ def sync_repo_users(self, db_repo: Repository): :return: boolean of whether any changed were made to the DB """ - vcs_user_ids = self.provider.list_repository_user_ids(db_repo.provider_id) if vcs_user_ids is None: return @@ -235,8 +253,8 @@ def sync(self, hooks=True, async_hooks=True): .. note:: - Syncing happens from GitHub's direction only. This means that we - consider the information on GitHub as valid, and we overwrite our + Syncing happens from the VCS' direction only. This means that we + consider the information on VCS as valid, and we overwrite our own state based on this information. """ vcs_repos = self.provider.list_repositories() @@ -330,13 +348,13 @@ def _sync_hooks(self, repo_ids, asynchronous=True): ) def sync_repo_hook(self, repo_id): - """Sync a GitHub repo's hook with the locally stored repo.""" + """Sync a VCS repo's hook with the locally stored repo.""" # Get the hook that we may have set in the past hook = self.provider.get_first_valid_webhook(repo_id) vcs_repo = self.provider.get_repository(repo_id) assert vcs_repo is not None - # If hook on GitHub exists, get or create corresponding db object and + # If hook on the VCS exists, get or create corresponding db object and # enable the hook. Otherwise remove the old hook information. db_repo = Repository.get(self.provider.factory.id, provider_id=repo_id) @@ -359,17 +377,17 @@ def sync_repo_hook(self, repo_id): self.mark_repo_disabled(db_repo) def mark_repo_disabled(self, db_repo: Repository): - """Disables an user repository.""" + """Marks a repository as disabled.""" db_repo.hook = None db_repo.enabled_by_user_id = None def mark_repo_enabled(self, db_repo: Repository, hook_id: str): - """Enables an user repository.""" + """Marks a repository as enabled.""" db_repo.hook = hook_id db_repo.enabled_by_user_id = self.provider.user_id def init_account(self): - """Setup a new GitHub account.""" + """Setup a new VCS account.""" if not self.provider.remote_account: raise RemoteAccountNotFound( self.provider.user_id, _("Remote account was not found for user.") @@ -405,6 +423,7 @@ def init_account(self): db.session.add(self.provider.remote_account) def enable_repository(self, repository_id): + """Creates the hook for a repository and marks it as enabled.""" db_repo = self.user_available_repositories.filter( Repository.provider_id == repository_id ).first() @@ -421,6 +440,7 @@ def enable_repository(self, repository_id): return True def disable_repository(self, repository_id, hook_id=None): + """Deletes the hook for a repository and marks it as disabled.""" db_repo = self.user_available_repositories.filter( Repository.provider_id == repository_id ).first() @@ -441,7 +461,14 @@ def disable_repository(self, repository_id, hook_id=None): class VCSRelease: - """A GitHub release.""" + """ + Represents a release and common high-level operations that can be performed on it. + + This class is often overriden upstream (e.g. in `invenio-rdm-records`) to specify + what a 'publish' event should do on a given Invenio implementation. + This module does not attempt to publish a record or anything similar, as `invenio-vcs` + is designed to work on any Invenio instance (not just RDM). + """ def __init__(self, release: Release, provider: "RepositoryServiceProvider"): """Constructor.""" @@ -466,6 +493,7 @@ def payload(self): @cached_property def _generic_release_and_repo(self): + """Converts the VCS-specific payload into a tuple of (GenericRelease, GenericRepository).""" return self.provider.factory.webhook_event_to_generic(self.payload) @cached_property @@ -514,10 +542,9 @@ def user_identity(self): def contributors(self): """Get list of contributors to a repository. - The list of contributors is fetched from Github API, filtered for type "User" and sorted by contributions. + The list of contributors is fetched from the VCS, filtered for type "User" and sorted by contributions. :returns: a generator of objects that contains contributors information. - :raises UnexpectedGithubResponse: when Github API returns a status code other than 200. """ max_contributors = current_app.config.get("VCS_MAX_CONTRIBUTORS_NUMBER", 30) return self.provider.list_repository_contributors( @@ -581,17 +608,17 @@ def release_published(self): @contextmanager def fetch_zipball_file(self): - """Fetch release zipball file using the current github session.""" + """Fetch release zipball file using the current VCS session.""" timeout = current_app.config.get("VCS_ZIPBALL_TIMEOUT", 300) zipball_url = self.resolve_zipball_url() return self.provider.fetch_release_zipball(zipball_url, timeout) def publish(self): - """Publish a GitHub release.""" + """Publish a VCS release.""" raise NotImplementedError def process_release(self): - """Processes a github release.""" + """Processes a VCS release.""" raise NotImplementedError def resolve_record(self): @@ -616,5 +643,5 @@ def badge_value(self): @property def record_url(self): - """Release self url (e.g. github HTML url).""" + """Release self url (e.g. VCS HTML url).""" raise NotImplementedError diff --git a/invenio_vcs/tasks.py b/invenio_vcs/tasks.py index 469df9b8..01d5416b 100644 --- a/invenio_vcs/tasks.py +++ b/invenio_vcs/tasks.py @@ -1,27 +1,12 @@ # -*- coding: utf-8 -*- -# # This file is part of Invenio. -# Copyright (C) 2023 CERN. +# Copyright (C) 2025 CERN. # Copyright (C) 2024 KTH Royal Institute of Technology. # -# Invenio is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# Invenio is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Invenio. If not, see . -# -# In applying this licence, CERN does not waive the privileges and immunities -# granted to it by virtue of its status as an Intergovernmental Organization -# or submit itself to any jurisdiction. +# Invenio is free software; you can redistribute it and/or modify it +# under the terms of the MIT License; see LICENSE file for more details. -"""Task for managing GitHub integration.""" +"""Task for managing vcs integration.""" import datetime from typing import TYPE_CHECKING @@ -34,7 +19,7 @@ from invenio_oauthclient.proxies import current_oauthclient from invenio_vcs.config import get_provider_by_id -from invenio_vcs.errors import CustomGitHubMetadataError, RepositoryAccessError +from invenio_vcs.errors import CustomVCSMetadataError, RepositoryAccessError from invenio_vcs.models import Release, ReleaseStatus from invenio_vcs.proxies import current_vcs @@ -51,7 +36,7 @@ def _get_err_obj(msg): def release_gh_metadata_handler(release: "VCSRelease", ex): - """Handler for CustomGithubMetadataError.""" + """Handler for CustomvcsMetadataError.""" release.db_release.errors = _get_err_obj(str(ex)) db.session.commit() @@ -63,7 +48,7 @@ def release_default_exception_handler(release: "VCSRelease", ex): DEFAULT_ERROR_HANDLERS = [ - (CustomGitHubMetadataError, release_gh_metadata_handler), + (CustomVCSMetadataError, release_gh_metadata_handler), (Exception, release_default_exception_handler), ] @@ -86,14 +71,14 @@ def disconnect_provider(provider_id, user_id, access_token, repo_hooks): for repo_id, repo_hook in repo_hooks: if svc.disable_repository(repo_id, repo_hook): current_app.logger.info( - _("Deleted hook from github repository."), + _("Deleted hook from vcs repository."), extra={"hook": repo_hook, "repo": repo_id}, ) # If we finished our clean-up successfully, we can revoke the token svc.provider.revoke_token(access_token) except Exception as exc: - # Retry in case GitHub may be down... + # Retry in case vcs may be down... disconnect_provider.retry(exc=exc) From d7eba10f57347c387f2fdce328d912dba8f6a7df Mon Sep 17 00:00:00 2001 From: Pal Kerecsenyi Date: Wed, 22 Oct 2025 11:13:55 +0200 Subject: [PATCH 3/6] fix(vcs): minor renames/fixes --- invenio_vcs/oauth/handlers.py | 6 +++--- invenio_vcs/service.py | 25 ++++++++++++------------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/invenio_vcs/oauth/handlers.py b/invenio_vcs/oauth/handlers.py index 82814ad2..39056250 100644 --- a/invenio_vcs/oauth/handlers.py +++ b/invenio_vcs/oauth/handlers.py @@ -57,7 +57,7 @@ def disconnect_handler(self, remote): oauth_unlink_external_id(dict(id=external_ids[0], method=external_method)) svc = VCSService(self.provider_factory.for_user(current_user.id)) - token = svc.provider.session_token + token = svc.provider.remote_token if token: extra_data = token.remote_account.extra_data @@ -70,9 +70,9 @@ def disconnect_handler(self, remote): repos = svc.user_enabled_repositories.all() repos_with_hooks = [] for repo in repos: - if repo.hook: + if repo.hook is not None: repos_with_hooks.append((repo.provider_id, repo.hook)) - svc.disable_repository(repo.provider_id) + svc.mark_repo_disabled(repo.provider_id) # Commit any changes before running the ascynhronous task db.session.commit() diff --git a/invenio_vcs/service.py b/invenio_vcs/service.py index 60e63c2a..b3f84025 100644 --- a/invenio_vcs/service.py +++ b/invenio_vcs/service.py @@ -75,7 +75,7 @@ def for_provider_and_token(provider_id: str, user_id: int, access_token: str): @cached_property def is_authenticated(self): """Whether we have a valid VCS API token for the user. Should (almost) always return True.""" - return self.provider.session_token is not None + return self.provider.remote_token is not None @property def user_available_repositories(self): @@ -168,7 +168,7 @@ def get_repository(self, repo_id=None, repo_name=None): return repo - def check_repo_access_permissions(self, repo: Repository): + def check_repo_access_permissions(self, db_repo: Repository): """Checks permissions from user on repo. Repo has access if any of the following is True: @@ -176,22 +176,24 @@ def check_repo_access_permissions(self, repo: Repository): - user is the owner of the repo - user has access to the repo in the VCS """ - if self.provider.user_id and repo: + if self.provider.user_id and db_repo: user_is_collaborator = any( - user.id == self.provider.user_id for user in repo.users + user.id == self.provider.user_id for user in db_repo.users ) if user_is_collaborator: return True if self.provider.remote_account and self.provider.remote_account.extra_data: user_has_remote_access_count = self.user_available_repositories.filter( - Repository.provider_id == repo.provider_id + Repository.provider_id == db_repo.provider_id ).count() if user_has_remote_access_count == 1: return True raise RepositoryAccessError( - user=self.provider.user_id, repo=repo.full_name, repo_id=repo.provider_id + user=self.provider.user_id, + repo=db_repo.full_name, + repo_id=db_repo.provider_id, ) def sync_repo_users(self, db_repo: Repository): @@ -428,9 +430,9 @@ def enable_repository(self, repository_id): Repository.provider_id == repository_id ).first() if db_repo is None: - raise RepositoryNotFoundError( - repository_id, _("Failed to enable repository.") - ) + raise RepositoryNotFoundError(repository_id) + + # No further access check needed: the repo was already in the user's available repo list. hook_id = self.provider.create_webhook(repository_id) if hook_id is None: @@ -444,11 +446,8 @@ def disable_repository(self, repository_id, hook_id=None): db_repo = self.user_available_repositories.filter( Repository.provider_id == repository_id ).first() - if db_repo is None: - raise RepositoryNotFoundError( - repository_id, _("Failed to disable repository.") - ) + raise RepositoryNotFoundError(repository_id) if not db_repo.enabled: raise RepositoryDisabledError(repository_id) From 85aaa9a5b8895f90709e995a75434ed01d180be1 Mon Sep 17 00:00:00 2001 From: Pal Kerecsenyi Date: Thu, 23 Oct 2025 11:31:23 +0200 Subject: [PATCH 4/6] WIP: remove html_url --- invenio_vcs/service.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/invenio_vcs/service.py b/invenio_vcs/service.py index b3f84025..ab6be90f 100644 --- a/invenio_vcs/service.py +++ b/invenio_vcs/service.py @@ -310,7 +310,6 @@ def sync(self, hooks=True, async_hooks=True): corresponding_db_repo = Repository.create( provider=self.provider.factory.id, provider_id=vcs_repo.id, - html_url=vcs_repo.html_url, default_branch=vcs_repo.default_branch, full_name=vcs_repo.full_name, description=vcs_repo.description, @@ -365,7 +364,6 @@ def sync_repo_hook(self, repo_id): db_repo = Repository.create( provider=self.provider.factory.id, provider_id=repo_id, - html_url=vcs_repo.html_url, default_branch=vcs_repo.default_branch, full_name=vcs_repo.full_name, description=vcs_repo.description, From 51485cf3f9359c259e974a0520584a4fee993dd2 Mon Sep 17 00:00:00 2001 From: Pal Kerecsenyi Date: Fri, 24 Oct 2025 10:20:25 +0200 Subject: [PATCH 5/6] WIP: sync bug fixes --- invenio_vcs/oauth/handlers.py | 2 +- invenio_vcs/receivers.py | 1 + invenio_vcs/service.py | 10 +++++++--- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/invenio_vcs/oauth/handlers.py b/invenio_vcs/oauth/handlers.py index 39056250..544a18a9 100644 --- a/invenio_vcs/oauth/handlers.py +++ b/invenio_vcs/oauth/handlers.py @@ -72,7 +72,7 @@ def disconnect_handler(self, remote): for repo in repos: if repo.hook is not None: repos_with_hooks.append((repo.provider_id, repo.hook)) - svc.mark_repo_disabled(repo.provider_id) + svc.mark_repo_disabled(repo) # Commit any changes before running the ascynhronous task db.session.commit() diff --git a/invenio_vcs/receivers.py b/invenio_vcs/receivers.py index 6e6093d4..55d0469b 100644 --- a/invenio_vcs/receivers.py +++ b/invenio_vcs/receivers.py @@ -61,6 +61,7 @@ def _handle_create_release(self, event): # Check if the release already exists existing_release = Release.query.filter_by( provider_id=generic_release.id, + provider=self.provider_factory.id, ).first() if existing_release: diff --git a/invenio_vcs/service.py b/invenio_vcs/service.py index ab6be90f..6bb788f6 100644 --- a/invenio_vcs/service.py +++ b/invenio_vcs/service.py @@ -276,12 +276,15 @@ def sync(self, hooks=True, async_hooks=True): .all() ) + # Make sure we don't run the user sync step more than once for any repo + user_synced_repo_ids: set[str] = set() for db_repo in db_repos: vcs_repo = vcs_repos.get(db_repo.provider_id) if not vcs_repo: continue changed_users = self.sync_repo_users(db_repo) + user_synced_repo_ids.add(db_repo.provider_id) changed_model = vcs_repo.to_model(db_repo) if changed_users or changed_model: db.session.add(db_repo) @@ -316,9 +319,10 @@ def sync(self, hooks=True, async_hooks=True): license_spdx=vcs_repo.license_spdx, ) - # In any case (even if we already have the repo) we need to sync its member users - # E.g. maybe the repo is in our DB but the user for which this sync has been trigerred isn't registered as a member - self.sync_repo_users(corresponding_db_repo) + if vcs_repo.id not in user_synced_repo_ids: + # In any case (even if we already have the repo) we need to sync its member users unless we have already done so. + # E.g. maybe the repo is in our DB but the user for which this sync has been trigerred isn't registered as a member + self.sync_repo_users(corresponding_db_repo) # Update last sync self.provider.remote_account.extra_data.update( From 54d44d070f49104a0e1c2c7b6ce4478e64fc8574 Mon Sep 17 00:00:00 2001 From: Pal Kerecsenyi Date: Fri, 24 Oct 2025 14:02:55 +0200 Subject: [PATCH 6/6] WIP: simplify get_repository method --- invenio_vcs/receivers.py | 1 - invenio_vcs/service.py | 6 ++---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/invenio_vcs/receivers.py b/invenio_vcs/receivers.py index 55d0469b..e5492ed7 100644 --- a/invenio_vcs/receivers.py +++ b/invenio_vcs/receivers.py @@ -71,7 +71,6 @@ def _handle_create_release(self, event): repo = Repository.get( self.provider_factory.id, provider_id=generic_repo.id, - full_name=generic_repo.full_name, ) if not repo: raise RepositoryNotFoundError(generic_repo.full_name) diff --git a/invenio_vcs/service.py b/invenio_vcs/service.py index 6bb788f6..10bd0198 100644 --- a/invenio_vcs/service.py +++ b/invenio_vcs/service.py @@ -152,14 +152,12 @@ def get_last_sync_time(self): return extra_data["last_sync"] - def get_repository(self, repo_id=None, repo_name=None): + def get_repository(self, repo_id): """Retrieves one repository. Checks for access permission. """ - repo = Repository.get( - self.provider.factory.id, provider_id=repo_id, full_name=repo_name - ) + repo = Repository.get(self.provider.factory.id, provider_id=repo_id) if not repo: raise RepositoryNotFoundError(repo_id)