|
| 1 | +# |
| 2 | +# This file is part of Invenio. |
| 3 | +# Copyright (C) 2025 CERN. |
| 4 | +# |
| 5 | +# Invenio is free software; you can redistribute it and/or modify it |
| 6 | +# under the terms of the MIT License; see LICENSE file for more details. |
| 7 | + |
| 8 | +"""Switch to generic git services""" |
| 9 | + |
| 10 | +import uuid |
| 11 | +from datetime import datetime, timezone |
| 12 | + |
| 13 | +import sqlalchemy as sa |
| 14 | +from alembic import op |
| 15 | +from sqlalchemy.ext.mutable import MutableDict |
| 16 | +from sqlalchemy_utils import JSONType, UUIDType |
| 17 | + |
| 18 | +# revision identifiers, used by Alembic. |
| 19 | +revision = "1754318294" |
| 20 | +down_revision = "b0eaee37b545" |
| 21 | +# You cannot rename an Alembic branch. So we will have to keep |
| 22 | +# the branch label `invenio-github` despite changing the module |
| 23 | +# to `invenio-vcs`. |
| 24 | +branch_labels = () |
| 25 | +depends_on = None |
| 26 | + |
| 27 | + |
| 28 | +def upgrade(): |
| 29 | + """Upgrade database.""" |
| 30 | + op.rename_table("github_repositories", "vcs_repositories") |
| 31 | + op.alter_column( |
| 32 | + "vcs_repositories", |
| 33 | + "github_id", |
| 34 | + new_column_name="provider_id", |
| 35 | + type_=sa.String(length=255), |
| 36 | + nullable=False, |
| 37 | + existing_type=sa.Integer(), |
| 38 | + existing_nullable=True, |
| 39 | + ) |
| 40 | + op.alter_column( |
| 41 | + "vcs_repositories", |
| 42 | + "hook", |
| 43 | + type_=sa.String(length=255), |
| 44 | + nullable=True, |
| 45 | + existing_type=sa.Integer(), |
| 46 | + existing_nullable=True, |
| 47 | + ) |
| 48 | + op.add_column( |
| 49 | + "vcs_repositories", |
| 50 | + # We use the provider name "github" by default as this is what we're already using across the codebase |
| 51 | + sa.Column("provider", sa.String(255), nullable=False, server_default="github"), |
| 52 | + ) |
| 53 | + op.add_column( |
| 54 | + "vcs_repositories", |
| 55 | + sa.Column( |
| 56 | + "default_branch", sa.String(255), nullable=False, server_default="master" |
| 57 | + ), |
| 58 | + ) |
| 59 | + op.add_column( |
| 60 | + "vcs_repositories", sa.Column("description", sa.String(10000), nullable=True) |
| 61 | + ) |
| 62 | + op.add_column( |
| 63 | + # Nullable for now (see below) |
| 64 | + "vcs_repositories", |
| 65 | + sa.Column("html_url", sa.String(10000), nullable=True), |
| 66 | + ) |
| 67 | + op.add_column( |
| 68 | + "vcs_repositories", sa.Column("license_spdx", sa.String(255), nullable=True) |
| 69 | + ) |
| 70 | + op.alter_column("vcs_repositories", "user_id", new_column_name="enabled_by_id") |
| 71 | + op.drop_index("ix_github_repositories_name") |
| 72 | + op.drop_index("ix_github_repositories_github_id") |
| 73 | + |
| 74 | + # Because they rely on the `provider` column, these are automatically |
| 75 | + # deleted when downgrading so we don't need a separate drop command |
| 76 | + # for them. |
| 77 | + op.create_unique_constraint( |
| 78 | + constraint_name=op.f("uq_vcs_repositories_provider_provider_id"), |
| 79 | + table_name="vcs_repositories", |
| 80 | + columns=["provider", "provider_id"], |
| 81 | + ) |
| 82 | + op.create_unique_constraint( |
| 83 | + constraint_name=op.f("uq_vcs_repositories_provider_name"), |
| 84 | + table_name="vcs_repositories", |
| 85 | + columns=["provider", "name"], |
| 86 | + ) |
| 87 | + |
| 88 | + """ |
| 89 | + # Migrate data from the OAuth remote `extra_data` field to the repositories table |
| 90 | + # where we will now store everything directly. |
| 91 | + # |
| 92 | + # We need to recreate the SQLAlchemy models for `RemoteAccount` and `Repository` here but |
| 93 | + # in a much more lightweight way. We cannot simply import the models because (a) they depend |
| 94 | + # on the full Invenio app being initialised and all extensions available and (b) we need |
| 95 | + # to work with the models as they stand precisely at this point in the migration chain |
| 96 | + # rather than the model file itself which may be at a later commit. |
| 97 | + # |
| 98 | + # We only include here the columns, constraints, and relations that we actually need to |
| 99 | + # perform the migration, therefore keeping these models as lightweight as possible. |
| 100 | + remote_account_table = sa.table( |
| 101 | + "oauthclient_remoteaccount", |
| 102 | + sa.Column("id", sa.Integer, primary_key=True), |
| 103 | + sa.Column("user_id", sa.Integer, sa.ForeignKey("account_user.id")), |
| 104 | + sa.Column("client_id", sa.String(255)), |
| 105 | + sa.Column("extra_data", MutableDict.as_mutable(JSONType)), |
| 106 | + ) |
| 107 | + vcs_repositories_table = sa.table( |
| 108 | + "vcs_repositories", |
| 109 | + sa.Column("id", UUIDType, primary_key=True), |
| 110 | + sa.Column("provider_id", sa.String(255), nullable=True), |
| 111 | + sa.Column("provider", sa.String(255), nullable=True), |
| 112 | + sa.Column("description", sa.String(10000), nullable=True), |
| 113 | + sa.Column("html_url", sa.String(10000), nullable=False), |
| 114 | + sa.Column("license_spdx", sa.String(255), nullable=True), |
| 115 | + sa.Column("default_branch", sa.String(255), nullable=False), |
| 116 | + sa.Column("name", sa.String(255), nullable=False), |
| 117 | + sa.Column("hook", sa.String(255), nullable=True), |
| 118 | + sa.Column( |
| 119 | + "enabled_by_id", sa.Integer, sa.ForeignKey("account_user.id"), nullable=True |
| 120 | + ), |
| 121 | + sa.Column("created", sa.DateTime, nullable=False), |
| 122 | + sa.Column("updated", sa.DateTime, nullable=False), |
| 123 | + ) |
| 124 | +
|
| 125 | + # This is the recommended way to run SQLAlchemy operations in a migration, see https://alembic.sqlalchemy.org/en/latest/ops.html#alembic.operations.Operations.execute |
| 126 | + session = op.get_bind() |
| 127 | +
|
| 128 | + # We don't know the client ID as this is a config variable. |
| 129 | + # So to find the RemoteAccounts that correspond to GitHub, we need to check for the existence |
| 130 | + # of the `repos` key in the `extra_data` JSON. We cannot make this very efficient sadly, because |
| 131 | + # (a) in Postgres we are using JSON not JSONB so there is no efficient JSON querying and (b) the |
| 132 | + # instance might be using MySQL/SQLite where we store it as `TEXT`. |
| 133 | +
|
| 134 | + remote_accounts = session.execute(sa.select(remote_account_table)) |
| 135 | + for remote_account in remote_accounts.mappings(): |
| 136 | + if "repos" not in remote_account["extra_data"]: |
| 137 | + continue |
| 138 | +
|
| 139 | + repos = remote_account["extra_data"]["repos"] |
| 140 | +
|
| 141 | + for id, github_repo in repos.items(): |
| 142 | + # `id` (the dict key) is a string because JSON keys must be strings |
| 143 | +
|
| 144 | + matching_db_repo_id = session.scalar( |
| 145 | + sa.select(vcs_repositories_table).filter_by(provider_id=id) |
| 146 | + ) |
| 147 | +
|
| 148 | + if matching_db_repo_id is None: |
| 149 | + # We are now storing _all_ repositories (even non-enabled ones) in the DB. |
| 150 | + # The repo-user association will be created on the first sync after this migration, we need to download |
| 151 | + # the list of users with access to the repo from the GitHub API. |
| 152 | + session.execute( |
| 153 | + vcs_repositories_table.insert().values( |
| 154 | + id=uuid.uuid4(), |
| 155 | + provider_id=id, |
| 156 | + provider="github", |
| 157 | + description=github_repo["description"], |
| 158 | + name=github_repo["full_name"], |
| 159 | + default_branch=github_repo["default_branch"], |
| 160 | + # So far we have only supported github.com so we can safely assume the URL |
| 161 | + html_url=f'https://github.com/{github_repo["full_name"]}', |
| 162 | + # We have never stored this, it is queried at runtime right now. When the first |
| 163 | + # sync happens after this migration, we will download all the license IDs from the VCS. |
| 164 | + license_spdx=None, |
| 165 | + # This repo wasn't enabled |
| 166 | + hook=None, |
| 167 | + enabled_by_id=None, |
| 168 | + created=datetime.now(tz=timezone.utc), |
| 169 | + updated=datetime.now(tz=timezone.utc), |
| 170 | + ) |
| 171 | + ) |
| 172 | + else: |
| 173 | + session.execute( |
| 174 | + vcs_repositories_table.update() |
| 175 | + .filter_by(id=matching_db_repo_id) |
| 176 | + .values( |
| 177 | + description=github_repo["description"], |
| 178 | + name=github_repo["full_name"], |
| 179 | + default_branch=github_repo["default_branch"], |
| 180 | + html_url=f'https://github.com/{github_repo["full_name"]}', |
| 181 | + updated=datetime.now(tz=timezone.utc), |
| 182 | + ) |
| 183 | + ) |
| 184 | +
|
| 185 | + # Remove `repos` from the existing `extra_data`, leaving only the last sync timestamp |
| 186 | + session.execute( |
| 187 | + remote_account_table.update() |
| 188 | + .filter_by(id=remote_account["id"]) |
| 189 | + .values(extra_data={"last_sync": remote_account["extra_data"]["last_sync"]}) |
| 190 | + ) |
| 191 | +
|
| 192 | + """ |
| 193 | + |
| 194 | + # We initially set this to nullable=True so we can create the column without an error |
| 195 | + # (it would be null for existing records) but after the SQLAlchemy operations above we |
| 196 | + # have populated it so we can mark it non-nullable. |
| 197 | + op.alter_column( |
| 198 | + "vcs_repositories", "html_url", nullable=False, existing_nullable=True |
| 199 | + ) |
| 200 | + |
| 201 | + op.rename_table("github_releases", "vcs_releases") |
| 202 | + op.alter_column( |
| 203 | + "vcs_releases", |
| 204 | + "release_id", |
| 205 | + new_column_name="provider_id", |
| 206 | + type_=sa.String(length=255), |
| 207 | + nullable=False, |
| 208 | + existing_type=sa.Integer(), |
| 209 | + existing_nullable=True, |
| 210 | + ) |
| 211 | + op.add_column( |
| 212 | + "vcs_releases", |
| 213 | + sa.Column("provider", sa.String(255), nullable=False, server_default="github"), |
| 214 | + ) |
| 215 | + if op.get_context().dialect.name == "postgresql": |
| 216 | + op.alter_column( |
| 217 | + "vcs_releases", |
| 218 | + "errors", |
| 219 | + type_=sa.dialects.postgresql.JSONB, |
| 220 | + postgresql_using="errors::text::jsonb", |
| 221 | + ) |
| 222 | + |
| 223 | + op.drop_constraint( |
| 224 | + op.f("uq_github_releases_release_id"), table_name="vcs_releases", type_="unique" |
| 225 | + ) |
| 226 | + # A given provider cannot have duplicate repository IDs. |
| 227 | + # These constraints are also inherently deleted when the `provider` column is dropped |
| 228 | + op.create_unique_constraint( |
| 229 | + constraint_name=op.f("uq_vcs_releases_provider_id_provider"), |
| 230 | + table_name="vcs_releases", |
| 231 | + columns=["provider_id", "provider"], |
| 232 | + ) |
| 233 | + # A specific repository from a given provider cannot have multiple releases of the same tag |
| 234 | + op.create_unique_constraint( |
| 235 | + constraint_name=op.f("uq_vcs_releases_provider_id_provider_tag"), |
| 236 | + table_name="vcs_releases", |
| 237 | + columns=["provider_id", "provider", "tag"], |
| 238 | + ) |
| 239 | + |
| 240 | + op.create_table( |
| 241 | + "vcs_repository_users", |
| 242 | + sa.Column("repository_id", UUIDType(), primary_key=True), |
| 243 | + sa.Column("user_id", sa.Integer(), primary_key=True), |
| 244 | + sa.ForeignKeyConstraint( |
| 245 | + ["repository_id"], |
| 246 | + ["vcs_repositories.id"], |
| 247 | + name=op.f("fk_vcs_repository_users_repository_id_vcs_repositories"), |
| 248 | + ), |
| 249 | + sa.ForeignKeyConstraint( |
| 250 | + ["user_id"], |
| 251 | + ["accounts_user.id"], |
| 252 | + name=op.f("fk_vcs_repository_users_user_id_accounts_user"), |
| 253 | + ), |
| 254 | + ) |
| 255 | + # ### end Alembic commands ### |
| 256 | + |
| 257 | + |
| 258 | +def downgrade(): |
| 259 | + """Downgrade database.""" |
| 260 | + |
| 261 | + # Currently, the downgrade can only be peformed **without data**. The tables are transformed but |
| 262 | + # data will not be successfully migrated. The upgrade migration has a large amount of custom logic |
| 263 | + # for migrating the data into the new format, and this is not replicated/reversed for downgrading. |
| 264 | + |
| 265 | + op.alter_column( |
| 266 | + "vcs_repositories", |
| 267 | + "enabled_by_id", |
| 268 | + new_column_name="user_id", |
| 269 | + ) |
| 270 | + op.drop_table("vcs_repository_users") |
| 271 | + |
| 272 | + op.rename_table("vcs_repositories", "github_repositories") |
| 273 | + op.alter_column( |
| 274 | + "github_repositories", |
| 275 | + "provider_id", |
| 276 | + new_column_name="github_id", |
| 277 | + type_=sa.Integer(), |
| 278 | + nullable=True, |
| 279 | + existing_type=sa.String(length=255), |
| 280 | + existing_nullable=False, |
| 281 | + postgresql_using="provider_id::integer", |
| 282 | + ) |
| 283 | + op.alter_column( |
| 284 | + "github_repositories", |
| 285 | + "hook", |
| 286 | + type_=sa.Integer(), |
| 287 | + nullable=True, |
| 288 | + existing_type=sa.String(length=255), |
| 289 | + existing_nullable=True, |
| 290 | + postgresql_using="hook::integer", |
| 291 | + ) |
| 292 | + op.drop_column("github_repositories", "provider") |
| 293 | + op.drop_column("github_repositories", "description") |
| 294 | + op.drop_column("github_repositories", "html_url") |
| 295 | + op.drop_column("github_repositories", "license_spdx") |
| 296 | + op.drop_column("github_repositories", "default_branch") |
| 297 | + op.create_index( |
| 298 | + op.f("ix_github_repositories_github_id"), |
| 299 | + "github_repositories", |
| 300 | + ["github_id"], |
| 301 | + unique=True, |
| 302 | + ) |
| 303 | + op.create_index( |
| 304 | + op.f("ix_github_repositories_name"), |
| 305 | + "github_repositories", |
| 306 | + ["name"], |
| 307 | + unique=True, |
| 308 | + ) |
| 309 | + |
| 310 | + op.rename_table("vcs_releases", "github_releases") |
| 311 | + op.alter_column( |
| 312 | + "github_releases", |
| 313 | + "provider_id", |
| 314 | + new_column_name="release_id", |
| 315 | + type_=sa.Integer(), |
| 316 | + nullable=True, |
| 317 | + existing_type=sa.String(length=255), |
| 318 | + existing_nullable=False, |
| 319 | + postgresql_using="provider_id::integer", |
| 320 | + ) |
| 321 | + op.drop_column("github_releases", "provider") |
| 322 | + if op.get_context().dialect.name == "postgresql": |
| 323 | + op.alter_column( |
| 324 | + "github_releases", |
| 325 | + "errors", |
| 326 | + type_=sa.dialects.postgresql.JSON, |
| 327 | + postgresql_using="errors::text::json", |
| 328 | + ) |
| 329 | + op.create_unique_constraint( |
| 330 | + op.f("uq_github_releases_release_id"), |
| 331 | + table_name="github_releases", |
| 332 | + columns=["release_id"], |
| 333 | + ) |
| 334 | + # ### end Alembic commands ### |
0 commit comments