Skip to content

Commit 9f1e07b

Browse files
committed
WIP: feat(vcs): new data model
* Updated the data model to accommodate the new generic approach to VCS integration. This involves renaming the `github_...` tables to `vcs_...`, adding a new column to the relevant tables to identify which provider the records relate to, and more. * Added an Alembic migration, including moving the repository data from `oauthclient_remoteaccount` to the `vcs_repositories` table, which is a complex and long-running operation. This will be supplemented by a manual migration guide for instances like Zenodo where a several-minute full DB lock is not feasible. The difference between whether to use the automated migration or the manual one will be clarified in the docs. * Added a repo-user m-to-m mapping table. By not storing repos in the Remote Accounts table, we need a different way of associating users with the repos they have access to. This table is synced using code that will be included in other PRs. * This PR contains only the data model changes themselves and not the associated functional changes needed to do anything useful. * This commit on its own is UNRELEASABLE. We will merge multiple commits related to the VCS upgrade into the `vcs-staging` branch and then merge them all into `master` once we have a fully release-ready prototype. At that point, we will create a squash commit.
1 parent a841b14 commit 9f1e07b

File tree

2 files changed

+681
-0
lines changed

2 files changed

+681
-0
lines changed
Lines changed: 334 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,334 @@
1+
#
2+
# This file is part of Invenio.
3+
# Copyright (C) 2025 CERN.
4+
#
5+
# Invenio is free software; you can redistribute it and/or modify it
6+
# under the terms of the MIT License; see LICENSE file for more details.
7+
8+
"""Switch to generic git services"""
9+
10+
import uuid
11+
from datetime import datetime, timezone
12+
13+
import sqlalchemy as sa
14+
from alembic import op
15+
from sqlalchemy.ext.mutable import MutableDict
16+
from sqlalchemy_utils import JSONType, UUIDType
17+
18+
# revision identifiers, used by Alembic.
19+
revision = "1754318294"
20+
down_revision = "b0eaee37b545"
21+
# You cannot rename an Alembic branch. So we will have to keep
22+
# the branch label `invenio-github` despite changing the module
23+
# to `invenio-vcs`.
24+
branch_labels = ()
25+
depends_on = None
26+
27+
28+
def upgrade():
29+
"""Upgrade database."""
30+
op.rename_table("github_repositories", "vcs_repositories")
31+
op.alter_column(
32+
"vcs_repositories",
33+
"github_id",
34+
new_column_name="provider_id",
35+
type_=sa.String(length=255),
36+
nullable=False,
37+
existing_type=sa.Integer(),
38+
existing_nullable=True,
39+
)
40+
op.alter_column(
41+
"vcs_repositories",
42+
"hook",
43+
type_=sa.String(length=255),
44+
nullable=True,
45+
existing_type=sa.Integer(),
46+
existing_nullable=True,
47+
)
48+
op.add_column(
49+
"vcs_repositories",
50+
# We use the provider name "github" by default as this is what we're already using across the codebase
51+
sa.Column("provider", sa.String(255), nullable=False, server_default="github"),
52+
)
53+
op.add_column(
54+
"vcs_repositories",
55+
sa.Column(
56+
"default_branch", sa.String(255), nullable=False, server_default="master"
57+
),
58+
)
59+
op.add_column(
60+
"vcs_repositories", sa.Column("description", sa.String(10000), nullable=True)
61+
)
62+
op.add_column(
63+
# Nullable for now (see below)
64+
"vcs_repositories",
65+
sa.Column("html_url", sa.String(10000), nullable=True),
66+
)
67+
op.add_column(
68+
"vcs_repositories", sa.Column("license_spdx", sa.String(255), nullable=True)
69+
)
70+
op.alter_column("vcs_repositories", "user_id", new_column_name="enabled_by_id")
71+
op.drop_index("ix_github_repositories_name")
72+
op.drop_index("ix_github_repositories_github_id")
73+
74+
# Because they rely on the `provider` column, these are automatically
75+
# deleted when downgrading so we don't need a separate drop command
76+
# for them.
77+
op.create_unique_constraint(
78+
constraint_name=op.f("uq_vcs_repositories_provider_provider_id"),
79+
table_name="vcs_repositories",
80+
columns=["provider", "provider_id"],
81+
)
82+
op.create_unique_constraint(
83+
constraint_name=op.f("uq_vcs_repositories_provider_name"),
84+
table_name="vcs_repositories",
85+
columns=["provider", "name"],
86+
)
87+
88+
"""
89+
# Migrate data from the OAuth remote `extra_data` field to the repositories table
90+
# where we will now store everything directly.
91+
#
92+
# We need to recreate the SQLAlchemy models for `RemoteAccount` and `Repository` here but
93+
# in a much more lightweight way. We cannot simply import the models because (a) they depend
94+
# on the full Invenio app being initialised and all extensions available and (b) we need
95+
# to work with the models as they stand precisely at this point in the migration chain
96+
# rather than the model file itself which may be at a later commit.
97+
#
98+
# We only include here the columns, constraints, and relations that we actually need to
99+
# perform the migration, therefore keeping these models as lightweight as possible.
100+
remote_account_table = sa.table(
101+
"oauthclient_remoteaccount",
102+
sa.Column("id", sa.Integer, primary_key=True),
103+
sa.Column("user_id", sa.Integer, sa.ForeignKey("account_user.id")),
104+
sa.Column("client_id", sa.String(255)),
105+
sa.Column("extra_data", MutableDict.as_mutable(JSONType)),
106+
)
107+
vcs_repositories_table = sa.table(
108+
"vcs_repositories",
109+
sa.Column("id", UUIDType, primary_key=True),
110+
sa.Column("provider_id", sa.String(255), nullable=True),
111+
sa.Column("provider", sa.String(255), nullable=True),
112+
sa.Column("description", sa.String(10000), nullable=True),
113+
sa.Column("html_url", sa.String(10000), nullable=False),
114+
sa.Column("license_spdx", sa.String(255), nullable=True),
115+
sa.Column("default_branch", sa.String(255), nullable=False),
116+
sa.Column("name", sa.String(255), nullable=False),
117+
sa.Column("hook", sa.String(255), nullable=True),
118+
sa.Column(
119+
"enabled_by_id", sa.Integer, sa.ForeignKey("account_user.id"), nullable=True
120+
),
121+
sa.Column("created", sa.DateTime, nullable=False),
122+
sa.Column("updated", sa.DateTime, nullable=False),
123+
)
124+
125+
# This is the recommended way to run SQLAlchemy operations in a migration, see https://alembic.sqlalchemy.org/en/latest/ops.html#alembic.operations.Operations.execute
126+
session = op.get_bind()
127+
128+
# We don't know the client ID as this is a config variable.
129+
# So to find the RemoteAccounts that correspond to GitHub, we need to check for the existence
130+
# of the `repos` key in the `extra_data` JSON. We cannot make this very efficient sadly, because
131+
# (a) in Postgres we are using JSON not JSONB so there is no efficient JSON querying and (b) the
132+
# instance might be using MySQL/SQLite where we store it as `TEXT`.
133+
134+
remote_accounts = session.execute(sa.select(remote_account_table))
135+
for remote_account in remote_accounts.mappings():
136+
if "repos" not in remote_account["extra_data"]:
137+
continue
138+
139+
repos = remote_account["extra_data"]["repos"]
140+
141+
for id, github_repo in repos.items():
142+
# `id` (the dict key) is a string because JSON keys must be strings
143+
144+
matching_db_repo_id = session.scalar(
145+
sa.select(vcs_repositories_table).filter_by(provider_id=id)
146+
)
147+
148+
if matching_db_repo_id is None:
149+
# We are now storing _all_ repositories (even non-enabled ones) in the DB.
150+
# The repo-user association will be created on the first sync after this migration, we need to download
151+
# the list of users with access to the repo from the GitHub API.
152+
session.execute(
153+
vcs_repositories_table.insert().values(
154+
id=uuid.uuid4(),
155+
provider_id=id,
156+
provider="github",
157+
description=github_repo["description"],
158+
name=github_repo["full_name"],
159+
default_branch=github_repo["default_branch"],
160+
# So far we have only supported github.com so we can safely assume the URL
161+
html_url=f'https://github.com/{github_repo["full_name"]}',
162+
# We have never stored this, it is queried at runtime right now. When the first
163+
# sync happens after this migration, we will download all the license IDs from the VCS.
164+
license_spdx=None,
165+
# This repo wasn't enabled
166+
hook=None,
167+
enabled_by_id=None,
168+
created=datetime.now(tz=timezone.utc),
169+
updated=datetime.now(tz=timezone.utc),
170+
)
171+
)
172+
else:
173+
session.execute(
174+
vcs_repositories_table.update()
175+
.filter_by(id=matching_db_repo_id)
176+
.values(
177+
description=github_repo["description"],
178+
name=github_repo["full_name"],
179+
default_branch=github_repo["default_branch"],
180+
html_url=f'https://github.com/{github_repo["full_name"]}',
181+
updated=datetime.now(tz=timezone.utc),
182+
)
183+
)
184+
185+
# Remove `repos` from the existing `extra_data`, leaving only the last sync timestamp
186+
session.execute(
187+
remote_account_table.update()
188+
.filter_by(id=remote_account["id"])
189+
.values(extra_data={"last_sync": remote_account["extra_data"]["last_sync"]})
190+
)
191+
192+
"""
193+
194+
# We initially set this to nullable=True so we can create the column without an error
195+
# (it would be null for existing records) but after the SQLAlchemy operations above we
196+
# have populated it so we can mark it non-nullable.
197+
op.alter_column(
198+
"vcs_repositories", "html_url", nullable=False, existing_nullable=True
199+
)
200+
201+
op.rename_table("github_releases", "vcs_releases")
202+
op.alter_column(
203+
"vcs_releases",
204+
"release_id",
205+
new_column_name="provider_id",
206+
type_=sa.String(length=255),
207+
nullable=False,
208+
existing_type=sa.Integer(),
209+
existing_nullable=True,
210+
)
211+
op.add_column(
212+
"vcs_releases",
213+
sa.Column("provider", sa.String(255), nullable=False, server_default="github"),
214+
)
215+
if op.get_context().dialect.name == "postgresql":
216+
op.alter_column(
217+
"vcs_releases",
218+
"errors",
219+
type_=sa.dialects.postgresql.JSONB,
220+
postgresql_using="errors::text::jsonb",
221+
)
222+
223+
op.drop_constraint(
224+
op.f("uq_github_releases_release_id"), table_name="vcs_releases", type_="unique"
225+
)
226+
# A given provider cannot have duplicate repository IDs.
227+
# These constraints are also inherently deleted when the `provider` column is dropped
228+
op.create_unique_constraint(
229+
constraint_name=op.f("uq_vcs_releases_provider_id_provider"),
230+
table_name="vcs_releases",
231+
columns=["provider_id", "provider"],
232+
)
233+
# A specific repository from a given provider cannot have multiple releases of the same tag
234+
op.create_unique_constraint(
235+
constraint_name=op.f("uq_vcs_releases_provider_id_provider_tag"),
236+
table_name="vcs_releases",
237+
columns=["provider_id", "provider", "tag"],
238+
)
239+
240+
op.create_table(
241+
"vcs_repository_users",
242+
sa.Column("repository_id", UUIDType(), primary_key=True),
243+
sa.Column("user_id", sa.Integer(), primary_key=True),
244+
sa.ForeignKeyConstraint(
245+
["repository_id"],
246+
["vcs_repositories.id"],
247+
name=op.f("fk_vcs_repository_users_repository_id_vcs_repositories"),
248+
),
249+
sa.ForeignKeyConstraint(
250+
["user_id"],
251+
["accounts_user.id"],
252+
name=op.f("fk_vcs_repository_users_user_id_accounts_user"),
253+
),
254+
)
255+
# ### end Alembic commands ###
256+
257+
258+
def downgrade():
259+
"""Downgrade database."""
260+
261+
# Currently, the downgrade can only be peformed **without data**. The tables are transformed but
262+
# data will not be successfully migrated. The upgrade migration has a large amount of custom logic
263+
# for migrating the data into the new format, and this is not replicated/reversed for downgrading.
264+
265+
op.alter_column(
266+
"vcs_repositories",
267+
"enabled_by_id",
268+
new_column_name="user_id",
269+
)
270+
op.drop_table("vcs_repository_users")
271+
272+
op.rename_table("vcs_repositories", "github_repositories")
273+
op.alter_column(
274+
"github_repositories",
275+
"provider_id",
276+
new_column_name="github_id",
277+
type_=sa.Integer(),
278+
nullable=True,
279+
existing_type=sa.String(length=255),
280+
existing_nullable=False,
281+
postgresql_using="provider_id::integer",
282+
)
283+
op.alter_column(
284+
"github_repositories",
285+
"hook",
286+
type_=sa.Integer(),
287+
nullable=True,
288+
existing_type=sa.String(length=255),
289+
existing_nullable=True,
290+
postgresql_using="hook::integer",
291+
)
292+
op.drop_column("github_repositories", "provider")
293+
op.drop_column("github_repositories", "description")
294+
op.drop_column("github_repositories", "html_url")
295+
op.drop_column("github_repositories", "license_spdx")
296+
op.drop_column("github_repositories", "default_branch")
297+
op.create_index(
298+
op.f("ix_github_repositories_github_id"),
299+
"github_repositories",
300+
["github_id"],
301+
unique=True,
302+
)
303+
op.create_index(
304+
op.f("ix_github_repositories_name"),
305+
"github_repositories",
306+
["name"],
307+
unique=True,
308+
)
309+
310+
op.rename_table("vcs_releases", "github_releases")
311+
op.alter_column(
312+
"github_releases",
313+
"provider_id",
314+
new_column_name="release_id",
315+
type_=sa.Integer(),
316+
nullable=True,
317+
existing_type=sa.String(length=255),
318+
existing_nullable=False,
319+
postgresql_using="provider_id::integer",
320+
)
321+
op.drop_column("github_releases", "provider")
322+
if op.get_context().dialect.name == "postgresql":
323+
op.alter_column(
324+
"github_releases",
325+
"errors",
326+
type_=sa.dialects.postgresql.JSON,
327+
postgresql_using="errors::text::json",
328+
)
329+
op.create_unique_constraint(
330+
op.f("uq_github_releases_release_id"),
331+
table_name="github_releases",
332+
columns=["release_id"],
333+
)
334+
# ### end Alembic commands ###

0 commit comments

Comments
 (0)