Skip to content

Commit

Permalink
feat: Prepare for the removal of collection_file_item. Make collectio…
Browse files Browse the repository at this point in the history
…n_file_item_id nullable. Populate collection_file on release, record and compiled_release.
  • Loading branch information
jpmckinney committed Nov 2, 2024
1 parent b5f7bda commit fb3746e
Show file tree
Hide file tree
Showing 14 changed files with 822 additions and 166 deletions.
2 changes: 0 additions & 2 deletions docs/database.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ Database tables
- A collection, like the collection of files written to a crawl directory by Kingfisher Collect. See :ref:`db-collection`.
* - ``collection_file``
- A file containing a release package or record package, like the files written by Kingfisher Collect.
* - ``collection_file_item``
- A passthrough table. `#324 <https://github.com/open-contracting/kingfisher-process/issues/324>`__
* - ``collection_note``
- A collection note. See :ref:`db-collection_note`.
* - ``package_data``
Expand Down
3 changes: 1 addition & 2 deletions process/management/commands/addchecks.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,10 @@ def handle_collection(self, collection, *args, **options):

# SELECT DISTINCT collection_file_id FROM release
# LEFT OUTER JOIN release_check ON release.id = release_check.release_id
# INNER JOIN collection_file_item ON collection_file_item_id = collection_file_item.id
# WHERE collection_id = :collection_id AND release_check.id IS NULL
qs = (
model.objects.filter(**{"collection": collection, f"{related_name}__isnull": True})
.values_list("collection_file_item__collection_file", flat=True)
.values_list("collection_file", flat=True)
.distinct()
)

Expand Down
2 changes: 1 addition & 1 deletion process/management/commands/checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def _check_collection_file(collection_file):
related_name = "recordcheck"

items = model.objects.select_related("data", "package_data").filter(
**{"collection_file_item__collection_file": collection_file, f"{related_name}__isnull": True}
**{"collection_file": collection_file, f"{related_name}__isnull": True}
)

logger.info("Checking %s %s for collection file %s", items.count(), items_key, collection_file)
Expand Down
2 changes: 1 addition & 1 deletion process/management/commands/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def callback(client_state, channel, method, properties, input_message):

match collection.data_type["format"]:
case Format.record_package:
items = Record.objects.filter(collection_file_item__collection_file=collection_file)
items = Record.objects.filter(collection_file=collection_file)
publish_routing_key = "compiler_record"
case Format.release_package:
# Return if another compiler worker received a message for the same compilable collection.
Expand Down
12 changes: 4 additions & 8 deletions process/management/commands/file_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from process.exceptions import EmptyFormatError, UnsupportedFormatError
from process.models import (
CollectionFile,
CollectionFileItem,
CollectionNote,
CompiledRelease,
Data,
Expand Down Expand Up @@ -116,7 +115,7 @@ def process_file(collection_file) -> int | None:
"""
Load file for a given collection.
Create the whole collection_file, collection_file_item, etc. structure.
Create the collection_file and either the release, record or compiled_release.
If the collection should be upgraded, create the same structure for upgraded collection as well.
:param collection_file: collection file for which should be releases checked
Expand Down Expand Up @@ -257,9 +256,6 @@ def _read_data_from_file(filename, data_type):


def _store_data(collection_file, package, releases_or_records, data_type, *, upgrade=False):
collection_file_item = CollectionFileItem(collection_file=collection_file, number=0)
collection_file_item.save()

for release_or_record in releases_or_records:
if upgrade:
with create_logger_note(collection_file.collection, "ocdskit"):
Expand All @@ -280,15 +276,15 @@ def _store_data(collection_file, package, releases_or_records, data_type, *, upg
case Format.record_package:
Record(
collection=collection_file.collection,
collection_file_item=collection_file_item,
collection_file=collection_file,
package_data=get_or_create(PackageData, package),
data=data,
ocid=release_or_record["ocid"],
).save()
case Format.release_package:
Release(
collection=collection_file.collection,
collection_file_item=collection_file_item,
collection_file=collection_file,
package_data=get_or_create(PackageData, package),
data=data,
ocid=release_or_record["ocid"],
Expand All @@ -298,7 +294,7 @@ def _store_data(collection_file, package, releases_or_records, data_type, *, upg
case Format.compiled_release:
CompiledRelease(
collection=collection_file.collection,
collection_file_item=collection_file_item,
collection_file=collection_file,
data=data,
ocid=release_or_record["ocid"],
release_date=release_or_record.get("date") or "",
Expand Down
7 changes: 3 additions & 4 deletions process/management/commands/wiper.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,9 @@ def handle(self, *args, **options):

def delete_collection(collection_id):
tables = [
("record", None), # references collection_file_item
("release", None), # references collection_file_item
("compiled_release", None), # references collection_file_item
("collection_file_item", "collection_file"), # references collection_file
("record", None), # references collection_file
("release", None), # references collection_file
("compiled_release", None), # references collection_file
("processing_step", None), # references collection_file
("collection_file", None),
("collection_note", None),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# Generated by Django 4.2.15 on 2024-11-02 03:30

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("process", "0042_compiledrelease_release_date_release_release_date"),
]

operations = [
migrations.AddField(
model_name="compiledrelease",
name="collection_file",
field=models.ForeignKey(
db_index=False, null=True, on_delete=django.db.models.deletion.CASCADE, to="process.collectionfile"
),
),
migrations.AddField(
model_name="record",
name="collection_file",
field=models.ForeignKey(
db_index=False, null=True, on_delete=django.db.models.deletion.CASCADE, to="process.collectionfile"
),
),
migrations.AddField(
model_name="release",
name="collection_file",
field=models.ForeignKey(
db_index=False, null=True, on_delete=django.db.models.deletion.CASCADE, to="process.collectionfile"
),
),
migrations.AddIndex(
model_name="compiledrelease",
index=models.Index(fields=["collection_file"], name="compiled_release_collection_file_id_idx"),
),
migrations.AddIndex(
model_name="record",
index=models.Index(fields=["collection_file"], name="record_collection_file_id_idx"),
),
migrations.AddIndex(
model_name="release",
index=models.Index(fields=["collection_file"], name="release_collection_file_id_idx"),
),
migrations.AlterField(
model_name="compiledrelease",
name="collection_file_item",
field=models.ForeignKey(
db_index=False, null=True, on_delete=django.db.models.deletion.CASCADE, to="process.collectionfileitem"
),
),
migrations.AlterField(
model_name="record",
name="collection_file_item",
field=models.ForeignKey(
db_index=False, null=True, on_delete=django.db.models.deletion.CASCADE, to="process.collectionfileitem"
),
),
migrations.AlterField(
model_name="release",
name="collection_file_item",
field=models.ForeignKey(
db_index=False, null=True, on_delete=django.db.models.deletion.CASCADE, to="process.collectionfileitem"
),
),
migrations.RunSQL(
"""
UPDATE release
SET collection_file_id = collection_file_item.collection_file_id
FROM collection_file_item
WHERE collection_file_item_id = collection_file_item.id
"""
),
migrations.RunSQL(
"""
UPDATE record
SET collection_file_id = collection_file_item.collection_file_id
FROM collection_file_item
WHERE collection_file_item_id = collection_file_item.id
"""
),
migrations.RunSQL(
"""
UPDATE compiled_release
SET collection_file_id = collection_file_item.collection_file_id
FROM collection_file_item
WHERE collection_file_item_id = collection_file_item.id
"""
),
]
17 changes: 11 additions & 6 deletions process/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
# DjangoJSONEncoder serializes Decimal values as strings. simplejson serializes Decimal values as numbers.
from simplejson import JSONEncoder

# # We set `db_table` so that the table names are identical to those created by SQLAlchemy in an earlier version. We
# don't use `unique=True` or `db_index=True`, because they create an additional index for the text fields `hash_md5`
# # We set `db_table` so that the table names are identical to those created by SQLAlchemy in an earlier version.
# We don't use `unique=True` or `db_index=True`, because they create an additional index for the text fields `hash_md5`
# and `ocid`. Instead, we set `Meta.constraints` and `Meta.indexes`.
#
# https://docs.djangoproject.com/en/4.2/ref/databases/#indexes-for-varchar-and-text-columns
Expand Down Expand Up @@ -242,7 +242,6 @@ class Name(models.TextChoices):
class Meta:
db_table = "processing_step"
indexes = [
# ForeignKey with db_index=False.
models.Index(name="processing_step_collection_id_ocid_idx", fields=["collection", "name", "ocid"]),
]

Expand Down Expand Up @@ -315,7 +314,8 @@ class Release(models.Model):
"""A release."""

collection = models.ForeignKey(Collection, on_delete=models.CASCADE, db_index=False)
collection_file_item = models.ForeignKey(CollectionFileItem, on_delete=models.CASCADE, db_index=False)
collection_file = models.ForeignKey(CollectionFile, on_delete=models.CASCADE, db_index=False, null=True)
collection_file_item = models.ForeignKey(CollectionFileItem, on_delete=models.CASCADE, db_index=False, null=True)

ocid = models.TextField(blank=True)
release_id = models.TextField(blank=True)
Expand All @@ -331,6 +331,7 @@ class Meta:
models.Index(fields=["collection", "ocid"]),
# ForeignKey with db_index=False.
models.Index(name="release_collection_id_idx", fields=["collection"]),
models.Index(name="release_collection_file_id_idx", fields=["collection_file"]),
models.Index(name="release_collection_file_item_id_idx", fields=["collection_file_item"]),
models.Index(name="release_data_id_idx", fields=["data"]),
models.Index(name="release_package_data_id_idx", fields=["package_data"]),
Expand All @@ -348,7 +349,8 @@ class Record(models.Model):
"""A record."""

collection = models.ForeignKey(Collection, on_delete=models.CASCADE, db_index=False)
collection_file_item = models.ForeignKey(CollectionFileItem, on_delete=models.CASCADE, db_index=False)
collection_file = models.ForeignKey(CollectionFile, on_delete=models.CASCADE, db_index=False, null=True)
collection_file_item = models.ForeignKey(CollectionFileItem, on_delete=models.CASCADE, db_index=False, null=True)

ocid = models.TextField(blank=True)

Expand All @@ -362,6 +364,7 @@ class Meta:
models.Index(fields=["collection", "ocid"]),
# ForeignKey with db_index=False.
models.Index(name="record_collection_id_idx", fields=["collection"]),
models.Index(name="record_collection_file_id_idx", fields=["collection_file"]),
models.Index(name="record_collection_file_item_id_idx", fields=["collection_file_item"]),
models.Index(name="record_data_id_idx", fields=["data"]),
models.Index(name="record_package_data_id_idx", fields=["package_data"]),
Expand All @@ -377,7 +380,8 @@ class CompiledRelease(models.Model):
"""A compiled release."""

collection = models.ForeignKey(Collection, on_delete=models.CASCADE, db_index=False)
collection_file_item = models.ForeignKey(CollectionFileItem, on_delete=models.CASCADE, db_index=False)
collection_file = models.ForeignKey(CollectionFile, on_delete=models.CASCADE, db_index=False, null=True)
collection_file_item = models.ForeignKey(CollectionFileItem, on_delete=models.CASCADE, db_index=False, null=True)

ocid = models.TextField(blank=True)
release_date = models.TextField(blank=True)
Expand All @@ -391,6 +395,7 @@ class Meta:
models.Index(fields=["collection", "ocid"]),
# ForeignKey with db_index=False.
models.Index(name="compiled_release_collection_id_idx", fields=["collection"]),
models.Index(name="compiled_release_collection_file_id_idx", fields=["collection_file"]),
models.Index(name="compiled_release_collection_file_item_id_idx", fields=["collection_file_item"]),
models.Index(name="compiled_release_data_id_idx", fields=["data"]),
]
Expand Down
7 changes: 2 additions & 5 deletions process/processors/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from ocdsextensionregistry import ProfileBuilder
from ocdsextensionregistry.exceptions import ExtensionWarning

from process.models import CollectionFile, CollectionFileItem, CollectionNote, CompiledRelease, Data
from process.models import CollectionFile, CollectionNote, CompiledRelease, Data
from process.util import create_note, create_warnings_note, get_or_create

logger = logging.getLogger(__name__)
Expand All @@ -18,14 +18,11 @@ def save_compiled_release(merged, collection, ocid):
collection_file = CollectionFile(collection=collection, filename=f"{ocid}.json")
collection_file.save()

collection_file_item = CollectionFileItem(collection_file=collection_file, number=0)
collection_file_item.save()

data = get_or_create(Data, merged)

release = CompiledRelease(
collection=collection,
collection_file_item=collection_file_item,
collection_file=collection_file,
data=data,
ocid=ocid,
release_date=merged.get("date") or "",
Expand Down
Loading

0 comments on commit fb3746e

Please sign in to comment.