[u 6/6] Configure mirroring per source (#7066)

nadove-ucsc · nadove-ucsc · commit 81888a757d7f · 2025-10-14T22:51:50.000-07:00
diff --git a/src/azul/indexer/__init__.py b/src/azul/indexer/__init__.py
@@ -10,6 +10,7 @@
 )
 import json
 import logging
+
 import math
 from typing import (
     Any,
@@ -412,7 +413,7 @@ class SourceConfig(SerializableAttrs):
     """
     Configuration
     """
-    pass
+    mirror: bool = attrs.field(default=True)
 
 
 @attrs.frozen(kw_only=True, order=True)
diff --git a/src/azul/indexer/mirror_controller.py b/src/azul/indexer/mirror_controller.py
@@ -136,28 +136,32 @@ def mirror_source(self, catalog: CatalogName, source_json: JSON):
         source = plugin.source_ref_cls.from_json(source_json)
         assert source.id in plugin.list_source_ids(authentication=None), R(
             'Cannot mirror non-public source', source)
-        # The desired partition size depends on the maximum number of messages
-        # we can send in one Lambda invocation, because queueing the individual
-        # mirror_file messages turns out to dominate the running time of
-        # handling a mirror_source message.
-        partition_size = int(
-            aws.sqs_fifo_rate_limit  # max. # of SendMessage calls per second
-            * self.client.queues.batch_size  # number of messages per call
-            * config.mirror_lambda_timeout  # max. duration of the invocation
-            / config.mirroring_concurrency  # number of concurrent invocations
-            / 2  # safety margin
-        )
-        source = plugin.partition_source_for_mirroring(catalog, source, partition_size)
-        prefix = source.prefix
-        log.info('Queueing %d partitions of source %r in catalog %r',
-                 prefix.num_partitions, str(source.spec), catalog)
+        if not source.config.mirror:
+            log.info('Not mirroring source % r in catalog %r because `no_mirror` flag is present',
+                     str(source.spec), str(catalog))
+        else:
+            # The desired partition size depends on the maximum number of messages
+            # we can send in one Lambda invocation, because queueing the individual
+            # mirror_file messages turns out to dominate the running time of
+            # handling a mirror_source message.
+            partition_size = int(
+                aws.sqs_fifo_rate_limit  # max. # of SendMessage calls per second
+                * self.client.queues.batch_size  # number of messages per call
+                * config.mirror_lambda_timeout  # max. duration of the invocation
+                / config.mirroring_concurrency  # number of concurrent invocations
+                / 2  # safety margin
+            )
+            source = plugin.partition_source_for_mirroring(catalog, source, partition_size)
+            prefix = source.prefix
+            log.info('Queueing %d partitions of source %r in catalog %r',
+                     prefix.num_partitions, str(source.spec), catalog)
 
-        def message(partition: str) -> SQSMessage:
-            log.debug('Queueing partition %r', partition)
-            return self.mirror_partition_message(catalog, source, partition)
+            def message(partition: str) -> SQSMessage:
+                log.debug('Queueing partition %r', partition)
+                return self.mirror_partition_message(catalog, source, partition)
 
-        messages = map(message, prefix.partition_prefixes())
-        self.client.queue_mirror_messages(messages)
+            messages = map(message, prefix.partition_prefixes())
+            self.client.queue_mirror_messages(messages)
 
     def mirror_partition(self,
                          catalog: CatalogName,
diff --git a/test/integration_test.py b/test/integration_test.py
@@ -296,7 +296,8 @@ def managed_access_sources_by_catalog(self
     def _select_source(self,
                        catalog: CatalogName,
                        *,
-                       public: bool | None = None
+                       public: bool | None = None,
+                       mirror: bool = False,
                        ) -> SourceRef | None:
         """
         Choose an indexed source at random.
@@ -309,6 +310,11 @@ def _select_source(self,
                        public sources. If false, choose a non-public source, or
                        return `None` if the catalog contains no non-public
                        sources.
+
+        :param mirror: If true, choose a source where the `no_mirror` flag is
+                       not present, or return `None` if the catalog contains no
+                       such source. If false, choose a source regardless of
+                       whether this flag is present.
         """
         plugin = self.repository_plugin(catalog)
         sources = set(plugin.sources)
@@ -327,6 +333,12 @@ def _select_source(self,
                 sources &= ma_sources
             else:
                 assert False, public
+        if mirror:
+            sources = {
+                source
+                for source in sources
+                if 'no_mirror' not in SourceSpec.parse_flags_only(source)
+            }
         if len(sources) == 0:
             assert public is False, 'An IT catalog must contain at least one public source'
             return None
@@ -1710,7 +1722,7 @@ def _test_mirroring(self, *, delete: bool):
                 if c.is_integration_test_catalog and c.mirror_max_file_size >= 0
             ]
             sources_by_catalog = {
-                catalog: [self._select_source(catalog, public=True)]
+                catalog: [self._select_source(catalog, public=True, mirror=True)]
                 for catalog in catalogs
             }