aiidateam
diff --git a/‎docs/source/nitpick-exceptions‎
Lines changed: 2 additions & 0 deletions b/‎docs/source/nitpick-exceptions‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/aiida/cmdline/commands/cmd_archive.py‎
Lines changed: 7 additions & 1 deletion b/‎src/aiida/cmdline/commands/cmd_archive.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎src/aiida/common/utils.py‎
Lines changed: 56 additions & 1 deletion b/‎src/aiida/common/utils.py‎
Lines changed: 56 additions & 1 deletion
diff --git a/‎src/aiida/storage/sqlite_zip/migrations/legacy_to_main.py‎
Lines changed: 1 addition & 1 deletion b/‎src/aiida/storage/sqlite_zip/migrations/legacy_to_main.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/aiida/tools/archive/common.py‎
Lines changed: 1 addition & 25 deletions b/‎src/aiida/tools/archive/common.py‎
Lines changed: 1 addition & 25 deletions
@@ -127,6 +127,8 @@ py:class aiida.cmdline.params.types.choice.T
 py:class aiida.common.lang.T
 py:class aiida.engine.processes.functions.N
 py:class aiida.engine.processes.functions.R_co
+py:class aiida.common.utils.T
+py:class aiida.common.utils.R
 
 ### third-party packages
 # Note: These exceptions are needed if
 
@@ -26,6 +26,7 @@
 from aiida.common.exceptions import CorruptStorage, IncompatibleStorageSchema, UnreachableStorage
 from aiida.common.links import GraphTraversalRules
 from aiida.common.log import AIIDA_LOGGER
+from aiida.common.utils import DEFAULT_BATCH_SIZE, DEFAULT_FILTER_SIZE
 
 EXTRAS_MODE_EXISTING = ['keep_existing', 'update_existing', 'mirror', 'none']
 EXTRAS_MODE_NEW = ['import', 'none']
@@ -130,7 +131,11 @@ def inspect(ctx, archive, version, meta_data, database):
 )
 @click.option('--compress', default=6, show_default=True, type=int, help='Level of compression to use (0-9).')
 @click.option(
-    '-b', '--batch-size', default=1000, type=int, help='Stream database rows in batches, to reduce memory usage.'
+    '-b',
+    '--batch-size',
+    default=DEFAULT_BATCH_SIZE,
+    type=int,
+    help='Stream database rows in batches, to reduce memory usage.',
 )
 @click.option(
     '--test-run',
@@ -210,6 +215,7 @@ def create(
         'overwrite': force,
         'compression': compress,
         'batch_size': batch_size,
+        'filter_size': DEFAULT_FILTER_SIZE,  # Implementation detail, not exposed to user via CLI
         'test_run': dry_run,
     }
 
 
@@ -18,7 +18,7 @@
 import sys
 from collections.abc import Iterable, Iterator
 from datetime import datetime, timedelta
-from typing import TYPE_CHECKING, Any, Callable
+from typing import TYPE_CHECKING, Any, Callable, TypeVar, overload
 from uuid import UUID
 
 from aiida.common.typing import Self
@@ -32,6 +32,9 @@
     except ImportError:
         from typing_extensions import TypeAlias
 
+T = TypeVar('T')
+R = TypeVar('R')
+
 
 def get_new_uuid() -> str:
     """Return a new UUID (typically to be used for new nodes)."""
@@ -608,3 +611,55 @@ def format_directory_size(size_in_bytes: int) -> str:
 
     # Format the size to two decimal places
     return f'{converted_size:.2f} {prefixes[index]}'
+
+
+@overload
+def batch_iter(iterable: Iterable[T], size: int, transform: None = None) -> Iterable[tuple[int, list[T]]]: ...
+
+
+@overload
+def batch_iter(iterable: Iterable[T], size: int, transform: Callable[[T], R]) -> Iterable[tuple[int, list[R]]]: ...
+
+
+def batch_iter(
+    iterable: Iterable[T], size: int, transform: Callable[[T], Any] | None = None
+) -> Iterable[tuple[int, list[Any]]]:
+    """Yield an iterable in batches of a set number of items.
+
+    Note, the final yield may be less than this size.
+
+    :param transform: a transform to apply to each item
+    :returns: (number of items, list of items)
+    """
+    transform = transform or (lambda x: x)
+    current = []
+    length = 0
+    for item in iterable:
+        current.append(transform(item))
+        length += 1
+        if length >= size:
+            yield length, current
+            current = []
+            length = 0
+    if current:
+        yield length, current
+
+
+# NOTE: `sqlite` has an `SQLITE_MAX_VARIABLE_NUMBER` compile-time flag.
+# On older `sqlite` versions, this was set to 999 by default,
+# while for newer versions it is generally higher, see:
+# https://www.sqlite.org/limits.html
+# If `DEFAULT_FILTER_SIZE` is set too high, the limit can be hit when large `IN` queries are
+# constructed through AiiDA, leading to SQLAlchemy `OperationalError`s.
+# On modern systems, the limit might be in the hundreds of thousands, however, as it is OS-
+# and/or Python version dependent and we don't know its size, we set the value to 999 for safety.
+# From manual benchmarking, this value for batching also seems to give reasonable performance.
+DEFAULT_FILTER_SIZE: int = 999
+
+# NOTE: `DEFAULT_BATCH_SIZE` controls how many database rows are fetched and processed at once during
+# streaming operations (e.g., `QueryBuilder.iterall()`, `QueryBuilder.iterdict()`). This prevents
+# loading entire large result sets into memory at once, which could cause memory exhaustion when
+# working with datasets containing thousands or millions of records. The value of 1000 provides a
+# balance between memory efficiency and database round-trip overhead. Setting it too low increases
+# the number of database queries needed, while setting it too high increases memory consumption.
+DEFAULT_BATCH_SIZE: int = 1000
@@ -131,7 +131,7 @@ def _json_to_sqlite(
     outpath: Path, data: dict, node_repos: Dict[str, List[Tuple[str, Optional[str]]]], batch_size: int = 100
 ) -> None:
     """Convert a JSON archive format to SQLite."""
-    from aiida.tools.archive.common import batch_iter
+    from aiida.common.utils import batch_iter
 
     from . import v1_db_schema as v1_schema
 
 
@@ -11,7 +11,7 @@
 import urllib.parse
 import urllib.request
 from html.parser import HTMLParser
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Type
+from typing import Dict, Type
 
 from aiida.orm import AuthInfo, Comment, Computer, Entity, Group, Log, Node, User
 from aiida.orm.entities import EntityTypes
@@ -28,30 +28,6 @@
 }
 
 
-def batch_iter(
-    iterable: Iterable[Any], size: int, transform: Optional[Callable[[Any], Any]] = None
-) -> Iterable[Tuple[int, List[Any]]]:
-    """Yield an iterable in batches of a set number of items.
-
-    Note, the final yield may be less than this size.
-
-    :param transform: a transform to apply to each item
-    :returns: (number of items, list of items)
-    """
-    transform = transform or (lambda x: x)
-    current = []
-    length = 0
-    for item in iterable:
-        current.append(transform(item))
-        length += 1
-        if length >= size:
-            yield length, current
-            current = []
-            length = 0
-    if current:
-        yield length, current
-
-
 class HTMLGetLinksParser(HTMLParser):
     """If a filter_extension is passed, only links with extension matching
     the given one will be returned.