Skip to content

Commit 20ae036

Browse files
TallJimbotimj
authored andcommitted
Move data ID expansion implementation to DataCoordinate.
This adds a new method to Registry and SqlRegistry without a RemoteRegistry implementation, and that's why it isn't decorated with abstractmethod yet. I'll fix that after I make the DimensionRecord containers serializable. This also adds a check to DataCoordinate.standardize that any passed keys are actually dimension names; this addresses a long-standing problem where users would get a keyword argument to some higher-level API wrong, and it would be forwarded as **kwargs down to DataCoordinate.standardize and then silently ignored. And it turns out we were doing that even in our own test utility code!
1 parent decbdc9 commit 20ae036

File tree

6 files changed

+139
-86
lines changed

6 files changed

+139
-86
lines changed

python/lsst/daf/butler/core/_containers/_dimension_record/_abstract_set.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@
2121

2222
from __future__ import annotations
2323

24-
__all__ = ("HeterogeneousDimensionRecordAbstractSet", "HomogeneousDimensionRecordAbstractSet")
24+
__all__ = (
25+
"HeterogeneousDimensionRecordAbstractSet",
26+
"HomogeneousDimensionRecordAbstractSet",
27+
)
2528

2629
from abc import abstractmethod
2730
from typing import Any, Collection, Iterator, Mapping

python/lsst/daf/butler/core/dimensions/_coordinate.py

Lines changed: 84 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,14 @@
2626

2727
from __future__ import annotations
2828

29-
__all__ = ("DataCoordinate", "DataId", "DataIdKey", "DataIdValue", "SerializedDataCoordinate")
29+
__all__ = (
30+
"DataCoordinate",
31+
"DataId",
32+
"DataIdKey",
33+
"DataIdValue",
34+
"InconsistentDataIdError",
35+
"SerializedDataCoordinate",
36+
)
3037

3138
from abc import abstractmethod
3239
import numbers
@@ -53,6 +60,7 @@
5360

5461
if TYPE_CHECKING: # Imports needed only for type annotations; may be circular.
5562
from ._universe import DimensionUniverse
63+
from .._containers import HeterogeneousDimensionRecordAbstractSet
5664
from ...registry import Registry
5765

5866
DataIdKey = Union[str, Dimension]
@@ -67,6 +75,12 @@
6775
"""
6876

6977

78+
class InconsistentDataIdError(ValueError):
79+
"""Exception raised when a data ID contains contradictory key-value pairs,
80+
according to dimension relationships.
81+
"""
82+
83+
7084
class SerializedDataCoordinate(BaseModel):
7185
"""Simplified model for serializing a `DataCoordinate`."""
7286

@@ -149,6 +163,7 @@ def standardize(
149163
graph: Optional[DimensionGraph] = None,
150164
universe: Optional[DimensionUniverse] = None,
151165
defaults: Optional[DataCoordinate] = None,
166+
records: Optional[HeterogeneousDimensionRecordAbstractSet] = None,
152167
**kwargs: Any
153168
) -> DataCoordinate:
154169
"""Standardize the supplied dataId.
@@ -173,6 +188,10 @@ def standardize(
173188
Default dimension key-value pairs to use when needed. These are
174189
never used to infer ``graph``, and are ignored if a different value
175190
is provided for the same key in ``mapping`` or `**kwargs``.
191+
records : `HeterogeneousDimensionRecordAbstractSet`, optional
192+
Container of `DimensionRecord` instances that may be used to
193+
fill in missing keys and/or attach records. If provided, the
194+
returned object is guaranteed to have `hasRecords` return `True`.
176195
**kwargs
177196
Additional keyword arguments are treated like additional key-value
178197
pairs in ``mapping``.
@@ -190,33 +209,41 @@ def standardize(
190209
Raised if a key-value pair for a required dimension is missing.
191210
"""
192211
d: Dict[str, DataIdValue] = {}
212+
r: Dict[str, Optional[DimensionRecord]] = {}
193213
if isinstance(mapping, DataCoordinate):
194214
if graph is None:
195-
if not kwargs:
215+
if not kwargs and records is None:
196216
# Already standardized to exactly what we want.
197217
return mapping
198-
elif kwargs.keys().isdisjoint(graph.dimensions.names):
199-
# User provided kwargs, but told us not to use them by
200-
# passing in dimensions that are disjoint from those kwargs.
201-
# This is not necessarily user error - it's a useful pattern
202-
# to pass in all of the key-value pairs you have and let the
203-
# code here pull out only what it needs.
204-
return mapping.subset(graph)
205218
assert universe is None or universe == mapping.universe
206219
universe = mapping.universe
207220
d.update((name, mapping[name]) for name in mapping.graph.required.names)
208221
if mapping.hasFull():
209222
d.update((name, mapping[name]) for name in mapping.graph.implied.names)
223+
if mapping.hasRecords():
224+
r.update((name, mapping.records[name]) for name in mapping.graph.elements.names)
210225
elif isinstance(mapping, NamedKeyMapping):
211226
d.update(mapping.byName())
212227
elif mapping is not None:
213228
d.update(mapping)
214229
d.update(kwargs)
215-
if graph is None:
216-
if defaults is not None:
230+
if universe is None:
231+
if graph is not None:
232+
universe = graph.universe
233+
elif defaults is not None:
217234
universe = defaults.universe
218-
elif universe is None:
219-
raise TypeError("universe must be provided if graph is not.")
235+
else:
236+
raise TypeError("universe must be provided if graph and defaults are not.")
237+
if not (d.keys() <= universe.getStaticDimensions().names):
238+
# We silently ignore keys that aren't relevant for this particular
239+
# data ID, but keys that aren't relevant for any possible data ID
240+
# are a bug that we want to report to the user. This is especially
241+
# important because other code frequently forwards unrecognized
242+
# kwargs here.
243+
raise ValueError(
244+
f"Unrecognized key(s) for data ID: {d.keys() - universe.getStaticDimensions().names}."
245+
)
246+
if graph is None:
220247
graph = DimensionGraph(universe, names=d.keys())
221248
if not graph.dimensions:
222249
return DataCoordinate.makeEmpty(graph.universe)
@@ -227,6 +254,46 @@ def standardize(
227254
else:
228255
for k, v in defaults.items():
229256
d.setdefault(k.name, v)
257+
if records is not None:
258+
for element in graph.primaryKeyTraversalOrder:
259+
record = r.get(element.name, ...) # Use ... to mean not found; None might mean NULL
260+
if record is ...:
261+
if isinstance(element, Dimension) and d.get(element.name) is None:
262+
if element in graph.required:
263+
raise LookupError(
264+
f"No value or null value for required dimension {element.name}."
265+
)
266+
d[element.name] = None
267+
record = None
268+
else:
269+
subset_data_id = DataCoordinate.standardize(d, graph=element.graph)
270+
try:
271+
record = records.by_definition[element].by_data_id[subset_data_id]
272+
except KeyError:
273+
record = None
274+
r[element.name] = record
275+
if record is not None:
276+
for dimension in element.implied:
277+
value = getattr(record, dimension.name)
278+
if d.setdefault(dimension.name, value) != value:
279+
raise InconsistentDataIdError(
280+
f"Data ID {d} has {dimension.name}={d[dimension.name]!r}, "
281+
f"but {element.name} implies {dimension.name}={value!r}."
282+
)
283+
else:
284+
if element in graph.required:
285+
raise LookupError(
286+
f"Could not find record for required dimension {element.name} via {d}."
287+
)
288+
if element.alwaysJoin:
289+
raise InconsistentDataIdError(
290+
f"Could not fetch record for element {element.name} via {d}, ",
291+
"but it is marked alwaysJoin=True; this means one or more dimensions are not "
292+
"related.",
293+
)
294+
for dimension in element.implied:
295+
d.setdefault(dimension.name, None)
296+
r.setdefault(dimension.name, None)
230297
if d.keys() >= graph.dimensions.names:
231298
values = tuple(d[name] for name in graph._dataCoordinateIndices.keys())
232299
else:
@@ -238,7 +305,10 @@ def standardize(
238305
# numbers.Integral; convert that to int.
239306
values = tuple(int(val) if isinstance(val, numbers.Integral) # type: ignore
240307
else val for val in values)
241-
return _BasicTupleDataCoordinate(graph, values)
308+
result: DataCoordinate = _BasicTupleDataCoordinate(graph, values)
309+
if r.keys() >= graph.elements.names:
310+
result = result.expanded(r)
311+
return result
242312

243313
@staticmethod
244314
def makeEmpty(universe: DimensionUniverse) -> DataCoordinate:

python/lsst/daf/butler/registries/sql.py

Lines changed: 16 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,10 @@
6060
DimensionGraph,
6161
DimensionRecord,
6262
DimensionUniverse,
63+
HeterogeneousDimensionRecordCache,
6364
HomogeneousDimensionRecordIterable,
64-
NamedKeyMapping,
6565
NameLookupMapping,
6666
Progress,
67-
ScalarDataCoordinateSet,
6867
StorageClassFactory,
6968
Timespan,
7069
)
@@ -76,7 +75,6 @@
7675
CollectionType,
7776
RegistryDefaults,
7877
ConflictingDefinitionError,
79-
InconsistentDataIdError,
8078
OrphanedRecordError,
8179
CollectionSearch,
8280
)
@@ -619,6 +617,14 @@ def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
619617
# Docstring inherited from lsst.daf.butler.registry.Registry
620618
return self._managers.datastores.findDatastores(ref)
621619

620+
def getDimensionRecordCache(self) -> HeterogeneousDimensionRecordCache:
621+
# Docstring inherited.
622+
callbacks = {
623+
element.name: self._managers.dimensions[element].fetch
624+
for element in self.dimensions.getStaticElements()
625+
}
626+
return HeterogeneousDimensionRecordCache(self.dimensions, callbacks)
627+
622628
def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
623629
records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,
624630
withDefaults: bool = True,
@@ -628,63 +634,13 @@ def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[Dimen
628634
defaults = None
629635
else:
630636
defaults = self.defaults.dataId
631-
standardized = DataCoordinate.standardize(dataId, graph=graph, universe=self.dimensions,
632-
defaults=defaults, **kwargs)
633-
if standardized.hasRecords():
634-
return standardized
635-
if records is None:
636-
records = {}
637-
elif isinstance(records, NamedKeyMapping):
638-
records = records.byName()
639-
else:
640-
records = dict(records)
641-
if isinstance(dataId, DataCoordinate) and dataId.hasRecords():
642-
records.update(dataId.records.byName())
643-
keys = standardized.byName()
644-
for element in standardized.graph.primaryKeyTraversalOrder:
645-
record = records.get(element.name, ...) # Use ... to mean not found; None might mean NULL
646-
if record is ...:
647-
if isinstance(element, Dimension) and keys.get(element.name) is None:
648-
if element in standardized.graph.required:
649-
raise LookupError(
650-
f"No value or null value for required dimension {element.name}."
651-
)
652-
keys[element.name] = None
653-
record = None
654-
else:
655-
storage = self._managers.dimensions[element]
656-
dataIdSet = ScalarDataCoordinateSet(
657-
DataCoordinate.standardize(keys, graph=element.graph)
658-
)
659-
fetched = tuple(storage.fetch(dataIdSet))
660-
try:
661-
(record,) = fetched
662-
except ValueError:
663-
record = None
664-
records[element.name] = record
665-
if record is not None:
666-
for d in element.implied:
667-
value = getattr(record, d.name)
668-
if keys.setdefault(d.name, value) != value:
669-
raise InconsistentDataIdError(
670-
f"Data ID {standardized} has {d.name}={keys[d.name]!r}, "
671-
f"but {element.name} implies {d.name}={value!r}."
672-
)
673-
else:
674-
if element in standardized.graph.required:
675-
raise LookupError(
676-
f"Could not fetch record for required dimension {element.name} via keys {keys}."
677-
)
678-
if element.alwaysJoin:
679-
raise InconsistentDataIdError(
680-
f"Could not fetch record for element {element.name} via keys {keys}, ",
681-
"but it is marked alwaysJoin=True; this means one or more dimensions are not "
682-
"related."
683-
)
684-
for d in element.implied:
685-
keys.setdefault(d.name, None)
686-
records.setdefault(d.name, None)
687-
return DataCoordinate.standardize(keys, graph=standardized.graph).expanded(records=records)
637+
cache = self.getDimensionRecordCache()
638+
if records is not None:
639+
for record in records.values():
640+
if record is not None:
641+
cache.add(record)
642+
return DataCoordinate.standardize(dataId, graph=graph, defaults=defaults, records=cache,
643+
universe=self.dimensions, **kwargs)
688644

689645
def insertDimensionData(self, element: Union[DimensionElement, str],
690646
*data: Union[Mapping[str, Any], DimensionRecord],

python/lsst/daf/butler/registry/_exceptions.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,9 @@
2828
"UnsupportedIdGeneratorError",
2929
)
3030

31-
32-
class InconsistentDataIdError(ValueError):
33-
"""Exception raised when a data ID contains contradictory key-value pairs,
34-
according to dimension relationships.
35-
"""
31+
# This exception moved for intra-daf_butler dependency reasons; import here and
32+
# re-export for backwards compatibility.
33+
from ..core import InconsistentDataIdError
3634

3735

3836
class ConflictingDefinitionError(Exception):

python/lsst/daf/butler/registry/_registry.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@
6060
DimensionGraph,
6161
DimensionRecord,
6262
DimensionUniverse,
63+
HeterogeneousDimensionRecordCache,
6364
HomogeneousDimensionRecordIterable,
6465
NameLookupMapping,
6566
StorageClassFactory,
@@ -1007,6 +1008,36 @@ def getDatasetLocations(self, ref: DatasetRef) -> Iterable[str]:
10071008
"""
10081009
raise NotImplementedError()
10091010

1011+
def getDimensionRecordCache(self) -> HeterogeneousDimensionRecordCache:
1012+
"""Return a container that fetches and caches `DimensionRecord` objects
1013+
from the database.
1014+
1015+
Returns
1016+
-------
1017+
cache : `HeterogeneousDimensionRecordCache`
1018+
A container that directly holds already-fetched `DimensionRecord`
1019+
objects and automatically fetches new ones as requested (see class
1020+
documentation for more information).
1021+
1022+
Notes
1023+
-----
1024+
This provides a simpler, faster interface for fetching dimension data
1025+
when the data IDs desired are already known exactly; use
1026+
`queryDimensionRecords` to return records via a more flexible
1027+
expression. One can also use the result of a call to
1028+
`queryDimensionRecords` to directly populate a cache::
1029+
1030+
cache = butler.registry.getDimensionRecordCache()
1031+
cache.update(butler.registry.queryDimensionRecords(...))
1032+
1033+
To obtain a container that does not automatically fetch missing
1034+
records, construct a `HeterogeneousDimensionRecordSet` from the cache::
1035+
1036+
cache = butler.registry.getDimensionRecordCache()
1037+
set = HeterogeneousDimensionRecordSet(cache.universe, cache)
1038+
"""
1039+
raise NotImplementedError()
1040+
10101041
@abstractmethod
10111042
def expandDataId(self, dataId: Optional[DataId] = None, *, graph: Optional[DimensionGraph] = None,
10121043
records: Optional[NameLookupMapping[DimensionElement, Optional[DimensionRecord]]] = None,

python/lsst/daf/butler/tests/utils.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -249,10 +249,6 @@ def addDataset(self, dataId, run=None, datasetType=None):
249249
"""Create a new example metric and add it to the named run with the
250250
given dataId.
251251
252-
Overwrites tags, so this does not try to associate the new dataset with
253-
existing tags. (If/when tags are needed this can be added to the
254-
arguments of this function.)
255-
256252
Parameters
257253
----------
258254
dataId : `dict`
@@ -270,5 +266,4 @@ def addDataset(self, dataId, run=None, datasetType=None):
270266
self.butler.put(metric,
271267
self.datasetType if datasetType is None else datasetType,
272268
dataId,
273-
run=run,
274-
tags=())
269+
run=run)

0 commit comments

Comments
 (0)