Skip to content

Commit 14cd533

Browse files
Merge pull request #241 from bento-platform/feat/discovery/config-model
feat: discovery module with config model
2 parents 6c98817 + aeb7854 commit 14cd533

22 files changed

+949
-1
lines changed

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,15 @@ additional code to set up error handling and basic Bento service boilerplate.
9797

9898
`db` contains common base classes for setting up database managers.
9999

100+
### `discovery`
101+
102+
`discovery` contains models and helper functions for the Bento Discovery Configuration specification, used
103+
in [Katsu](https://github.com/bento-platform/katsu).
104+
105+
#### Guides
106+
107+
* [Discovery configuration: structure and validation](./docs/discovery/discovery_config.md)
108+
100109
### `drs`
101110

102111
`drs` provides utilities for fetching data and record metadata from

bento_lib/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from . import apps
44
from . import auth
5+
from . import discovery
56
from . import drs
67
from . import events
78
from . import schemas
@@ -15,6 +16,7 @@
1516
"__version__",
1617
"apps",
1718
"auth",
19+
"discovery",
1820
"drs",
1921
"events",
2022
"schemas",

bento_lib/_internal.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
import logging
2+
from structlog.stdlib import BoundLogger, get_logger
23

34
__all__ = ["internal_logger"]
45

56
logging.basicConfig(level=logging.NOTSET)
67

7-
internal_logger = logging.getLogger("bento_lib")
8+
internal_logger: BoundLogger = get_logger("bento_lib")

bento_lib/discovery/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from . import helpers
2+
from . import models
3+
4+
__all__ = ["helpers", "models"]

bento_lib/discovery/exceptions.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
__all__ = ["DiscoveryValidationError"]
2+
3+
4+
class DiscoveryValidationError(Exception):
5+
pass

bento_lib/discovery/helpers.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
from pathlib import Path
2+
from structlog.stdlib import BoundLogger
3+
4+
from bento_lib._internal import internal_logger
5+
from .exceptions import DiscoveryValidationError
6+
from .models.config import DiscoveryConfig
7+
8+
__all__ = [
9+
"load_discovery_config",
10+
]
11+
12+
13+
FIELD_DEF_NOT_FOUND = "field definition not found"
14+
FIELD_ALREADY_SEEN = "field already seen"
15+
16+
17+
def _load_discovery_config_json(config_path: Path | str) -> DiscoveryConfig:
18+
with open(config_path, "r") as fh:
19+
return DiscoveryConfig.model_validate_json(fh.read())
20+
21+
22+
def _validate_references_and_duplicates(cfg: DiscoveryConfig, logger: BoundLogger) -> None:
23+
fields = cfg.fields
24+
25+
# validate overview and check for chart duplicates:
26+
seen_chart_fields: set[str] = set()
27+
for s_idx, section in enumerate(cfg.overview):
28+
for c_idx, chart in enumerate(section.charts):
29+
exc_path = (
30+
f"overview > section {section.section_title} [{s_idx}] > {chart.field} {chart.chart_type} [{c_idx}]"
31+
)
32+
if chart.field not in fields:
33+
logger.error(
34+
f"overview {FIELD_DEF_NOT_FOUND}", section=section.section_title, field=chart.field, chart_idx=c_idx
35+
)
36+
raise DiscoveryValidationError(f"{exc_path}: {FIELD_DEF_NOT_FOUND}")
37+
if chart.field in seen_chart_fields:
38+
logger.error(
39+
f"overview {FIELD_ALREADY_SEEN}", section=section.section_title, field=chart.field, chart_idx=c_idx
40+
)
41+
raise DiscoveryValidationError(f"{exc_path}: {FIELD_ALREADY_SEEN}")
42+
seen_chart_fields.add(chart.field)
43+
44+
# validate search:
45+
seen_search_fields: set[str] = set()
46+
for s_idx, section in enumerate(cfg.search):
47+
for f_idx, f in enumerate(section.fields):
48+
exc_path = f"search > section {section.section_title} [{s_idx}] > {f} [{f_idx}]"
49+
if f not in fields:
50+
logger.error(f"search {FIELD_DEF_NOT_FOUND}", section=section.section_title, field=f)
51+
raise DiscoveryValidationError(f"{exc_path}: {FIELD_DEF_NOT_FOUND}")
52+
if f in seen_search_fields:
53+
logger.error(f"search {FIELD_ALREADY_SEEN}", section=section.section_title, field=f)
54+
raise DiscoveryValidationError(f"{exc_path}: {FIELD_ALREADY_SEEN}")
55+
seen_search_fields.add(f)
56+
57+
# issue warnings if there are fields defined that the config doesn't reference:
58+
referenced_fields = seen_chart_fields | seen_search_fields
59+
for fi, f in enumerate(fields.keys()):
60+
if f not in referenced_fields:
61+
logger.warning("field not referenced", field=f, field_idx=fi)
62+
63+
64+
def load_discovery_config(config_path: Path | str, logger: BoundLogger | None = None) -> DiscoveryConfig:
65+
# 1. load the config object (or raise a Pydantic validation error if the config is in the wrong format)
66+
cfg = _load_discovery_config_json(config_path)
67+
68+
# 2. validate the config's internal references and overview chart/search field entries
69+
# a) make sure all fields in overview and search are defined
70+
# b) make sure fields are not listed more than once as a chart or as a search filter
71+
# c) issue warnings if any fields are defined that the config doesn't reference anywhere
72+
_validate_references_and_duplicates(cfg, logger or internal_logger)
73+
74+
# now that we've validated references, return the config
75+
return cfg
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from . import config, fields, overview, search
2+
3+
__all__ = [
4+
"config",
5+
"fields",
6+
"overview",
7+
"search",
8+
]

bento_lib/discovery/models/config.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from pydantic import BaseModel
2+
3+
from .fields import FieldDefinition
4+
from .overview import OverviewSection
5+
from .search import SearchSection
6+
7+
__all__ = [
8+
"DiscoveryConfigRules",
9+
"DiscoveryConfig",
10+
]
11+
12+
13+
class DiscoveryConfigRules(BaseModel):
14+
count_threshold: int
15+
max_query_parameters: int
16+
17+
18+
class DiscoveryConfig(BaseModel):
19+
overview: list[OverviewSection]
20+
search: list[SearchSection]
21+
fields: dict[str, FieldDefinition]
22+
rules: DiscoveryConfigRules

bento_lib/discovery/models/fields.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
from pydantic import AliasChoices, BaseModel, Field
2+
from typing import Literal
3+
4+
__all__ = [
5+
"BaseFieldDefinition",
6+
# string
7+
"StringFieldConfig",
8+
"StringFieldDefinition",
9+
# number
10+
"BaseNumberFieldConfig",
11+
"ManualBinsNumberFieldConfig",
12+
"AutoBinsNumberFieldConfig",
13+
"NumberFieldDefinition",
14+
# date
15+
"DateFieldConfig",
16+
"DateFieldDefinition",
17+
# sum type:
18+
"FieldDefinition",
19+
]
20+
21+
22+
DataTypeField = Field(validation_alias=AliasChoices("data_type", "datatype"))
23+
24+
25+
class BaseFieldDefinition(BaseModel):
26+
mapping: str
27+
title: str # TODO: make optional and pull from Bento schema if not set
28+
description: str # TODO: make optional and pull from Bento schema if not set
29+
data_type: Literal["string", "number", "date"] = DataTypeField
30+
# --- The below fields are currently valid, but need to be reworked for new search ---------------------------------
31+
mapping_for_search_filter: str | None = None
32+
group_by: str | None = None
33+
group_by_value: str | None = None
34+
value_mapping: str | None = None
35+
# ------------------------------------------------------------------------------------------------------------------
36+
37+
38+
class StringFieldConfig(BaseModel):
39+
enum: list[str] | None
40+
41+
42+
class StringFieldDefinition(BaseFieldDefinition):
43+
data_type: Literal["string"] = DataTypeField
44+
config: StringFieldConfig
45+
46+
47+
class BaseNumberFieldConfig(BaseModel):
48+
units: str
49+
50+
51+
class ManualBinsNumberFieldConfig(BaseNumberFieldConfig):
52+
bins: list[int | float]
53+
54+
55+
class AutoBinsNumberFieldConfig(BaseNumberFieldConfig):
56+
bin_size: int
57+
taper_left: int
58+
taper_right: int
59+
minimum: int
60+
maximum: int
61+
62+
63+
class NumberFieldDefinition(BaseFieldDefinition):
64+
data_type: Literal["number"] = DataTypeField
65+
config: ManualBinsNumberFieldConfig | AutoBinsNumberFieldConfig
66+
67+
68+
class DateFieldConfig(BaseModel):
69+
bin_by: Literal["month"] # Currently only binning by month is implemented
70+
71+
72+
class DateFieldDefinition(BaseFieldDefinition):
73+
data_type: Literal["date"] = DataTypeField
74+
config: DateFieldConfig
75+
76+
77+
FieldDefinition = DateFieldDefinition | NumberFieldDefinition | StringFieldDefinition
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from pydantic import BaseModel
2+
from typing import Literal
3+
4+
__all__ = [
5+
"OverviewChart",
6+
"OverviewSection",
7+
]
8+
9+
10+
class OverviewChart(BaseModel):
11+
field: str
12+
chart_type: Literal["bar", "choropleth", "histogram", "pie"]
13+
14+
15+
class OverviewSection(BaseModel):
16+
section_title: str
17+
charts: list[OverviewChart]

0 commit comments

Comments
 (0)