diff --git a/cookiecutter/mapper-template/cookiecutter.json b/cookiecutter/mapper-template/cookiecutter.json index 9d7b6385f..073319d08 100644 --- a/cookiecutter/mapper-template/cookiecutter.json +++ b/cookiecutter/mapper-template/cookiecutter.json @@ -5,6 +5,7 @@ "mapper_id": "mapper-{{ cookiecutter.name.lower() }}", "library_name": "{{ cookiecutter.mapper_id.replace('-', '_') }}", "variant": "None (Skip)", + "faker_extra": false, "include_ci_files": ["GitHub", "None (Skip)"], "license": ["Apache-2.0"], "ide": ["VSCode", "None"], @@ -14,6 +15,7 @@ "admin_email": "Provide your [bold yellow]email[/]", "mapper_id": "The ID of the tap, in kebab-case", "library_name": "The name of the library, in snake_case. This is how the library will be imported in Python.", + "faker_extra": "Add [bold orange1][link=https://faker.readthedocs.io/en/master/]Faker[/link][/] as an extra dependency to support generating fake data in stream maps?", "include_ci_files": "Whether to include CI files for a common CI services", "license": "The license for the project", "ide": "Add configuration files for your preferred IDE" diff --git a/cookiecutter/mapper-template/{{cookiecutter.mapper_id}}/pyproject.toml b/cookiecutter/mapper-template/{{cookiecutter.mapper_id}}/pyproject.toml index d0ae4b1a9..fcbd018e4 100644 --- a/cookiecutter/mapper-template/{{cookiecutter.mapper_id}}/pyproject.toml +++ b/cookiecutter/mapper-template/{{cookiecutter.mapper_id}}/pyproject.toml @@ -31,7 +31,7 @@ packages = [ [tool.poetry.dependencies] python = ">=3.8" -singer-sdk = { version="~=0.34.1" } +singer-sdk = { version="~=0.34.1"{{ ', extras = ["faker"]' if cookiecutter.faker_extra }} } fs-s3fs = { version = "~=1.1.1", optional = true } [tool.poetry.group.dev.dependencies] diff --git a/cookiecutter/tap-template/cookiecutter.json b/cookiecutter/tap-template/cookiecutter.json index 4eae4d8ad..cd5c01bf9 100644 --- a/cookiecutter/tap-template/cookiecutter.json +++ b/cookiecutter/tap-template/cookiecutter.json @@ -14,6 +14,7 @@ "JWT", "Custom or N/A" ], + "faker_extra": false, "include_ci_files": ["GitHub", "None"], "license": ["Apache-2.0", "None"], "ide": ["VSCode", "None"], @@ -25,6 +26,7 @@ "library_name": "The name of the library, in snake_case. This is how the library will be imported in Python.", "stream_type": "The type of stream the source provides", "auth_method": "The [bold red]authentication[/] method used by the source, for REST and GraphQL sources", + "faker_extra": "Add [bold orange1][link=https://faker.readthedocs.io/en/master/]Faker[/link][/] as an extra dependency to support generating fake data in stream maps?", "include_ci_files": "Whether to include CI files for a common CI services", "license": "The license for the project", "ide": "Add configuration files for your preferred IDE" diff --git a/cookiecutter/tap-template/{{cookiecutter.tap_id}}/pyproject.toml b/cookiecutter/tap-template/{{cookiecutter.tap_id}}/pyproject.toml index b1902b724..a48b5f551 100644 --- a/cookiecutter/tap-template/{{cookiecutter.tap_id}}/pyproject.toml +++ b/cookiecutter/tap-template/{{cookiecutter.tap_id}}/pyproject.toml @@ -31,7 +31,7 @@ packages = [ [tool.poetry.dependencies] python = ">=3.8" importlib-resources = { version = "==6.1.*", python = "<3.9" } -singer-sdk = { version="~=0.34.1" } +singer-sdk = { version="~=0.34.1"{{ ', extras = ["faker"]' if cookiecutter.faker_extra }} } fs-s3fs = { version = "~=1.1.1", optional = true } {%- if cookiecutter.stream_type in ["REST", "GraphQL"] %} requests = "~=2.31.0" diff --git a/cookiecutter/target-template/cookiecutter.json b/cookiecutter/target-template/cookiecutter.json index 2490a31db..5c1ff2544 100644 --- a/cookiecutter/target-template/cookiecutter.json +++ b/cookiecutter/target-template/cookiecutter.json @@ -6,6 +6,7 @@ "library_name": "{{ cookiecutter.target_id.replace('-', '_') }}", "variant": "None (Skip)", "serialization_method": ["Per record", "Per batch", "SQL"], + "faker_extra": false, "include_ci_files": ["GitHub", "None (Skip)"], "license": ["Apache-2.0"], "ide": ["VSCode", "None"], @@ -16,6 +17,7 @@ "mapper_id": "The ID of the tap, in kebab-case", "library_name": "The name of the library, in snake_case. This is how the library will be imported in Python.", "serialization_method": "The serialization method to use for loading data", + "faker_extra": "Add [bold orange1][link=https://faker.readthedocs.io/en/master/]Faker[/link][/] as an extra dependency to support generating fake data in stream maps?", "include_ci_files": "Whether to include CI files for a common CI services", "license": "The license for the project", "ide": "Add configuration files for your preferred IDE" diff --git a/cookiecutter/target-template/{{cookiecutter.target_id}}/pyproject.toml b/cookiecutter/target-template/{{cookiecutter.target_id}}/pyproject.toml index 7bcceaf69..ebe332f8e 100644 --- a/cookiecutter/target-template/{{cookiecutter.target_id}}/pyproject.toml +++ b/cookiecutter/target-template/{{cookiecutter.target_id}}/pyproject.toml @@ -30,7 +30,7 @@ packages = [ [tool.poetry.dependencies] python = ">=3.8" -singer-sdk = { version="~=0.34.1" } +singer-sdk = { version="~=0.34.1"{{ ', extras = ["faker"]' if cookiecutter.faker_extra }} } fs-s3fs = { version = "~=1.1.1", optional = true } {%- if cookiecutter.serialization_method != "SQL" %} requests = "~=2.31.0" diff --git a/docs/stream_maps.md b/docs/stream_maps.md index 8d84d9ea0..b7bffd36b 100644 --- a/docs/stream_maps.md +++ b/docs/stream_maps.md @@ -80,23 +80,13 @@ These capabilities are all out of scope _by design_: a transformation tool like [dbt](https://www.getdbt.com), or (b) create a custom mapper plugin with inline lookup logic. -### A feature for all Singer users, enabled by the SDK +## A feature for all Singer users, enabled by the SDK -The mapping features described here are create for the **_users_** of SDK-based taps and targets. +The mapping features described here are created for the **_users_** of SDK-based taps and targets, which support inline transformations with `stream_maps` and `stream_map_config` out-of-box. -Developers simply enable the feature using the instructions below, and then users can benefit from having inline transformation capabilities out-of-box on their favorite taps and targets. +**Note:** to support non-SDK taps and targets, the standalone inline mapper plugin [`meltano-map-transformer`](https://hub.meltano.com/mappers/meltano-map-transformer/) follows all specifications defined here and can apply mapping transformations between _any_ Singer tap and target, even if they are not built using the SDK. -**Note:** to support non-SDK taps and targets, we are also creating a standalone inline mapper plugin (`meltano-map-transform`), which follows all specifications defined here and can apply mapping transformations between _any_ Singer tap and target, even if they are not built using the SDK. - -## Enabling Stream Maps in SDK-Based Plugins - -To support inline mapping functions, the developer only needs to declare two plugin settings, -called `stream_maps` and `stream_map_config`, and declare both settings as `object` type. (For example: -`Property("stream_maps, ObjectType())` if using the python helper classes or -`"stream_maps": {"type": "object"}` if using native JSON Schema declarations.) - -If the `stream_maps` setting is detected, the following behaviors will be implemented -by the SDK automatically: +The following behaviors are implemented by the SDK automatically: 1. For taps, the SCHEMA and RECORD messages will automatically be transformed, duplicated, filtered, or aliased, as per the `stream_maps` config settings _after_ @@ -108,7 +98,7 @@ by the SDK automatically: setting _prior_ to any Sink processing functions. - This means that the target developer can assume that all streams and records are transformed, aliased, filtered, etc. _before_ any custom target code is executed. -3. The upcoming standalone mapper plugin (`meltano-map-transform`) is a hybrid tap/target which +3. The standalone mapper plugin [`meltano-map-transformer`](https://hub.meltano.com/mappers/meltano-map-transformer/) is a hybrid tap/target which simply receives input from a tap, transforms all stream and schema messages via the `stream_maps` config option, and then emits the resulting stream(s) to a downstream target. @@ -122,8 +112,7 @@ by the SDK automatically: The `stream_maps` config expects a mapping of stream names to a structured transform object. -Here is a sample `stream_maps` transformation which removes all references to `email` and -adds `email_domain` and `email_hash` as new properties: +Here is a sample `stream_maps` transformation which obfuscates `phone_number` with a fake value, removes all references to `email` and adds `email_domain` and `email_hash` as new properties: `meltano.yml` or `config.json`: @@ -138,9 +127,18 @@ stream_maps: email_domain: owner_email.split('@')[-1] # for uniqueness checks email_hash: md5(config['hash_seed'] + owner_email) + # generate a fake phone number + phone_number: fake.phone_number() stream_map_config: # hash outputs are not able to be replicated without the original seed: hash_seed: 01AWZh7A6DzGm6iJZZ2T +faker_config: + # set specific seed + seed: 0 + # set specific locales + locale: + - en_US + - en_GB ``` ```` @@ -151,11 +149,19 @@ stream_map_config: "customers": { "email": null, "email_domain": "owner_email.split('@')[-1]", - "email_hash": "md5(config['hash_seed'] + owner_email)" + "email_hash": "md5(config['hash_seed'] + owner_email)", + "phone_number": "fake.phone_number()" } }, "stream_map_config": { "hash_seed": "01AWZh7A6DzGm6iJZZ2T" + }, + "faker_config": { + "seed": 0, + "locale": [ + "en_US", + "en_GB" + ] } } ``` @@ -236,6 +242,11 @@ can be referenced directly by mapping expressions. - `record` - an alias for the record values dictionary in the current stream. - `_` - same as `record` but shorter to type - `self` - the existing property value if the property already exists +- `fake` - a [`Faker`](https://faker.readthedocs.io/en/master/) instance, configurable via `faker_config` (see previous example) - see the built-in [standard providers](https://faker.readthedocs.io/en/master/providers.html) for available methods + + ```{tip} + The `fake` object is only available if the plugin specifies `faker` as an addtional dependency (through the `singer-sdk` `faker` extra, or directly). + ``` #### Automatic Schema Detection diff --git a/e2e-tests/cookiecutters/mapper-base.json b/e2e-tests/cookiecutters/mapper-base.json index 01834631d..24d67ad61 100644 --- a/e2e-tests/cookiecutters/mapper-base.json +++ b/e2e-tests/cookiecutters/mapper-base.json @@ -6,6 +6,7 @@ "mapper_id": "mapper-base", "library_name": "mapper_base", "variant": "None (Skip)", + "faker_extra": false, "include_ci_files": "None (Skip)", "license": "Apache-2.0", "ide": "VSCode", diff --git a/e2e-tests/cookiecutters/tap-faker.json b/e2e-tests/cookiecutters/tap-faker.json new file mode 100644 index 000000000..c01d30642 --- /dev/null +++ b/e2e-tests/cookiecutters/tap-faker.json @@ -0,0 +1,18 @@ +{ + "cookiecutter": { + "source_name": "AutomaticTestTap", + "admin_name": "Automatic Tester", + "admin_email": "auto.tester@example.com", + "tap_id": "tap-faker", + "library_name": "tap_faker", + "variant": "None (Skip)", + "stream_type": "REST", + "auth_method": "Bearer Token", + "include_ci_files": "None (Skip)", + "faker_extra": true, + "license": "Apache-2.0", + "ide": "VSCode", + "_template": "../tap-template/", + "_output_dir": "." + } +} diff --git a/e2e-tests/cookiecutters/tap-graphql-jwt.json b/e2e-tests/cookiecutters/tap-graphql-jwt.json index 28b3dfee4..4fe00ef7b 100644 --- a/e2e-tests/cookiecutters/tap-graphql-jwt.json +++ b/e2e-tests/cookiecutters/tap-graphql-jwt.json @@ -9,6 +9,7 @@ "stream_type": "GraphQL", "auth_method": "JWT", "include_ci_files": "None (Skip)", + "faker_extra": false, "license": "Apache-2.0", "ide": "VSCode", "_template": "../tap-template/", diff --git a/e2e-tests/cookiecutters/tap-other-custom.json b/e2e-tests/cookiecutters/tap-other-custom.json index d0aabab09..b78fcd51b 100644 --- a/e2e-tests/cookiecutters/tap-other-custom.json +++ b/e2e-tests/cookiecutters/tap-other-custom.json @@ -9,6 +9,7 @@ "stream_type": "Other", "auth_method": "Custom or N/A", "include_ci_files": "None (Skip)", + "faker_extra": false, "license": "Apache-2.0", "ide": "VSCode", "_template": "../tap-template/", diff --git a/e2e-tests/cookiecutters/tap-rest-api_key-github.json b/e2e-tests/cookiecutters/tap-rest-api_key-github.json index cb162402a..1d474c2fd 100644 --- a/e2e-tests/cookiecutters/tap-rest-api_key-github.json +++ b/e2e-tests/cookiecutters/tap-rest-api_key-github.json @@ -9,6 +9,7 @@ "stream_type": "REST", "auth_method": "API Key", "include_ci_files": "GitHub", + "faker_extra": false, "license": "Apache-2.0", "ide": "VSCode", "_template": "../tap-template/", diff --git a/e2e-tests/cookiecutters/tap-rest-basic_auth.json b/e2e-tests/cookiecutters/tap-rest-basic_auth.json index aa91e3c0d..6c9a1a509 100644 --- a/e2e-tests/cookiecutters/tap-rest-basic_auth.json +++ b/e2e-tests/cookiecutters/tap-rest-basic_auth.json @@ -9,6 +9,7 @@ "stream_type": "REST", "auth_method": "Basic Auth", "include_ci_files": "None (Skip)", + "faker_extra": false, "license": "Apache-2.0", "ide": "VSCode", "_template": "../tap-template/", diff --git a/e2e-tests/cookiecutters/tap-rest-bearer_token.json b/e2e-tests/cookiecutters/tap-rest-bearer_token.json index 274039845..0e032babb 100644 --- a/e2e-tests/cookiecutters/tap-rest-bearer_token.json +++ b/e2e-tests/cookiecutters/tap-rest-bearer_token.json @@ -9,6 +9,7 @@ "stream_type": "REST", "auth_method": "Bearer Token", "include_ci_files": "None (Skip)", + "faker_extra": false, "license": "Apache-2.0", "ide": "VSCode", "_template": "../tap-template/", diff --git a/e2e-tests/cookiecutters/tap-rest-custom.json b/e2e-tests/cookiecutters/tap-rest-custom.json index 67d72dea9..9a75b7b69 100644 --- a/e2e-tests/cookiecutters/tap-rest-custom.json +++ b/e2e-tests/cookiecutters/tap-rest-custom.json @@ -9,6 +9,7 @@ "stream_type": "REST", "auth_method": "Custom or N/A", "include_ci_files": "None (Skip)", + "faker_extra": false, "license": "Apache-2.0", "ide": "VSCode", "_template": "../tap-template/", diff --git a/e2e-tests/cookiecutters/tap-rest-jwt.json b/e2e-tests/cookiecutters/tap-rest-jwt.json index aa3729388..572129c74 100644 --- a/e2e-tests/cookiecutters/tap-rest-jwt.json +++ b/e2e-tests/cookiecutters/tap-rest-jwt.json @@ -9,6 +9,7 @@ "stream_type": "REST", "auth_method": "JWT", "include_ci_files": "None (Skip)", + "faker_extra": false, "license": "Apache-2.0", "ide": "VSCode", "_template": "../tap-template/", diff --git a/e2e-tests/cookiecutters/tap-rest-oauth2.json b/e2e-tests/cookiecutters/tap-rest-oauth2.json index 905349aac..97b2ff41b 100644 --- a/e2e-tests/cookiecutters/tap-rest-oauth2.json +++ b/e2e-tests/cookiecutters/tap-rest-oauth2.json @@ -9,6 +9,7 @@ "stream_type": "REST", "auth_method": "OAuth2", "include_ci_files": "None (Skip)", + "faker_extra": false, "license": "Apache-2.0", "ide": "VSCode", "_template": "../tap-template/", diff --git a/e2e-tests/cookiecutters/tap-sql-custom.json b/e2e-tests/cookiecutters/tap-sql-custom.json index 81ac625d0..264d2d500 100644 --- a/e2e-tests/cookiecutters/tap-sql-custom.json +++ b/e2e-tests/cookiecutters/tap-sql-custom.json @@ -9,6 +9,7 @@ "stream_type": "SQL", "auth_method": "Custom or N/A", "include_ci_files": "None (Skip)", + "faker_extra": false, "license": "Apache-2.0", "ide": "VSCode", "_template": "../tap-template/", diff --git a/e2e-tests/cookiecutters/target-per_record.json b/e2e-tests/cookiecutters/target-per_record.json index 89f923911..9e55e598d 100644 --- a/e2e-tests/cookiecutters/target-per_record.json +++ b/e2e-tests/cookiecutters/target-per_record.json @@ -8,6 +8,7 @@ "variant": "None (Skip)", "serialization_method": "Per record", "include_ci_files": "None (Skip)", + "faker_extra": false, "license": "Apache-2.0", "ide": "VSCode", "_template": "./sdk/cookiecutter/target-template", diff --git a/e2e-tests/cookiecutters/target-sql.json b/e2e-tests/cookiecutters/target-sql.json index 881b3ebe4..edd037dae 100644 --- a/e2e-tests/cookiecutters/target-sql.json +++ b/e2e-tests/cookiecutters/target-sql.json @@ -8,6 +8,7 @@ "variant": "None (Skip)", "serialization_method": "SQL", "include_ci_files": "None (Skip)", + "faker_extra": false, "license": "Apache-2.0", "ide": "VSCode", "_template": "./sdk/cookiecutter/target-template", diff --git a/noxfile.py b/noxfile.py index f323db08b..875e6670d 100644 --- a/noxfile.py +++ b/noxfile.py @@ -65,7 +65,7 @@ def _clean_py312_deps(session: Session, dependencies: list[str]) -> None: def mypy(session: Session) -> None: """Check types with mypy.""" args = session.posargs or ["singer_sdk"] - session.install(".[s3,testing,parquet]") + session.install(".[faker,parquet,s3,testing]") session.install( "exceptiongroup", "mypy", @@ -87,7 +87,7 @@ def mypy(session: Session) -> None: def tests(session: Session) -> None: """Execute pytest tests and compute coverage.""" _clean_py312_deps(session, test_dependencies) - session.install(".[s3,parquet]") + session.install(".[faker,parquet,s3]") session.install(*test_dependencies) sqlalchemy_version = os.environ.get("SQLALCHEMY_VERSION") @@ -144,7 +144,7 @@ def update_snapshots(session: Session) -> None: args = session.posargs or ["-m", "snapshot"] _clean_py312_deps(session, test_dependencies) - session.install(".") + session.install(".[faker]") session.install(*test_dependencies) session.run("pytest", "--snapshot-update", *args) diff --git a/poetry.lock b/poetry.lock index 62d34b9ef..0d923c767 100644 --- a/poetry.lock +++ b/poetry.lock @@ -619,6 +619,21 @@ files = [ [package.extras] test = ["pytest (>=6)"] +[[package]] +name = "faker" +version = "22.5.0" +description = "Faker is a Python package that generates fake data for you." +optional = true +python-versions = ">=3.8" +files = [ + {file = "Faker-22.5.0-py3-none-any.whl", hash = "sha256:9a510a31090cc47a7ef7d95c1da8126a891a9d076c7e26b01ad02e1bcf915c3e"}, + {file = "Faker-22.5.0.tar.gz", hash = "sha256:a4e689e2f4e62474245364bbd82cec045dcbbf85a539ee742a515fb4e93a6dd5"}, +] + +[package.dependencies] +python-dateutil = ">=2.4" +typing-extensions = {version = ">=3.10.0.1", markers = "python_version <= \"3.8\""} + [[package]] name = "filelock" version = "3.12.4" @@ -2612,6 +2627,7 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [extras] docs = ["furo", "myst-parser", "sphinx", "sphinx-autobuild", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-notfound-page", "sphinx-reredirects"] +faker = ["faker"] parquet = ["numpy", "numpy", "pyarrow"] s3 = ["fs-s3fs"] testing = ["pytest", "pytest-durations"] @@ -2619,4 +2635,4 @@ testing = ["pytest", "pytest-durations"] [metadata] lock-version = "2.0" python-versions = ">=3.8" -content-hash = "1b837f356d2b6ed6e90e55059808bbbf85b398b8bce9275b432e27688156044e" +content-hash = "cf326e02c7b02fbe1d12a80b3d16a4485357799c06585b3a36b025ad02bde08f" diff --git a/pyproject.toml b/pyproject.toml index e72322388..4bafb308d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -90,6 +90,9 @@ pyarrow = { version = ">=13", optional = true } pytest = {version=">=7.2.1", optional = true} pytest-durations = {version = ">=1.2.0", optional = true} +# installed as optional 'faker' extra +faker = {version = "~=22.5", optional = true} + [tool.poetry.extras] docs = [ "sphinx", @@ -107,6 +110,7 @@ testing = [ "pytest-durations" ] parquet = ["numpy", "pyarrow"] +faker = ["faker"] [tool.poetry.group.dev.dependencies] coverage = {extras = ["toml"], version = ">=7.4"} diff --git a/singer_sdk/helpers/capabilities.py b/singer_sdk/helpers/capabilities.py index b18d53748..151cd1c83 100644 --- a/singer_sdk/helpers/capabilities.py +++ b/singer_sdk/helpers/capabilities.py @@ -7,9 +7,12 @@ from warnings import warn from singer_sdk.typing import ( + ArrayType, BooleanType, IntegerType, + NumberType, ObjectType, + OneOf, PropertiesList, Property, StringType, @@ -34,6 +37,33 @@ ObjectType(), description="User-defined config values to be used within map expressions.", ), + Property( + "faker_config", + ObjectType( + Property( + "seed", + OneOf(NumberType, StringType, BooleanType), + description=( + "Value to seed the Faker generator for deterministic output: " + "https://faker.readthedocs.io/en/master/#seeding-the-generator" + ), + ), + Property( + "locale", + OneOf(StringType, ArrayType(StringType)), + description=( + "One or more LCID locale strings to produce localized output for: " + "https://faker.readthedocs.io/en/master/#localization" + ), + ), + ), + description=( + "Config for the [`Faker`](https://faker.readthedocs.io/en/master/) " + "instance variable `fake` used within map expressions. Only applicable if " + "the plugin specifies `faker` as an addtional dependency (through the " + "`singer-sdk` `faker` extra or directly)." + ), + ), ).to_dict() FLATTENING_CONFIG = PropertiesList( Property( diff --git a/singer_sdk/mapper.py b/singer_sdk/mapper.py index d3cd9400d..cd5faf93c 100644 --- a/singer_sdk/mapper.py +++ b/singer_sdk/mapper.py @@ -10,6 +10,7 @@ import copy import datetime import hashlib +import importlib.util import logging import typing as t @@ -28,6 +29,8 @@ if t.TYPE_CHECKING: import sys + from faker import Faker + if sys.version_info >= (3, 10): from typing import TypeAlias # noqa: ICN003 else: @@ -231,6 +234,7 @@ def __init__( self, stream_alias: str, map_config: dict, + faker_config: dict, raw_schema: dict, key_properties: t.Sequence[str] | None, map_transform: dict, @@ -241,6 +245,7 @@ def __init__( Args: stream_alias: Stream name. map_config: Stream map configuration. + faker_config: Faker configuration. raw_schema: Original stream's JSON schema. key_properties: Primary key of the source stream. map_transform: Dictionary of transformations to apply to the stream. @@ -254,6 +259,8 @@ def __init__( ) self.map_config = map_config + self.faker_config = faker_config + self._transform_fn: t.Callable[[dict], dict | None] self._filter_fn: t.Callable[[dict], bool] ( @@ -262,6 +269,7 @@ def __init__( self.transformed_schema, ) = self._init_functions_and_schema(stream_map=map_transform) self.expr_evaluator = simpleeval.EvalWithCompoundTypes(functions=self.functions) + self.fake = self._init_faker_instance() def transform(self, record: dict) -> dict | None: """Return a transformed record. @@ -324,6 +332,10 @@ def _eval( names["_"] = record # Add a shorthand alias in case of reserved words in names names["record"] = record # ...and a longhand alias names["config"] = self.map_config # Allow map config access within transform + + if self.fake: + names["fake"] = self.fake + if property_name and property_name in record: # Allow access to original property value if applicable names["self"] = record[property_name] @@ -593,6 +605,24 @@ def transform_fn(record: dict) -> dict | None: return filter_fn, transform_fn, transformed_schema + def _init_faker_instance(self) -> Faker | None: + if not importlib.util.find_spec("faker"): + return None + + from faker import Faker + + if self.faker_config: + faker_seed = self.faker_config.get("seed") + faker_locale = self.faker_config.get("locale") + + if faker_seed is not None: + Faker.seed(faker_seed) + + if faker_locale is not None: + return Faker(faker_locale) + + return Faker() + class PluginMapper: """Inline map tranformer.""" @@ -613,6 +643,7 @@ def __init__( """ self.stream_maps: dict[str, list[StreamMap]] = {} self.map_config = plugin_config.get("stream_map_config", {}) + self.faker_config = plugin_config.get("faker_config", {}) self.flattening_options = get_flattening_options(plugin_config) self.default_mapper_type: type[DefaultStreamMap] = SameRecordTransform self.logger = logger @@ -751,6 +782,7 @@ def register_raw_stream_schema( # noqa: PLR0912, C901 stream_alias=stream_alias, map_transform=stream_def, map_config=self.map_config, + faker_config=self.faker_config, raw_schema=schema, key_properties=key_properties, flattening_options=self.flattening_options, diff --git a/tests/core/test_mapper.py b/tests/core/test_mapper.py index 46a3eecf5..2f17684d6 100644 --- a/tests/core/test_mapper.py +++ b/tests/core/test_mapper.py @@ -582,12 +582,11 @@ def discover_streams(self): ) @pytest.mark.snapshot() @pytest.mark.parametrize( - "stream_maps,flatten,flatten_max_depth,snapshot_name", + "stream_maps,config,snapshot_name", [ pytest.param( {}, - False, - 0, + {"flattening_enabled": False, "flattening_max_depth": 0}, "no_map.jsonl", id="no_map", ), @@ -597,8 +596,7 @@ def discover_streams(self): "email_hash": "md5(email)", }, }, - False, - 0, + {"flattening_enabled": False, "flattening_max_depth": 0}, "keep_all_fields.jsonl", id="keep_all_fields", ), @@ -610,8 +608,7 @@ def discover_streams(self): "__else__": None, }, }, - False, - 0, + {"flattening_enabled": False, "flattening_max_depth": 0}, "only_mapped_fields.jsonl", id="only_mapped_fields", ), @@ -623,8 +620,7 @@ def discover_streams(self): "__else__": "__NULL__", }, }, - False, - 0, + {"flattening_enabled": False, "flattening_max_depth": 0}, "only_mapped_fields_null_string.jsonl", id="only_mapped_fields_null_string", ), @@ -636,57 +632,49 @@ def discover_streams(self): "__else__": None, }, }, - False, - 0, + {"flattening_enabled": False, "flattening_max_depth": 0}, "changed_key_properties.jsonl", id="changed_key_properties", ), pytest.param( {"mystream": None, "sourced_stream_1": {"__source__": "mystream"}}, - False, - 0, + {"flattening_enabled": False, "flattening_max_depth": 0}, "sourced_stream_1.jsonl", id="sourced_stream_1", ), pytest.param( {"mystream": "__NULL__", "sourced_stream_1": {"__source__": "mystream"}}, - False, - 0, + {"flattening_enabled": False, "flattening_max_depth": 0}, "sourced_stream_1_null_string.jsonl", id="sourced_stream_1_null_string", ), pytest.param( {"sourced_stream_2": {"__source__": "mystream"}, "__else__": None}, - False, - 0, + {"flattening_enabled": False, "flattening_max_depth": 0}, "sourced_stream_2.jsonl", id="sourced_stream_2", ), pytest.param( {"mystream": {"__alias__": "aliased_stream"}}, - False, - 0, + {"flattening_enabled": False, "flattening_max_depth": 0}, "aliased_stream.jsonl", id="aliased_stream", ), pytest.param( {}, - True, - 0, + {"flattening_enabled": True, "flattening_max_depth": 0}, "flatten_depth_0.jsonl", id="flatten_depth_0", ), pytest.param( {}, - True, - 1, + {"flattening_enabled": True, "flattening_max_depth": 1}, "flatten_depth_1.jsonl", id="flatten_depth_1", ), pytest.param( {}, - True, - 10, + {"flattening_enabled": True, "flattening_max_depth": 2}, "flatten_all.jsonl", id="flatten_all", ), @@ -697,8 +685,7 @@ def discover_streams(self): "__key_properties__": ["email_hash"], }, }, - True, - 10, + {"flattening_enabled": True, "flattening_max_depth": 10}, "map_and_flatten.jsonl", id="map_and_flatten", ), @@ -708,15 +695,13 @@ def discover_streams(self): "email": None, }, }, - False, - 0, + {"flattening_enabled": False, "flattening_max_depth": 0}, "drop_property.jsonl", id="drop_property", ), pytest.param( {"mystream": {"email": "__NULL__"}}, - False, - 0, + {"flattening_enabled": False, "flattening_max_depth": 0}, "drop_property_null_string.jsonl", id="drop_property_null_string", ), @@ -727,8 +712,7 @@ def discover_streams(self): "__else__": None, }, }, - False, - 0, + {"flattening_enabled": False, "flattening_max_depth": 0}, "non_pk_passthrough.jsonl", id="non_pk_passthrough", ), @@ -739,29 +723,41 @@ def discover_streams(self): "__else__": None, }, }, - False, - 0, + {"flattening_enabled": False, "flattening_max_depth": 0}, "record_to_column.jsonl", id="record_to_column", ), + pytest.param( + { + "mystream": { + "cc": "fake.credit_card_number()", + "__else__": None, + }, + }, + { + "flattening_enabled": False, + "flattening_max_depth": 0, + "faker_config": { + "locale": "en_US", + "seed": 123456, + }, + }, + "fake_credit_card_number.jsonl", + id="fake_credit_card_number", + ), ], ) def test_mapped_stream( snapshot: Snapshot, snapshot_dir: Path, stream_maps: dict, - flatten: bool, - flatten_max_depth: int | None, + config: dict, snapshot_name: str, ): snapshot.snapshot_dir = snapshot_dir.joinpath("mapped_stream") tap = MappedTap( - config={ - "stream_maps": stream_maps, - "flattening_enabled": flatten, - "flattening_max_depth": flatten_max_depth, - }, + config={"stream_maps": stream_maps, **config}, ) buf = io.StringIO() with redirect_stdout(buf): diff --git a/tests/snapshots/mapped_stream/fake_credit_card_number.jsonl b/tests/snapshots/mapped_stream/fake_credit_card_number.jsonl new file mode 100644 index 000000000..6db048524 --- /dev/null +++ b/tests/snapshots/mapped_stream/fake_credit_card_number.jsonl @@ -0,0 +1,6 @@ +{"type":"STATE","value":{}} +{"type":"SCHEMA","stream":"mystream","schema":{"type":"object","properties":{"cc":{"type":["string","null"]}}},"key_properties":[]} +{"type":"RECORD","stream":"mystream","record":{"cc":"4201040137208265027"},"time_extracted":"2022-01-01T00:00:00+00:00"} +{"type":"RECORD","stream":"mystream","record":{"cc":"675987782884"},"time_extracted":"2022-01-01T00:00:00+00:00"} +{"type":"RECORD","stream":"mystream","record":{"cc":"502011811259"},"time_extracted":"2022-01-01T00:00:00+00:00"} +{"type":"STATE","value":{"bookmarks":{"mystream":{}}}}