Skip to content

Commit

Permalink
feat(taps): Generate fake data with stream maps (#2170)
Browse files Browse the repository at this point in the history
* Add configurable `Faker` instance as a stream maps expression property

* Bump `faker` now SDK dependency on Python 3.7 has been dropped

* Define `faker` as an extra

* Allow configuration of `Faker` instance with top-level `faker_config` settings

* Install with additional `faker` extra for Nox mypy type checking

* REVERT ME: remove faker dependency

* Revert "REVERT ME: remove faker dependency"

This reverts commit 4e7ea15.

* Prefer checks over exception handling

* Add docs for `fake`

* Add `faker_config` property to stream maps config schema

* Fix out-of-date docs for leveraging stream maps as a developer of an SDK-based plugin

* Fix phrasing

* Add note on the condition under which the `fake` variable  is available

* Update tap/target cookiecutter templates with Faker prompt

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Also update mapper cookiecutter template

* Update replay files

* Use shorter name

* use admonition

* Add mapper test case

* Add missing snapshot

* Add missing extras to nox session

* Add example to JSON tab

---------

Co-authored-by: Edgar Ramírez Mondragón <[email protected]>
Co-authored-by: Edgar Ramírez Mondragón <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
4 people authored Jan 31, 2024
1 parent 41ca39e commit 2fbe530
Show file tree
Hide file tree
Showing 27 changed files with 198 additions and 67 deletions.
2 changes: 2 additions & 0 deletions cookiecutter/mapper-template/cookiecutter.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"mapper_id": "mapper-{{ cookiecutter.name.lower() }}",
"library_name": "{{ cookiecutter.mapper_id.replace('-', '_') }}",
"variant": "None (Skip)",
"faker_extra": false,
"include_ci_files": ["GitHub", "None (Skip)"],
"license": ["Apache-2.0"],
"ide": ["VSCode", "None"],
Expand All @@ -14,6 +15,7 @@
"admin_email": "Provide your [bold yellow]email[/]",
"mapper_id": "The ID of the tap, in kebab-case",
"library_name": "The name of the library, in snake_case. This is how the library will be imported in Python.",
"faker_extra": "Add [bold orange1][link=https://faker.readthedocs.io/en/master/]Faker[/link][/] as an extra dependency to support generating fake data in stream maps?",
"include_ci_files": "Whether to include CI files for a common CI services",
"license": "The license for the project",
"ide": "Add configuration files for your preferred IDE"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ packages = [

[tool.poetry.dependencies]
python = ">=3.8"
singer-sdk = { version="~=0.34.1" }
singer-sdk = { version="~=0.34.1"{{ ', extras = ["faker"]' if cookiecutter.faker_extra }} }
fs-s3fs = { version = "~=1.1.1", optional = true }

[tool.poetry.group.dev.dependencies]
Expand Down
2 changes: 2 additions & 0 deletions cookiecutter/tap-template/cookiecutter.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"JWT",
"Custom or N/A"
],
"faker_extra": false,
"include_ci_files": ["GitHub", "None"],
"license": ["Apache-2.0", "None"],
"ide": ["VSCode", "None"],
Expand All @@ -25,6 +26,7 @@
"library_name": "The name of the library, in snake_case. This is how the library will be imported in Python.",
"stream_type": "The type of stream the source provides",
"auth_method": "The [bold red]authentication[/] method used by the source, for REST and GraphQL sources",
"faker_extra": "Add [bold orange1][link=https://faker.readthedocs.io/en/master/]Faker[/link][/] as an extra dependency to support generating fake data in stream maps?",
"include_ci_files": "Whether to include CI files for a common CI services",
"license": "The license for the project",
"ide": "Add configuration files for your preferred IDE"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ packages = [
[tool.poetry.dependencies]
python = ">=3.8"
importlib-resources = { version = "==6.1.*", python = "<3.9" }
singer-sdk = { version="~=0.34.1" }
singer-sdk = { version="~=0.34.1"{{ ', extras = ["faker"]' if cookiecutter.faker_extra }} }
fs-s3fs = { version = "~=1.1.1", optional = true }
{%- if cookiecutter.stream_type in ["REST", "GraphQL"] %}
requests = "~=2.31.0"
Expand Down
2 changes: 2 additions & 0 deletions cookiecutter/target-template/cookiecutter.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"library_name": "{{ cookiecutter.target_id.replace('-', '_') }}",
"variant": "None (Skip)",
"serialization_method": ["Per record", "Per batch", "SQL"],
"faker_extra": false,
"include_ci_files": ["GitHub", "None (Skip)"],
"license": ["Apache-2.0"],
"ide": ["VSCode", "None"],
Expand All @@ -16,6 +17,7 @@
"mapper_id": "The ID of the tap, in kebab-case",
"library_name": "The name of the library, in snake_case. This is how the library will be imported in Python.",
"serialization_method": "The serialization method to use for loading data",
"faker_extra": "Add [bold orange1][link=https://faker.readthedocs.io/en/master/]Faker[/link][/] as an extra dependency to support generating fake data in stream maps?",
"include_ci_files": "Whether to include CI files for a common CI services",
"license": "The license for the project",
"ide": "Add configuration files for your preferred IDE"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ packages = [

[tool.poetry.dependencies]
python = ">=3.8"
singer-sdk = { version="~=0.34.1" }
singer-sdk = { version="~=0.34.1"{{ ', extras = ["faker"]' if cookiecutter.faker_extra }} }
fs-s3fs = { version = "~=1.1.1", optional = true }
{%- if cookiecutter.serialization_method != "SQL" %}
requests = "~=2.31.0"
Expand Down
47 changes: 29 additions & 18 deletions docs/stream_maps.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,23 +80,13 @@ These capabilities are all out of scope _by design_:
a transformation tool like [dbt](https://www.getdbt.com), or (b) create a custom mapper
plugin with inline lookup logic.

### A feature for all Singer users, enabled by the SDK
## A feature for all Singer users, enabled by the SDK

The mapping features described here are create for the **_users_** of SDK-based taps and targets.
The mapping features described here are created for the **_users_** of SDK-based taps and targets, which support inline transformations with `stream_maps` and `stream_map_config` out-of-box.

Developers simply enable the feature using the instructions below, and then users can benefit from having inline transformation capabilities out-of-box on their favorite taps and targets.
**Note:** to support non-SDK taps and targets, the standalone inline mapper plugin [`meltano-map-transformer`](https://hub.meltano.com/mappers/meltano-map-transformer/) follows all specifications defined here and can apply mapping transformations between _any_ Singer tap and target, even if they are not built using the SDK.

**Note:** to support non-SDK taps and targets, we are also creating a standalone inline mapper plugin (`meltano-map-transform`), which follows all specifications defined here and can apply mapping transformations between _any_ Singer tap and target, even if they are not built using the SDK.

## Enabling Stream Maps in SDK-Based Plugins

To support inline mapping functions, the developer only needs to declare two plugin settings,
called `stream_maps` and `stream_map_config`, and declare both settings as `object` type. (For example:
`Property("stream_maps, ObjectType())` if using the python helper classes or
`"stream_maps": {"type": "object"}` if using native JSON Schema declarations.)

If the `stream_maps` setting is detected, the following behaviors will be implemented
by the SDK automatically:
The following behaviors are implemented by the SDK automatically:

1. For taps, the SCHEMA and RECORD messages will automatically be transformed,
duplicated, filtered, or aliased, as per the `stream_maps` config settings _after_
Expand All @@ -108,7 +98,7 @@ by the SDK automatically:
setting _prior_ to any Sink processing functions.
- This means that the target developer can assume that all streams and records are
transformed, aliased, filtered, etc. _before_ any custom target code is executed.
3. The upcoming standalone mapper plugin (`meltano-map-transform`) is a hybrid tap/target which
3. The standalone mapper plugin [`meltano-map-transformer`](https://hub.meltano.com/mappers/meltano-map-transformer/) is a hybrid tap/target which
simply receives input from a tap, transforms all stream and schema messages via the
`stream_maps` config option, and then emits the resulting stream(s) to a downstream
target.
Expand All @@ -122,8 +112,7 @@ by the SDK automatically:

The `stream_maps` config expects a mapping of stream names to a structured transform object.

Here is a sample `stream_maps` transformation which removes all references to `email` and
adds `email_domain` and `email_hash` as new properties:
Here is a sample `stream_maps` transformation which obfuscates `phone_number` with a fake value, removes all references to `email` and adds `email_domain` and `email_hash` as new properties:

`meltano.yml` or `config.json`:

Expand All @@ -138,9 +127,18 @@ stream_maps:
email_domain: owner_email.split('@')[-1]
# for uniqueness checks
email_hash: md5(config['hash_seed'] + owner_email)
# generate a fake phone number
phone_number: fake.phone_number()
stream_map_config:
# hash outputs are not able to be replicated without the original seed:
hash_seed: 01AWZh7A6DzGm6iJZZ2T
faker_config:
# set specific seed
seed: 0
# set specific locales
locale:
- en_US
- en_GB
```
````

Expand All @@ -151,11 +149,19 @@ stream_map_config:
"customers": {
"email": null,
"email_domain": "owner_email.split('@')[-1]",
"email_hash": "md5(config['hash_seed'] + owner_email)"
"email_hash": "md5(config['hash_seed'] + owner_email)",
"phone_number": "fake.phone_number()"
}
},
"stream_map_config": {
"hash_seed": "01AWZh7A6DzGm6iJZZ2T"
},
"faker_config": {
"seed": 0,
"locale": [
"en_US",
"en_GB"
]
}
}
```
Expand Down Expand Up @@ -236,6 +242,11 @@ can be referenced directly by mapping expressions.
- `record` - an alias for the record values dictionary in the current stream.
- `_` - same as `record` but shorter to type
- `self` - the existing property value if the property already exists
- `fake` - a [`Faker`](https://faker.readthedocs.io/en/master/) instance, configurable via `faker_config` (see previous example) - see the built-in [standard providers](https://faker.readthedocs.io/en/master/providers.html) for available methods

```{tip}
The `fake` object is only available if the plugin specifies `faker` as an addtional dependency (through the `singer-sdk` `faker` extra, or directly).
```

#### Automatic Schema Detection

Expand Down
1 change: 1 addition & 0 deletions e2e-tests/cookiecutters/mapper-base.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"mapper_id": "mapper-base",
"library_name": "mapper_base",
"variant": "None (Skip)",
"faker_extra": false,
"include_ci_files": "None (Skip)",
"license": "Apache-2.0",
"ide": "VSCode",
Expand Down
18 changes: 18 additions & 0 deletions e2e-tests/cookiecutters/tap-faker.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"cookiecutter": {
"source_name": "AutomaticTestTap",
"admin_name": "Automatic Tester",
"admin_email": "[email protected]",
"tap_id": "tap-faker",
"library_name": "tap_faker",
"variant": "None (Skip)",
"stream_type": "REST",
"auth_method": "Bearer Token",
"include_ci_files": "None (Skip)",
"faker_extra": true,
"license": "Apache-2.0",
"ide": "VSCode",
"_template": "../tap-template/",
"_output_dir": "."
}
}
1 change: 1 addition & 0 deletions e2e-tests/cookiecutters/tap-graphql-jwt.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"stream_type": "GraphQL",
"auth_method": "JWT",
"include_ci_files": "None (Skip)",
"faker_extra": false,
"license": "Apache-2.0",
"ide": "VSCode",
"_template": "../tap-template/",
Expand Down
1 change: 1 addition & 0 deletions e2e-tests/cookiecutters/tap-other-custom.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"stream_type": "Other",
"auth_method": "Custom or N/A",
"include_ci_files": "None (Skip)",
"faker_extra": false,
"license": "Apache-2.0",
"ide": "VSCode",
"_template": "../tap-template/",
Expand Down
1 change: 1 addition & 0 deletions e2e-tests/cookiecutters/tap-rest-api_key-github.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"stream_type": "REST",
"auth_method": "API Key",
"include_ci_files": "GitHub",
"faker_extra": false,
"license": "Apache-2.0",
"ide": "VSCode",
"_template": "../tap-template/",
Expand Down
1 change: 1 addition & 0 deletions e2e-tests/cookiecutters/tap-rest-basic_auth.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"stream_type": "REST",
"auth_method": "Basic Auth",
"include_ci_files": "None (Skip)",
"faker_extra": false,
"license": "Apache-2.0",
"ide": "VSCode",
"_template": "../tap-template/",
Expand Down
1 change: 1 addition & 0 deletions e2e-tests/cookiecutters/tap-rest-bearer_token.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"stream_type": "REST",
"auth_method": "Bearer Token",
"include_ci_files": "None (Skip)",
"faker_extra": false,
"license": "Apache-2.0",
"ide": "VSCode",
"_template": "../tap-template/",
Expand Down
1 change: 1 addition & 0 deletions e2e-tests/cookiecutters/tap-rest-custom.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"stream_type": "REST",
"auth_method": "Custom or N/A",
"include_ci_files": "None (Skip)",
"faker_extra": false,
"license": "Apache-2.0",
"ide": "VSCode",
"_template": "../tap-template/",
Expand Down
1 change: 1 addition & 0 deletions e2e-tests/cookiecutters/tap-rest-jwt.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"stream_type": "REST",
"auth_method": "JWT",
"include_ci_files": "None (Skip)",
"faker_extra": false,
"license": "Apache-2.0",
"ide": "VSCode",
"_template": "../tap-template/",
Expand Down
1 change: 1 addition & 0 deletions e2e-tests/cookiecutters/tap-rest-oauth2.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"stream_type": "REST",
"auth_method": "OAuth2",
"include_ci_files": "None (Skip)",
"faker_extra": false,
"license": "Apache-2.0",
"ide": "VSCode",
"_template": "../tap-template/",
Expand Down
1 change: 1 addition & 0 deletions e2e-tests/cookiecutters/tap-sql-custom.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"stream_type": "SQL",
"auth_method": "Custom or N/A",
"include_ci_files": "None (Skip)",
"faker_extra": false,
"license": "Apache-2.0",
"ide": "VSCode",
"_template": "../tap-template/",
Expand Down
1 change: 1 addition & 0 deletions e2e-tests/cookiecutters/target-per_record.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"variant": "None (Skip)",
"serialization_method": "Per record",
"include_ci_files": "None (Skip)",
"faker_extra": false,
"license": "Apache-2.0",
"ide": "VSCode",
"_template": "./sdk/cookiecutter/target-template",
Expand Down
1 change: 1 addition & 0 deletions e2e-tests/cookiecutters/target-sql.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"variant": "None (Skip)",
"serialization_method": "SQL",
"include_ci_files": "None (Skip)",
"faker_extra": false,
"license": "Apache-2.0",
"ide": "VSCode",
"_template": "./sdk/cookiecutter/target-template",
Expand Down
6 changes: 3 additions & 3 deletions noxfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def _clean_py312_deps(session: Session, dependencies: list[str]) -> None:
def mypy(session: Session) -> None:
"""Check types with mypy."""
args = session.posargs or ["singer_sdk"]
session.install(".[s3,testing,parquet]")
session.install(".[faker,parquet,s3,testing]")
session.install(
"exceptiongroup",
"mypy",
Expand All @@ -87,7 +87,7 @@ def mypy(session: Session) -> None:
def tests(session: Session) -> None:
"""Execute pytest tests and compute coverage."""
_clean_py312_deps(session, test_dependencies)
session.install(".[s3,parquet]")
session.install(".[faker,parquet,s3]")
session.install(*test_dependencies)

sqlalchemy_version = os.environ.get("SQLALCHEMY_VERSION")
Expand Down Expand Up @@ -144,7 +144,7 @@ def update_snapshots(session: Session) -> None:
args = session.posargs or ["-m", "snapshot"]

_clean_py312_deps(session, test_dependencies)
session.install(".")
session.install(".[faker]")
session.install(*test_dependencies)
session.run("pytest", "--snapshot-update", *args)

Expand Down
18 changes: 17 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ pyarrow = { version = ">=13", optional = true }
pytest = {version=">=7.2.1", optional = true}
pytest-durations = {version = ">=1.2.0", optional = true}

# installed as optional 'faker' extra
faker = {version = "~=22.5", optional = true}

[tool.poetry.extras]
docs = [
"sphinx",
Expand All @@ -107,6 +110,7 @@ testing = [
"pytest-durations"
]
parquet = ["numpy", "pyarrow"]
faker = ["faker"]

[tool.poetry.group.dev.dependencies]
coverage = {extras = ["toml"], version = ">=7.4"}
Expand Down
Loading

0 comments on commit 2fbe530

Please sign in to comment.