Skip to content

Commit

Permalink
Singer/Meltano: Add example singerfile-to-cratedb
Browse files Browse the repository at this point in the history
  • Loading branch information
amotl committed Dec 8, 2023
1 parent ed998d6 commit 17a8ba3
Show file tree
Hide file tree
Showing 6 changed files with 256 additions and 0 deletions.
2 changes: 2 additions & 0 deletions framework/singer-meltano/singerfile-to-cratedb/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
catalog.json
tap_countries.singer
63 changes: 63 additions & 0 deletions framework/singer-meltano/singerfile-to-cratedb/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# Meltano Singer File -> CrateDB example

## About

Import data from a file in Singer format (JSONL) into CrateDB, using
[tap-singer-jsonl] and [meltano-target-cratedb].

## Configuration

### tap-singer-jsonl

Within the `extractors` section, have a look at `tap-singer-jsonl`'s
`config.local.paths` section, how to configure JSONL files in Singer
format as pipeline source(s).

### target-cratedb

Within the `loaders` section, at `target-cratedb`, adjust
`config.sqlalchemy_url` to match your database connectivity settings
as pipeline target.

## Usage

Install dependencies.
```shell
meltano install
```

Discover data schema.
```shell
meltano invoke tap-singer-jsonl --discover > catalog.json
```

Run plugin standalone, testdrive.
```shell
meltano invoke tap-singer-jsonl --catalog catalog.json
```

Invoke data transfer to CrateDB database.
```shell
meltano run tap-singer-jsonl target-cratedb
```

## Screenshot

Enjoy the list of countries.
```sql
crash --command 'SELECT "code", "name", "capital", "emoji", "languages[1]" FROM "melty"."countries" ORDER BY "name" LIMIT 42;'
```

![image](https://github.com/crate-workbench/meltano-target-cratedb/assets/453543/fa7076cc-267e-446c-a4f3-aa1283778ace)


## Development
In order to link the sandbox to a development installation of [meltano-target-cratedb],
configure the `pip_url` of the component like this:
```yaml
pip_url: --editable=/path/to/sources/meltano-target-cratedb
```
[meltano-target-cratedb]: https://github.com/crate-workbench/meltano-target-cratedb
[tap-singer-jsonl]: https://github.com/kgpayne/tap-singer-jsonl
52 changes: 52 additions & 0 deletions framework/singer-meltano/singerfile-to-cratedb/meltano.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# A Meltano project is just a directory on your filesystem containing text-based files.
# At a minimum, a Meltano project must contain a project file named `meltano.yml`,
# which contains your project configuration, and tells Meltano that a particular
# directory is a Meltano project.
---
version: 1
default_environment: dev
send_anonymous_usage_stats: false
project_id: f14797b9-9d1c-414c-851c-c91e08ddbc2e

environments:
- name: dev
- name: staging
- name: prod

plugins:

# Configure data source.
# In Singer jargon, it is an "extractor", wrapped into a "tap".
extractors:

- name: tap-singer-jsonl
variant: kgpayne
pip_url: git+https://github.com/crate-workbench/tap-singer-jsonl@fix-paths
config:
source: local
add_record_metadata: false
local:
# Note: Configure Singer file(s) here.
paths:
- "tap_countries.singer"

# Configure data sinks.
# In Singer jargon, it is a "loader", wrapped into a "target".
loaders:

- name: target-jsonl
variant: andyh1203
pip_url: target-jsonl

- name: target-cratedb
namespace: cratedb
variant: cratedb
# Acquire from PyPI.
pip_url: meltano-target-cratedb
# Acquire from GitHub.
# pip_url: git+https://github.com/crate-workbench/meltano-target-cratedb.git

# Note: Configure your database server and credentials here.
config:
sqlalchemy_url: crate://crate@localhost/
add_record_metadata: true
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
{
"plugin_type": "extractors",
"name": "tap-singer-jsonl",
"namespace": "tap_singer_jsonl",
"variant": "kgpayne",
"label": "Singer JSONL",
"docs": "https://hub.meltano.com/extractors/tap-singer-jsonl--kgpayne",
"repo": "https://github.com/kgpayne/tap-singer-jsonl",
"pip_url": "tap-singer-jsonl",
"executable": "tap-singer-jsonl",
"description": "Read Singer-formatted JSONL Files",
"logo_url": "https://hub.meltano.com/assets/logos/extractors/singer.png",
"capabilities": [
"discover"
],
"settings_group_validation": [
[
"local.folders"
],
[
"local.paths"
],
[
"source",
"s3.bucket"
],
[
"source",
"s3.paths"
]
],
"settings": [
{
"name": "source",
"kind": "string",
"value": "local",
"label": "Source",
"description": "The source configuration to use when reading `.singer.gz` files. Currently `local` and `s3` are supported."
},
{
"name": "add_record_metadata",
"kind": "boolean",
"value": true,
"label": "Add Record Metadata",
"description": "Whether to inject `_sdc_*` metadata columns."
},
{
"name": "local.folders",
"kind": "array",
"label": "Folders",
"description": "Array of directory paths to scan for `.singer.gz` files."
},
{
"name": "local.recursive",
"kind": "boolean",
"value": false,
"label": "Recursive",
"description": "Whether to scan directories recursively when discovering `.singer.gz` files."
},
{
"name": "local.paths",
"kind": "array",
"label": "Paths",
"description": "Array of file paths to singer-formatted files. **Note:** extension is ignored, and compression is inferred automatically by `smart_open`. Both `local.folders` and `local.paths` can be specified together."
},
{
"name": "s3.bucket",
"kind": "string",
"label": "Bucket",
"description": "S3 bucket name."
},
{
"name": "s3.prefix",
"kind": "string",
"label": "Prefix",
"description": "S3 key prefix. **Note:** key prefixes will be scanned recursively."
},
{
"name": "s3.paths",
"kind": "array",
"label": "Paths",
"description": "S3 file paths to singer-formatted files. **Note:** extension is ignored, and compression is inferred automatically by `smart_open`. Both `s3.prefix` and `s3.paths` can be specified together."
}
]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{
"plugin_type": "loaders",
"name": "target-jsonl",
"namespace": "target_jsonl",
"variant": "andyh1203",
"label": "JSON Lines (JSONL)",
"docs": "https://hub.meltano.com/loaders/target-jsonl--andyh1203",
"repo": "https://github.com/andyh1203/target-jsonl",
"pip_url": "target-jsonl",
"description": "JSONL loader",
"logo_url": "https://hub.meltano.com/assets/logos/loaders/jsonl.png",
"settings": [
{
"name": "destination_path",
"kind": "string",
"value": "output",
"label": "Destination Path",
"description": "Sets the destination path the JSONL files are written to, relative\nto the project root.\n\nThe directory needs to exist already, it will not be created\nautomatically.\n\nTo write JSONL files to the project root, set an empty string (`\"\"`).\n"
},
{
"name": "do_timestamp_file",
"kind": "boolean",
"value": false,
"label": "Include Timestamp in File Names",
"description": "Specifies if the files should get timestamped.\n\nBy default, the resulting file will not have a timestamp in the file name (i.e. `exchange_rate.jsonl`).\n\nIf this option gets set to `true`, the resulting file will have a timestamp associated with it (i.e. `exchange_rate-{timestamp}.jsonl`).\n"
},
{
"name": "custom_name",
"kind": "string",
"label": "Custom File Name Override",
"description": "Specifies a custom name for the filename, instead of the stream name.\n\nThe file name will be `{custom_name}-{timestamp}.jsonl`, if `do_timestamp_file` is `true`.\nOtherwise the file name will be `{custom_name}.jsonl`.\n\nIf custom name is not provided, the stream name will be used.\n"
}
]
}
20 changes: 20 additions & 0 deletions framework/singer-meltano/singerfile-to-cratedb/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
[tool.poe.tasks]

test = [

# Acquire Singer file in JSONL format.
{ cmd = "wget --no-clobber https://github.com/MeltanoLabs/target-postgres/raw/v0.0.9/target_postgres/tests/data_files/tap_countries.singer" },

# Install recipe.
{ cmd = "meltano install" },

# Discover data schema.
{ shell = "meltano invoke tap-singer-jsonl --discover > catalog.json" },

# Run plugin standalone, testdrive.
{ cmd = "meltano invoke tap-singer-jsonl --catalog catalog.json" },

# Invoke pipeline, loading data into database, for real.
{ cmd = "meltano run tap-singer-jsonl target-cratedb" },

]

0 comments on commit 17a8ba3

Please sign in to comment.