From 0500bae8e563a6064cd47f29c9f0f7c2cd02f545 Mon Sep 17 00:00:00 2001 From: Fabian Steeg Date: Wed, 13 Mar 2024 16:29:40 +0100 Subject: [PATCH] Support strapi export data in rppd-to-lobid workflows (RPB-59) --- conf/rppd-test-to-lobid.flux | 15 +++++++++++++-- conf/rppd-to-lobid.flux | 18 ++++++++++++++---- transformRppd.sh | 3 +++ 3 files changed, 30 insertions(+), 6 deletions(-) diff --git a/conf/rppd-test-to-lobid.flux b/conf/rppd-test-to-lobid.flux index 17f0a5d9..eabebaf3 100644 --- a/conf/rppd-test-to-lobid.flux +++ b/conf/rppd-test-to-lobid.flux @@ -1,7 +1,18 @@ -FLUX_DIR + "output/test-output-rppd.json" +// Defaults, use Allegro test data: +// sbt "runMain rpb.ETL conf/rppd-to-strapi.flux IN_FILE=RPB-Export_HBZ_Bio_Test.txt OUT_FILE=test-output-rppd.json" +// sbt "runMain rpb.ETL conf/rppd-test-to-lobid.flux" + +// To use Strapi export test data: +// zgrep -a '"type":"api::person.person"' conf/strapi-export-test.tar.gz > conf/output/test-rppd-export.json +// sbt "runMain rpb.ETL conf/rppd-test-to-lobid.flux IN_FILE=test-rppd-export.json RECORD_PATH=data" + +default IN_FILE = "test-output-rppd.json"; // pass e.g. IN_FILE=test-rppd-export.json +default RECORD_PATH = ""; // pass e.g. RECORD_PATH=data + +FLUX_DIR + "output/" + IN_FILE | open-file | as-lines -| decode-json +| decode-json(recordPath=RECORD_PATH) | fix(FLUX_DIR + "rppd-to-lobid.fix") | batch-reset(batchsize="1") | encode-json(prettyPrinting="true") diff --git a/conf/rppd-to-lobid.flux b/conf/rppd-to-lobid.flux index fce6039e..6ece8734 100644 --- a/conf/rppd-to-lobid.flux +++ b/conf/rppd-to-lobid.flux @@ -1,11 +1,21 @@ -default outfile = "conf/output/bulk/rppd/bulk-rppd-${i}.jsonl"; // lobid-gnd expects *.jsonl suffix -"conf/output/output-rppd-strapi.ndjson" +// Defaults, use Allegro data: +// sbt "runMain rpb.ETL conf/rppd-to-strapi.flux IN_FILE=RPB-Export_HBZ_Bio.txt OUT_FILE=output-rppd-strapi.ndjson" +// sbt "runMain rpb.ETL conf/rppd-to-lobid.flux" + +// To use Strapi export data: +// zgrep -a '"type":"api::person.person"' conf/strapi-export.tar.gz > conf/output/rppd-export.json +// sbt "runMain rpb.ETL conf/rppd-to-lobid.flux IN_FILE=rppd-export.json RECORD_PATH=data" + +default IN_FILE = "output-rppd-strapi.ndjson"; // pass e.g. OUT_FILE=output-rppd-export.ndjson +default RECORD_PATH = ""; // pass e.g. RECORD_PATH=data +default OUT_FILE = "conf/output/bulk/rppd/bulk-rppd-${i}.jsonl"; // lobid-gnd expects *.jsonl suffix +"conf/output/" + IN_FILE | open-file | as-lines -| decode-json +| decode-json(recordPath=RECORD_PATH) | fix(FLUX_DIR + "rppd-to-lobid.fix") | batch-reset(batchsize="1000") | encode-json(prettyPrinting="false") | json-to-elasticsearch-bulk(idkey="id", type="authority", index="gnd-rppd-test") -| write(outfile) +| write(OUT_FILE) ; diff --git a/transformRppd.sh b/transformRppd.sh index 5092f1fd..c0be6604 100644 --- a/transformRppd.sh +++ b/transformRppd.sh @@ -7,5 +7,8 @@ sbt "runMain rpb.ETL conf/rppd-to-strapi.flux IN_FILE=RPB-Export_HBZ_Bio.txt OUT sbt "runMain rpb.ETL conf/rppd-to-gnd-mapping.flux" sbt "runMain rpb.ETL conf/rppd-rppdId-with-label-map.flux" sbt "runMain rpb.ETL conf/rppd-to-lobid.flux" +# Or from Strapi export: +# zgrep -a '"type":"api::person.person"' conf/strapi-export.tar.gz > conf/output/rppd-export.jsonl +# sbt "runMain rpb.ETL conf/rppd-to-lobid.flux IN_FILE=rppd-export.jsonl RECORD_PATH=data" # Indexing happens in rppd/transformAndIndexRppd.sh (lobid-gnd repo, branch 'rppd'), which calls this script