Skip to content

Commit

Permalink
Support strapi export data in rppd-to-lobid workflows (RPB-59)
Browse files Browse the repository at this point in the history
  • Loading branch information
fsteeg committed Mar 13, 2024
1 parent e29df3b commit 0500bae
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 6 deletions.
15 changes: 13 additions & 2 deletions conf/rppd-test-to-lobid.flux
Original file line number Diff line number Diff line change
@@ -1,7 +1,18 @@
FLUX_DIR + "output/test-output-rppd.json"
// Defaults, use Allegro test data:
// sbt "runMain rpb.ETL conf/rppd-to-strapi.flux IN_FILE=RPB-Export_HBZ_Bio_Test.txt OUT_FILE=test-output-rppd.json"
// sbt "runMain rpb.ETL conf/rppd-test-to-lobid.flux"

// To use Strapi export test data:
// zgrep -a '"type":"api::person.person"' conf/strapi-export-test.tar.gz > conf/output/test-rppd-export.json
// sbt "runMain rpb.ETL conf/rppd-test-to-lobid.flux IN_FILE=test-rppd-export.json RECORD_PATH=data"

default IN_FILE = "test-output-rppd.json"; // pass e.g. IN_FILE=test-rppd-export.json
default RECORD_PATH = ""; // pass e.g. RECORD_PATH=data

FLUX_DIR + "output/" + IN_FILE
| open-file
| as-lines
| decode-json
| decode-json(recordPath=RECORD_PATH)
| fix(FLUX_DIR + "rppd-to-lobid.fix")
| batch-reset(batchsize="1")
| encode-json(prettyPrinting="true")
Expand Down
18 changes: 14 additions & 4 deletions conf/rppd-to-lobid.flux
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
default outfile = "conf/output/bulk/rppd/bulk-rppd-${i}.jsonl"; // lobid-gnd expects *.jsonl suffix
"conf/output/output-rppd-strapi.ndjson"
// Defaults, use Allegro data:
// sbt "runMain rpb.ETL conf/rppd-to-strapi.flux IN_FILE=RPB-Export_HBZ_Bio.txt OUT_FILE=output-rppd-strapi.ndjson"
// sbt "runMain rpb.ETL conf/rppd-to-lobid.flux"

// To use Strapi export data:
// zgrep -a '"type":"api::person.person"' conf/strapi-export.tar.gz > conf/output/rppd-export.json
// sbt "runMain rpb.ETL conf/rppd-to-lobid.flux IN_FILE=rppd-export.json RECORD_PATH=data"

default IN_FILE = "output-rppd-strapi.ndjson"; // pass e.g. OUT_FILE=output-rppd-export.ndjson
default RECORD_PATH = ""; // pass e.g. RECORD_PATH=data
default OUT_FILE = "conf/output/bulk/rppd/bulk-rppd-${i}.jsonl"; // lobid-gnd expects *.jsonl suffix
"conf/output/" + IN_FILE
| open-file
| as-lines
| decode-json
| decode-json(recordPath=RECORD_PATH)
| fix(FLUX_DIR + "rppd-to-lobid.fix")
| batch-reset(batchsize="1000")
| encode-json(prettyPrinting="false")
| json-to-elasticsearch-bulk(idkey="id", type="authority", index="gnd-rppd-test")
| write(outfile)
| write(OUT_FILE)
;
3 changes: 3 additions & 0 deletions transformRppd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,8 @@ sbt "runMain rpb.ETL conf/rppd-to-strapi.flux IN_FILE=RPB-Export_HBZ_Bio.txt OUT
sbt "runMain rpb.ETL conf/rppd-to-gnd-mapping.flux"
sbt "runMain rpb.ETL conf/rppd-rppdId-with-label-map.flux"
sbt "runMain rpb.ETL conf/rppd-to-lobid.flux"
# Or from Strapi export:
# zgrep -a '"type":"api::person.person"' conf/strapi-export.tar.gz > conf/output/rppd-export.jsonl
# sbt "runMain rpb.ETL conf/rppd-to-lobid.flux IN_FILE=rppd-export.jsonl RECORD_PATH=data"

# Indexing happens in rppd/transformAndIndexRppd.sh (lobid-gnd repo, branch 'rppd'), which calls this script

0 comments on commit 0500bae

Please sign in to comment.