Skip to content

Commit

Permalink
Merge branch 'rpb-59' of https://github.com/hbz/rpb into main
Browse files Browse the repository at this point in the history
  • Loading branch information
fsteeg committed Mar 20, 2024
2 parents a29ab60 + b7ed84a commit 099de52
Show file tree
Hide file tree
Showing 11 changed files with 186 additions and 61 deletions.
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,29 @@ sbt "runMain rpb.ETL conf/rpb-test-titel-to-lobid.flux"

This writes individual `.json` files for each record in the input data to `output/`.

### Export strapi data

```bash
sbt "runMain rpb.ETL conf/test-export-strapi-to-lobid.flux"
```

This writes individual `.json` files for Strapi records to `output/`.

### Compare export data

```bash
sbt "runMain rpb.ETL conf/test-export-compare-strapi.flux PICK=all_equal('f36_','u') PATH=articles"
sbt "runMain rpb.ETL conf/test-export-compare-strapi.flux PICK=all_equal('f36_','s') PATH=independent-works"
sbt "runMain rpb.ETL conf/test-export-compare-strapi.flux PICK=all_equal('f36_','sbd') PATH=independent-works"
sbt "runMain rpb.ETL conf/test-export-compare-strapi.flux PICK=all_equal('f36t','MultiVolumeBook') PATH=independent-works"
```

This selects parts of the test data and write two files:

1) for each test record, get the data from the Strapi HTTP API, convert the result to the lobid format, write to `test-lobid-output-from-strapi.json`
2) convert each record directly to lobid, write to `test-lobid-output-from-file.json`

We can then compare the two files (e.g. in VSC: Select for Compare, Format Document) to see differences. Since fields that are not defined in the Strapi content types are omitted upon import, missing data here points to missing fields in the Strapi content types.

### Validate output

Expand Down
Empty file added conf/output/test-output-85.json
Empty file.
98 changes: 49 additions & 49 deletions conf/output/test-output-rppd.json

Large diffs are not rendered by default.

15 changes: 13 additions & 2 deletions conf/rppd-test-to-lobid.flux
Original file line number Diff line number Diff line change
@@ -1,7 +1,18 @@
FLUX_DIR + "output/test-output-rppd.json"
// Defaults, use Allegro test data:
// sbt "runMain rpb.ETL conf/rppd-to-strapi.flux IN_FILE=RPB-Export_HBZ_Bio_Test.txt OUT_FILE=test-output-rppd.json"
// sbt "runMain rpb.ETL conf/rppd-test-to-lobid.flux"

// To use Strapi export test data:
// zgrep -a '"type":"api::person.person"' conf/strapi-export-test.tar.gz > conf/output/test-rppd-export.json
// sbt "runMain rpb.ETL conf/rppd-test-to-lobid.flux IN_FILE=test-rppd-export.json RECORD_PATH=data"

default IN_FILE = "test-output-rppd.json"; // pass e.g. IN_FILE=test-rppd-export.json
default RECORD_PATH = ""; // pass e.g. RECORD_PATH=data

FLUX_DIR + "output/" + IN_FILE
| open-file
| as-lines
| decode-json
| decode-json(recordPath=RECORD_PATH)
| fix(FLUX_DIR + "rppd-to-lobid.fix")
| batch-reset(batchsize="1")
| encode-json(prettyPrinting="true")
Expand Down
7 changes: 2 additions & 5 deletions conf/rppd-to-lobid.fix
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ do once("map")
end

# Kommentar Doku: #14 "x" eintragen, falls der Datensatz nicht im Webopac erscheinen soll, z.B. noch nicht aufgearbeitete ps-Sätze
if exists("doNotIndex")
if all_equal("doNotIndex", "true")
reject()
end

Expand Down Expand Up @@ -318,10 +318,7 @@ move_field("_temp", "gndSubjectCategory[]")
#1ny (Datum der letzten inhaltlichen Änderung) -> describedBy.dateModified
# Kommentar Doku: JJJJMMTT, z.B. 20120928 für 28.09.2012

if any_match("dateModified", "(\\d{4})(\\d{2})(\\d{2})")
replace_all("dateModified", "(\\d{4})(\\d{2})(\\d{2})", "$1-$2-$3")
copy_field("dateModified", "describedBy.dateModified")
end
copy_field("dateModified", "describedBy.dateModified")

# -------
#1z1 (1. biogr. Anmerkung) -> biographicalOrHistoricalInformation
Expand Down
18 changes: 14 additions & 4 deletions conf/rppd-to-lobid.flux
Original file line number Diff line number Diff line change
@@ -1,11 +1,21 @@
default outfile = "conf/output/bulk/rppd/bulk-rppd-${i}.jsonl"; // lobid-gnd expects *.jsonl suffix
"conf/output/output-rppd-strapi.ndjson"
// Defaults, use Allegro data:
// sbt "runMain rpb.ETL conf/rppd-to-strapi.flux IN_FILE=RPB-Export_HBZ_Bio.txt OUT_FILE=output-rppd-strapi.ndjson"
// sbt "runMain rpb.ETL conf/rppd-to-lobid.flux"

// To use Strapi export data:
// zgrep -a '"type":"api::person.person"' conf/strapi-export.tar.gz > conf/output/rppd-export.json
// sbt "runMain rpb.ETL conf/rppd-to-lobid.flux IN_FILE=rppd-export.json RECORD_PATH=data"

default IN_FILE = "output-rppd-strapi.ndjson"; // pass e.g. OUT_FILE=output-rppd-export.ndjson
default RECORD_PATH = ""; // pass e.g. RECORD_PATH=data
default OUT_FILE = "conf/output/bulk/rppd/bulk-rppd-${i}.jsonl"; // lobid-gnd expects *.jsonl suffix
"conf/output/" + IN_FILE
| open-file
| as-lines
| decode-json
| decode-json(recordPath=RECORD_PATH)
| fix(FLUX_DIR + "rppd-to-lobid.fix")
| batch-reset(batchsize="1000")
| encode-json(prettyPrinting="false")
| json-to-elasticsearch-bulk(idkey="id", type="authority", index="gnd-rppd-test")
| write(outfile)
| write(OUT_FILE)
;
3 changes: 3 additions & 0 deletions conf/rppd-to-strapi.fix
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ call_macro("add_bio", field: "f1ä?")
call_macro("add_bio", field: "f1ö?")
call_macro("add_bio", field: "f1ß?")

# Allegro: "20240312" -> 2024-03-12
replace_all("dateModified", "(\\d{4})(\\d{2})(\\d{2})", "$1-$2-$3")

# gndIdentifier ist 'required' und 'unique'
if all_match("gndIdentifier", "Familienmitglied|Keine GND-Ansetzung")
paste("gndIdentifier", "~Keine GND-Ansetzung für", "rppdId")
Expand Down
32 changes: 32 additions & 0 deletions conf/test-export-compare-rppd.flux
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// Get test data for the specified type; for each record,
// fetch the entry from Strapi, convert that to lobid, write.

// sbt "runMain rpb.ETL conf/rppd-to-strapi.flux IN_FILE=RPB-Export_HBZ_Bio_Test.txt OUT_FILE=test-output-rppd.json"
// sbt -mem 2048 "runMain rpb.ETL conf/test-export-compare-rppd.flux"
FLUX_DIR + "output/test-output-rppd.json"
| open-file
| as-lines
| decode-json
| fix("
prepend(rppdId, 'https://rpb-cms-test.lobid.org/api/persons?populate=*&filters[rppdId][$eq]=')
retain(rppdId)
")
| literal-to-object
| log-object("Strapi URL: ")
| open-http
| as-records
| decode-json(recordPath="data.[*].attributes")
| fix(FLUX_DIR + "rppd-to-lobid.fix")
| encode-json
| write(FLUX_DIR + "output/test-rppd-output-from-strapi.json")
;

// To compare, convert test data directly to lobid, write.
FLUX_DIR + "output/test-output-rppd.json"
| open-file
| as-lines
| decode-json
| fix(FLUX_DIR + "rppd-to-lobid.fix")
| encode-json
| write(FLUX_DIR + "output/test-rppd-output-from-file.json")
;
37 changes: 37 additions & 0 deletions conf/test-export-compare-strapi.flux
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Get test data for the specified type; for each record,
// fetch the entry from Strapi, convert that to lobid, write.
FLUX_DIR + "output/test-output-strapi.json"
| open-file
| as-lines
| decode-json
| fix("
unless " + PICK + "
reject()
end
prepend(f00_, 'https://rpb-cms-test.lobid.org/api/" + PATH + "?populate=*&filters[f00_][$eq]=')
retain(f00_)
")
| literal-to-object
| log-object("Strapi URL: ")
| open-http
| as-records
| decode-json(recordPath="data.[*].attributes")
| fix(FLUX_DIR + "rpb-titel-to-lobid.fix")
| encode-json
| write(FLUX_DIR + "output/test-lobid-output-from-strapi.json")
;

// To compare, convert test data directly to lobid, write.
FLUX_DIR + "output/test-output-strapi.json"
| open-file
| as-lines
| decode-json
| fix("
unless " + PICK + "
reject()
end
")
| fix(FLUX_DIR + "rpb-titel-to-lobid.fix")
| encode-json
| write(FLUX_DIR + "output/test-lobid-output-from-file.json")
;
9 changes: 9 additions & 0 deletions conf/test-export-strapi-to-lobid.flux
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"https://rpb-cms-test.lobid.org/api/articles?populate=*&pagination[pageSize]=5"
| open-http
| as-records
| decode-json(recordPath="data.[*].attributes")
| fix(FLUX_DIR + "rpb-titel-to-lobid.fix")
| batch-reset(batchsize="1")
| encode-json(prettyPrinting="true")
| write(FLUX_DIR + "output/test-strapi-to-lobid-output-${i}.json")
;
5 changes: 4 additions & 1 deletion transformRppd.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@ set -u

bash transformBeacons.sh
rm conf/output/bulk/rppd/*
sbt "runMain rpb.ETL conf/rppd-to-strapi.flux IN_FILE=RPB-Export_HBZ_Bio.txt OUT_FILE=output-rppd-strapi.ndjson"
sbt "runMain rpb.ETL conf/rppd-to-gnd-mapping.flux"
sbt "runMain rpb.ETL conf/rppd-rppdId-with-label-map.flux"
sbt "runMain rpb.ETL conf/rppd-to-strapi.flux IN_FILE=RPB-Export_HBZ_Bio.txt OUT_FILE=output-rppd-strapi.ndjson"
sbt "runMain rpb.ETL conf/rppd-to-lobid.flux"
# Or from Strapi export:
# zgrep -a '"type":"api::person.person"' conf/strapi-export.tar.gz > conf/output/rppd-export.jsonl
# sbt "runMain rpb.ETL conf/rppd-to-lobid.flux IN_FILE=rppd-export.jsonl RECORD_PATH=data"

# Indexing happens in rppd/transformAndIndexRppd.sh (lobid-gnd repo, branch 'rppd'), which calls this script

0 comments on commit 099de52

Please sign in to comment.