diff --git a/Makefile b/Makefile index 59a200f..b472c8f 100644 --- a/Makefile +++ b/Makefile @@ -55,8 +55,8 @@ refdata/RDAMediaType.nt: %-preprocessed.alephseq: %-in.alephseq uniq $< | scripts/filter-duplicates.py | $(UCONV) -x Any-NFC -i | scripts/filter-fennica-repl.py >$@ -%.mrcx: %-preprocessed.alephseq refdata/iso639-2-fi.csv - $(CATMANDU) convert MARC --type ALEPHSEQ to MARC --type XML --fix scripts/filter-marc.fix --fix scripts/strip-personal-info.fix --fix scripts/preprocess-marc.fix <$< >$@ +%.mrcx: %-preprocessed.alephseq refdata/iso639-2-fi.csv refdata/subst-260c.csv + $(CATMANDU) convert MARC --type ALEPHSEQ to MARC --type XML --fix scripts/filter-marc.fix --fix scripts/strip-personal-info.fix --fix scripts/preprocess-marc.fix --fix scripts/substitute-marc.fix <$< >$@ %-bf2.rdf: %.mrcx $(XSLTPROC) --stringparam baseuri $(URIBASEFENNICA) $(MARC2BIBFRAME2)/xsl/marc2bibframe2.xsl $^ >$@ diff --git a/scripts/extract-subst-260c.py b/scripts/extract-subst-260c.py new file mode 100755 index 0000000..9d4486e --- /dev/null +++ b/scripts/extract-subst-260c.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python3 + +import sys +import csv + +reader = csv.reader(sys.stdin, dialect='excel-tab') +writer = csv.writer(sys.stdout, quoting=csv.QUOTE_ALL) + +for lineno, row in enumerate(reader): + if lineno == 0: + continue # skip header + + recid = row[0] + orig260c = row[1] + new260c_from = row[4] + new260c_till = row[5] + + key = recid + "/" + orig260c + val = new260c_from + + if new260c_till: + val += "-" + new260c_till + + # skip trivial cases (already handled by conversion) + if orig260c == val: + continue + + if orig260c == val + ".": + continue + + if orig260c == val + "-": + continue + + if orig260c == val + "-.": + continue + + if orig260c == "[" + val + "]": + continue + + if orig260c == "[" + val + "].": + continue + + + writer.writerow([key, val]) diff --git a/scripts/substitute-marc.fix b/scripts/substitute-marc.fix new file mode 100644 index 0000000..dec9239 --- /dev/null +++ b/scripts/substitute-marc.fix @@ -0,0 +1,11 @@ +# Apply cleanup substitutions from CSV files to MARC records + +marc_map('001',recid) +marc_map('260c',orig260c) +if exists(orig260c) + paste(substval,recid,orig260c,join_char:"/") + lookup(substval,'refdata/subst-260c.csv',delete:1) + if exists(substval) + marc_set('260c',$.substval) + end +end diff --git a/test/15_mrcx.bats b/test/15_mrcx.bats index 0ac0d17..8519247 100644 --- a/test/15_mrcx.bats +++ b/test/15_mrcx.bats @@ -105,3 +105,9 @@ setup () { make slices/sjubroder-00450.mrcx xmllint --format slices/sjubroder-00450.mrcx | grep -q 'tag="490"' } + +@test "MARCXML: cleans up bad 260c values" { + make slices/suoja-pirtti-00000.mrcx + run bash -c "xmllint --format slices/suoja-pirtti-00000.mrcx | grep -A 3 'tag=.260.' | grep 'code=.c.' | grep 'Merkur'" + [ "$status" -ne 0 ] +}