-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathMakefile
159 lines (108 loc) · 6.14 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# Paths to non-unix-standard tools that we depend on; can be overridden on the command line
CATMANDU=catmandu
MARC2BIBFRAME2=$(PATH_PREFIX)../marc2bibframe2
XSLTPROC=xsltproc
RSPARQL=rsparql
RIOT=riot
SPARQL=sparql
UCONV=uconv
RDF2HDT=rdf2hdt
HDTSEARCH=hdtSearch
HDTSPARQL=hdtsparql.sh
# Other configuration settings
FINTOSPARQL=http://api.dev.finto.fi/sparql
URIBASEFENNICA=http://urn.fi/URN:NBN:fi:bib:me:
JVMARGS="-Xmx4G"
# Pattern rules used internally
split-input/%.md5: input/%.alephseq
scripts/split-input.sh $(patsubst %.md5,%,$@) <$^
cd split-input; md5sum $(patsubst split-input/%.md5,%,$@)-*-in.alephseq >`basename $@`
%.md5: %
md5sum $^ >$@
slices/%.md5: split-input/%.md5
scripts/update-slices.sh $^ $@
refdata/subst-260c.csv: refdata/fennica-dates.csv.gz
zcat $^ | scripts/extract-subst-260c.py >$@
refdata/iso639-2-fi.csv: sparql/extract-iso639-2-fi.rq
$(RSPARQL) --service $(FINTOSPARQL) --query $^ --results=CSV >$@
refdata/iso639-1-2-mapping.nt: sparql/extract-iso639-1-2-mapping.rq
$(RSPARQL) --service $(FINTOSPARQL) --query $^ --results=NT >$@
refdata/cn-labels.nt: sparql/extract-cn-labels.rq
$(RSPARQL) --service $(FINTOSPARQL) --query $^ --results=NT >$@
refdata/RDACarrierType.nt:
curl -H 'Accept: text/html' -s http://rdaregistry.info/termList/RDACarrierType.nt >$@
refdata/RDAContentType.nt:
curl -H 'Accept: text/html' -s http://rdaregistry.info/termList/RDAContentType.nt | sed -e 's|RDAContentType//|RDAContentType/|g' >$@
refdata/RDAMediaType.nt:
curl -H 'Accept: text/html' -s http://rdaregistry.info/termList/RDAMediaType.nt >$@
%-preprocessed.alephseq: %-in.alephseq
uniq $< | scripts/filter-duplicates.py | $(UCONV) -x Any-NFC -i | scripts/filter-fennica-repl.py >$@
%.mrcx: %-preprocessed.alephseq refdata/iso639-2-fi.csv refdata/subst-260c.csv
$(CATMANDU) convert MARC --type ALEPHSEQ to MARC --type XML --fix scripts/filter-marc.fix --fix scripts/strip-personal-info.fix --fix scripts/preprocess-marc.fix --fix scripts/substitute-marc.fix <$< >$@
%-bf2.rdf: %.mrcx
$(XSLTPROC) --stringparam baseuri $(URIBASEFENNICA) $(MARC2BIBFRAME2)/xsl/marc2bibframe2.xsl $^ >$@
%.nt: %.rdf
$(RIOT) -q $^ >$@
%-rewritten.nt: %-bf2.nt
scripts/rewrite-uris.py $^ | scripts/filter-bad-ntriples.py >$@ 2>$(patsubst %.nt,%.log,$@)
%-schema.nt: %-rewritten.nt
JVM_ARGS=$(JVMARGS) $(SPARQL) --graph $< --query sparql/bf-to-schema.rq --out=NT | scripts/filter-bad-ntriples.py >$@ 2>$(patsubst %.nt,%.log,$@)>$@
%-reconciled.nt: %-schema.nt refdata/iso639-1-2-mapping.nt refdata/RDACarrierType.nt refdata/RDAContentType.nt refdata/RDAMediaType.nt refdata/cn-labels.nt
JVM_ARGS=$(JVMARGS) $(SPARQL) --graph $< --namedGraph $(word 2,$^) --namedGraph $(word 3,$^) --namedGraph $(word 4,$^) --namedGraph $(word 5,$^) --namedGraph $(word 6,$^) --query sparql/reconcile.rq --out=NT >$@
%-work-keys.nt: %-rewritten.nt
JVM_ARGS=$(JVMARGS) $(SPARQL) --data $< --query sparql/create-work-keys.rq --out=NT >$@
.SECONDEXPANSION:
refdata/%-work-keys.nt: $$(shell ls slices/$$(*)-?????-in.alephseq | sed -e 's/-in.alephseq/-work-keys.nt/')
$(RIOT) $^ >$@
refdata/%-agent-keys.nt: $$(shell ls slices/$$(*)-?????-in.alephseq | sed -e 's/-in.alephseq/-agent-keys.nt/')
$(RIOT) $^ >$@
%-transformations.nt: %-keys.nt
scripts/create-merge-transformations.py <$^ >$@
slices/%-merged.nt: slices/%-reconciled.nt refdata/$$(shell echo $$(*)|sed -e 's/-[0-9X]\+//')-work-transformations.nt
$(SPARQL) --data $< --data $(word 2,$^) --query sparql/merge.rq --out=NT >$@
slices/%-agent-keys.nt: slices/%-merged.nt
JVM_ARGS=$(JVMARGS) $(SPARQL) --data $< --query sparql/create-agent-keys.rq --out=NT >$@
slices/%-merged2.nt: slices/%-merged.nt refdata/$$(shell echo $$(*)|sed -e 's/-[0-9X]\+//')-agent-transformations.nt
$(SPARQL) --data $< --data $(word 2,$^) --query sparql/merge.rq --out=NT >$@
merged/%.mrcx: $$(shell ls slices/$$(*)-?????-in.alephseq | sed -e 's/-in.alephseq/-preprocessed.alephseq/')
cat $^ | $(CATMANDU) convert MARC --type ALEPHSEQ to MARC --type XML --pretty 1 --fix scripts/filter-marc.fix --fix scripts/strip-personal-info.fix >$@
merged/%-merged.nt: $$(shell ls slices/$$(*)-?????-in.alephseq | sed -e 's/-in.alephseq/-merged2.nt/') refdata/fennica-collection.ttl
$(RIOT) $^ >$@
%.hdt: %.nt
$(RDF2HDT) $< $@
# also (re)generate index, for later querying
rm -f [email protected]*
$(HDTSEARCH) -q 0 $@
output/%.nt: merged/%-merged.hdt
JAVA_OPTIONS=$(JVMARGS) $(HDTSPARQL) $^ "`cat sparql/consolidate-works.rq`" >$@
# Targets to be run externally
all: slice consolidate
realclean: clean
rm -f split-input/*.alephseq split-input/*.md5
rm -f slices/*.alephseq slices/*.md5
rm -f refdata/*.csv refdata/*.nt
clean:
rm -f refdata/*-work-keys.nt refdata/*-work-transformations.nt
rm -f slices/*-preprocessed.alephseq
rm -f slices/*.mrcx
rm -f slices/*.rdf
rm -f slices/*.nt slices/*.log
rm -f merged/*.nt merged/*.mrcx
slice: $(patsubst input/%.alephseq,slices/%.md5,$(wildcard input/*.alephseq))
preprocess: $(patsubst %-in.alephseq,%-preprocessed.alephseq,$(wildcard slices/*-in.alephseq))
marcdist: $(patsubst input/%.alephseq,merged/%.mrcx,$(wildcard input/*.alephseq))
mrcx: $(patsubst %-in.alephseq,%.mrcx,$(wildcard slices/*-in.alephseq))
rdf: $(patsubst %-in.alephseq,%-bf2.rdf,$(wildcard slices/*-in.alephseq))
rewrite: $(patsubst %-in.alephseq,%-rewritten.nt,$(wildcard slices/*-in.alephseq))
work-keys: $(patsubst %-in.alephseq,%-work-keys.nt,$(wildcard slices/*-in.alephseq))
work-transformations: $(patsubst input/%.alephseq,refdata/%-work-transformations.nt,$(wildcard input/*.alephseq))
schema: $(patsubst %-in.alephseq,%-schema.nt,$(wildcard slices/*-in.alephseq))
reconcile: $(patsubst %-in.alephseq,%-reconciled.nt,$(wildcard slices/*-in.alephseq))
agent-keys: $(patsubst %-in.alephseq,%-agent-keys.nt,$(wildcard slices/*-in.alephseq))
agent-transformations: $(patsubst input/%.alephseq,refdata/%-agent-transformations.nt,$(wildcard input/*.alephseq))
merge: $(patsubst input/%.alephseq,merged/%-merged.nt,$(wildcard input/*.alephseq))
consolidate: $(patsubst input/%.alephseq,output/%.nt,$(wildcard input/*.alephseq))
.PHONY: all realclean clean slice preprocess mrcx rdf rewrite work-keys schema merge consolidate
.DEFAULT_GOAL := all
# retain all intermediate files
.SECONDARY: