Skip to content

Commit 1c8f416

Browse files
committed
Add script for specifying designer roels in elib records (and move unspecified to instance)
1 parent aa64202 commit 1c8f416

File tree

2 files changed

+180
-2
lines changed

2 files changed

+180
-2
lines changed

librisxl-tools/scripts/merge-works.sh

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,9 @@ SWEDISH_FICTION=$CLUSTERS_DIR/4-swedish-fiction.tsv
3434
#NO_ANONYMOUS_TRANSLATIONS=$CLUSTERS_DIR/5-no-anonymous-translations.tsv
3535

3636
LANGUAGE_IN_TITLE=$NORMALIZATIONS_DIR/1-titles-with-language
37-
CONTRIBUTION=$NORMALIZATIONS_DIR/2-contribution
38-
ROLES_TO_INSTANCE=$NORMALIZATIONS_DIR/3-roles-to-instance
37+
ELIB_DESIGNERS=$$NORMALIZATIONS_DIR/1-elib-cover-designer
38+
CONTRIBUTION=$NORMALIZATIONS_DIR/3-contribution
39+
ROLES_TO_INSTANCE=$NORMALIZATIONS_DIR/4-roles-to-instance
3940

4041
# Clustring step 1 TODO: run only on recently updated records after first run
4142
echo "Finding new clusters..."
@@ -87,6 +88,12 @@ time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDIS
8788
$ARGS --report $LANGUAGE_IN_TITLE src/main/groovy/datatool/scripts/mergeworks/normalize/language-in-work-title.groovy 2>/dev/null
8889
echo "$(count_lines $LANGUAGE_IN_TITLE/MODIFIED.txt) records affected, report in $LANGUAGE_IN_TITLE"
8990

91+
echo
92+
echo "Specifying designer roles in Elib records..."
93+
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -jar build/libs/whelktool.jar \
94+
$ARGS --report $ELIB_DESIGNERS scripts/cleanups/2023/05/lxl-4183-elib-cover-designer.groovy 2>/dev/null
95+
echo "$(count_lines $ELIB_DESIGNERS/MODIFIED.txt) records affected, report in $ELIB_DESIGNERS"
96+
9097
echo
9198
echo "Normalizing contribution..."
9299
time java -Dxl.secret.properties=$HOME/secret.properties-$ENV -Dclusters=$SWEDISH_FICTION -jar build/libs/whelktool.jar \
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
import groovy.transform.Memoized
2+
3+
import java.util.concurrent.ConcurrentHashMap
4+
5+
PrintWriter matchedAndSpecified = getReportWriter("matched.tsv")
6+
PrintWriter unmatchedSpecifiedAnyway = getReportWriter("mismatched.tsv")
7+
PrintWriter matchedInOtherWork = getReportWriter("matched-in-other-work.tsv")
8+
PrintWriter notSpecifiedMovedToInstance = getReportWriter("not-specified-moved-to-instance.txt")
9+
10+
def where = """
11+
collection = 'bib'
12+
AND data#>>'{@graph, 0, identifiedBy}' LIKE '%Elib%'
13+
AND (data#>>'{@graph, 1, instanceOf, summary}' is not null OR data#>>'{@graph, 1, summary}' is not null)
14+
AND deleted = false
15+
"""
16+
17+
ROLES = [
18+
'Formgivare:' : 'https://id.kb.se/relator/designer',
19+
'Omslag:' : 'https://id.kb.se/relator/coverDesigner',
20+
'Omslagsformgivare:': 'https://id.kb.se/relator/coverDesigner',
21+
]
22+
23+
OTHER = [['@id': 'https://id.kb.se/relator/unspecifiedContributor']]
24+
25+
Map<String, Set<String>> knownNames = new ConcurrentHashMap(['https://id.kb.se/relator/designer' : new ConcurrentHashMap().newKeySet(),
26+
'https://id.kb.se/relator/coverDesigner': new ConcurrentHashMap().newKeySet()])
27+
Map<String, Set<String>> knownAgents = new ConcurrentHashMap(['https://id.kb.se/relator/designer' : new ConcurrentHashMap().newKeySet(),
28+
'https://id.kb.se/relator/coverDesigner': new ConcurrentHashMap().newKeySet()])
29+
Set<String> handled = new ConcurrentHashMap().newKeySet()
30+
31+
selectBySqlWhere(where) { bib ->
32+
def id = bib.doc.shortId
33+
def instance = bib.graph[1]
34+
def summary = asList(instance['instanceOf']['summary']) + asList(bib.graph[1]['summary'])
35+
36+
def nameToRoles = summary
37+
.findResults { it['label'] }
38+
.join(' ')
39+
.with { parseRoles(it) }
40+
.each { name, roles ->
41+
knownNames.computeIfAbsent(name, f -> []).add(roles)
42+
}
43+
44+
List workContribution = instance['instanceOf']['contribution']
45+
if (!workContribution) {
46+
return
47+
}
48+
49+
def modified = workContribution.removeAll { !it.agent }
50+
51+
Set existingRoles = workContribution.collect { asList(it.role)*.'@id' }.grep().flatten()
52+
53+
if (existingRoles.contains('https://id.kb.se/relator/unspecifiedContributor') && nameToRoles) {
54+
workContribution.each { c ->
55+
if (asList(c.role) == OTHER) {
56+
def agentName = name(loadIfLink(c.agent))
57+
def roles = nameToRoles[agentName]
58+
if (roles) {
59+
c['role'] = roles.collect { ['@id': it] }
60+
matchedAndSpecified.println([id, c.agent, roles].join('\t'))
61+
nameToRoles.remove(agentName)
62+
modified = true
63+
}
64+
}
65+
}
66+
67+
def other = workContribution.findAll { asList(it.role) == OTHER }
68+
69+
if (nameToRoles.size() == 1 && other.size() == 1) {
70+
def c = other[0]
71+
def name = nameToRoles.keySet()[0]
72+
def roles = nameToRoles[name]
73+
other[0]['role'] = roles.collect { ['@id': it] }
74+
other.clear()
75+
unmatchedSpecifiedAnyway.println([id, c.agent, name, roles].join('\t'))
76+
modified = true
77+
}
78+
79+
if (other.isEmpty()) {
80+
handled.add(id)
81+
}
82+
}
83+
84+
workContribution.each { c ->
85+
def roles = asList(c.role)*.'@id'
86+
if (knownAgents.keySet().intersect(roles)) {
87+
knownAgents.computeIfAbsent(c.agent, f -> []).add(roles)
88+
}
89+
}
90+
91+
if (modified) {
92+
bib.scheduleSave()
93+
}
94+
}
95+
96+
selectBySqlWhere("collection = 'bib' AND data#>>'{@graph, 0, identifiedBy}' LIKE '%Elib%' AND deleted = false") { bib ->
97+
def id = bib.doc.shortId
98+
if (id in handled) {
99+
return
100+
}
101+
def instance = bib.graph[1]
102+
List workContribution = instance['instanceOf']['contribution']
103+
if (!workContribution) {
104+
return
105+
}
106+
107+
workContribution.removeAll { !it.agent }
108+
109+
workContribution.each { c ->
110+
if (asList(c.role) == OTHER) {
111+
def roles = knownAgents[c.agent] ?: knownNames[name(loadIfLink(c.agent))]
112+
if (roles) {
113+
def countByRole = roles.countBy { it }.sort { -it.value }
114+
if (countByRole.size() == 1) {
115+
countByRole.find { it.value > 2 }?.with {
116+
def role = it.key
117+
def count = it.value
118+
c['role'] = [['@id': role]]
119+
matchedInOtherWork.println([id, c.agent, role, count].join('\t'))
120+
bib.scheduleSave()
121+
}
122+
}
123+
}
124+
}
125+
}
126+
127+
workContribution.removeAll { c ->
128+
if (asList(c.role) == OTHER) {
129+
instance['contribution'] = asList(instance['contribution']) + c
130+
notSpecifiedMovedToInstance.println(id)
131+
bib.scheduleSave()
132+
return true
133+
}
134+
return false
135+
}
136+
}
137+
138+
private Map parseRoles(String summary) {
139+
def roleToNames = ROLES.collectEntries { s, id ->
140+
def names = summary
141+
.findAll(/$s[^\[,"]+/)
142+
.collect { it.substring(s.size()) }
143+
.collect { it.trim() }
144+
145+
[(id): names]
146+
}
147+
148+
def nameToRoles = [:]
149+
roleToNames.each { role, names ->
150+
names.each { n -> nameToRoles[n] = nameToRoles.getOrDefault(n, []) + [role] }
151+
}
152+
153+
return nameToRoles
154+
}
155+
156+
private String name(Map agent) {
157+
agent.name ?: "${agent.givenName} ${agent.familyName}"
158+
}
159+
160+
private Map loadIfLink(Map m) {
161+
m['@id'] ? loadThing(m['@id']) : m
162+
}
163+
164+
@Memoized
165+
private Map loadThing(def id) {
166+
def thing = [:]
167+
selectByIds([id]) { t ->
168+
thing = t.graph[1]
169+
}
170+
return thing
171+
}

0 commit comments

Comments
 (0)