From 37172ed613feb59c8b1e8e89baad674d42ee4256 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Lindstr=C3=B6m?= Date: Thu, 11 Jul 2024 17:01:33 +0200 Subject: [PATCH 1/4] Add whelktool script for linking SAB classification --- whelktool/scripts/2024/sab/main.groovy | 158 +++++++++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 whelktool/scripts/2024/sab/main.groovy diff --git a/whelktool/scripts/2024/sab/main.groovy b/whelktool/scripts/2024/sab/main.groovy new file mode 100644 index 0000000000..186e70d1f9 --- /dev/null +++ b/whelktool/scripts/2024/sab/main.groovy @@ -0,0 +1,158 @@ +SAB = "https://id.kb.se/term/kssb" + +SAB_MAP = [:] + +missing = [:] + +boolean interpretClassification(Map thing) { + var modified = false + var isInstance = 'instanceOf' in thing + + for (Map cls : asList(thing.classification)) { + if (asList(cls.inScheme).any { + it[ID] == SAB || it.code?.toLowerCase()?.startsWith("kssb") + }) { + List sabRefs = null + + var clsCode = cls.code + + if (isInstance) { + var mediaSubdiv = clsCode.find(/(\/[A-Z]+)/) + if (mediaSubdiv in SAB_MAP) { + var basecode = clsCode.replace(mediaSubdiv, '') + if (basecode in SAB_MAP) { + sabRefs = [ [(ID): SAB_MAP[basecode]], [(ID): SAB_MAP[mediaSubdiv]] ] + } + } + } + + if (!sabRefs) { + if (clsCode in SAB_MAP) { + sabRefs = [ [(ID): SAB_MAP[clsCode]] ] + } else if (clsCode) { + sabRefs = splitSabCode(clsCode) + } + } + + if (sabRefs) { + cls.clear() + if (sabRefs.size() == 1) { + cls.putAll(sabRefs[0]) + } else { + cls[TYPE] = 'Classification' + cls.code = clsCode + cls.inScheme = [(ID): SAB] + cls.broader = sabRefs + + var missed = sabRefs.findAll { ID !in it && it[TYPE] != 'ShelfLocal' } + if (missed) { + missed.each { + missing.get(it.code, []) << clsCode + } + } + } + modified = true + } else if (clsCode && !clsCode.contains('z ')) { + missing.get(clsCode, []) << '' + } + } + } + + if (isInstance) { + if (ID !in thing.instanceOf) { + modified |= interpretClassification(thing.instanceOf) + } + } + + return modified +} + +List splitSabCode(String code) { + var chunks = parseSabCode(code) + return chunks.findResults { chunk -> + if (chunk.size() == 0) { + return null + } + + if (chunk.indexOf(' ') > -1) { + if (chunk.startsWith('z ')) { + return [(TYPE): 'ShelfLocal', label: chunk.substring(2)] + } + return null + } + + // TODO: Improve? Other parts may be subcomponents of chunks[0]... + if (chunk =~ /^[.:(]/) { + var prefixedCode = chunks[0][0] + chunk + if (prefixedCode in SAB_MAP) { + chunk = prefixedCode + } + } + + if (chunk in SAB_MAP) { + return [(ID): SAB_MAP[chunk]] + } + + //var slug = URLEncoder.encode(chunk) + //return [(ID): "${SAB}/${slug}"] + return [code: chunk] + } +} + +List parseSabCode(String code) { + // TODO: starts with any /[a-z]/? + if (code.startsWith('u')) { + code = code.substring(1) + ',u' + } + + var spaceIdx = code.indexOf('z ') + var rest = [] + if (spaceIdx > -1) { + rest << code.substring(spaceIdx) + code = code.substring(0, spaceIdx) + } + + return code.split(/(?=z\s+.+|\(\w+\)|[,\/=:.-])/) + rest +} + + +selectBySqlWhere(""" + data#>>'{@graph,0,inDataset,0,@id}' IN ('https://id.kb.se/dataset/sab', 'https://id.kb.se/dataset/sab/precoordinated') AND + data#>>'{@graph,1,inScheme,@id}' = 'https://id.kb.se/term/kssb' AND + data#>>'{@graph,1,@type}' != 'Collection' +""") { + def cls = it.graph[1] + + // FIXME: Make *all* sab.ttl codes unique! (Use altLabel for "shortcode"?) + def code = cls.code + if (cls[TYPE].indexOf('Subdivision') > -1) { + def firstIdChar = URLDecoder.decode(cls[ID].substring(SAB.size() + 1))[0] + if (!cls.code.startsWith(firstIdChar)) { + code = firstIdChar + code + } + } + //if (code != cls.code) println code + ' => ' + cls.code + + SAB_MAP[code] = cls[ID] +} +println "Loaded ${SAB_MAP.size()} SAB references" + + +selectBySqlWhere(""" + collection <> 'hold' AND + data#>>'{@graph,1}' LIKE '%kssb%' +""") { data -> +/* +selectByIds(['8rkj0wql14q40gb']) { data -> +*/ + def (record, instance) = data.graph + + if (interpretClassification(instance)) { + data.scheduleSave() + } +} + +missingLog = getReportWriter("sab-missing.txt") +missing.keySet().sort().each { + missingLog.println "${it} ${missing[it].size()} ${missing[it].unique().join(' | ')}" +} From 3f4aead081c9925c65dac436fcad977592b98bc7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Lindstr=C3=B6m?= Date: Mon, 9 Dec 2024 19:43:27 +0100 Subject: [PATCH 2/4] Type labelled entities in SAB as just Resources --- whelktool/scripts/2024/sab/main.groovy | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/whelktool/scripts/2024/sab/main.groovy b/whelktool/scripts/2024/sab/main.groovy index 186e70d1f9..4f410adbc8 100644 --- a/whelktool/scripts/2024/sab/main.groovy +++ b/whelktool/scripts/2024/sab/main.groovy @@ -44,7 +44,7 @@ boolean interpretClassification(Map thing) { cls.inScheme = [(ID): SAB] cls.broader = sabRefs - var missed = sabRefs.findAll { ID !in it && it[TYPE] != 'ShelfLocal' } + var missed = sabRefs.findAll { ID !in it && it[TYPE] != 'Resource' } if (missed) { missed.each { missing.get(it.code, []) << clsCode @@ -76,7 +76,7 @@ List splitSabCode(String code) { if (chunk.indexOf(' ') > -1) { if (chunk.startsWith('z ')) { - return [(TYPE): 'ShelfLocal', label: chunk.substring(2)] + return [(TYPE): 'Resource', label: chunk.substring(2)] } return null } From d2c188cb4adaf8b63cb8a5d22ebddbe9f9baec24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Lindstr=C3=B6m?= Date: Tue, 17 Dec 2024 18:50:55 +0100 Subject: [PATCH 3/4] Fix handling of missing or multiple SAB codes --- whelktool/scripts/2024/sab/main.groovy | 104 ++++++++++++++++--------- 1 file changed, 68 insertions(+), 36 deletions(-) diff --git a/whelktool/scripts/2024/sab/main.groovy b/whelktool/scripts/2024/sab/main.groovy index 4f410adbc8..56158dd02a 100644 --- a/whelktool/scripts/2024/sab/main.groovy +++ b/whelktool/scripts/2024/sab/main.groovy @@ -8,52 +8,37 @@ boolean interpretClassification(Map thing) { var modified = false var isInstance = 'instanceOf' in thing + List additionalCodes = [] + for (Map cls : asList(thing.classification)) { if (asList(cls.inScheme).any { it[ID] == SAB || it.code?.toLowerCase()?.startsWith("kssb") }) { - List sabRefs = null - - var clsCode = cls.code + def clsCode = cls.code - if (isInstance) { - var mediaSubdiv = clsCode.find(/(\/[A-Z]+)/) - if (mediaSubdiv in SAB_MAP) { - var basecode = clsCode.replace(mediaSubdiv, '') - if (basecode in SAB_MAP) { - sabRefs = [ [(ID): SAB_MAP[basecode]], [(ID): SAB_MAP[mediaSubdiv]] ] - } - } + if (clsCode instanceof List && clsCode.size() > 0) { + additionalCodes = clsCode[1..-1].findAll { it instanceof String } + clsCode = clsCode[0] } - if (!sabRefs) { - if (clsCode in SAB_MAP) { - sabRefs = [ [(ID): SAB_MAP[clsCode]] ] - } else if (clsCode) { - sabRefs = splitSabCode(clsCode) - } + if (clsCode !instanceof String) { + continue } - if (sabRefs) { + var newCls = getClassification(clsCode, isInstance) + if (newCls != null) { cls.clear() - if (sabRefs.size() == 1) { - cls.putAll(sabRefs[0]) - } else { - cls[TYPE] = 'Classification' - cls.code = clsCode - cls.inScheme = [(ID): SAB] - cls.broader = sabRefs - - var missed = sabRefs.findAll { ID !in it && it[TYPE] != 'Resource' } - if (missed) { - missed.each { - missing.get(it.code, []) << clsCode - } - } - } + cls.putAll(newCls) modified = true - } else if (clsCode && !clsCode.contains('z ')) { - missing.get(clsCode, []) << '' + } + } + } + + if (thing.classification instanceof List && additionalCodes) { + additionalCodes.each { + var newCls = getClassification(it, isInstance) + if (newCls != null) { + thing.classification << newCls } } } @@ -67,6 +52,53 @@ boolean interpretClassification(Map thing) { return modified } +Map getClassification(String clsCode, isInstance=false) { + List sabRefs = null + + if (isInstance) { + var mediaSubdiv = clsCode.find(/(\/[A-Z]+)/) + if (mediaSubdiv in SAB_MAP) { + var basecode = clsCode.replace(mediaSubdiv, '') + if (basecode in SAB_MAP) { + sabRefs = [ [(ID): SAB_MAP[basecode]], [(ID): SAB_MAP[mediaSubdiv]] ] + } + } + } + + if (!sabRefs) { + if (clsCode in SAB_MAP) { + sabRefs = [ [(ID): SAB_MAP[clsCode]] ] + } else if (clsCode) { + sabRefs = splitSabCode(clsCode) + } + } + + if (sabRefs) { + var cls = [:] + if (sabRefs.size() == 1) { + cls.putAll(sabRefs[0]) + } else { + cls[TYPE] = 'Classification' + cls.code = clsCode + cls.inScheme = [(ID): SAB] + cls.broader = sabRefs + + var missed = sabRefs.findAll { ID !in it && it[TYPE] != 'Resource' } + if (missed) { + missed.each { + missing.get(it.code, []) << clsCode + } + } + } + + return cls + } else if (!clsCode.contains('z ')) { + missing.get(clsCode, []) << '' + } + + return null +} + List splitSabCode(String code) { var chunks = parseSabCode(code) return chunks.findResults { chunk -> @@ -143,7 +175,7 @@ selectBySqlWhere(""" data#>>'{@graph,1}' LIKE '%kssb%' """) { data -> /* -selectByIds(['8rkj0wql14q40gb']) { data -> +selectByIds(['8rkj0wql14q40gb', '2kc9d80d2kl6v14']) { data -> */ def (record, instance) = data.graph From 7dc6e7487790ac0e3a256c7b321ce25a5e4f91c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20Lindstr=C3=B6m?= Date: Wed, 18 Dec 2024 13:53:23 +0100 Subject: [PATCH 4/4] Fix handling of SAB code as list with one item --- whelktool/scripts/2024/sab/main.groovy | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/whelktool/scripts/2024/sab/main.groovy b/whelktool/scripts/2024/sab/main.groovy index 56158dd02a..59a5432b21 100644 --- a/whelktool/scripts/2024/sab/main.groovy +++ b/whelktool/scripts/2024/sab/main.groovy @@ -17,7 +17,9 @@ boolean interpretClassification(Map thing) { def clsCode = cls.code if (clsCode instanceof List && clsCode.size() > 0) { - additionalCodes = clsCode[1..-1].findAll { it instanceof String } + if (clsCode.size() > 1) { + additionalCodes += clsCode[1..-1].findAll { it instanceof String } + } clsCode = clsCode[0] } @@ -175,7 +177,7 @@ selectBySqlWhere(""" data#>>'{@graph,1}' LIKE '%kssb%' """) { data -> /* -selectByIds(['8rkj0wql14q40gb', '2kc9d80d2kl6v14']) { data -> +selectByIds(['8rkj0wql14q40gb', '2kc9d80d2kl6v14', 'vc5xkl562s3bj8w']) { data -> */ def (record, instance) = data.graph