Skip to content

Commit

Permalink
Update duplicate instance query
Browse files Browse the repository at this point in the history
  • Loading branch information
wwelling committed Sep 19, 2024
1 parent 5a64008 commit 1fa948d
Showing 1 changed file with 1 addition and 1 deletion.
2 changes: 1 addition & 1 deletion duplicate-instance-report/nodes/query.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,6 @@
"outputPath": "/mnt/workflows/${tenantId}/duplicate-instance-report/instance-report-report-${timestamp}.csv",
"resultType": "CSV",
"includeHeader": true,
"query": "WITH oclc_from_srs_marctab AS (SELECT instance_id::text, LTRIM(REGEXP_REPLACE(content, '[^\\d]', '', 'g'), '0') AS \"content\" FROM public.srs_marctab WHERE field = '035' AND ord = 1 AND (sf = 'a' OR sf = 'z') AND \"content\" LIKE '(OCoLC)%'), isbn_from_instance_identifiers AS (SELECT instance_id, SUBSTRING(identifier FROM '^[^ ]*') AS \"content\" FROM folio_reporting.instance_identifiers WHERE identifier_type_name = 'ISBN'), issn_from_srs_marctab AS (SELECT instance_id::text, \"content\" FROM public.srs_marctab WHERE field = '022' AND ord = 1 AND sf = 'a'), lccn_from_instance_identifiers AS (SELECT instance_id, identifier AS \"content\" FROM folio_reporting.instance_identifiers WHERE identifier_type_name = 'LCCN'), call_number_from_holdings_ext AS (SELECT instance_id, TRIM(COALESCE(call_number_prefix, '') || ' ' || COALESCE(call_number, '') || ' ' || COALESCE(call_number_suffix, '')) AS \"content\" FROM folio_reporting.holdings_ext), instance_hrid_from_instance_ext AS (SELECT instance_id, instance_hrid AS \"content\" FROM folio_reporting.instance_ext), identified_instance AS (SELECT ie.instance_id, MAX(CASE WHEN oclcsm.content IS NOT NULL THEN oclcsm.content ELSE NULL END) AS OCLC, MAX(CASE WHEN isbnii.content IS NOT NULL THEN LEFT(SUBSTRING(isbnii.content FROM GREATEST(LENGTH(isbnii.content) - 9, 1)), 9) ELSE NULL END) AS ISBN, MAX(CASE WHEN issnsm.content IS NOT NULL THEN issnsm.content ELSE NULL END) AS ISSN, MAX(CASE WHEN lccnii.content IS NOT NULL THEN lccnii.content ELSE NULL END) AS LCCN, MAX(CASE WHEN cnhe.content IS NOT NULL THEN cnhe.content ELSE NULL END) AS CALL_NUMBER, MAX(CASE WHEN ihie.content IS NOT NULL THEN ihie.content ELSE NULL END) AS HRID FROM folio_reporting.instance_ext ie LEFT JOIN oclc_from_srs_marctab oclcsm ON ie.instance_id = oclcsm.instance_id LEFT JOIN isbn_from_instance_identifiers isbnii ON ie.instance_id = isbnii.instance_id LEFT JOIN issn_from_srs_marctab issnsm ON ie.instance_id = issnsm.instance_id LEFT JOIN lccn_from_instance_identifiers lccnii ON ie.instance_id = lccnii.instance_id LEFT JOIN call_number_from_holdings_ext cnhe ON ie.instance_id = cnhe.instance_id LEFT JOIN instance_hrid_from_instance_ext ihie ON ie.instance_id = ihie.instance_id GROUP BY ie.instance_id) SELECT ii1.HRID AS HRID, ii2.HRID AS HRID2, CASE WHEN ii1.OCLC = ii2.OCLC IS NOT NULL THEN TRUE ELSE FALSE END AS OCLC, CASE WHEN ii1.ISBN = ii2.ISBN IS NOT NULL THEN TRUE ELSE FALSE END AS ISBN, CASE WHEN ii1.ISSN = ii2.ISSN IS NOT NULL THEN TRUE ELSE FALSE END AS ISSN, CASE WHEN ii1.CALL_NUMBER = ii2.CALL_NUMBER IS NOT NULL THEN TRUE ELSE FALSE END AS CALL_NUMBER, CASE WHEN ii1.LCCN = ii2.LCCN IS NOT NULL THEN TRUE ELSE FALSE END AS LCCN FROM identified_instance ii1 INNER JOIN identified_instance ii2 ON ii1.OCLC = ii2.OCLC OR ii1.ISBN = ii2.ISBN OR ii1.ISSN = ii2.ISSN OR ii1.LCCN = ii2.LCCN OR ii1.CALL_NUMBER = ii2.CALL_NUMBER WHERE ii1.instance_id <> ii2.instance_id LIMIT 1000",
"query": "WITH oclc_from_srs_marctab AS (SELECT instance_id, LTRIM(REGEXP_REPLACE(content, '[^\\d]', '', 'g'), '0') AS oclc FROM public.srs_marctab WHERE field = '035' AND ord = 1 AND sf IN ('a', 'z') AND content LIKE '(OCoLC)%'), oclc_duplicates AS (SELECT oclc FROM oclc_from_srs_marctab WHERE oclc <> '' GROUP BY oclc HAVING COUNT(*) > 1), oclc_data AS (SELECT s.instance_id::text, s.oclc FROM oclc_from_srs_marctab s JOIN oclc_duplicates d ON s.oclc = d.oclc), isbn_from_instance_identifiers AS (SELECT instance_id, NULLIF(SUBSTRING(LEFT(identifier, POSITION(' ' IN identifier) - 1) FROM GREATEST(LENGTH(LEFT(identifier, POSITION(' ' IN identifier) - 1)) - 9, 1) FOR 9), '') AS isbn FROM folio_reporting.instance_identifiers WHERE identifier_type_name = 'ISBN'), isbn_duplicates AS (SELECT isbn FROM isbn_from_instance_identifiers WHERE isbn <> '' GROUP BY isbn HAVING COUNT(*) > 1), isbn_data AS (SELECT s.instance_id, s.isbn FROM isbn_from_instance_identifiers s JOIN isbn_duplicates d ON s.isbn = d.isbn), issn_from_srs_marctab AS (SELECT instance_id, content AS issn FROM public.srs_marctab WHERE field = '022' AND ord = 1 AND sf = 'a'), issn_duplicates AS (SELECT issn FROM issn_from_srs_marctab WHERE issn <> '' GROUP BY issn HAVING COUNT(*) > 1), issn_data AS (SELECT s.instance_id::text, s.issn FROM issn_from_srs_marctab s JOIN issn_duplicates d ON s.issn = d.issn), lccn_from_instance_identifiers AS (SELECT instance_id, identifier AS lccn FROM folio_reporting.instance_identifiers WHERE identifier_type_name = 'LCCN'), lccn_duplicates AS (SELECT lccn FROM lccn_from_instance_identifiers WHERE lccn <> '' GROUP BY lccn HAVING COUNT(*) > 1), lccn_data AS (SELECT s.instance_id, s.lccn FROM lccn_from_instance_identifiers s JOIN lccn_duplicates d ON s.lccn = d.lccn), call_number_from_holdings_ext AS (SELECT instance_id, TRIM(CONCAT_WS(' ', COALESCE(NULLIF(TRIM(call_number_prefix), ''), ''), COALESCE(NULLIF(TRIM(call_number), ''), ''), COALESCE(NULLIF(TRIM(call_number_suffix), ''), ''))) AS call_number FROM folio_reporting.holdings_ext), call_number_duplicates AS (SELECT call_number FROM call_number_from_holdings_ext WHERE TRIM(call_number) <> '' GROUP BY call_number HAVING COUNT(*) > 1), call_number_data AS (SELECT s.instance_id, s.call_number FROM call_number_from_holdings_ext s JOIN call_number_duplicates d ON s.call_number = d.call_number), duplicate_instances AS (SELECT ie.instance_id, ie.instance_hrid, oclc.oclc, isbn.isbn, issn.issn, lccn.lccn, cn.call_number FROM folio_reporting.instance_ext ie LEFT JOIN oclc_data oclc ON ie.instance_id = oclc.instance_id LEFT JOIN isbn_data isbn ON ie.instance_id = isbn.instance_id LEFT JOIN issn_data issn ON ie.instance_id = issn.instance_id LEFT JOIN lccn_data lccn ON ie.instance_id = lccn.instance_id LEFT JOIN call_number_data cn ON ie.instance_id = cn.instance_id WHERE oclc.oclc IS NOT NULL OR isbn.isbn IS NOT NULL OR issn.issn IS NOT NULL OR lccn.lccn IS NOT NULL OR cn.call_number IS NOT NULL) SELECT di1.instance_hrid AS HRID, di2.instance_hrid AS HRID2, MAX(CASE WHEN (di1.OCLC IS NOT NULL AND di1.OCLC = di2.OCLC) THEN 'T' ELSE NULL END) AS OCLC, MAX(CASE WHEN (di1.ISBN IS NOT NULL AND di1.ISBN = di2.ISBN) THEN 'T' ELSE NULL END) AS ISBN, MAX(CASE WHEN (di1.ISSN IS NOT NULL AND di1.ISSN = di2.ISSN) THEN 'T' ELSE NULL END) AS ISSN, MAX(CASE WHEN (di1.CALL_NUMBER IS NOT NULL AND di1.CALL_NUMBER = di2.CALL_NUMBER) THEN 'T' ELSE NULL END) AS CALL_NUMBER, MAX(CASE WHEN (di1.LCCN IS NOT NULL AND di1.LCCN = di2.LCCN) THEN 'T' ELSE NULL END) AS LCCN FROM duplicate_instances di1 INNER JOIN duplicate_instances di2 ON di1.instance_id <> di2.instance_id AND (di1.oclc = di2.oclc OR di1.isbn = di2.isbn OR di1.issn = di2.issn OR di1.lccn = di2.lccn OR di1.call_number = di2.call_number) GROUP BY di1.instance_hrid, di2.instance_hrid",
"asyncBefore": true
}

0 comments on commit 1fa948d

Please sign in to comment.