From 4cd761f4a3cde20bbe8998266789c315f80f04cd Mon Sep 17 00:00:00 2001 From: William Welling Date: Thu, 19 Sep 2024 08:15:12 -0500 Subject: [PATCH] Add Formatted SQL for Issue #350 --- duplicate-instance-report/sql/call_number.sql | 45 +++++++ .../sql/duplicate_instances.sql | 121 ++++++++++++++++++ duplicate-instance-report/sql/isbn.sql | 54 ++++++++ duplicate-instance-report/sql/issn.sql | 54 ++++++++ duplicate-instance-report/sql/lccn.sql | 48 +++++++ duplicate-instance-report/sql/oclc.sql | 64 +++++++++ 6 files changed, 386 insertions(+) create mode 100644 duplicate-instance-report/sql/call_number.sql create mode 100644 duplicate-instance-report/sql/duplicate_instances.sql create mode 100644 duplicate-instance-report/sql/isbn.sql create mode 100644 duplicate-instance-report/sql/issn.sql create mode 100644 duplicate-instance-report/sql/lccn.sql create mode 100644 duplicate-instance-report/sql/oclc.sql diff --git a/duplicate-instance-report/sql/call_number.sql b/duplicate-instance-report/sql/call_number.sql new file mode 100644 index 00000000..200cb29a --- /dev/null +++ b/duplicate-instance-report/sql/call_number.sql @@ -0,0 +1,45 @@ +-- CALL NUMBER + +-- - [ ] 1. The call numbers do not need to be normalized +-- - [ ] 2. An source instance is considered to match a target instance if any of the source instance's holding's call numbers match any of the target instance's holding's call numbers. +-- - [ ] 3. This comparison should be done using the call_number, call_number_prefix, call_number_suffix columns of the holdings_ext table in folio reporting. + +-- matches +WITH call_number_from_holdings_ext AS ( + SELECT instance_id, TRIM(CONCAT_WS(' ', call_number_prefix, call_number, call_number_suffix)) AS call_number + FROM folio_reporting.holdings_ext +) +SELECT call_number, COUNT(*) FROM call_number_from_holdings_ext WHERE TRIM(call_number) <> '' GROUP BY call_number HAVING COUNT(*) > 1 ORDER BY count DESC; + + +-- grouped by matches +WITH call_number_from_holdings_ext AS ( + SELECT TRIM(CONCAT_WS(' ', call_number_prefix, call_number, call_number_suffix)) AS call_number + FROM folio_reporting.holdings_ext +), +call_number_duplicates AS ( + SELECT call_number + FROM call_number_from_holdings_ext + WHERE TRIM(call_number) <> '' + GROUP BY call_number + HAVING COUNT(*) > 1 +) +SELECT * FROM call_number_duplicates; + + +-- order by identifier +WITH call_number_from_holdings_ext AS ( + SELECT instance_id, TRIM(CONCAT_WS(' ', call_number_prefix, call_number, call_number_suffix)) AS call_number + FROM folio_reporting.holdings_ext +), +call_number_duplicates AS ( + SELECT call_number + FROM call_number_from_holdings_ext + WHERE TRIM(call_number) <> '' + GROUP BY call_number + HAVING COUNT(*) > 1 +) +SELECT s.instance_id, s.call_number +FROM call_number_from_holdings_ext s +JOIN call_number_duplicates d ON s.call_number = d.call_number +ORDER BY s.call_number; diff --git a/duplicate-instance-report/sql/duplicate_instances.sql b/duplicate-instance-report/sql/duplicate_instances.sql new file mode 100644 index 00000000..65d3445f --- /dev/null +++ b/duplicate-instance-report/sql/duplicate_instances.sql @@ -0,0 +1,121 @@ +WITH oclc_from_srs_marctab AS ( + SELECT instance_id, LTRIM(REGEXP_REPLACE(content, '[^0-9]', '', 'g'), '0') AS oclc + FROM public.srs_marctab + WHERE field = '035' + AND ord = 1 + AND sf IN ('a', 'z') + AND content LIKE '(OCoLC)%' +), +oclc_duplicates AS ( + SELECT oclc + FROM oclc_from_srs_marctab + WHERE oclc <> '' + GROUP BY oclc + HAVING COUNT(*) > 1 +), +oclc_data AS ( + SELECT s.instance_id::text, s.oclc + FROM oclc_from_srs_marctab s + JOIN oclc_duplicates d ON s.oclc = d.oclc +), +isbn_from_instance_identifiers AS ( + SELECT instance_id, NULLIF(LEFT(RIGHT(REGEXP_REPLACE(identifier, ' .*', ''), 10), 9), ':') AS isbn + FROM folio_reporting.instance_identifiers + WHERE identifier_type_name = 'ISBN' +), +isbn_duplicates AS ( + SELECT isbn + FROM isbn_from_instance_identifiers + WHERE isbn <> '' + GROUP BY isbn + HAVING COUNT(*) > 1 +), +isbn_data AS ( + SELECT s.instance_id, s.isbn + FROM isbn_from_instance_identifiers s + JOIN isbn_duplicates d ON s.isbn = d.isbn +), +issn_from_srs_marctab AS ( + SELECT instance_id, content AS issn + FROM public.srs_marctab + WHERE field = '022' + AND ord = 1 + AND sf = 'a' +), +issn_duplicates AS ( + SELECT issn + FROM issn_from_srs_marctab + WHERE issn <> '' + GROUP BY issn + HAVING COUNT(*) > 1 +), +issn_data AS ( + SELECT s.instance_id::text, s.issn + FROM issn_from_srs_marctab s + JOIN issn_duplicates d ON s.issn = d.issn +), +lccn_from_instance_identifiers AS ( + SELECT instance_id, identifier AS lccn + FROM folio_reporting.instance_identifiers + WHERE identifier_type_name = 'LCCN' +), +lccn_duplicates AS ( + SELECT lccn + FROM lccn_from_instance_identifiers + WHERE lccn <> '' + GROUP BY lccn + HAVING COUNT(*) > 1 +), +lccn_data AS ( + SELECT s.instance_id, s.lccn + FROM lccn_from_instance_identifiers s + JOIN lccn_duplicates d ON s.lccn = d.lccn +), +call_number_from_holdings_ext AS ( + SELECT instance_id, TRIM(CONCAT_WS(' ', call_number_prefix, call_number, call_number_suffix)) IS NOT NULL AS call_number + FROM folio_reporting.holdings_ext +), +call_number_duplicates AS ( + SELECT call_number + FROM call_number_from_holdings_ext + WHERE TRIM(call_number) <> '' + GROUP BY call_number + HAVING COUNT(*) > 1 +), +call_number_data AS ( + SELECT s.instance_id, s.call_number + FROM call_number_from_holdings_ext s + JOIN call_number_duplicates d ON s.call_number = d.call_number +), +duplicate_instances AS ( + SELECT ie.instance_id, ie.instance_hrid, oclc.oclc, isbn.isbn, issn.issn, lccn.lccn, cn.call_number + FROM folio_reporting.instance_ext ie + LEFT JOIN oclc_data oclc ON ie.instance_id = oclc.instance_id + LEFT JOIN isbn_data isbn ON ie.instance_id = isbn.instance_id + LEFT JOIN issn_data issn ON ie.instance_id = issn.instance_id + LEFT JOIN lccn_data lccn ON ie.instance_id = lccn.instance_id + LEFT JOIN call_number_data cn ON ie.instance_id = cn.instance_id + WHERE oclc.oclc IS NOT NULL + OR isbn.isbn IS NOT NULL + OR issn.issn IS NOT NULL + OR lccn.lccn IS NOT NULL + OR cn.call_number IS NOT NULL +) +SELECT + di1.instance_hrid AS HRID, + di2.instance_hrid AS HRID2, + MAX(CASE WHEN (di1.OCLC IS NOT NULL AND di1.OCLC = di2.OCLC) THEN 'T' ELSE NULL END) AS OCLC, + MAX(CASE WHEN (di1.ISBN IS NOT NULL AND di1.ISBN = di2.ISBN) THEN 'T' ELSE NULL END) AS ISBN, + MAX(CASE WHEN (di1.ISSN IS NOT NULL AND di1.ISSN = di2.ISSN) THEN 'T' ELSE NULL END) AS ISSN, + MAX(CASE WHEN (di1.LCCN IS NOT NULL AND di1.LCCN = di2.LCCN) THEN 'T' ELSE NULL END) AS LCCN, + MAX(CASE WHEN (di1.CALL_NUMBER IS NOT NULL AND di1.CALL_NUMBER = di2.CALL_NUMBER) THEN 'T' ELSE NULL END) AS CALL_NUMBER +FROM + duplicate_instances di1 +INNER JOIN + duplicate_instances di2 ON di1.instance_id <> di2.instance_id AND + (di1.oclc = di2.oclc OR + di1.isbn = di2.isbn OR + di1.issn = di2.issn OR + di1.lccn = di2.lccn OR + di1.call_number = di2.call_number) +GROUP BY di1.instance_hrid, di2.instance_hrid; diff --git a/duplicate-instance-report/sql/isbn.sql b/duplicate-instance-report/sql/isbn.sql new file mode 100644 index 00000000..abd3e870 --- /dev/null +++ b/duplicate-instance-report/sql/isbn.sql @@ -0,0 +1,54 @@ +-- ISBN + +-- **ISBN Match** + +-- - [ ] 1. The ISBN will need to be normalized as follows: +-- - [ ] a. Trim everything after and including the first white space +-- - [ ] b. Take the last 10 characters +-- - [ ] c. Of those last 10 characters, remove the last character +-- - [ ] d. `9780134553351 (foo) : $16.50` -> `9780134553351` -> `~978~ 013455335 ~1~` -> `013455335` +-- - [ ] 2. The Normalized ISBN of the source instance is considered a match if it is identical to the Normalized ISBN of the target instance. +-- - [ ] 3. The instance_identifiers table should be used for this comparison + +-- matches +WITH isbn_from_instance_identifiers AS ( + SELECT instance_id, NULLIF(LEFT(RIGHT(REGEXP_REPLACE(identifier, ' .*', ''), 10), 9), ':') AS isbn + FROM folio_reporting.instance_identifiers + WHERE identifier_type_name = 'ISBN' +) +SELECT isbn, COUNT(*) FROM isbn_from_instance_identifiers WHERE isbn <> '' GROUP BY isbn HAVING COUNT(*) > 1; + + +-- grouped by matches +WITH isbn_from_instance_identifiers AS ( + SELECT NULLIF(LEFT(RIGHT(REGEXP_REPLACE(identifier, ' .*', ''), 10), 9), ':') AS isbn + FROM folio_reporting.instance_identifiers + WHERE identifier_type_name = 'ISBN' +), +isbn_duplicates AS ( + SELECT isbn + FROM isbn_from_instance_identifiers + WHERE isbn <> '' + GROUP BY isbn + HAVING COUNT(*) > 1 +) +SELECT * FROM isbn_duplicates; + + +-- order by identifier +WITH isbn_from_instance_identifiers AS ( + SELECT instance_id, NULLIF(LEFT(RIGHT(REGEXP_REPLACE(identifier, ' .*', ''), 10), 9), ':') AS isbn + FROM folio_reporting.instance_identifiers + WHERE identifier_type_name = 'ISBN' +), +isbn_duplicates AS ( + SELECT isbn + FROM isbn_from_instance_identifiers + WHERE isbn <> '' + GROUP BY isbn + HAVING COUNT(*) > 1 +) +SELECT s.instance_id, s.isbn +FROM isbn_from_instance_identifiers s +JOIN isbn_duplicates d ON s.isbn = d.isbn +ORDER BY s.isbn; diff --git a/duplicate-instance-report/sql/issn.sql b/duplicate-instance-report/sql/issn.sql new file mode 100644 index 00000000..39bfa588 --- /dev/null +++ b/duplicate-instance-report/sql/issn.sql @@ -0,0 +1,54 @@ +-- ISSN + +-- - [ ] 1. The ISSNs do not need to be normalized +-- - [ ] 2. The ISSN is considered to be a match when the field:022 ord:0 sf:a or the source instance is identical the field:022 ord:0 sf:a of the target record. +-- - [ ] 3. The srs_marctab table should be used for this comparison + +-- matches +WITH issn_from_srs_marctab AS ( + SELECT instance_id::text, content AS issn + FROM public.srs_marctab + WHERE field = '022' + AND ord = 1 + AND sf = 'a' +) +SELECT issn, COUNT(*) FROM issn_from_srs_marctab WHERE issn <> '' GROUP BY issn HAVING COUNT(*) > 1 ORDER BY count DESC; + + +-- grouped by matches +WITH issn_from_srs_marctab AS ( + SELECT content AS issn + FROM public.srs_marctab + WHERE field = '022' + AND ord = 1 + AND sf = 'a' +), +issn_duplicates AS ( + SELECT issn + FROM issn_from_srs_marctab + WHERE issn <> '' + GROUP BY issn + HAVING COUNT(*) > 1 +) +SELECT * FROM issn_duplicates; + + +-- order by identifier +WITH issn_from_srs_marctab AS ( + SELECT instance_id, content AS issn + FROM public.srs_marctab + WHERE field = '022' + AND ord = 1 + AND sf = 'a' +), +issn_duplicates AS ( + SELECT issn + FROM issn_from_srs_marctab + WHERE issn <> '' + GROUP BY issn + HAVING COUNT(*) > 1 +) +SELECT s.instance_id::text, s.issn +FROM issn_from_srs_marctab s +JOIN issn_duplicates d ON s.issn = d.issn +ORDER BY s.issn; diff --git a/duplicate-instance-report/sql/lccn.sql b/duplicate-instance-report/sql/lccn.sql new file mode 100644 index 00000000..713d939f --- /dev/null +++ b/duplicate-instance-report/sql/lccn.sql @@ -0,0 +1,48 @@ +-- LCCN + +-- - [ ] 1. The LCCN's do not need to be normalized +-- - [ ] 2. The LCCN is considered to be a match if the source LCCN is identical to the target LCCN +-- - [ ] 3. The instance_identifiers table should be used for this comparison + +-- matches +WITH lccn_from_instance_identifiers AS ( + SELECT instance_id, identifier AS lccn + FROM folio_reporting.instance_identifiers + WHERE identifier_type_name = 'LCCN' +) +SELECT lccn, COUNT(*) FROM lccn_from_instance_identifiers WHERE lccn <> '' GROUP BY lccn HAVING COUNT(*) > 1 ORDER BY count DESC; + + +-- grouped by matches +WITH lccn_from_instance_identifiers AS ( + SELECT identifier AS lccn + FROM folio_reporting.instance_identifiers + WHERE identifier_type_name = 'LCCN' +), +lccn_duplicates AS ( + SELECT lccn + FROM lccn_from_instance_identifiers + WHERE lccn <> '' + GROUP BY lccn + HAVING COUNT(*) > 1 +) +SELECT * FROM lccn_duplicates; + + +-- order by identifier +WITH lccn_from_instance_identifiers AS ( + SELECT instance_id, identifier AS lccn + FROM folio_reporting.instance_identifiers + WHERE identifier_type_name = 'LCCN' +), +lccn_duplicates AS ( + SELECT lccn + FROM lccn_from_instance_identifiers + WHERE lccn <> '' + GROUP BY lccn + HAVING COUNT(*) > 1 +) +SELECT s.instance_id, s.lccn +FROM lccn_from_instance_identifiers s +JOIN lccn_duplicates d ON s.lccn = d.lccn +ORDER BY s.lccn; diff --git a/duplicate-instance-report/sql/oclc.sql b/duplicate-instance-report/sql/oclc.sql new file mode 100644 index 00000000..c85166ec --- /dev/null +++ b/duplicate-instance-report/sql/oclc.sql @@ -0,0 +1,64 @@ +-- OCLC Matching + +-- **OCLC Match** + +-- - [ ] 1. The OCLC numbers will need to be normalized. +-- - [ ] a. Only treat values with the `(OCLoC)` prefix as an OCLC number. Ignore all fields lacking this prefix. +-- - [ ] b. All non numerical characters shoud be removed. +-- - [ ] c. Trim all leading 0s +-- - [ ] 2. A Normalized OCLC number is considered to be a match when: +-- - [ ] a. The Normalized OCLC number from the field:035 ord:1 sf:a of the source record matches the Normalized OCLC number from the field:035 ord:1 sf:a of the target record +-- - [ ] b. The Normalized OCLC number from the field:035 ord:1 sf:a of the source record matches any of the Normalized OCLC numbers within the field:035 sf:z fields of the target record. +-- - [ ] c. The srs_marctab table should be used for this comparison + +-- matches +WITH oclc_from_srs_marctab AS ( + SELECT instance_id::text, LTRIM(REGEXP_REPLACE(content, '[^0-9]', '', 'g'), '0') AS oclc + FROM public.srs_marctab + WHERE field = '035' + AND ord = 1 + AND sf IN ('a', 'z') + AND content LIKE '(OCoLC)%' +) +SELECT oclc, COUNT(*) FROM oclc_from_srs_marctab WHERE oclc <> '' GROUP BY oclc HAVING COUNT(*) > 1 ORDER BY count DESC; + + +-- grouped by matches +WITH oclc_from_srs_marctab AS ( + SELECT instance_id, LTRIM(REGEXP_REPLACE(content, '[^0-9]', '', 'g'), '0') AS oclc + FROM public.srs_marctab + WHERE field = '035' + AND ord = 1 + AND sf IN ('a', 'z') + AND content LIKE '(OCoLC)%' +), +oclc_duplicates AS ( + SELECT oclc + FROM oclc_from_srs_marctab + WHERE oclc <> '' + GROUP BY oclc + HAVING COUNT(*) > 1 +) +SELECT * FROM oclc_duplicates; + + +-- order identifier +WITH oclc_from_srs_marctab AS ( + SELECT instance_id, LTRIM(REGEXP_REPLACE(content, '[^0-9]', '', 'g'), '0') AS oclc + FROM public.srs_marctab + WHERE field = '035' + AND ord = 1 + AND sf IN ('a', 'z') + AND content LIKE '(OCoLC)%' +), +oclc_duplicates AS ( + SELECT oclc + FROM oclc_from_srs_marctab + WHERE oclc <> '' + GROUP BY oclc + HAVING COUNT(*) > 1 +) +SELECT s.instance_id::text, s.oclc +FROM oclc_from_srs_marctab s +JOIN oclc_duplicates d ON s.oclc = d.oclc +ORDER BY s.oclc;