From 4cd761f4a3cde20bbe8998266789c315f80f04cd Mon Sep 17 00:00:00 2001
From: William Welling <wwelling@tamu.edu>
Date: Thu, 19 Sep 2024 08:15:12 -0500
Subject: [PATCH] Add Formatted SQL for Issue #350

---
 duplicate-instance-report/sql/call_number.sql |  45 +++++++
 .../sql/duplicate_instances.sql               | 121 ++++++++++++++++++
 duplicate-instance-report/sql/isbn.sql        |  54 ++++++++
 duplicate-instance-report/sql/issn.sql        |  54 ++++++++
 duplicate-instance-report/sql/lccn.sql        |  48 +++++++
 duplicate-instance-report/sql/oclc.sql        |  64 +++++++++
 6 files changed, 386 insertions(+)
 create mode 100644 duplicate-instance-report/sql/call_number.sql
 create mode 100644 duplicate-instance-report/sql/duplicate_instances.sql
 create mode 100644 duplicate-instance-report/sql/isbn.sql
 create mode 100644 duplicate-instance-report/sql/issn.sql
 create mode 100644 duplicate-instance-report/sql/lccn.sql
 create mode 100644 duplicate-instance-report/sql/oclc.sql

diff --git a/duplicate-instance-report/sql/call_number.sql b/duplicate-instance-report/sql/call_number.sql
new file mode 100644
index 00000000..200cb29a
--- /dev/null
+++ b/duplicate-instance-report/sql/call_number.sql
@@ -0,0 +1,45 @@
+-- CALL NUMBER
+
+-- - [ ] 1. The call numbers do not need to be normalized
+-- - [ ] 2. An source instance is considered to match a target instance if any of the source instance's holding's call numbers match any of the target instance's holding's call numbers.
+-- - [ ] 3. This comparison should be done using the call_number, call_number_prefix, call_number_suffix columns of the holdings_ext table in folio reporting.
+
+-- matches
+WITH call_number_from_holdings_ext AS (
+    SELECT instance_id, TRIM(CONCAT_WS(' ', call_number_prefix, call_number, call_number_suffix)) AS call_number
+    FROM folio_reporting.holdings_ext
+)
+SELECT call_number, COUNT(*) FROM call_number_from_holdings_ext WHERE TRIM(call_number) <> '' GROUP BY call_number HAVING COUNT(*) > 1 ORDER BY count DESC;
+
+
+-- grouped by matches
+WITH call_number_from_holdings_ext AS (
+    SELECT TRIM(CONCAT_WS(' ', call_number_prefix, call_number, call_number_suffix)) AS call_number
+    FROM folio_reporting.holdings_ext
+),
+call_number_duplicates AS (
+  SELECT call_number
+  FROM call_number_from_holdings_ext
+  WHERE TRIM(call_number) <> ''
+  GROUP BY call_number
+  HAVING COUNT(*) > 1
+)
+SELECT * FROM call_number_duplicates;
+
+
+-- order by identifier
+WITH call_number_from_holdings_ext AS (
+    SELECT instance_id, TRIM(CONCAT_WS(' ', call_number_prefix, call_number, call_number_suffix)) AS call_number
+    FROM folio_reporting.holdings_ext
+),
+call_number_duplicates AS (
+  SELECT call_number
+  FROM call_number_from_holdings_ext
+  WHERE TRIM(call_number) <> ''
+  GROUP BY call_number
+  HAVING COUNT(*) > 1
+)
+SELECT s.instance_id, s.call_number
+FROM call_number_from_holdings_ext s
+JOIN call_number_duplicates d ON s.call_number = d.call_number
+ORDER BY s.call_number;
diff --git a/duplicate-instance-report/sql/duplicate_instances.sql b/duplicate-instance-report/sql/duplicate_instances.sql
new file mode 100644
index 00000000..65d3445f
--- /dev/null
+++ b/duplicate-instance-report/sql/duplicate_instances.sql
@@ -0,0 +1,121 @@
+WITH oclc_from_srs_marctab AS (
+  SELECT instance_id, LTRIM(REGEXP_REPLACE(content, '[^0-9]', '', 'g'), '0') AS oclc
+  FROM public.srs_marctab
+  WHERE field = '035'
+    AND ord = 1
+    AND sf IN ('a', 'z')
+    AND content LIKE '(OCoLC)%'
+),
+oclc_duplicates AS (
+  SELECT oclc
+  FROM oclc_from_srs_marctab
+  WHERE oclc <> ''
+  GROUP BY oclc
+  HAVING COUNT(*) > 1
+), 
+oclc_data AS (
+  SELECT s.instance_id::text, s.oclc
+  FROM oclc_from_srs_marctab s
+  JOIN oclc_duplicates d ON s.oclc = d.oclc
+),
+isbn_from_instance_identifiers AS (
+    SELECT instance_id, NULLIF(LEFT(RIGHT(REGEXP_REPLACE(identifier, ' .*', ''), 10), 9), ':') AS isbn
+    FROM folio_reporting.instance_identifiers
+    WHERE identifier_type_name = 'ISBN'
+),
+isbn_duplicates AS (
+  SELECT isbn
+  FROM isbn_from_instance_identifiers
+  WHERE isbn <> ''
+  GROUP BY isbn
+  HAVING COUNT(*) > 1
+),
+isbn_data AS (
+  SELECT s.instance_id, s.isbn
+  FROM isbn_from_instance_identifiers s
+  JOIN isbn_duplicates d ON s.isbn = d.isbn
+),
+issn_from_srs_marctab AS (
+    SELECT instance_id, content AS issn
+    FROM public.srs_marctab
+    WHERE field = '022'
+      AND ord = 1
+      AND sf = 'a'
+),
+issn_duplicates AS (
+  SELECT issn
+  FROM issn_from_srs_marctab
+  WHERE issn <> ''
+  GROUP BY issn
+  HAVING COUNT(*) > 1
+),
+issn_data AS (
+  SELECT s.instance_id::text, s.issn
+  FROM issn_from_srs_marctab s
+  JOIN issn_duplicates d ON s.issn = d.issn
+),
+lccn_from_instance_identifiers AS (
+    SELECT instance_id, identifier AS lccn
+    FROM folio_reporting.instance_identifiers
+    WHERE identifier_type_name = 'LCCN'
+),
+lccn_duplicates AS (
+  SELECT lccn
+  FROM lccn_from_instance_identifiers
+  WHERE lccn <> ''
+  GROUP BY lccn
+  HAVING COUNT(*) > 1
+),
+lccn_data AS (
+  SELECT s.instance_id, s.lccn
+  FROM lccn_from_instance_identifiers s
+  JOIN lccn_duplicates d ON s.lccn = d.lccn
+),
+call_number_from_holdings_ext AS (
+    SELECT instance_id, TRIM(CONCAT_WS(' ', call_number_prefix, call_number, call_number_suffix)) IS NOT NULL AS call_number
+    FROM folio_reporting.holdings_ext
+),
+call_number_duplicates AS (
+  SELECT call_number
+  FROM call_number_from_holdings_ext
+  WHERE TRIM(call_number) <> ''
+  GROUP BY call_number
+  HAVING COUNT(*) > 1
+),
+call_number_data AS (
+  SELECT s.instance_id, s.call_number
+  FROM call_number_from_holdings_ext s
+  JOIN call_number_duplicates d ON s.call_number = d.call_number
+),
+duplicate_instances AS (
+  SELECT ie.instance_id, ie.instance_hrid, oclc.oclc, isbn.isbn, issn.issn, lccn.lccn, cn.call_number
+  FROM folio_reporting.instance_ext ie
+  LEFT JOIN oclc_data oclc ON ie.instance_id = oclc.instance_id
+  LEFT JOIN isbn_data isbn ON ie.instance_id = isbn.instance_id
+  LEFT JOIN issn_data issn ON ie.instance_id = issn.instance_id
+  LEFT JOIN lccn_data lccn ON ie.instance_id = lccn.instance_id
+  LEFT JOIN call_number_data cn ON ie.instance_id = cn.instance_id
+  WHERE oclc.oclc IS NOT NULL
+     OR isbn.isbn IS NOT NULL
+     OR issn.issn IS NOT NULL
+     OR lccn.lccn IS NOT NULL
+     OR cn.call_number IS NOT NULL
+)
+SELECT
+  di1.instance_hrid AS HRID,
+  di2.instance_hrid AS HRID2,
+  MAX(CASE WHEN (di1.OCLC IS NOT NULL AND di1.OCLC = di2.OCLC) THEN 'T' ELSE NULL END) AS OCLC,
+  MAX(CASE WHEN (di1.ISBN IS NOT NULL AND di1.ISBN = di2.ISBN) THEN 'T' ELSE NULL END) AS ISBN,
+  MAX(CASE WHEN (di1.ISSN IS NOT NULL AND di1.ISSN = di2.ISSN) THEN 'T' ELSE NULL END) AS ISSN,
+  MAX(CASE WHEN (di1.LCCN IS NOT NULL AND di1.LCCN = di2.LCCN) THEN 'T' ELSE NULL END) AS LCCN,
+  MAX(CASE WHEN (di1.CALL_NUMBER IS NOT NULL AND di1.CALL_NUMBER = di2.CALL_NUMBER) THEN 'T' ELSE NULL END) AS CALL_NUMBER
+FROM
+  duplicate_instances di1
+INNER JOIN
+  duplicate_instances di2 ON di1.instance_id <> di2.instance_id AND
+    (di1.oclc = di2.oclc OR
+     di1.isbn = di2.isbn OR
+     di1.issn = di2.issn OR
+     di1.lccn = di2.lccn OR
+     di1.call_number = di2.call_number)
+GROUP BY di1.instance_hrid, di2.instance_hrid;
diff --git a/duplicate-instance-report/sql/isbn.sql b/duplicate-instance-report/sql/isbn.sql
new file mode 100644
index 00000000..abd3e870
--- /dev/null
+++ b/duplicate-instance-report/sql/isbn.sql
@@ -0,0 +1,54 @@
+-- ISBN
+
+-- **ISBN Match**
+
+-- - [ ] 1.   The ISBN will need to be normalized as follows:
+--   - [ ] a. Trim everything after and including the first white space
+--   - [ ] b. Take the last 10 characters
+--   - [ ] c. Of those last 10 characters, remove the last character
+--   - [ ] d.  `9780134553351 (foo) : $16.50` -> `9780134553351` -> `~978~ 013455335 ~1~` -> `013455335`
+-- - [ ] 2. The Normalized ISBN of the source instance is considered a match if it is identical to the Normalized ISBN of the target instance.
+-- - [ ] 3. The instance_identifiers table should be used for this comparison
+
+-- matches
+WITH isbn_from_instance_identifiers AS (
+    SELECT instance_id, NULLIF(LEFT(RIGHT(REGEXP_REPLACE(identifier, ' .*', ''), 10), 9), ':') AS isbn
+    FROM folio_reporting.instance_identifiers
+    WHERE identifier_type_name = 'ISBN'
+)
+SELECT isbn, COUNT(*) FROM isbn_from_instance_identifiers WHERE isbn <> '' GROUP BY isbn HAVING COUNT(*) > 1;
+
+
+-- grouped by matches
+WITH isbn_from_instance_identifiers AS (
+    SELECT NULLIF(LEFT(RIGHT(REGEXP_REPLACE(identifier, ' .*', ''), 10), 9), ':') AS isbn
+    FROM folio_reporting.instance_identifiers
+    WHERE identifier_type_name = 'ISBN'
+),
+isbn_duplicates AS (
+  SELECT isbn
+  FROM isbn_from_instance_identifiers
+  WHERE isbn <> ''
+  GROUP BY isbn
+  HAVING COUNT(*) > 1
+)
+SELECT * FROM isbn_duplicates;
+
+
+-- order by identifier
+WITH isbn_from_instance_identifiers AS (
+    SELECT instance_id, NULLIF(LEFT(RIGHT(REGEXP_REPLACE(identifier, ' .*', ''), 10), 9), ':') AS isbn
+    FROM folio_reporting.instance_identifiers
+    WHERE identifier_type_name = 'ISBN'
+),
+isbn_duplicates AS (
+  SELECT isbn
+  FROM isbn_from_instance_identifiers
+  WHERE isbn <> ''
+  GROUP BY isbn
+  HAVING COUNT(*) > 1
+)
+SELECT s.instance_id, s.isbn
+FROM isbn_from_instance_identifiers s
+JOIN isbn_duplicates d ON s.isbn = d.isbn
+ORDER BY s.isbn;
diff --git a/duplicate-instance-report/sql/issn.sql b/duplicate-instance-report/sql/issn.sql
new file mode 100644
index 00000000..39bfa588
--- /dev/null
+++ b/duplicate-instance-report/sql/issn.sql
@@ -0,0 +1,54 @@
+-- ISSN
+
+-- - [ ] 1. The ISSNs do not need to be normalized
+-- - [ ] 2. The ISSN is considered to be a match when the field:022 ord:0 sf:a or the source instance is identical the field:022 ord:0 sf:a of the target record.
+-- - [ ] 3. The srs_marctab table should be used for this comparison
+
+-- matches
+WITH issn_from_srs_marctab AS (
+    SELECT instance_id::text, content AS issn
+    FROM public.srs_marctab
+    WHERE field = '022' 
+      AND ord = 1 
+      AND sf = 'a'
+)
+SELECT issn, COUNT(*) FROM issn_from_srs_marctab WHERE issn <> '' GROUP BY issn HAVING COUNT(*) > 1 ORDER BY count DESC;
+
+
+-- grouped by matches
+WITH issn_from_srs_marctab AS (
+    SELECT content AS issn
+    FROM public.srs_marctab
+    WHERE field = '022' 
+      AND ord = 1 
+      AND sf = 'a'
+),
+issn_duplicates AS (
+  SELECT issn
+  FROM issn_from_srs_marctab
+  WHERE issn <> ''
+  GROUP BY issn
+  HAVING COUNT(*) > 1
+)
+SELECT * FROM issn_duplicates;
+
+
+-- order by identifier
+WITH issn_from_srs_marctab AS (
+    SELECT instance_id, content AS issn
+    FROM public.srs_marctab
+    WHERE field = '022' 
+      AND ord = 1 
+      AND sf = 'a'
+),
+issn_duplicates AS (
+  SELECT issn
+  FROM issn_from_srs_marctab
+  WHERE issn <> ''
+  GROUP BY issn
+  HAVING COUNT(*) > 1
+)
+SELECT s.instance_id::text, s.issn
+FROM issn_from_srs_marctab s
+JOIN issn_duplicates d ON s.issn = d.issn
+ORDER BY s.issn;
diff --git a/duplicate-instance-report/sql/lccn.sql b/duplicate-instance-report/sql/lccn.sql
new file mode 100644
index 00000000..713d939f
--- /dev/null
+++ b/duplicate-instance-report/sql/lccn.sql
@@ -0,0 +1,48 @@
+-- LCCN
+
+-- - [ ] 1. The LCCN's do not need to be normalized
+-- - [ ] 2. The LCCN is considered to be a match if the source LCCN is identical to the target LCCN
+-- - [ ] 3. The instance_identifiers table should be used for this comparison
+
+-- matches
+WITH lccn_from_instance_identifiers AS (
+    SELECT instance_id, identifier AS lccn
+    FROM folio_reporting.instance_identifiers
+    WHERE identifier_type_name = 'LCCN'
+)
+SELECT lccn, COUNT(*) FROM lccn_from_instance_identifiers WHERE lccn <> '' GROUP BY lccn HAVING COUNT(*) > 1 ORDER BY count DESC;
+
+
+-- grouped by matches
+WITH lccn_from_instance_identifiers AS (
+    SELECT identifier AS lccn
+    FROM folio_reporting.instance_identifiers
+    WHERE identifier_type_name = 'LCCN'
+),
+lccn_duplicates AS (
+  SELECT lccn
+  FROM lccn_from_instance_identifiers
+  WHERE lccn <> ''
+  GROUP BY lccn
+  HAVING COUNT(*) > 1
+)
+SELECT * FROM lccn_duplicates;
+
+
+-- order by identifier
+WITH lccn_from_instance_identifiers AS (
+    SELECT instance_id, identifier AS lccn
+    FROM folio_reporting.instance_identifiers
+    WHERE identifier_type_name = 'LCCN'
+),
+lccn_duplicates AS (
+  SELECT lccn
+  FROM lccn_from_instance_identifiers
+  WHERE lccn <> ''
+  GROUP BY lccn
+  HAVING COUNT(*) > 1
+)
+SELECT s.instance_id, s.lccn
+FROM lccn_from_instance_identifiers s
+JOIN lccn_duplicates d ON s.lccn = d.lccn
+ORDER BY s.lccn;
diff --git a/duplicate-instance-report/sql/oclc.sql b/duplicate-instance-report/sql/oclc.sql
new file mode 100644
index 00000000..c85166ec
--- /dev/null
+++ b/duplicate-instance-report/sql/oclc.sql
@@ -0,0 +1,64 @@
+-- OCLC Matching
+
+-- **OCLC Match**
+
+-- - [ ] 1. The OCLC numbers will need to be normalized.
+--   - [ ] a. Only treat values with the `(OCLoC)` prefix as an OCLC number. Ignore all fields lacking this prefix.
+--   - [ ] b. All non numerical characters shoud be removed.
+--   - [ ] c. Trim all leading 0s 
+-- - [ ] 2. A Normalized OCLC number is considered to be a match when:
+--   - [ ] a. The Normalized OCLC number from the field:035 ord:1 sf:a of the source record matches the Normalized OCLC number from the field:035 ord:1 sf:a of the target record
+--   - [ ] b. The Normalized OCLC number from the field:035 ord:1 sf:a of the source record matches any of the Normalized OCLC numbers within the field:035 sf:z fields of the target record.
+--   - [ ] c. The srs_marctab table should be used for this comparison
+
+-- matches
+WITH oclc_from_srs_marctab AS (
+    SELECT instance_id::text, LTRIM(REGEXP_REPLACE(content, '[^0-9]', '', 'g'), '0') AS oclc
+    FROM public.srs_marctab
+    WHERE field = '035' 
+      AND ord = 1 
+      AND sf IN ('a', 'z') 
+      AND content LIKE '(OCoLC)%'
+)
+SELECT oclc, COUNT(*) FROM oclc_from_srs_marctab WHERE oclc <> '' GROUP BY oclc HAVING COUNT(*) > 1 ORDER BY count DESC;
+
+
+-- grouped by matches
+WITH oclc_from_srs_marctab AS (
+  SELECT instance_id, LTRIM(REGEXP_REPLACE(content, '[^0-9]', '', 'g'), '0') AS oclc
+  FROM public.srs_marctab
+  WHERE field = '035'
+    AND ord = 1
+    AND sf IN ('a', 'z')
+    AND content LIKE '(OCoLC)%'
+),
+oclc_duplicates AS (
+  SELECT oclc
+  FROM oclc_from_srs_marctab
+  WHERE oclc <> ''
+  GROUP BY oclc
+  HAVING COUNT(*) > 1
+)
+SELECT * FROM oclc_duplicates;
+
+
+-- order identifier
+WITH oclc_from_srs_marctab AS (
+  SELECT instance_id, LTRIM(REGEXP_REPLACE(content, '[^0-9]', '', 'g'), '0') AS oclc
+  FROM public.srs_marctab
+  WHERE field = '035'
+    AND ord = 1
+    AND sf IN ('a', 'z')
+    AND content LIKE '(OCoLC)%'
+),
+oclc_duplicates AS (
+  SELECT oclc
+  FROM oclc_from_srs_marctab
+  WHERE oclc <> ''
+  GROUP BY oclc
+  HAVING COUNT(*) > 1
+)
+SELECT s.instance_id::text, s.oclc
+FROM oclc_from_srs_marctab s
+JOIN oclc_duplicates d ON s.oclc = d.oclc
+ORDER BY s.oclc;