Skip to content

Commit

Permalink
Add Formatted SQL for Issue #350
Browse files Browse the repository at this point in the history
  • Loading branch information
wwelling committed Sep 19, 2024
1 parent 1fa948d commit 5a366a7
Show file tree
Hide file tree
Showing 6 changed files with 400 additions and 0 deletions.
59 changes: 59 additions & 0 deletions duplicate-instance-report/sql/call_number.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
-- CALL NUMBER

-- - [ ] 1. The call numbers do not need to be normalized
-- - [ ] 2. An source instance is considered to match a target instance if any of the source instance's holding's call numbers match any of the target instance's holding's call numbers.
-- - [ ] 3. This comparison should be done using the call_number, call_number_prefix, call_number_suffix columns of the holdings_ext table in folio reporting.

-- matches
WITH call_number_from_holdings_ext AS (
SELECT instance_id, TRIM(CONCAT_WS(' ', call_number_prefix, call_number, call_number_suffix)) AS call_number
FROM folio_reporting.holdings_ext
)
SELECT call_number, COUNT(*) FROM call_number_from_holdings_ext WHERE TRIM(call_number) <> '' GROUP BY call_number HAVING COUNT(*) > 1 ORDER BY count DESC;


-- grouped by matches
WITH call_number_from_holdings_ext AS (
SELECT TRIM(CONCAT_WS(' ', call_number_prefix, call_number, call_number_suffix)) AS call_number
FROM folio_reporting.holdings_ext
),
call_number_duplicates AS (
SELECT call_number
FROM call_number_from_holdings_ext
WHERE TRIM(call_number) <> ''
GROUP BY call_number
HAVING COUNT(*) > 1
)
SELECT * FROM call_number_duplicates;


--
WITH call_number_from_holdings_ext AS (
SELECT instance_id, TRIM(CONCAT_WS(' ', call_number_prefix, call_number, call_number_suffix)) AS call_number
FROM folio_reporting.holdings_ext
),
call_number_duplicates AS (
SELECT call_number
FROM call_number_from_holdings_ext
WHERE TRIM(call_number) <> ''
GROUP BY call_number
HAVING COUNT(*) > 1
)
SELECT s.instance_id, s.call_number
FROM call_number_from_holdings_ext s
JOIN call_number_duplicates d ON s.call_number = d.call_number
ORDER BY s.call_number;


-- try next
WITH call_number_duplicates AS (
SELECT instance_id, TRIM(CONCAT_WS(' ', call_number_prefix, call_number, call_number_suffix)) AS call_number
FROM folio_reporting.holdings_ext
WHERE TRIM(call_number) <> ''
GROUP BY instance_id, call_number
HAVING COUNT(*) > 1
)
SELECT s.instance_id, s.call_number
FROM call_number_from_holdings_ext s
JOIN call_number_duplicates d ON s.call_number = d.call_number
ORDER BY s.call_number;
121 changes: 121 additions & 0 deletions duplicate-instance-report/sql/duplicate_instances.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
oclc_from_srs_marctab AS (
SELECT instance_id, REGEXP_REPLACE(content, '[^0-9]', '', 'g') AS oclc
FROM public.srs_marctab
WHERE field = '035'
AND ord = 1
AND sf IN ('a', 'z')
AND content LIKE '(OCoLC)%'
),
oclc_duplicates AS (
SELECT oclc
FROM oclc_from_srs_marctab
WHERE oclc <> ''
GROUP BY oclc
HAVING COUNT(*) > 1
),
oclc_data AS (
SELECT s.instance_id::text, s.oclc
FROM oclc_from_srs_marctab s
JOIN oclc_duplicates d ON s.oclc = d.oclc
),
isbn_from_instance_identifiers AS (
SELECT instance_id, NULLIF(LEFT(RIGHT(REGEXP_REPLACE(identifier, ' .*', ''), 10), 9), ':') AS isbn
FROM folio_reporting.instance_identifiers
WHERE identifier_type_name = 'ISBN'
),
isbn_duplicates AS (
SELECT isbn
FROM isbn_from_instance_identifiers
WHERE isbn <> ''
GROUP BY isbn
HAVING COUNT(*) > 1
),
isbn_data AS (
SELECT s.instance_id, s.isbn
FROM isbn_from_instance_identifiers s
JOIN isbn_duplicates d ON s.isbn = d.isbn
),
issn_from_srs_marctab AS (
SELECT instance_id, content AS issn
FROM public.srs_marctab
WHERE field = '022'
AND ord = 1
AND sf = 'a'
),
issn_duplicates AS (
SELECT issn
FROM issn_from_srs_marctab
WHERE issn <> ''
GROUP BY issn
HAVING COUNT(*) > 1
),
issn_data AS (
SELECT s.instance_id::text, s.issn
FROM issn_from_srs_marctab s
JOIN issn_duplicates d ON s.issn = d.issn
),
lccn_from_instance_identifiers AS (
SELECT instance_id, identifier AS lccn
FROM folio_reporting.instance_identifiers
WHERE identifier_type_name = 'LCCN'
),
lccn_duplicates AS (
SELECT lccn
FROM lccn_from_instance_identifiers
WHERE lccn <> ''
GROUP BY lccn
HAVING COUNT(*) > 1
),
lccn_data AS (
SELECT s.instance_id, s.lccn
FROM lccn_from_instance_identifiers s
JOIN lccn_duplicates d ON s.lccn = d.lccn
),
call_number_from_holdings_ext AS (
SELECT instance_id, TRIM(CONCAT_WS(' ', call_number_prefix, call_number, call_number_suffix)) IS NOT NULL AS call_number
FROM folio_reporting.holdings_ext
),
call_number_duplicates AS (
SELECT call_number
FROM call_number_from_holdings_ext
WHERE TRIM(call_number) <> ''
GROUP BY call_number
HAVING COUNT(*) > 1
),
call_number_data AS (
SELECT s.instance_id, s.call_number
FROM call_number_from_holdings_ext s
JOIN call_number_duplicates d ON s.call_number = d.call_number
),
duplicate_instances AS (
SELECT ie.instance_id, ie.instance_hrid, oclc.oclc, isbn.isbn, issn.issn, lccn.lccn, cn.call_number
FROM folio_reporting.instance_ext ie
LEFT JOIN oclc_data oclc ON ie.instance_id = oclc.instance_id
LEFT JOIN isbn_data isbn ON ie.instance_id = isbn.instance_id
LEFT JOIN issn_data issn ON ie.instance_id = issn.instance_id
LEFT JOIN lccn_data lccn ON ie.instance_id = lccn.instance_id
LEFT JOIN call_number_data cn ON ie.instance_id = cn.instance_id
WHERE oclc.oclc IS NOT NULL
OR isbn.isbn IS NOT NULL
OR issn.issn IS NOT NULL
OR lccn.lccn IS NOT NULL
OR cn.call_number IS NOT NULL
)
SELECT
di1.instance_hrid AS HRID,
di2.instance_hrid AS HRID2,
MAX(CASE WHEN (di1.OCLC IS NOT NULL AND di1.OCLC = di2.OCLC) THEN 'T' ELSE NULL END) AS OCLC,
MAX(CASE WHEN (di1.ISBN IS NOT NULL AND di1.ISBN = di2.ISBN) THEN 'T' ELSE NULL END) AS ISBN,
MAX(CASE WHEN (di1.ISSN IS NOT NULL AND di1.ISSN = di2.ISSN) THEN 'T' ELSE NULL END) AS ISSN,
MAX(CASE WHEN (di1.LCCN IS NOT NULL AND di1.LCCN = di2.LCCN) THEN 'T' ELSE NULL END) AS LCCN,
MAX(CASE WHEN (di1.CALL_NUMBER IS NOT NULL AND di1.CALL_NUMBER = di2.CALL_NUMBER) THEN 'T' ELSE NULL END) AS CALL_NUMBER
FROM
duplicate_instances di1
INNER JOIN
duplicate_instances di2 ON di1.instance_id <> di2.instance_id AND
(di1.oclc = di2.oclc OR
di1.isbn = di2.isbn OR
di1.issn = di2.issn OR
di1.lccn = di2.lccn OR
di1.call_number = di2.call_number)
GROUP BY di1.instance_hrid, di2.instance_hrid;
54 changes: 54 additions & 0 deletions duplicate-instance-report/sql/isbn.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
-- ISBN

-- **ISBN Match**

-- - [ ] 1. The ISBN will need to be normalized as follows:
-- - [ ] a. Trim everything after and including the first white space
-- - [ ] b. Take the last 10 characters
-- - [ ] c. Of those last 10 characters, remove the last character
-- - [ ] d. `9780134553351 (foo) : $16.50` -> `9780134553351` -> `~978~ 013455335 ~1~` -> `013455335`
-- - [ ] 2. The Normalized ISBN of the source instance is considered a match if it is identical to the Normalized ISBN of the target instance.
-- - [ ] 3. The instance_identifiers table should be used for this comparison

-- matches
WITH isbn_from_instance_identifiers AS (
SELECT instance_id, NULLIF(LEFT(RIGHT(REGEXP_REPLACE(identifier, ' .*', ''), 10), 9), ':') AS isbn
FROM folio_reporting.instance_identifiers
WHERE identifier_type_name = 'ISBN'
)
SELECT isbn, COUNT(*) FROM isbn_from_instance_identifiers WHERE isbn <> '' GROUP BY isbn HAVING COUNT(*) > 1;


-- grouped by matches
WITH isbn_from_instance_identifiers AS (
SELECT NULLIF(LEFT(RIGHT(REGEXP_REPLACE(identifier, ' .*', ''), 10), 9), ':') AS isbn
FROM folio_reporting.instance_identifiers
WHERE identifier_type_name = 'ISBN'
),
isbn_duplicates AS (
SELECT isbn
FROM isbn_from_instance_identifiers
WHERE isbn <> ''
GROUP BY isbn
HAVING COUNT(*) > 1
)
SELECT * FROM isbn_duplicates;


--
WITH isbn_from_instance_identifiers AS (
SELECT instance_id, NULLIF(LEFT(RIGHT(REGEXP_REPLACE(identifier, ' .*', ''), 10), 9), ':') AS isbn
FROM folio_reporting.instance_identifiers
WHERE identifier_type_name = 'ISBN'
),
isbn_duplicates AS (
SELECT isbn
FROM isbn_from_instance_identifiers
WHERE isbn <> ''
GROUP BY isbn
HAVING COUNT(*) > 1
)
SELECT s.instance_id, s.isbn
FROM isbn_from_instance_identifiers s
JOIN isbn_duplicates d ON s.isbn = d.isbn
ORDER BY s.isbn;
54 changes: 54 additions & 0 deletions duplicate-instance-report/sql/issn.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
-- ISSN

-- - [ ] 1. The ISSNs do not need to be normalized
-- - [ ] 2. The ISSN is considered to be a match when the field:022 ord:0 sf:a or the source instance is identical the field:022 ord:0 sf:a of the target record.
-- - [ ] 3. The srs_marctab table should be used for this comparison

-- matches
WITH issn_from_srs_marctab AS (
SELECT instance_id::text, content AS issn
FROM public.srs_marctab
WHERE field = '022'
AND ord = 1
AND sf = 'a'
)
SELECT issn, COUNT(*) FROM issn_from_srs_marctab WHERE issn <> '' GROUP BY issn HAVING COUNT(*) > 1 ORDER BY count DESC;


-- grouped by matches
WITH issn_from_srs_marctab AS (
SELECT content AS issn
FROM public.srs_marctab
WHERE field = '022'
AND ord = 1
AND sf = 'a'
),
issn_duplicates AS (
SELECT issn
FROM issn_from_srs_marctab
WHERE issn <> ''
GROUP BY issn
HAVING COUNT(*) > 1
)
SELECT * FROM issn_duplicates;


--
WITH issn_from_srs_marctab AS (
SELECT instance_id, content AS issn
FROM public.srs_marctab
WHERE field = '022'
AND ord = 1
AND sf = 'a'
),
issn_duplicates AS (
SELECT issn
FROM issn_from_srs_marctab
WHERE issn <> ''
GROUP BY issn
HAVING COUNT(*) > 1
)
SELECT s.instance_id::text, s.issn
FROM issn_from_srs_marctab s
JOIN issn_duplicates d ON s.issn = d.issn
ORDER BY s.issn;
48 changes: 48 additions & 0 deletions duplicate-instance-report/sql/lccn.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
-- LCCN

-- - [ ] 1. The LCCN's do not need to be normalized
-- - [ ] 2. The LCCN is considered to be a match if the source LCCN is identical to the target LCCN
-- - [ ] 3. The instance_identifiers table should be used for this comparison

-- matches
WITH lccn_from_instance_identifiers AS (
SELECT instance_id, identifier AS lccn
FROM folio_reporting.instance_identifiers
WHERE identifier_type_name = 'LCCN'
)
SELECT lccn, COUNT(*) FROM lccn_from_instance_identifiers WHERE lccn <> '' GROUP BY lccn HAVING COUNT(*) > 1 ORDER BY count DESC;


-- grouped by matches
WITH lccn_from_instance_identifiers AS (
SELECT identifier AS lccn
FROM folio_reporting.instance_identifiers
WHERE identifier_type_name = 'LCCN'
),
lccn_duplicates AS (
SELECT lccn
FROM lccn_from_instance_identifiers
WHERE lccn <> ''
GROUP BY lccn
HAVING COUNT(*) > 1
)
SELECT * FROM lccn_duplicates;


--
WITH lccn_from_instance_identifiers AS (
SELECT instance_id, identifier AS lccn
FROM folio_reporting.instance_identifiers
WHERE identifier_type_name = 'LCCN'
),
lccn_duplicates AS (
SELECT lccn
FROM lccn_from_instance_identifiers
WHERE lccn <> ''
GROUP BY lccn
HAVING COUNT(*) > 1
)
SELECT s.instance_id, s.lccn
FROM lccn_from_instance_identifiers s
JOIN lccn_duplicates d ON s.lccn = d.lccn
ORDER BY s.lccn;
Loading

0 comments on commit 5a366a7

Please sign in to comment.