-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
386 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
-- CALL NUMBER | ||
|
||
-- - [ ] 1. The call numbers do not need to be normalized | ||
-- - [ ] 2. An source instance is considered to match a target instance if any of the source instance's holding's call numbers match any of the target instance's holding's call numbers. | ||
-- - [ ] 3. This comparison should be done using the call_number, call_number_prefix, call_number_suffix columns of the holdings_ext table in folio reporting. | ||
|
||
-- matches | ||
WITH call_number_from_holdings_ext AS ( | ||
SELECT instance_id, TRIM(CONCAT_WS(' ', call_number_prefix, call_number, call_number_suffix)) AS call_number | ||
FROM folio_reporting.holdings_ext | ||
) | ||
SELECT call_number, COUNT(*) FROM call_number_from_holdings_ext WHERE TRIM(call_number) <> '' GROUP BY call_number HAVING COUNT(*) > 1 ORDER BY count DESC; | ||
|
||
|
||
-- grouped by matches | ||
WITH call_number_from_holdings_ext AS ( | ||
SELECT TRIM(CONCAT_WS(' ', call_number_prefix, call_number, call_number_suffix)) AS call_number | ||
FROM folio_reporting.holdings_ext | ||
), | ||
call_number_duplicates AS ( | ||
SELECT call_number | ||
FROM call_number_from_holdings_ext | ||
WHERE TRIM(call_number) <> '' | ||
GROUP BY call_number | ||
HAVING COUNT(*) > 1 | ||
) | ||
SELECT * FROM call_number_duplicates; | ||
|
||
|
||
-- order by identifier | ||
WITH call_number_from_holdings_ext AS ( | ||
SELECT instance_id, TRIM(CONCAT_WS(' ', call_number_prefix, call_number, call_number_suffix)) AS call_number | ||
FROM folio_reporting.holdings_ext | ||
), | ||
call_number_duplicates AS ( | ||
SELECT call_number | ||
FROM call_number_from_holdings_ext | ||
WHERE TRIM(call_number) <> '' | ||
GROUP BY call_number | ||
HAVING COUNT(*) > 1 | ||
) | ||
SELECT s.instance_id, s.call_number | ||
FROM call_number_from_holdings_ext s | ||
JOIN call_number_duplicates d ON s.call_number = d.call_number | ||
ORDER BY s.call_number; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
WITH oclc_from_srs_marctab AS ( | ||
SELECT instance_id, LTRIM(REGEXP_REPLACE(content, '[^0-9]', '', 'g'), '0') AS oclc | ||
FROM public.srs_marctab | ||
WHERE field = '035' | ||
AND ord = 1 | ||
AND sf IN ('a', 'z') | ||
AND content LIKE '(OCoLC)%' | ||
), | ||
oclc_duplicates AS ( | ||
SELECT oclc | ||
FROM oclc_from_srs_marctab | ||
WHERE oclc <> '' | ||
GROUP BY oclc | ||
HAVING COUNT(*) > 1 | ||
), | ||
oclc_data AS ( | ||
SELECT s.instance_id::text, s.oclc | ||
FROM oclc_from_srs_marctab s | ||
JOIN oclc_duplicates d ON s.oclc = d.oclc | ||
), | ||
isbn_from_instance_identifiers AS ( | ||
SELECT instance_id, NULLIF(LEFT(RIGHT(REGEXP_REPLACE(identifier, ' .*', ''), 10), 9), ':') AS isbn | ||
FROM folio_reporting.instance_identifiers | ||
WHERE identifier_type_name = 'ISBN' | ||
), | ||
isbn_duplicates AS ( | ||
SELECT isbn | ||
FROM isbn_from_instance_identifiers | ||
WHERE isbn <> '' | ||
GROUP BY isbn | ||
HAVING COUNT(*) > 1 | ||
), | ||
isbn_data AS ( | ||
SELECT s.instance_id, s.isbn | ||
FROM isbn_from_instance_identifiers s | ||
JOIN isbn_duplicates d ON s.isbn = d.isbn | ||
), | ||
issn_from_srs_marctab AS ( | ||
SELECT instance_id, content AS issn | ||
FROM public.srs_marctab | ||
WHERE field = '022' | ||
AND ord = 1 | ||
AND sf = 'a' | ||
), | ||
issn_duplicates AS ( | ||
SELECT issn | ||
FROM issn_from_srs_marctab | ||
WHERE issn <> '' | ||
GROUP BY issn | ||
HAVING COUNT(*) > 1 | ||
), | ||
issn_data AS ( | ||
SELECT s.instance_id::text, s.issn | ||
FROM issn_from_srs_marctab s | ||
JOIN issn_duplicates d ON s.issn = d.issn | ||
), | ||
lccn_from_instance_identifiers AS ( | ||
SELECT instance_id, identifier AS lccn | ||
FROM folio_reporting.instance_identifiers | ||
WHERE identifier_type_name = 'LCCN' | ||
), | ||
lccn_duplicates AS ( | ||
SELECT lccn | ||
FROM lccn_from_instance_identifiers | ||
WHERE lccn <> '' | ||
GROUP BY lccn | ||
HAVING COUNT(*) > 1 | ||
), | ||
lccn_data AS ( | ||
SELECT s.instance_id, s.lccn | ||
FROM lccn_from_instance_identifiers s | ||
JOIN lccn_duplicates d ON s.lccn = d.lccn | ||
), | ||
call_number_from_holdings_ext AS ( | ||
SELECT instance_id, TRIM(CONCAT_WS(' ', call_number_prefix, call_number, call_number_suffix)) IS NOT NULL AS call_number | ||
FROM folio_reporting.holdings_ext | ||
), | ||
call_number_duplicates AS ( | ||
SELECT call_number | ||
FROM call_number_from_holdings_ext | ||
WHERE TRIM(call_number) <> '' | ||
GROUP BY call_number | ||
HAVING COUNT(*) > 1 | ||
), | ||
call_number_data AS ( | ||
SELECT s.instance_id, s.call_number | ||
FROM call_number_from_holdings_ext s | ||
JOIN call_number_duplicates d ON s.call_number = d.call_number | ||
), | ||
duplicate_instances AS ( | ||
SELECT ie.instance_id, ie.instance_hrid, oclc.oclc, isbn.isbn, issn.issn, lccn.lccn, cn.call_number | ||
FROM folio_reporting.instance_ext ie | ||
LEFT JOIN oclc_data oclc ON ie.instance_id = oclc.instance_id | ||
LEFT JOIN isbn_data isbn ON ie.instance_id = isbn.instance_id | ||
LEFT JOIN issn_data issn ON ie.instance_id = issn.instance_id | ||
LEFT JOIN lccn_data lccn ON ie.instance_id = lccn.instance_id | ||
LEFT JOIN call_number_data cn ON ie.instance_id = cn.instance_id | ||
WHERE oclc.oclc IS NOT NULL | ||
OR isbn.isbn IS NOT NULL | ||
OR issn.issn IS NOT NULL | ||
OR lccn.lccn IS NOT NULL | ||
OR cn.call_number IS NOT NULL | ||
) | ||
SELECT | ||
di1.instance_hrid AS HRID, | ||
di2.instance_hrid AS HRID2, | ||
MAX(CASE WHEN (di1.OCLC IS NOT NULL AND di1.OCLC = di2.OCLC) THEN 'T' ELSE NULL END) AS OCLC, | ||
MAX(CASE WHEN (di1.ISBN IS NOT NULL AND di1.ISBN = di2.ISBN) THEN 'T' ELSE NULL END) AS ISBN, | ||
MAX(CASE WHEN (di1.ISSN IS NOT NULL AND di1.ISSN = di2.ISSN) THEN 'T' ELSE NULL END) AS ISSN, | ||
MAX(CASE WHEN (di1.LCCN IS NOT NULL AND di1.LCCN = di2.LCCN) THEN 'T' ELSE NULL END) AS LCCN, | ||
MAX(CASE WHEN (di1.CALL_NUMBER IS NOT NULL AND di1.CALL_NUMBER = di2.CALL_NUMBER) THEN 'T' ELSE NULL END) AS CALL_NUMBER | ||
FROM | ||
duplicate_instances di1 | ||
INNER JOIN | ||
duplicate_instances di2 ON di1.instance_id <> di2.instance_id AND | ||
(di1.oclc = di2.oclc OR | ||
di1.isbn = di2.isbn OR | ||
di1.issn = di2.issn OR | ||
di1.lccn = di2.lccn OR | ||
di1.call_number = di2.call_number) | ||
GROUP BY di1.instance_hrid, di2.instance_hrid; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
-- ISBN | ||
|
||
-- **ISBN Match** | ||
|
||
-- - [ ] 1. The ISBN will need to be normalized as follows: | ||
-- - [ ] a. Trim everything after and including the first white space | ||
-- - [ ] b. Take the last 10 characters | ||
-- - [ ] c. Of those last 10 characters, remove the last character | ||
-- - [ ] d. `9780134553351 (foo) : $16.50` -> `9780134553351` -> `~978~ 013455335 ~1~` -> `013455335` | ||
-- - [ ] 2. The Normalized ISBN of the source instance is considered a match if it is identical to the Normalized ISBN of the target instance. | ||
-- - [ ] 3. The instance_identifiers table should be used for this comparison | ||
|
||
-- matches | ||
WITH isbn_from_instance_identifiers AS ( | ||
SELECT instance_id, NULLIF(LEFT(RIGHT(REGEXP_REPLACE(identifier, ' .*', ''), 10), 9), ':') AS isbn | ||
FROM folio_reporting.instance_identifiers | ||
WHERE identifier_type_name = 'ISBN' | ||
) | ||
SELECT isbn, COUNT(*) FROM isbn_from_instance_identifiers WHERE isbn <> '' GROUP BY isbn HAVING COUNT(*) > 1; | ||
|
||
|
||
-- grouped by matches | ||
WITH isbn_from_instance_identifiers AS ( | ||
SELECT NULLIF(LEFT(RIGHT(REGEXP_REPLACE(identifier, ' .*', ''), 10), 9), ':') AS isbn | ||
FROM folio_reporting.instance_identifiers | ||
WHERE identifier_type_name = 'ISBN' | ||
), | ||
isbn_duplicates AS ( | ||
SELECT isbn | ||
FROM isbn_from_instance_identifiers | ||
WHERE isbn <> '' | ||
GROUP BY isbn | ||
HAVING COUNT(*) > 1 | ||
) | ||
SELECT * FROM isbn_duplicates; | ||
|
||
|
||
-- order by identifier | ||
WITH isbn_from_instance_identifiers AS ( | ||
SELECT instance_id, NULLIF(LEFT(RIGHT(REGEXP_REPLACE(identifier, ' .*', ''), 10), 9), ':') AS isbn | ||
FROM folio_reporting.instance_identifiers | ||
WHERE identifier_type_name = 'ISBN' | ||
), | ||
isbn_duplicates AS ( | ||
SELECT isbn | ||
FROM isbn_from_instance_identifiers | ||
WHERE isbn <> '' | ||
GROUP BY isbn | ||
HAVING COUNT(*) > 1 | ||
) | ||
SELECT s.instance_id, s.isbn | ||
FROM isbn_from_instance_identifiers s | ||
JOIN isbn_duplicates d ON s.isbn = d.isbn | ||
ORDER BY s.isbn; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
-- ISSN | ||
|
||
-- - [ ] 1. The ISSNs do not need to be normalized | ||
-- - [ ] 2. The ISSN is considered to be a match when the field:022 ord:0 sf:a or the source instance is identical the field:022 ord:0 sf:a of the target record. | ||
-- - [ ] 3. The srs_marctab table should be used for this comparison | ||
|
||
-- matches | ||
WITH issn_from_srs_marctab AS ( | ||
SELECT instance_id::text, content AS issn | ||
FROM public.srs_marctab | ||
WHERE field = '022' | ||
AND ord = 1 | ||
AND sf = 'a' | ||
) | ||
SELECT issn, COUNT(*) FROM issn_from_srs_marctab WHERE issn <> '' GROUP BY issn HAVING COUNT(*) > 1 ORDER BY count DESC; | ||
|
||
|
||
-- grouped by matches | ||
WITH issn_from_srs_marctab AS ( | ||
SELECT content AS issn | ||
FROM public.srs_marctab | ||
WHERE field = '022' | ||
AND ord = 1 | ||
AND sf = 'a' | ||
), | ||
issn_duplicates AS ( | ||
SELECT issn | ||
FROM issn_from_srs_marctab | ||
WHERE issn <> '' | ||
GROUP BY issn | ||
HAVING COUNT(*) > 1 | ||
) | ||
SELECT * FROM issn_duplicates; | ||
|
||
|
||
-- order by identifier | ||
WITH issn_from_srs_marctab AS ( | ||
SELECT instance_id, content AS issn | ||
FROM public.srs_marctab | ||
WHERE field = '022' | ||
AND ord = 1 | ||
AND sf = 'a' | ||
), | ||
issn_duplicates AS ( | ||
SELECT issn | ||
FROM issn_from_srs_marctab | ||
WHERE issn <> '' | ||
GROUP BY issn | ||
HAVING COUNT(*) > 1 | ||
) | ||
SELECT s.instance_id::text, s.issn | ||
FROM issn_from_srs_marctab s | ||
JOIN issn_duplicates d ON s.issn = d.issn | ||
ORDER BY s.issn; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
-- LCCN | ||
|
||
-- - [ ] 1. The LCCN's do not need to be normalized | ||
-- - [ ] 2. The LCCN is considered to be a match if the source LCCN is identical to the target LCCN | ||
-- - [ ] 3. The instance_identifiers table should be used for this comparison | ||
|
||
-- matches | ||
WITH lccn_from_instance_identifiers AS ( | ||
SELECT instance_id, identifier AS lccn | ||
FROM folio_reporting.instance_identifiers | ||
WHERE identifier_type_name = 'LCCN' | ||
) | ||
SELECT lccn, COUNT(*) FROM lccn_from_instance_identifiers WHERE lccn <> '' GROUP BY lccn HAVING COUNT(*) > 1 ORDER BY count DESC; | ||
|
||
|
||
-- grouped by matches | ||
WITH lccn_from_instance_identifiers AS ( | ||
SELECT identifier AS lccn | ||
FROM folio_reporting.instance_identifiers | ||
WHERE identifier_type_name = 'LCCN' | ||
), | ||
lccn_duplicates AS ( | ||
SELECT lccn | ||
FROM lccn_from_instance_identifiers | ||
WHERE lccn <> '' | ||
GROUP BY lccn | ||
HAVING COUNT(*) > 1 | ||
) | ||
SELECT * FROM lccn_duplicates; | ||
|
||
|
||
-- order by identifier | ||
WITH lccn_from_instance_identifiers AS ( | ||
SELECT instance_id, identifier AS lccn | ||
FROM folio_reporting.instance_identifiers | ||
WHERE identifier_type_name = 'LCCN' | ||
), | ||
lccn_duplicates AS ( | ||
SELECT lccn | ||
FROM lccn_from_instance_identifiers | ||
WHERE lccn <> '' | ||
GROUP BY lccn | ||
HAVING COUNT(*) > 1 | ||
) | ||
SELECT s.instance_id, s.lccn | ||
FROM lccn_from_instance_identifiers s | ||
JOIN lccn_duplicates d ON s.lccn = d.lccn | ||
ORDER BY s.lccn; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
-- OCLC Matching | ||
|
||
-- **OCLC Match** | ||
|
||
-- - [ ] 1. The OCLC numbers will need to be normalized. | ||
-- - [ ] a. Only treat values with the `(OCLoC)` prefix as an OCLC number. Ignore all fields lacking this prefix. | ||
-- - [ ] b. All non numerical characters shoud be removed. | ||
-- - [ ] c. Trim all leading 0s | ||
-- - [ ] 2. A Normalized OCLC number is considered to be a match when: | ||
-- - [ ] a. The Normalized OCLC number from the field:035 ord:1 sf:a of the source record matches the Normalized OCLC number from the field:035 ord:1 sf:a of the target record | ||
-- - [ ] b. The Normalized OCLC number from the field:035 ord:1 sf:a of the source record matches any of the Normalized OCLC numbers within the field:035 sf:z fields of the target record. | ||
-- - [ ] c. The srs_marctab table should be used for this comparison | ||
|
||
-- matches | ||
WITH oclc_from_srs_marctab AS ( | ||
SELECT instance_id::text, LTRIM(REGEXP_REPLACE(content, '[^0-9]', '', 'g'), '0') AS oclc | ||
FROM public.srs_marctab | ||
WHERE field = '035' | ||
AND ord = 1 | ||
AND sf IN ('a', 'z') | ||
AND content LIKE '(OCoLC)%' | ||
) | ||
SELECT oclc, COUNT(*) FROM oclc_from_srs_marctab WHERE oclc <> '' GROUP BY oclc HAVING COUNT(*) > 1 ORDER BY count DESC; | ||
|
||
|
||
-- grouped by matches | ||
WITH oclc_from_srs_marctab AS ( | ||
SELECT instance_id, LTRIM(REGEXP_REPLACE(content, '[^0-9]', '', 'g'), '0') AS oclc | ||
FROM public.srs_marctab | ||
WHERE field = '035' | ||
AND ord = 1 | ||
AND sf IN ('a', 'z') | ||
AND content LIKE '(OCoLC)%' | ||
), | ||
oclc_duplicates AS ( | ||
SELECT oclc | ||
FROM oclc_from_srs_marctab | ||
WHERE oclc <> '' | ||
GROUP BY oclc | ||
HAVING COUNT(*) > 1 | ||
) | ||
SELECT * FROM oclc_duplicates; | ||
|
||
|
||
-- order identifier | ||
WITH oclc_from_srs_marctab AS ( | ||
SELECT instance_id, LTRIM(REGEXP_REPLACE(content, '[^0-9]', '', 'g'), '0') AS oclc | ||
FROM public.srs_marctab | ||
WHERE field = '035' | ||
AND ord = 1 | ||
AND sf IN ('a', 'z') | ||
AND content LIKE '(OCoLC)%' | ||
), | ||
oclc_duplicates AS ( | ||
SELECT oclc | ||
FROM oclc_from_srs_marctab | ||
WHERE oclc <> '' | ||
GROUP BY oclc | ||
HAVING COUNT(*) > 1 | ||
) | ||
SELECT s.instance_id::text, s.oclc | ||
FROM oclc_from_srs_marctab s | ||
JOIN oclc_duplicates d ON s.oclc = d.oclc | ||
ORDER BY s.oclc; |