Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
name: CI
on: [push, pull_request]
jobs:
build:
name: Build
strategy:
fail-fast: false
matrix:
jdk: [ 21 ]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up JDK ${{ matrix.jdk }}
uses: actions/setup-java@v4
with:
java-version: ${{ matrix.jdk }}
distribution: liberica
cache: maven
- name: Build with Maven
run: mvn -V -B package
Original file line number Diff line number Diff line change
@@ -1,68 +1,68 @@
version 1.0
@totalColumns 42
/*-------------------------------------------------------------------------------
|Schema: ADM_363-technical-acquisition-with-minimal-transcription.csvs |
|Authors: Nicki Welch |
| David Underdown |
|Purpose: To capture metadata about the digitisation of the ADM 363 series |
| Primarily technical metadata, but with a minimal amount of |
| transcription to verify that the records may be publicly released |
| after receipt by The National Archives |
|Revision: 1.0 first release |
| 1.1 update as some official numbers only single digit |
| 1.2 allow M as official number prefix too |
| 1.3 further additions to prefixes, L, S, SS, SSX |
| 1.4 allow for asterisk and ? in official number |
| 1.5 further prefixes MX, KX, JX, and longer volume number |
| 1.6 add explicit check that checksum is not that for a 0 byte file |
| 1.7 Fix errors eg use correct not(), rather than isNot() |
| 1.8 Allow brackets etc in comments, range checking for birth year |
| ???? for birth year |
| 1.9 Add piece check in ordinal: unique($piece,$item,$ordinal) |
| Remove and in($resource_uri) from item: |
| resource_uri, change starts(...) to |
| regex("...") |
| 2.0 Allow LX as a prefix too |
|-------------------------------------------------------------------------------*/
batch_code: length(10) regex("^ADM362B([0-9]{3})$")
department: (is("ADM") if($file_path/notEmpty,in($file_path) and in($resource_uri)))
series: is("362") and if($file_path/notEmpty,in($file_path) and in($resource_uri))
piece: range(1,69720) if($file_path/notEmpty,in($file_path) and in($resource_uri))
item: ((positiveInteger unique($piece,$item,$ordinal)) or empty) if($file_path/notEmpty,in($file_path))
ordinal: if($item/empty,empty,unique($piece,$item,$ordinal))
file_uuid: if($ordinal/empty,empty,uuid4 unique)
file_path: uri if($ordinal/empty,empty,unique fileExists regex("^file:\/\/\/ADM_362\/[0-9]{1,5}\/[1-9][0-9]{0,4}\/[1-9][0-9]{0,4}_[0-9]{1,4}\.jp2$"))
file_checksum: if($ordinal/empty,empty,not("e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855") and checksum(file($file_path),"SHA-256"))
resource_uri: if($ordinal/notEmpty,uri and regex("^http://datagov.nationalarchives.gov.uk/66/ADM/362/[1-9][0-9]*/[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-[a-f0-9]{12}$"))
scan_operator: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z]{1,12}$"))
scan_id: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z_]{1,12}$"))
scan_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
scan_native_format: if($ordinal/empty,empty,regex("[0-9\w\s,.:]+"))
scan_timestamp: if($ordinal/empty,empty,xDateTime)
image_resolution: if($ordinal/empty,empty,is("300"))
image_width: if($ordinal/empty,empty,positiveInteger)
image_height: if($ordinal/empty,empty,positiveInteger)
image_tonal_resolution: if($ordinal/empty,empty,is("24-bit colour"))
image_format: if($ordinal/empty,empty,is("x-fmt/392"))
image_colour_space: if($ordinal/empty,empty,is("sRGB"))
process_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
jp2_creation_timestamp: if($ordinal/empty,empty,xDateTime)
uuid_timestamp: if($ordinal/empty,empty,xDateTime)
embed_timestamp: if($ordinal/empty,empty,xDateTime)
image_split: if($ordinal/empty,empty,is("yes") or is("no"))
image_split_other_uuid: if($ordinal/empty,empty,if($image_split/is("yes"),uuid4,is("")))
image_split_operator: if($ordinal/empty,empty,if($image_split/is("yes"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
image_split_timestamp: if($ordinal/empty,empty,if($image_split/is("yes"),xDateTime,is("")))
image_crop: if($ordinal/empty,empty,is("auto") or is("manual") or is("none"))
image_crop_operator: if($ordinal/empty,empty,if($image_split/is("manual"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
image_crop_timestamp: if($ordinal/empty,empty,if($image_crop/is("none"),empty,xDateTime))
image_deskew: if($ordinal/empty,empty,is("yes") or is("no"))
image_deskew_operator: if($ordinal/empty,empty,if($image_deskew/is("yes"),regex("^[0-9a-zA-Z]{1,12}$"),is("")))
image_deskew_timestamp: if($ordinal/empty,empty,if($image_deskew/is("yes"),xDateTime,is("")))
QA-code: regex("^[0-9/,]{1,2}$") @optional
comments: regex("[-\w\s,\.\(\)\/'":\?]+") @optional
transcribed_volume_number: if($item/empty,regex("[0-9A-Z\-\s]{1,19}"),is(""))
transcribed_birth_date_day: if(($ordinal/empty and $item/notEmpty),regex("^\*|([0\?][1-9\?])|([1-2\?][0-9\?])|([3\?][0-1\?])$"),is(""))
transcribed_birth_date_month: if(($ordinal/empty and $item/notEmpty),is("*") or is("?") or is("January") or is("February") or is("March") or is("April") or is("May") or is("June") or is("July") or is("August") or is("September") or is("October") or is("November") or is("December"), is(""))
transcribed_birth_date_year: if(($ordinal/empty and $item/notEmpty),if(positiveInteger,range(1850,1914),regex("^1[7-9][0-9\?]{2}|\*|\?{4}$")),is(""))
version 1.0
@totalColumns 42
/*-------------------------------------------------------------------------------
|Schema: ADM_363-technical-acquisition-with-minimal-transcription.csvs |
|Authors: Nicki Welch |
| David Underdown |
|Purpose: To capture metadata about the digitisation of the ADM 363 series |
| Primarily technical metadata, but with a minimal amount of |
| transcription to verify that the records may be publicly released |
| after receipt by The National Archives |
|Revision: 1.0 first release |
| 1.1 update as some official numbers only single digit |
| 1.2 allow M as official number prefix too |
| 1.3 further additions to prefixes, L, S, SS, SSX |
| 1.4 allow for asterisk and ? in official number |
| 1.5 further prefixes MX, KX, JX, and longer volume number |
| 1.6 add explicit check that checksum is not that for a 0 byte file |
| 1.7 Fix errors eg use correct not(), rather than isNot() |
| 1.8 Allow brackets etc in comments, range checking for birth year |
| ???? for birth year |
| 1.9 Add piece check in ordinal: unique($piece,$item,$ordinal) |
| Remove and in($resource_uri) from item: |
| resource_uri, change starts(...) to |
| regex("...") |
| 2.0 Allow LX as a prefix too |
|-------------------------------------------------------------------------------*/
batch_code: length(10) regex("^ADM362B([0-9]{3})$")
department: (is("ADM") if($file_path/notEmpty,in($file_path) and in($resource_uri)))
series: is("362") and if($file_path/notEmpty,in($file_path) and in($resource_uri))
piece: range(1,69720) if($file_path/notEmpty,in($file_path) and in($resource_uri))
item: ((positiveInteger unique($piece,$item,$ordinal)) or empty) if($file_path/notEmpty,in($file_path))
ordinal: if($item/empty,empty,unique($piece,$item,$ordinal))
file_uuid: if($ordinal/empty,empty,uuid4 unique)
file_path: uri if($ordinal/empty,empty,unique fileExists regex("^file:\/\/\/ADM_362\/[0-9]{1,5}\/[1-9][0-9]{0,4}\/[1-9][0-9]{0,4}_[0-9]{1,4}\.jp2$"))
file_checksum: if($ordinal/empty,empty,not("e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855") and checksum(file($file_path),"SHA-256"))
resource_uri: if($ordinal/notEmpty,uri and regex("^http://datagov.nationalarchives.gov.uk/66/ADM/362/[1-9][0-9]*/[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-[a-f0-9]{12}$"))
scan_operator: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z]{1,12}$"))
scan_id: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z_]{1,12}$"))
scan_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
scan_native_format: if($ordinal/empty,empty,regex("[0-9\w\s,.:]+"))
scan_timestamp: if($ordinal/empty,empty,xDateTime)
image_resolution: if($ordinal/empty,empty,is("300"))
image_width: if($ordinal/empty,empty,positiveInteger)
image_height: if($ordinal/empty,empty,positiveInteger)
image_tonal_resolution: if($ordinal/empty,empty,is("24-bit colour"))
image_format: if($ordinal/empty,empty,is("x-fmt/392"))
image_colour_space: if($ordinal/empty,empty,is("sRGB"))
process_location: if($ordinal/empty,empty,regex("[-\w\s,]+"))
jp2_creation_timestamp: if($ordinal/empty,empty,xDateTime)
uuid_timestamp: if($ordinal/empty,empty,xDateTime)
embed_timestamp: if($ordinal/empty,empty,xDateTime)
image_split: if($ordinal/empty,empty,is("yes") or is("no"))
image_split_other_uuid: if($ordinal/empty,empty,if($image_split/is("yes"),uuid4,is("")))
image_split_operator: if($ordinal/empty,empty,if($image_split/is("yes"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
image_split_timestamp: if($ordinal/empty,empty,if($image_split/is("yes"),xDateTime,is("")))
image_crop: if($ordinal/empty,empty,is("auto") or is("manual") or is("none"))
image_crop_operator: if($ordinal/empty,empty,if($image_split/is("manual"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is("")))
image_crop_timestamp: if($ordinal/empty,empty,if($image_crop/is("none"),empty,xDateTime))
image_deskew: if($ordinal/empty,empty,is("yes") or is("no"))
image_deskew_operator: if($ordinal/empty,empty,if($image_deskew/is("yes"),regex("^[0-9a-zA-Z]{1,12}$"),is("")))
image_deskew_timestamp: if($ordinal/empty,empty,if($image_deskew/is("yes"),xDateTime,is("")))
QA-code: regex("^[0-9/,]{1,2}$") @optional
comments: regex("[-\w\s,\.\(\)\/'":\?]+") @optional
transcribed_volume_number: if($item/empty,regex("[0-9A-Z\-\s]{1,19}"),is(""))
transcribed_birth_date_day: if(($ordinal/empty and $item/notEmpty),regex("^\*|([0\?][1-9\?])|([1-2\?][0-9\?])|([3\?][0-1\?])$"),is(""))
transcribed_birth_date_month: if(($ordinal/empty and $item/notEmpty),is("*") or is("?") or is("January") or is("February") or is("March") or is("April") or is("May") or is("June") or is("July") or is("August") or is("September") or is("October") or is("November") or is("December"), is(""))
transcribed_birth_date_year: if(($ordinal/empty and $item/notEmpty),if(positiveInteger,range(1850,1914),regex("^1[7-9][0-9\?]{2}|\*|\?{4}$")),is(""))
transcribed_official_number: if(($ordinal/empty and $item/notEmpty),regex("^(([CDP]\/)?([FJKLMS]|LX|MX|JX|KX|SS|SSX)[/?0-9]{1,6}|[/?1-9][/?0-9]{5}|\*)$"),is(""))
10 changes: 5 additions & 5 deletions src/main/resources/schemas/concat.csvs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
version 1.1
@totalColumns 3
c1:
c2:
c3: is(concat($c1,$c2))
version 1.1
@totalColumns 3
c1:
c2:
c3: is(concat($c1,$c2))
24 changes: 12 additions & 12 deletions src/main/resources/schemas/thunder-stone-sample-csvs.csvs
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
database /tmp/testdb
table customer
# indicate csv format with a delimiter of |
csv |
# Name Type Tag
field CustID varchar(10) 1
field Company varchar(80) 2
field Address varchar(80) 3
field City varchar(20) 4
field State varchar(10) 5
field Zip varchar(10) 6
field Country varchar(10) 7
database /tmp/testdb
table customer
# indicate csv format with a delimiter of |
csv |
# Name Type Tag
field CustID varchar(10) 1
field Company varchar(80) 2
field Address varchar(80) 3
field City varchar(20) 4
field State varchar(10) 5
field Zip varchar(10) 6
field Country varchar(10) 7
field Phone varchar(20) 8
10 changes: 5 additions & 5 deletions src/test/resources/mock-data/concat.csvs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
version 1.1
@totalColumns 3
c1:
c2:
c3: is(concat($c1,$c2))
version 1.1
@totalColumns 3
c1:
c2:
c3: is(concat($c1,$c2))
Loading