diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..1f82cf9 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,20 @@ +name: CI +on: [push, pull_request] +jobs: + build: + name: Build + strategy: + fail-fast: false + matrix: + jdk: [ 21 ] + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Set up JDK ${{ matrix.jdk }} + uses: actions/setup-java@v4 + with: + java-version: ${{ matrix.jdk }} + distribution: liberica + cache: maven + - name: Build with Maven + run: mvn -V -B package \ No newline at end of file diff --git a/src/main/resources/schemas/ADM_362-technical-acquisition-with-minimal-transcription.csvs b/src/main/resources/schemas/ADM_362-technical-acquisition-with-minimal-transcription.csvs index 7698428..c82a12f 100644 --- a/src/main/resources/schemas/ADM_362-technical-acquisition-with-minimal-transcription.csvs +++ b/src/main/resources/schemas/ADM_362-technical-acquisition-with-minimal-transcription.csvs @@ -1,68 +1,68 @@ -version 1.0 -@totalColumns 42 -/*------------------------------------------------------------------------------- -|Schema: ADM_363-technical-acquisition-with-minimal-transcription.csvs | -|Authors: Nicki Welch | -| David Underdown | -|Purpose: To capture metadata about the digitisation of the ADM 363 series | -| Primarily technical metadata, but with a minimal amount of | -| transcription to verify that the records may be publicly released | -| after receipt by The National Archives | -|Revision: 1.0 first release | -| 1.1 update as some official numbers only single digit | -| 1.2 allow M as official number prefix too | -| 1.3 further additions to prefixes, L, S, SS, SSX | -| 1.4 allow for asterisk and ? in official number | -| 1.5 further prefixes MX, KX, JX, and longer volume number | -| 1.6 add explicit check that checksum is not that for a 0 byte file | -| 1.7 Fix errors eg use correct not(), rather than isNot() | -| 1.8 Allow brackets etc in comments, range checking for birth year | -| ???? for birth year | -| 1.9 Add piece check in ordinal: unique($piece,$item,$ordinal) | -| Remove and in($resource_uri) from item: | -| resource_uri, change starts(...) to | -| regex("...") | -| 2.0 Allow LX as a prefix too | -|-------------------------------------------------------------------------------*/ -batch_code: length(10) regex("^ADM362B([0-9]{3})$") -department: (is("ADM") if($file_path/notEmpty,in($file_path) and in($resource_uri))) -series: is("362") and if($file_path/notEmpty,in($file_path) and in($resource_uri)) -piece: range(1,69720) if($file_path/notEmpty,in($file_path) and in($resource_uri)) -item: ((positiveInteger unique($piece,$item,$ordinal)) or empty) if($file_path/notEmpty,in($file_path)) -ordinal: if($item/empty,empty,unique($piece,$item,$ordinal)) -file_uuid: if($ordinal/empty,empty,uuid4 unique) -file_path: uri if($ordinal/empty,empty,unique fileExists regex("^file:\/\/\/ADM_362\/[0-9]{1,5}\/[1-9][0-9]{0,4}\/[1-9][0-9]{0,4}_[0-9]{1,4}\.jp2$")) -file_checksum: if($ordinal/empty,empty,not("e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855") and checksum(file($file_path),"SHA-256")) -resource_uri: if($ordinal/notEmpty,uri and regex("^http://datagov.nationalarchives.gov.uk/66/ADM/362/[1-9][0-9]*/[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-[a-f0-9]{12}$")) -scan_operator: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z]{1,12}$")) -scan_id: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z_]{1,12}$")) -scan_location: if($ordinal/empty,empty,regex("[-\w\s,]+")) -scan_native_format: if($ordinal/empty,empty,regex("[0-9\w\s,.:]+")) -scan_timestamp: if($ordinal/empty,empty,xDateTime) -image_resolution: if($ordinal/empty,empty,is("300")) -image_width: if($ordinal/empty,empty,positiveInteger) -image_height: if($ordinal/empty,empty,positiveInteger) -image_tonal_resolution: if($ordinal/empty,empty,is("24-bit colour")) -image_format: if($ordinal/empty,empty,is("x-fmt/392")) -image_colour_space: if($ordinal/empty,empty,is("sRGB")) -process_location: if($ordinal/empty,empty,regex("[-\w\s,]+")) -jp2_creation_timestamp: if($ordinal/empty,empty,xDateTime) -uuid_timestamp: if($ordinal/empty,empty,xDateTime) -embed_timestamp: if($ordinal/empty,empty,xDateTime) -image_split: if($ordinal/empty,empty,is("yes") or is("no")) -image_split_other_uuid: if($ordinal/empty,empty,if($image_split/is("yes"),uuid4,is(""))) -image_split_operator: if($ordinal/empty,empty,if($image_split/is("yes"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is(""))) -image_split_timestamp: if($ordinal/empty,empty,if($image_split/is("yes"),xDateTime,is(""))) -image_crop: if($ordinal/empty,empty,is("auto") or is("manual") or is("none")) -image_crop_operator: if($ordinal/empty,empty,if($image_split/is("manual"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is(""))) -image_crop_timestamp: if($ordinal/empty,empty,if($image_crop/is("none"),empty,xDateTime)) -image_deskew: if($ordinal/empty,empty,is("yes") or is("no")) -image_deskew_operator: if($ordinal/empty,empty,if($image_deskew/is("yes"),regex("^[0-9a-zA-Z]{1,12}$"),is(""))) -image_deskew_timestamp: if($ordinal/empty,empty,if($image_deskew/is("yes"),xDateTime,is(""))) -QA-code: regex("^[0-9/,]{1,2}$") @optional -comments: regex("[-\w\s,\.\(\)\/'":\?]+") @optional -transcribed_volume_number: if($item/empty,regex("[0-9A-Z\-\s]{1,19}"),is("")) -transcribed_birth_date_day: if(($ordinal/empty and $item/notEmpty),regex("^\*|([0\?][1-9\?])|([1-2\?][0-9\?])|([3\?][0-1\?])$"),is("")) -transcribed_birth_date_month: if(($ordinal/empty and $item/notEmpty),is("*") or is("?") or is("January") or is("February") or is("March") or is("April") or is("May") or is("June") or is("July") or is("August") or is("September") or is("October") or is("November") or is("December"), is("")) -transcribed_birth_date_year: if(($ordinal/empty and $item/notEmpty),if(positiveInteger,range(1850,1914),regex("^1[7-9][0-9\?]{2}|\*|\?{4}$")),is("")) +version 1.0 +@totalColumns 42 +/*------------------------------------------------------------------------------- +|Schema: ADM_363-technical-acquisition-with-minimal-transcription.csvs | +|Authors: Nicki Welch | +| David Underdown | +|Purpose: To capture metadata about the digitisation of the ADM 363 series | +| Primarily technical metadata, but with a minimal amount of | +| transcription to verify that the records may be publicly released | +| after receipt by The National Archives | +|Revision: 1.0 first release | +| 1.1 update as some official numbers only single digit | +| 1.2 allow M as official number prefix too | +| 1.3 further additions to prefixes, L, S, SS, SSX | +| 1.4 allow for asterisk and ? in official number | +| 1.5 further prefixes MX, KX, JX, and longer volume number | +| 1.6 add explicit check that checksum is not that for a 0 byte file | +| 1.7 Fix errors eg use correct not(), rather than isNot() | +| 1.8 Allow brackets etc in comments, range checking for birth year | +| ???? for birth year | +| 1.9 Add piece check in ordinal: unique($piece,$item,$ordinal) | +| Remove and in($resource_uri) from item: | +| resource_uri, change starts(...) to | +| regex("...") | +| 2.0 Allow LX as a prefix too | +|-------------------------------------------------------------------------------*/ +batch_code: length(10) regex("^ADM362B([0-9]{3})$") +department: (is("ADM") if($file_path/notEmpty,in($file_path) and in($resource_uri))) +series: is("362") and if($file_path/notEmpty,in($file_path) and in($resource_uri)) +piece: range(1,69720) if($file_path/notEmpty,in($file_path) and in($resource_uri)) +item: ((positiveInteger unique($piece,$item,$ordinal)) or empty) if($file_path/notEmpty,in($file_path)) +ordinal: if($item/empty,empty,unique($piece,$item,$ordinal)) +file_uuid: if($ordinal/empty,empty,uuid4 unique) +file_path: uri if($ordinal/empty,empty,unique fileExists regex("^file:\/\/\/ADM_362\/[0-9]{1,5}\/[1-9][0-9]{0,4}\/[1-9][0-9]{0,4}_[0-9]{1,4}\.jp2$")) +file_checksum: if($ordinal/empty,empty,not("e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855") and checksum(file($file_path),"SHA-256")) +resource_uri: if($ordinal/notEmpty,uri and regex("^http://datagov.nationalarchives.gov.uk/66/ADM/362/[1-9][0-9]*/[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-[a-f0-9]{12}$")) +scan_operator: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z]{1,12}$")) +scan_id: if($ordinal/empty,empty,length(1,12) regex("^[0-9a-zA-Z_]{1,12}$")) +scan_location: if($ordinal/empty,empty,regex("[-\w\s,]+")) +scan_native_format: if($ordinal/empty,empty,regex("[0-9\w\s,.:]+")) +scan_timestamp: if($ordinal/empty,empty,xDateTime) +image_resolution: if($ordinal/empty,empty,is("300")) +image_width: if($ordinal/empty,empty,positiveInteger) +image_height: if($ordinal/empty,empty,positiveInteger) +image_tonal_resolution: if($ordinal/empty,empty,is("24-bit colour")) +image_format: if($ordinal/empty,empty,is("x-fmt/392")) +image_colour_space: if($ordinal/empty,empty,is("sRGB")) +process_location: if($ordinal/empty,empty,regex("[-\w\s,]+")) +jp2_creation_timestamp: if($ordinal/empty,empty,xDateTime) +uuid_timestamp: if($ordinal/empty,empty,xDateTime) +embed_timestamp: if($ordinal/empty,empty,xDateTime) +image_split: if($ordinal/empty,empty,is("yes") or is("no")) +image_split_other_uuid: if($ordinal/empty,empty,if($image_split/is("yes"),uuid4,is(""))) +image_split_operator: if($ordinal/empty,empty,if($image_split/is("yes"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is(""))) +image_split_timestamp: if($ordinal/empty,empty,if($image_split/is("yes"),xDateTime,is(""))) +image_crop: if($ordinal/empty,empty,is("auto") or is("manual") or is("none")) +image_crop_operator: if($ordinal/empty,empty,if($image_split/is("manual"),length(1,12) and regex("^[0-9a-zA-Z]{1,12}$"),is(""))) +image_crop_timestamp: if($ordinal/empty,empty,if($image_crop/is("none"),empty,xDateTime)) +image_deskew: if($ordinal/empty,empty,is("yes") or is("no")) +image_deskew_operator: if($ordinal/empty,empty,if($image_deskew/is("yes"),regex("^[0-9a-zA-Z]{1,12}$"),is(""))) +image_deskew_timestamp: if($ordinal/empty,empty,if($image_deskew/is("yes"),xDateTime,is(""))) +QA-code: regex("^[0-9/,]{1,2}$") @optional +comments: regex("[-\w\s,\.\(\)\/'":\?]+") @optional +transcribed_volume_number: if($item/empty,regex("[0-9A-Z\-\s]{1,19}"),is("")) +transcribed_birth_date_day: if(($ordinal/empty and $item/notEmpty),regex("^\*|([0\?][1-9\?])|([1-2\?][0-9\?])|([3\?][0-1\?])$"),is("")) +transcribed_birth_date_month: if(($ordinal/empty and $item/notEmpty),is("*") or is("?") or is("January") or is("February") or is("March") or is("April") or is("May") or is("June") or is("July") or is("August") or is("September") or is("October") or is("November") or is("December"), is("")) +transcribed_birth_date_year: if(($ordinal/empty and $item/notEmpty),if(positiveInteger,range(1850,1914),regex("^1[7-9][0-9\?]{2}|\*|\?{4}$")),is("")) transcribed_official_number: if(($ordinal/empty and $item/notEmpty),regex("^(([CDP]\/)?([FJKLMS]|LX|MX|JX|KX|SS|SSX)[/?0-9]{1,6}|[/?1-9][/?0-9]{5}|\*)$"),is("")) \ No newline at end of file diff --git a/src/main/resources/schemas/concat.csvs b/src/main/resources/schemas/concat.csvs index fcde34a..8ca60ee 100644 --- a/src/main/resources/schemas/concat.csvs +++ b/src/main/resources/schemas/concat.csvs @@ -1,5 +1,5 @@ -version 1.1 -@totalColumns 3 -c1: -c2: -c3: is(concat($c1,$c2)) +version 1.1 +@totalColumns 3 +c1: +c2: +c3: is(concat($c1,$c2)) diff --git a/src/main/resources/schemas/thunder-stone-sample-csvs.csvs b/src/main/resources/schemas/thunder-stone-sample-csvs.csvs index 658a08d..9ecf89f 100644 --- a/src/main/resources/schemas/thunder-stone-sample-csvs.csvs +++ b/src/main/resources/schemas/thunder-stone-sample-csvs.csvs @@ -1,13 +1,13 @@ -database /tmp/testdb -table customer -# indicate csv format with a delimiter of | -csv | -# Name Type Tag -field CustID varchar(10) 1 -field Company varchar(80) 2 -field Address varchar(80) 3 -field City varchar(20) 4 -field State varchar(10) 5 -field Zip varchar(10) 6 -field Country varchar(10) 7 +database /tmp/testdb +table customer +# indicate csv format with a delimiter of | +csv | +# Name Type Tag +field CustID varchar(10) 1 +field Company varchar(80) 2 +field Address varchar(80) 3 +field City varchar(20) 4 +field State varchar(10) 5 +field Zip varchar(10) 6 +field Country varchar(10) 7 field Phone varchar(20) 8 \ No newline at end of file diff --git a/src/test/resources/mock-data/concat.csvs b/src/test/resources/mock-data/concat.csvs index fcde34a..8ca60ee 100644 --- a/src/test/resources/mock-data/concat.csvs +++ b/src/test/resources/mock-data/concat.csvs @@ -1,5 +1,5 @@ -version 1.1 -@totalColumns 3 -c1: -c2: -c3: is(concat($c1,$c2)) +version 1.1 +@totalColumns 3 +c1: +c2: +c3: is(concat($c1,$c2))