From e301bfcb356449aa9fcdf75c175460de5978dc49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mich=C3=A8le=20BARRE?= Date: Wed, 10 Apr 2024 15:30:21 +1100 Subject: [PATCH] feat(data): check data ref: #3 --- .github/dependabot.yml | 9 +++++ .github/workflows/check_data.yml | 24 ++++++++++++++ .gitignore | 1 + duck.sql | 56 ++++++++++++++++++++++++++++++++ 4 files changed, 90 insertions(+) create mode 100644 .github/dependabot.yml create mode 100644 .github/workflows/check_data.yml create mode 100644 .gitignore create mode 100644 duck.sql diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..9259011 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,9 @@ +version: 2 + +updates: + # Maintain dependencies for GitHub Actions + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" + target-branch: develop diff --git a/.github/workflows/check_data.yml b/.github/workflows/check_data.yml new file mode 100644 index 0000000..68db635 --- /dev/null +++ b/.github/workflows/check_data.yml @@ -0,0 +1,24 @@ +# This workflow will check out the code, install DuckDb then run data quality tests +# For more information see: https://dev.to/optnc/effortless-data-quality-wduckdb-on-github-2mkb +name: 🧪 Check data + + +on: [pull_request, push] + +jobs: + test: + name: 🧪 Verify data + runs-on: ubuntu-latest + steps: + - name: 📥 Checkout code + uses: actions/checkout@v4 + + - name: ⚙️ Install DuckDB + uses: opt-nc/setup-duckdb-action@v1.0.7 + with: + version: ${{ vars.VERSION_DUCKDB}} + + + - name : 🕵️‍♂️ Check data integrity + run: | + duckdb < duck.sql \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..485dee6 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea diff --git a/duck.sql b/duck.sql new file mode 100644 index 0000000..391eaec --- /dev/null +++ b/duck.sql @@ -0,0 +1,56 @@ +-- Load acronyms data +create +or replace table acronyms ( + id_acronym VARCHAR NOT NULL, CHECK (id_acronym = UPPER(id_acronym)), + description VARCHAR UNIQUE +); + +insert into acronyms (id_acronym, description) +from + ( + FROM read_csv('data/acronyms_optnc.csv', + header = true, + columns = { + 'id_acronym': 'VARCHAR', + 'description': 'VARCHAR' + }) + ) t; + +from acronyms; + +-- Prepare test environment +CREATE SEQUENCE seq_original START 1; +CREATE SEQUENCE seq_sorted START 1; + +create or replace temp table orig_table as + select nextval('seq_original') as index, + id_acronym from acronyms; + +create or replace temp table sorted_table as + select nextval('seq_sorted') as index, + id_acronym + from (select id_acronym from acronyms order by id_acronym); + +-- Check the resulting tables +from orig_table; +from sorted_table; + +-- Create the table that compares the sorted and original tables columns +create or replace temp table test_sorted(orig_id_acronym varchar, + orig_index integer, + sorted_index integer + -- the magic part XD + check(orig_index = sorted_index) + ); +-- Populate the comparison table +insert into test_sorted +select + orig_table.id_acronym as orig_id_acronym, + orig_table.index as orig_index, + sorted_table.index as sorted_index, +from + orig_table, + sorted_table +where + orig_table.id_acronym = sorted_table.id_acronym +order by orig_table.index; \ No newline at end of file