From e301bfcb356449aa9fcdf75c175460de5978dc49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mich=C3=A8le=20BARRE?= <michele.barre@opt.nc>
Date: Wed, 10 Apr 2024 15:30:21 +1100
Subject: [PATCH] feat(data): check data ref: #3

---
 .github/dependabot.yml           |  9 +++++
 .github/workflows/check_data.yml | 24 ++++++++++++++
 .gitignore                       |  1 +
 duck.sql                         | 56 ++++++++++++++++++++++++++++++++
 4 files changed, 90 insertions(+)
 create mode 100644 .github/dependabot.yml
 create mode 100644 .github/workflows/check_data.yml
 create mode 100644 .gitignore
 create mode 100644 duck.sql

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..9259011
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,9 @@
+version: 2
+
+updates:
+  # Maintain dependencies for GitHub Actions
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "daily"
+    target-branch: develop
diff --git a/.github/workflows/check_data.yml b/.github/workflows/check_data.yml
new file mode 100644
index 0000000..68db635
--- /dev/null
+++ b/.github/workflows/check_data.yml
@@ -0,0 +1,24 @@
+# This workflow will check out the code, install DuckDb then run data quality tests
+# For more information see: https://dev.to/optnc/effortless-data-quality-wduckdb-on-github-2mkb
+name: 🧪 Check data
+
+
+on: [pull_request, push]
+
+jobs:
+  test:
+    name: 🧪 Verify data
+    runs-on: ubuntu-latest
+    steps:
+      - name: 📥 Checkout code
+        uses: actions/checkout@v4
+
+      - name: ⚙️ Install DuckDB
+        uses: opt-nc/setup-duckdb-action@v1.0.7
+        with:
+          version: ${{ vars.VERSION_DUCKDB}}
+
+
+      - name : 🕵️‍♂️ Check data integrity
+        run: |
+          duckdb < duck.sql
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..485dee6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.idea
diff --git a/duck.sql b/duck.sql
new file mode 100644
index 0000000..391eaec
--- /dev/null
+++ b/duck.sql
@@ -0,0 +1,56 @@
+-- Load acronyms data
+create
+or replace table acronyms (
+    id_acronym VARCHAR NOT NULL, CHECK (id_acronym = UPPER(id_acronym)),
+    description  VARCHAR UNIQUE
+);
+
+insert into acronyms (id_acronym, description)
+from
+    (
+        FROM read_csv('data/acronyms_optnc.csv',
+                            header = true,
+                            columns = {
+                                'id_acronym': 'VARCHAR',
+                                'description': 'VARCHAR'
+    })
+    ) t;
+
+from acronyms;
+
+-- Prepare test environment
+CREATE SEQUENCE seq_original START 1;
+CREATE SEQUENCE seq_sorted START 1;
+
+create or replace temp table orig_table as
+    select nextval('seq_original') as index,
+    id_acronym from acronyms;
+
+create or replace temp table sorted_table as
+    select nextval('seq_sorted') as index,
+    id_acronym
+    from (select id_acronym from acronyms order by id_acronym);
+
+-- Check the resulting tables
+from orig_table;
+from sorted_table;
+
+-- Create the table that compares the sorted and original tables columns
+create or replace temp table test_sorted(orig_id_acronym varchar,
+                                    orig_index integer,
+                                    sorted_index integer
+                                    -- the magic part XD
+                                    check(orig_index = sorted_index)
+                                    );
+-- Populate the comparison table
+insert into test_sorted
+select
+    orig_table.id_acronym as orig_id_acronym,
+    orig_table.index as orig_index,
+    sorted_table.index as sorted_index,
+from
+    orig_table,
+    sorted_table
+where
+    orig_table.id_acronym = sorted_table.id_acronym
+order by orig_table.index;
\ No newline at end of file