Skip to content

Commit

Permalink
feat(data): check data
Browse files Browse the repository at this point in the history
ref: #3
  • Loading branch information
mbarre committed Apr 10, 2024
1 parent 323caf5 commit e301bfc
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 0 deletions.
9 changes: 9 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
version: 2

updates:
# Maintain dependencies for GitHub Actions
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "daily"
target-branch: develop
24 changes: 24 additions & 0 deletions .github/workflows/check_data.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# This workflow will check out the code, install DuckDb then run data quality tests
# For more information see: https://dev.to/optnc/effortless-data-quality-wduckdb-on-github-2mkb
name: 🧪 Check data


on: [pull_request, push]

jobs:
test:
name: 🧪 Verify data
runs-on: ubuntu-latest
steps:
- name: 📥 Checkout code
uses: actions/checkout@v4

- name: ⚙️ Install DuckDB
uses: opt-nc/[email protected]
with:
version: ${{ vars.VERSION_DUCKDB}}


- name : 🕵️‍♂️ Check data integrity
run: |
duckdb < duck.sql
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.idea
56 changes: 56 additions & 0 deletions duck.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
-- Load acronyms data
create
or replace table acronyms (
id_acronym VARCHAR NOT NULL, CHECK (id_acronym = UPPER(id_acronym)),
description VARCHAR UNIQUE
);

insert into acronyms (id_acronym, description)
from
(
FROM read_csv('data/acronyms_optnc.csv',
header = true,
columns = {
'id_acronym': 'VARCHAR',
'description': 'VARCHAR'
})
) t;

from acronyms;

-- Prepare test environment
CREATE SEQUENCE seq_original START 1;
CREATE SEQUENCE seq_sorted START 1;

create or replace temp table orig_table as
select nextval('seq_original') as index,
id_acronym from acronyms;

create or replace temp table sorted_table as
select nextval('seq_sorted') as index,
id_acronym
from (select id_acronym from acronyms order by id_acronym);

-- Check the resulting tables
from orig_table;
from sorted_table;

-- Create the table that compares the sorted and original tables columns
create or replace temp table test_sorted(orig_id_acronym varchar,
orig_index integer,
sorted_index integer
-- the magic part XD
check(orig_index = sorted_index)
);
-- Populate the comparison table
insert into test_sorted
select
orig_table.id_acronym as orig_id_acronym,
orig_table.index as orig_index,
sorted_table.index as sorted_index,
from
orig_table,
sorted_table
where
orig_table.id_acronym = sorted_table.id_acronym
order by orig_table.index;

0 comments on commit e301bfc

Please sign in to comment.