Skip to content

Commit

Permalink
Add tracker-radar clickhouse schema
Browse files Browse the repository at this point in the history
  • Loading branch information
sammacbeth committed Apr 4, 2023
1 parent 9073db5 commit 43392d9
Show file tree
Hide file tree
Showing 6 changed files with 272 additions and 0 deletions.
89 changes: 89 additions & 0 deletions clickhouse/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Tracker Radar Clickhouse Database

This folder contains scripts and schemas for inserting Tracker Radar data into Clickhouse. This
allows for easy longitudial analysis of the data we publish on Github.

## Setup

If starting from scratch (no existing database), we first need to create the database and table/view
schemas:
```bash
cat clickhouse/schema.sql | clickhouse-client -h clickhouse --multiquery
```

## Data import

Given a checkout of [tracker-radar](https://github.com/duckduckgo/tracker-radar), we can import data
for a given tag as follows:
```bash
bash ./clickhouse/insert-tag.sh /path/to/tracker-radar 2023.03
```
In this case, we're importing the `2023.03` tag. After the import, you should be able to read the imported data in Clickhouse:
```
use-clickhouse1.duckduckgo.com :) select count(*) from domain_json where tag = '2023.03';
SELECT count(*)
FROM domain_json
WHERE tag = '2023.03'
Query id: b330bfd4-dfcf-449e-8a17-eb3151552163
┌─count()─┐
│ 56921 │
└─────────┘
1 rows in set. Elapsed: 0.135 sec. Processed 56.92 thousand rows, 910.74 KB (420.42 thousand rows/s., 6.73 MB/s.)
use-clickhouse1.duckduckgo.com :) select count(*) from entity_json where tag = '2023.03';
SELECT count(*)
FROM entity_json
WHERE tag = '2023.03'
Query id: f6d77443-5667-467e-af6f-b63e80d3abbb
┌─count()─┐
│ 18966 │
└─────────┘
1 rows in set. Elapsed: 0.134 sec. Processed 18.97 thousand rows, 303.46 KB (141.11 thousand rows/s., 2.26 MB/s.)
```

## Querying the data

### Example queries

Get the longitundal prevalence of a tracker domain in a specific region:
```sql
SELECT
tag, region, domain, owner.1 as owner, prevalence
FROM domain_summary
WHERE region ='GB' AND domain = 'google-analytics.com'
ORDER BY tag DESC;
```

Get the longitundal prevalence of an entity:
```sql
SELECT
tag, name, prevalence.3 as total_prevalence
FROM entity_summary
WHERE name = 'Microsoft Corporation'
ORDER BY tag DESC;
```

Find some high prevalence resources loaded by a site:
```sql
SELECT
tag, region, domain, rule, type, prevalence
FROM domain_resources
WHERE region = 'US' AND has(exampleSites, 'theguardian.com')
ORDER BY prevalence DESC;
```

Check who sets the `_ga` cookie apart from `google-analytics.com`:
```sql
SELECT *
FROM firstPartyCookies
WHERE domain != 'google-analytics.com' AND cookieName = '_ga'
ORDER BY prevalence DESC limit 10;
```
24 changes: 24 additions & 0 deletions clickhouse/format-domain-data.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import fs from 'fs'
import { join } from 'path'

const tag = process.argv[2];
const trDir = process.argv[3];

let regions = fs.readdirSync(join(trDir, 'domains'));
if (!regions.includes('US')) {
regions = ['']
}
for (const region of regions) {
const baseDir = join(trDir, 'domains', region)
const domainFiles = fs.readdirSync(baseDir);

for (const file of domainFiles) {
const data = JSON.parse(fs.readFileSync(join(baseDir, file), 'utf-8'))
// format rules
data.resources = data.resources.map(res => {
res.rule = res.rule.replace(/\\/g, "");
return res;
})
console.log([tag, region, file, JSON.stringify(data)].join('\t'))
}
}
13 changes: 13 additions & 0 deletions clickhouse/format-entity-data.mjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import fs from 'fs'
import { join } from 'path'

const tag = process.argv[2];
const trDir = process.argv[3];

const baseDir = join(trDir, 'entities')
const entityFiles = fs.readdirSync(baseDir);

for (const file of entityFiles) {
const data = JSON.parse(fs.readFileSync(join(baseDir, file), 'utf-8'))
console.log([tag, file, JSON.stringify(data)].join('\t'))
}
12 changes: 12 additions & 0 deletions clickhouse/import-all.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash
set -ex

TRI=`pwd`
TR_PATH=$1
cd $TR_PATH

for TAG in $(git tag -l | sort -r)
do
cd $TRI
bash clickhouse/insert-data.sh $TR_PATH $TAG
done
28 changes: 28 additions & 0 deletions clickhouse/insert-data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash
set -ex

REPO_PATH=`pwd`
TR_PATH=$1
TAG=$2

cd $TR_PATH
git checkout $TAG
cd $REPO_PATH

node clickhouse/format-domain-data.mjs $TAG $TR_PATH | clickhouse-client \
--host clickhouse \
--query "
INSERT INTO tracker_radar.domain_json
SELECT tag, region, filename, data
FROM input('tag String, region String, filename String, data String')
FORMAT TabSeparated
"

node clickhouse/format-entity-data.mjs $TAG $TR_PATH | clickhouse-client \
--host clickhouse \
--query "
INSERT INTO tracker_radar.entity_json
SELECT tag, filename, data
FROM input('tag String, filename String, data String')
FORMAT TabSeparated
"
106 changes: 106 additions & 0 deletions clickhouse/schema.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
CREATE DATABASE IF NOT EXISTS tracker_radar;
USE tracker_radar;

CREATE TABLE IF NOT EXISTS domain_json
(
tag String,
region String,
filename String,
data String
)
ENGINE = MergeTree()
PRIMARY KEY (tag, region, filename);

CREATE TABLE IF NOT EXISTS entity_json
(
tag String,
filename String,
data String
)
ENGINE = MergeTree()
PRIMARY KEY (tag, filename);

CREATE VIEW IF NOT EXISTS domain_summary
AS
SELECT
tag,
region,
JSONExtractString(data, 'domain') AS domain,
JSONExtract(data, 'owner', 'Tuple(name String, displayName String, privacyPolicy String, url String)') AS owner,
JSONExtract(data, 'source', 'Array(String)') AS source,
JSONExtractFloat(data, 'prevalence') AS prevalence,
JSONExtractUInt(data, 'sites') AS sites,
JSONExtract(data, 'subdomains', 'Array(String)') AS subdomains,
JSONExtractUInt(data, 'fingerprinting') AS fingerprinting,
JSONExtract(data, 'categories', 'Array(String)') AS categories,
JSONExtractFloat(data, 'cookies') AS cookies,
JSONExtract(data, 'nameservers', 'Array(String)') AS nameservers,
JSONExtract(data, 'cnames', 'Array(Tuple(original String, resolved String))') AS cnames,
JSONExtract(data, 'topInitiators', 'Array(Tuple(domain String, prevalence Float32))') AS topInitiators,
JSONExtractKeysAndValues(data, 'types', 'UInt32') AS types
FROM domain_json;

CREATE MATERIALIZED VIEW domain_resources ENGINE = MergeTree() ORDER BY (tag, region, domain) POPULATE
AS
SELECT
tag,
region,
domain,
JSONExtractString(resource, 'rule') AS rule,
JSONExtractFloat(resource, 'cookies') AS cookies,
JSONExtractUInt(resource, 'fingerprinting') AS fingerprinting,
JSONExtract(resource, 'subdomains', 'Array(String)') AS subdomains,
JSONExtractUInt(resource, 'sites') AS sites,
JSONExtractFloat(resource, 'prevalence') AS prevalence,
JSONExtractString(resource, 'type') AS type,
JSONExtract(resource, 'exampleSites', 'Array(String)') AS exampleSites,
JSONExtract(resource, 'responseHashes', 'Array(String)') AS responseHashes,
JSONExtractKeysAndValues(resource, 'firstPartyCookies', 'Tuple(ttl UInt32, length UInt16, prevalence Float32, uniqueness Float32)') AS firstPartyCookies,
JSONExtractKeysAndValues(resource, 'firstPartyCookiesSent', 'Float32') AS firstPartyCookiesSent,
JSONExtractKeysAndValues(resource, 'apis', 'UInt32') AS apis
FROM (
SELECT tag, region, JSONExtractString(data, 'domain') AS domain, JSONExtractArrayRaw(data, 'resources') AS resource
FROM domain_json
) ARRAY JOIN resource;

-- CREATE MATERIALIZED VIEW firstPartyCookies ENGINE = MergeTree() ORDER BY (tag, region, domain, rule, cookieName) POPULATE
CREATE VIEW firstPartyCookies
AS
SELECT
tag, region, domain, rule, type,
firstPartyCookies.1 AS cookieName,
firstPartyCookies.2.1 AS ttl,
firstPartyCookies.2.2 AS length,
firstPartyCookies.2.3 AS prevalence,
firstPartyCookies.2.4 AS uniqueness
FROM domain_resources
ARRAY JOIN firstPartyCookies;

-- CREATE MATERIALIZED VIEW firstPartyCookiesSent ENGINE = MergeTree() ORDER BY (tag, region, domain, rule, cookieName) POPULATE
CREATE VIEW firstPartyCookiesSent
AS
SELECT
tag, region, domain, rule, type,
firstPartyCookiesSent.1 AS cookieName,
firstPartyCookiesSent.2 AS prevalence
FROM domain_resources
ARRAY JOIN firstPartyCookiesSent;

CREATE VIEW IF NOT EXISTS entity_summary
AS
SELECT
tag,
JSONExtractString(data, 'name') AS name,
JSONExtractString(data, 'displayName') AS displayName,
JSONExtract(data, 'properties', 'Array(String)') AS properties,
JSONExtract(data, 'prevalence', 'Tuple(tracking Float32, nonTracker Float32, total Float32)') AS prevalence
FROM entity_json

CREATE VIEW resource_apis
AS
SELECT
tag, region, domain, rule, type, fingerprinting,
apis.1 AS api,
apis.2 AS count
FROM domain_resources
ARRAY JOIN apis;

0 comments on commit 43392d9

Please sign in to comment.