-
Notifications
You must be signed in to change notification settings - Fork 25
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
9073db5
commit 43392d9
Showing
6 changed files
with
272 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
# Tracker Radar Clickhouse Database | ||
|
||
This folder contains scripts and schemas for inserting Tracker Radar data into Clickhouse. This | ||
allows for easy longitudial analysis of the data we publish on Github. | ||
|
||
## Setup | ||
|
||
If starting from scratch (no existing database), we first need to create the database and table/view | ||
schemas: | ||
```bash | ||
cat clickhouse/schema.sql | clickhouse-client -h clickhouse --multiquery | ||
``` | ||
|
||
## Data import | ||
|
||
Given a checkout of [tracker-radar](https://github.com/duckduckgo/tracker-radar), we can import data | ||
for a given tag as follows: | ||
```bash | ||
bash ./clickhouse/insert-tag.sh /path/to/tracker-radar 2023.03 | ||
``` | ||
In this case, we're importing the `2023.03` tag. After the import, you should be able to read the imported data in Clickhouse: | ||
``` | ||
use-clickhouse1.duckduckgo.com :) select count(*) from domain_json where tag = '2023.03'; | ||
SELECT count(*) | ||
FROM domain_json | ||
WHERE tag = '2023.03' | ||
Query id: b330bfd4-dfcf-449e-8a17-eb3151552163 | ||
┌─count()─┐ | ||
│ 56921 │ | ||
└─────────┘ | ||
1 rows in set. Elapsed: 0.135 sec. Processed 56.92 thousand rows, 910.74 KB (420.42 thousand rows/s., 6.73 MB/s.) | ||
use-clickhouse1.duckduckgo.com :) select count(*) from entity_json where tag = '2023.03'; | ||
SELECT count(*) | ||
FROM entity_json | ||
WHERE tag = '2023.03' | ||
Query id: f6d77443-5667-467e-af6f-b63e80d3abbb | ||
┌─count()─┐ | ||
│ 18966 │ | ||
└─────────┘ | ||
1 rows in set. Elapsed: 0.134 sec. Processed 18.97 thousand rows, 303.46 KB (141.11 thousand rows/s., 2.26 MB/s.) | ||
``` | ||
|
||
## Querying the data | ||
|
||
### Example queries | ||
|
||
Get the longitundal prevalence of a tracker domain in a specific region: | ||
```sql | ||
SELECT | ||
tag, region, domain, owner.1 as owner, prevalence | ||
FROM domain_summary | ||
WHERE region ='GB' AND domain = 'google-analytics.com' | ||
ORDER BY tag DESC; | ||
``` | ||
|
||
Get the longitundal prevalence of an entity: | ||
```sql | ||
SELECT | ||
tag, name, prevalence.3 as total_prevalence | ||
FROM entity_summary | ||
WHERE name = 'Microsoft Corporation' | ||
ORDER BY tag DESC; | ||
``` | ||
|
||
Find some high prevalence resources loaded by a site: | ||
```sql | ||
SELECT | ||
tag, region, domain, rule, type, prevalence | ||
FROM domain_resources | ||
WHERE region = 'US' AND has(exampleSites, 'theguardian.com') | ||
ORDER BY prevalence DESC; | ||
``` | ||
|
||
Check who sets the `_ga` cookie apart from `google-analytics.com`: | ||
```sql | ||
SELECT * | ||
FROM firstPartyCookies | ||
WHERE domain != 'google-analytics.com' AND cookieName = '_ga' | ||
ORDER BY prevalence DESC limit 10; | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
import fs from 'fs' | ||
import { join } from 'path' | ||
|
||
const tag = process.argv[2]; | ||
const trDir = process.argv[3]; | ||
|
||
let regions = fs.readdirSync(join(trDir, 'domains')); | ||
if (!regions.includes('US')) { | ||
regions = [''] | ||
} | ||
for (const region of regions) { | ||
const baseDir = join(trDir, 'domains', region) | ||
const domainFiles = fs.readdirSync(baseDir); | ||
|
||
for (const file of domainFiles) { | ||
const data = JSON.parse(fs.readFileSync(join(baseDir, file), 'utf-8')) | ||
// format rules | ||
data.resources = data.resources.map(res => { | ||
res.rule = res.rule.replace(/\\/g, ""); | ||
return res; | ||
}) | ||
console.log([tag, region, file, JSON.stringify(data)].join('\t')) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
import fs from 'fs' | ||
import { join } from 'path' | ||
|
||
const tag = process.argv[2]; | ||
const trDir = process.argv[3]; | ||
|
||
const baseDir = join(trDir, 'entities') | ||
const entityFiles = fs.readdirSync(baseDir); | ||
|
||
for (const file of entityFiles) { | ||
const data = JSON.parse(fs.readFileSync(join(baseDir, file), 'utf-8')) | ||
console.log([tag, file, JSON.stringify(data)].join('\t')) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#!/bin/bash | ||
set -ex | ||
|
||
TRI=`pwd` | ||
TR_PATH=$1 | ||
cd $TR_PATH | ||
|
||
for TAG in $(git tag -l | sort -r) | ||
do | ||
cd $TRI | ||
bash clickhouse/insert-data.sh $TR_PATH $TAG | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/bin/bash | ||
set -ex | ||
|
||
REPO_PATH=`pwd` | ||
TR_PATH=$1 | ||
TAG=$2 | ||
|
||
cd $TR_PATH | ||
git checkout $TAG | ||
cd $REPO_PATH | ||
|
||
node clickhouse/format-domain-data.mjs $TAG $TR_PATH | clickhouse-client \ | ||
--host clickhouse \ | ||
--query " | ||
INSERT INTO tracker_radar.domain_json | ||
SELECT tag, region, filename, data | ||
FROM input('tag String, region String, filename String, data String') | ||
FORMAT TabSeparated | ||
" | ||
|
||
node clickhouse/format-entity-data.mjs $TAG $TR_PATH | clickhouse-client \ | ||
--host clickhouse \ | ||
--query " | ||
INSERT INTO tracker_radar.entity_json | ||
SELECT tag, filename, data | ||
FROM input('tag String, filename String, data String') | ||
FORMAT TabSeparated | ||
" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
CREATE DATABASE IF NOT EXISTS tracker_radar; | ||
USE tracker_radar; | ||
|
||
CREATE TABLE IF NOT EXISTS domain_json | ||
( | ||
tag String, | ||
region String, | ||
filename String, | ||
data String | ||
) | ||
ENGINE = MergeTree() | ||
PRIMARY KEY (tag, region, filename); | ||
|
||
CREATE TABLE IF NOT EXISTS entity_json | ||
( | ||
tag String, | ||
filename String, | ||
data String | ||
) | ||
ENGINE = MergeTree() | ||
PRIMARY KEY (tag, filename); | ||
|
||
CREATE VIEW IF NOT EXISTS domain_summary | ||
AS | ||
SELECT | ||
tag, | ||
region, | ||
JSONExtractString(data, 'domain') AS domain, | ||
JSONExtract(data, 'owner', 'Tuple(name String, displayName String, privacyPolicy String, url String)') AS owner, | ||
JSONExtract(data, 'source', 'Array(String)') AS source, | ||
JSONExtractFloat(data, 'prevalence') AS prevalence, | ||
JSONExtractUInt(data, 'sites') AS sites, | ||
JSONExtract(data, 'subdomains', 'Array(String)') AS subdomains, | ||
JSONExtractUInt(data, 'fingerprinting') AS fingerprinting, | ||
JSONExtract(data, 'categories', 'Array(String)') AS categories, | ||
JSONExtractFloat(data, 'cookies') AS cookies, | ||
JSONExtract(data, 'nameservers', 'Array(String)') AS nameservers, | ||
JSONExtract(data, 'cnames', 'Array(Tuple(original String, resolved String))') AS cnames, | ||
JSONExtract(data, 'topInitiators', 'Array(Tuple(domain String, prevalence Float32))') AS topInitiators, | ||
JSONExtractKeysAndValues(data, 'types', 'UInt32') AS types | ||
FROM domain_json; | ||
|
||
CREATE MATERIALIZED VIEW domain_resources ENGINE = MergeTree() ORDER BY (tag, region, domain) POPULATE | ||
AS | ||
SELECT | ||
tag, | ||
region, | ||
domain, | ||
JSONExtractString(resource, 'rule') AS rule, | ||
JSONExtractFloat(resource, 'cookies') AS cookies, | ||
JSONExtractUInt(resource, 'fingerprinting') AS fingerprinting, | ||
JSONExtract(resource, 'subdomains', 'Array(String)') AS subdomains, | ||
JSONExtractUInt(resource, 'sites') AS sites, | ||
JSONExtractFloat(resource, 'prevalence') AS prevalence, | ||
JSONExtractString(resource, 'type') AS type, | ||
JSONExtract(resource, 'exampleSites', 'Array(String)') AS exampleSites, | ||
JSONExtract(resource, 'responseHashes', 'Array(String)') AS responseHashes, | ||
JSONExtractKeysAndValues(resource, 'firstPartyCookies', 'Tuple(ttl UInt32, length UInt16, prevalence Float32, uniqueness Float32)') AS firstPartyCookies, | ||
JSONExtractKeysAndValues(resource, 'firstPartyCookiesSent', 'Float32') AS firstPartyCookiesSent, | ||
JSONExtractKeysAndValues(resource, 'apis', 'UInt32') AS apis | ||
FROM ( | ||
SELECT tag, region, JSONExtractString(data, 'domain') AS domain, JSONExtractArrayRaw(data, 'resources') AS resource | ||
FROM domain_json | ||
) ARRAY JOIN resource; | ||
|
||
-- CREATE MATERIALIZED VIEW firstPartyCookies ENGINE = MergeTree() ORDER BY (tag, region, domain, rule, cookieName) POPULATE | ||
CREATE VIEW firstPartyCookies | ||
AS | ||
SELECT | ||
tag, region, domain, rule, type, | ||
firstPartyCookies.1 AS cookieName, | ||
firstPartyCookies.2.1 AS ttl, | ||
firstPartyCookies.2.2 AS length, | ||
firstPartyCookies.2.3 AS prevalence, | ||
firstPartyCookies.2.4 AS uniqueness | ||
FROM domain_resources | ||
ARRAY JOIN firstPartyCookies; | ||
|
||
-- CREATE MATERIALIZED VIEW firstPartyCookiesSent ENGINE = MergeTree() ORDER BY (tag, region, domain, rule, cookieName) POPULATE | ||
CREATE VIEW firstPartyCookiesSent | ||
AS | ||
SELECT | ||
tag, region, domain, rule, type, | ||
firstPartyCookiesSent.1 AS cookieName, | ||
firstPartyCookiesSent.2 AS prevalence | ||
FROM domain_resources | ||
ARRAY JOIN firstPartyCookiesSent; | ||
|
||
CREATE VIEW IF NOT EXISTS entity_summary | ||
AS | ||
SELECT | ||
tag, | ||
JSONExtractString(data, 'name') AS name, | ||
JSONExtractString(data, 'displayName') AS displayName, | ||
JSONExtract(data, 'properties', 'Array(String)') AS properties, | ||
JSONExtract(data, 'prevalence', 'Tuple(tracking Float32, nonTracker Float32, total Float32)') AS prevalence | ||
FROM entity_json | ||
|
||
CREATE VIEW resource_apis | ||
AS | ||
SELECT | ||
tag, region, domain, rule, type, fingerprinting, | ||
apis.1 AS api, | ||
apis.2 AS count | ||
FROM domain_resources | ||
ARRAY JOIN apis; |