Skip to content

Commit

Permalink
rules upgraded
Browse files Browse the repository at this point in the history
  • Loading branch information
max-ostapenko committed Dec 31, 2024
1 parent 0637a43 commit 1b4838d
Show file tree
Hide file tree
Showing 21 changed files with 111 additions and 49 deletions.
6 changes: 2 additions & 4 deletions sql/.sqlfluff
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[sqlfluff]
dialect = bigquery
# Comma separated list of rules to exclude, or None
exclude_rules = AL01,AL04,AL07,AL09,AM03,AM05,CP02,CP03,CV02,LT05,LT09,RF01,RF02,RF03,RF04,ST01,ST02,ST05,ST06,ST07,ST08
exclude_rules = AL01,AL04,AL07,AL09,AM03,AM05,CP02,CP03,CV02,LT05,LT09,RF01,RF02,RF03,RF04,ST01,ST02,ST05,ST06,ST07,ST08,ST11
# AL01 - We don't always alias tables with AS ("FROM table1 AS tb1" instead of "FROM table1 tb1"). Do for columns but not for tables.
# AL04 - Asks for unique table aliases meaning it complains if selecting from two 2021_07_01 tables as implicit alias is table name (not fully qualified) so same.
# AL07 - Avoid aliases in from and join - why?
Expand All @@ -23,6 +23,7 @@ exclude_rules = AL01,AL04,AL07,AL09,AM03,AM05,CP02,CP03,CV02,LT05,LT09,RF01,RF02
# ST06 - Insists on wildcards (*) in certain SELECT order - why?
# ST07 - Uses joins instead of USING - why?
# ST08 - Sometimes clearer to include brackets for complex COUNT(DISTINCT) cases
# ST11 - Doesn't consider wildcards in SELECT. Issue: https://github.com/sqlfluff/sqlfluff/issues/6511

large_file_skip_byte_limit = 40000
# CPU processes to use while linting.
Expand Down Expand Up @@ -65,6 +66,3 @@ preferred_quoted_literal_style = single_quotes

[sqlfluff:rules:references.special_chars]
additional_allowed_characters = "-."

[sqlfluff:layout:type:start_bracket]
spacing_before = single:inline
6 changes: 4 additions & 2 deletions sql/2024/privacy/cookies_top_first_party_names.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@ WITH pages AS (
client,
root_page,
custom_metrics,
COUNT(DISTINCT net.host(root_page)) OVER(PARTITION BY client) AS total_domains
COUNT(DISTINCT net.host(root_page)) OVER (PARTITION BY client) AS total_domains
FROM `httparchive.all.pages`
WHERE date = '2024-06-01'
), cookies AS (
),

cookies AS (
SELECT
client,
cookie,
Expand Down
4 changes: 3 additions & 1 deletion sql/2024/privacy/cookies_top_third_party_domains.sql
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ WITH pages AS (
COUNT(DISTINCT page) OVER (PARTITION BY client) AS total_pages
FROM `httparchive.all.pages`
WHERE date = '2024-06-01'
), cookies AS (
),

cookies AS (
SELECT
client,
page,
Expand Down
4 changes: 3 additions & 1 deletion sql/2024/privacy/cookies_top_third_party_names.sql
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@ WITH pages AS (
client,
root_page,
custom_metrics,
COUNT(DISTINCT net.host(root_page)) OVER(PARTITION BY client) AS total_domains
COUNT(DISTINCT net.host(root_page)) OVER (PARTITION BY client) AS total_domains
FROM `httparchive.all.pages`
WHERE date = '2024-06-01'
),

cookies AS (
SELECT
client,
Expand All @@ -19,6 +20,7 @@ cookies AS (
FROM pages,
UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) AS cookie
)

SELECT
client,
COUNT(DISTINCT firstparty_host) AS domain_count,
Expand Down
3 changes: 3 additions & 0 deletions sql/2024/privacy/easylist-tracker-detection.sql
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@ WITH easylist_data AS (
SELECT string_field_0
FROM `httparchive.almanac.easylist_adservers`
),

requests_data AS (
SELECT url
FROM `httparchive.all.requests`
WHERE
date = '2024-06-01' AND
is_root_page = TRUE
),

block_status AS (
SELECT
r.url,
Expand All @@ -32,6 +34,7 @@ block_status AS (
ON CheckDomainInURL(r.url, e.string_field_0) = 1
GROUP BY r.url
)

SELECT
COUNT(0) AS blocked_url_count
FROM block_status
Expand Down
1 change: 1 addition & 0 deletions sql/2024/privacy/fingerprinting_most_common_apis.sql
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ WITH pages AS (
UNNEST(getFingerprintingTypes(JSON_EXTRACT(custom_metrics, '$.privacy.fingerprinting.counts'))) AS fingerprinting_type
WHERE date = '2024-06-01'
)

SELECT
client,
fingerprinting_type,
Expand Down
16 changes: 12 additions & 4 deletions sql/2024/privacy/most_common_bounce_domains.sql
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ WITH redirect_requests AS (
type NOT IN ('css', 'image', 'font', 'video', 'audio') AND
ROUND(INT64(summary.status) / 100) = 3 AND
index <= 2
), navigation_redirect AS (
),

navigation_redirect AS (
-- Find the first navigation redirect
SELECT
client,
Expand All @@ -28,7 +30,9 @@ WITH redirect_requests AS (
index = 1 AND
LOWER(response_header.name) = 'location' AND
NET.REG_DOMAIN(response_header.value) != NET.REG_DOMAIN(page)
), bounce_redirect AS (
),

bounce_redirect AS (
-- Find the second navigation redirect
SELECT
client,
Expand All @@ -41,7 +45,9 @@ WITH redirect_requests AS (
WHERE
index = 2 AND
LOWER(response_header.name) = 'location'
), bounce_sequences AS (
),

bounce_sequences AS (
-- Combine the first and second navigation redirects
SELECT
nav.client,
Expand All @@ -58,7 +64,9 @@ WITH redirect_requests AS (
GROUP BY
nav.client,
bounce_hostname
), pages_total AS (
),

pages_total AS (
SELECT
client,
COUNT(DISTINCT page) AS total_pages
Expand Down
6 changes: 4 additions & 2 deletions sql/2024/privacy/most_common_client_hints.sql
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ WITH response_headers AS (
date = '2024-06-01' AND
is_root_page = TRUE AND
is_main_document = TRUE
), meta_tags AS (
),

meta_tags AS (
SELECT
client,
page,
Expand All @@ -27,7 +29,7 @@ WITH response_headers AS (
WHERE
date = '2024-06-01' AND
is_root_page = TRUE
),
),
UNNEST(JSON_QUERY_ARRAY(metrics, '$.meta-nodes.nodes')) meta_node
WHERE JSON_VALUE(meta_node, '$.http-equiv') IS NOT NULL
)
Expand Down
18 changes: 13 additions & 5 deletions sql/2024/privacy/most_common_cname_domains.sql
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,19 @@ try {
# https://github.com/AdguardTeam/cname-trackers/blob/master/script/src/cloaked-trackers.json
WITH adguard_trackers AS (
SELECT
domain AS domain
domain
FROM UNNEST(['cz.affilbox.cz', 'pl02.prolitteris.2cnt.net', 'a8.net', 'mm.actionlink.jp', 'mr-in.com', 'ebis.ne.jp', '0i0i0i0.com', 'ads.bid', 'at-o.net', 'actonservice.com', 'actonsoftware.com', '2o7.net', 'data.adobedc.net', 'sc.adobedc.net', 'sc.omtrdc.net', 'adocean.pl', 'aquaplatform.com', 'cdn18685953.ahacdn.me', 'thirdparty.bnc.lt', 'api.clickaine.com', 'tagcommander.com', 'track.sp.crdl.io', 'dnsdelegation.io', 'storetail.io', 'e.customeriomail.com', 'dataunlocker.com', 'monopoly-drain.ga', 'friendly-community.tk', 'nc0.co', 'customer.etracker.com', 'eulerian.net', 'extole.com', 'extole.io', 'fathomdns.com', 'genieespv.jp', 'ad-cloud.jp', 'goatcounter.com', 'heleric.com', 'iocnt.net', 'affex.org', 'k.keyade.com', 'ghochv3eng.trafficmanager.net', 'online-metrix.net', 'logly.co.jp', 'mailgun.org', 'ab1n.net', 'ntv.io', 'ntvpforever.com', 'postrelease.com', 'non.li', 'tracking.bp01.net', 't.eloqua.com', 'oghub.io', 'go.pardot.com', 'parsely.com', 'custom.plausible.io', 'popcashjs.b-cdn.net', 'rdtk.io', 'sailthru.com', 'exacttarget.com', 'a351fec2c318c11ea9b9b0a0ae18fb0b-1529426863.eu-central-1.elb.amazonaws.com', 'a5e652663674a11e997c60ac8a4ec150-1684524385.eu-central-1.elb.amazonaws.com', 'a88045584548111e997c60ac8a4ec150-1610510072.eu-central-1.elb.amazonaws.com', 'afc4d9aa2a91d11e997c60ac8a4ec150-2082092489.eu-central-1.elb.amazonaws.com', 'e.truedata.co', 'utiq-aws.net', 'webtrekk.net', 'wt-eu02.net', 'ak-is2.net', 'wizaly.com']) AS domain
), whotracksme AS (
),

whotracksme AS (
SELECT DISTINCT
domain,
category
FROM `httparchive.almanac.whotracksme`
WHERE date = '2024-06-01'
), cnames AS (
),

cnames AS (
SELECT
client,
cnames.cname,
Expand All @@ -44,14 +48,18 @@ WITH adguard_trackers AS (
client,
cnames.cname,
page
), pages_total AS (
),

pages_total AS (
SELECT
client,
COUNT(DISTINCT page) AS total_pages
FROM `httparchive.all.pages`
WHERE date = '2024-06-01'
GROUP BY client
), cname_stats AS (
),

cname_stats AS (
SELECT
client,
NET.REG_DOMAIN(cname) AS cname,
Expand Down
4 changes: 3 additions & 1 deletion sql/2024/privacy/most_common_countries_for_iab_tcf_v2.sql
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@ WITH totals AS (
date = '2024-06-01' AND
JSON_TYPE(custom_metrics.privacy.iab_tcf_v2.data) = 'object'
GROUP BY client
), cmps AS (
),

cmps AS (
SELECT
client,
--ANY_VALUE(custom_metrics.privacy.iab_tcf_v2.data) AS example,
Expand Down
5 changes: 4 additions & 1 deletion sql/2024/privacy/most_common_tracker_categories.sql
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ WITH whotracksme AS (
FROM httparchive.almanac.whotracksme
WHERE date = '2024-06-01'
),

totals AS (
SELECT
client,
Expand All @@ -16,6 +17,7 @@ totals AS (
date = '2024-06-01'
GROUP BY client
),

tracker_categories AS (
SELECT
client,
Expand All @@ -26,11 +28,12 @@ tracker_categories AS (
ON (
NET.HOST(url) = domain OR
ENDS_WITH(NET.HOST(url), CONCAT('.', domain))
)
)
WHERE
date = '2024-06-01' AND
NET.REG_DOMAIN(page) != NET.REG_DOMAIN(url) -- third party
),

aggregated AS (
SELECT
client,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ WITH ara_features AS (
is_root_page = TRUE AND
ara LIKE 'destination%'
)

SELECT
client,
rank_group,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ WITH totals AS (
SELECT
client,
technology.technology,
total_websites AS total_websites,
total_websites,
COUNT(DISTINCT page) AS number_of_websites,
COUNT(DISTINCT page) / total_websites AS percent_of_websites
FROM httparchive.crawl.pages
Expand Down
6 changes: 4 additions & 2 deletions sql/2024/privacy/number_of_websites_with_client_hints.sql
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ WITH response_headers AS (
WHERE
date = '2024-06-01' AND
is_main_document = TRUE
), meta_tags AS (
),

meta_tags AS (
SELECT
client,
page,
Expand All @@ -23,7 +25,7 @@ WITH response_headers AS (
JSON_VALUE(custom_metrics, '$.almanac') AS metrics
FROM `httparchive.all.pages`
WHERE date = '2024-06-01'
),
),
UNNEST(JSON_QUERY_ARRAY(metrics, '$.meta-nodes.nodes')) meta_node
WHERE JSON_VALUE(meta_node, '$.http-equiv') IS NOT NULL
)
Expand Down
5 changes: 3 additions & 2 deletions sql/2024/privacy/number_of_websites_with_nb_trackers.sql
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ WITH whotracksme AS (
FROM almanac.whotracksme
WHERE date = '2024-06-01'
),

totals AS (
SELECT
client,
Expand Down Expand Up @@ -40,7 +41,7 @@ FROM (
GROUP BY
client,
page
)
)
JOIN
totals
USING (client)
Expand Down Expand Up @@ -81,7 +82,7 @@ FROM (
GROUP BY
client,
page
)
)
JOIN
totals
USING (client)
Expand Down
41 changes: 25 additions & 16 deletions sql/2024/privacy/number_of_websites_with_referrerpolicy.sql
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ WITH referrer_policy_custom_metrics AS (
date = '2024-06-01' AND
is_root_page = TRUE
),

referrer_policy_headers AS (
SELECT
client,
Expand Down Expand Up @@ -45,28 +46,36 @@ FROM (
client,
COUNT(DISTINCT page) AS number_of_pages,
COUNT(DISTINCT IF(
meta_policy IS NOT NULL,
page, NULL)) AS number_of_pages_with_entire_document_policy_meta,
COUNT(DISTINCT IF(
header_policy IS NOT NULL,
page, NULL)) AS number_of_pages_with_entire_document_policy_header,
meta_policy IS NOT NULL,
page, NULL
)) AS number_of_pages_with_entire_document_policy_meta,
COUNT(DISTINCT IF(
meta_policy IS NOT NULL OR
header_policy IS NOT NULL,
page, NULL)
page, NULL
)) AS number_of_pages_with_entire_document_policy_header,
COUNT(
DISTINCT IF(
meta_policy IS NOT NULL OR
header_policy IS NOT NULL,
page, NULL
)
) AS number_of_pages_with_entire_document_policy,
COUNT(DISTINCT IF(
individual_requests,
page, NULL)) AS number_of_pages_with_any_individual_requests,
individual_requests,
page, NULL
)) AS number_of_pages_with_any_individual_requests,
COUNT(DISTINCT IF(
link_relations,
page, NULL)) AS number_of_pages_with_any_link_relations,
COUNT(DISTINCT IF(
meta_policy IS NOT NULL OR
header_policy IS NOT NULL OR
individual_requests OR
link_relations,
page, NULL)
page, NULL
)) AS number_of_pages_with_any_link_relations,
COUNT(
DISTINCT IF(
meta_policy IS NOT NULL OR
header_policy IS NOT NULL OR
individual_requests OR
link_relations,
page, NULL
)
) AS number_of_pages_with_any_referrer_policy
FROM
referrer_policy_custom_metrics
Expand Down
Loading

0 comments on commit 1b4838d

Please sign in to comment.