Skip to content

Commit b9cd7ca

Browse files
authored
Merge pull request #129 from HTTPArchive/htmlElementPopularity
Html element popularity
2 parents 31ca574 + b7ae4a7 commit b9cd7ca

File tree

1 file changed

+35
-0
lines changed

1 file changed

+35
-0
lines changed
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#standardSQL
2+
CREATE TEMPORARY FUNCTION getElements(payload STRING)
3+
RETURNS ARRAY<STRING> LANGUAGE js AS '''
4+
try {
5+
var $ = JSON.parse(payload);
6+
var elements = JSON.parse($._element_count);
7+
if (Array.isArray(elements) || typeof elements != 'object') return [];
8+
return Object.keys(elements);
9+
} catch (e) {
10+
return [];
11+
}
12+
''';
13+
14+
SELECT
15+
_TABLE_SUFFIX AS client,
16+
element,
17+
COUNT(DISTINCT url) AS pages,
18+
total,
19+
COUNT(DISTINCT url) / total AS pct,
20+
ARRAY_TO_STRING(ARRAY_AGG(DISTINCT url LIMIT 5), ' ') AS sample_urls
21+
FROM
22+
`httparchive.pages.${YYYY_MM_DD}_*`
23+
JOIN
24+
(SELECT _TABLE_SUFFIX, COUNT(0) AS total FROM `httparchive.pages.${YYYY_MM_DD}_*` GROUP BY _TABLE_SUFFIX)
25+
USING (_TABLE_SUFFIX),
26+
UNNEST(getElements(payload)) AS element
27+
GROUP BY
28+
client,
29+
total,
30+
element
31+
HAVING
32+
COUNT(DISTINCT url) >= 10
33+
ORDER BY
34+
pages / total DESC,
35+
client

0 commit comments

Comments
 (0)