Skip to content

Commit b90332f

Browse files
Privacy 2024 queries - CCPA, fingerprinting, cookies (#3720)
* CCPA metrics * fingerprinting metrics * cookie metrics * lint
1 parent 266fa78 commit b90332f

7 files changed

+41
-0
lines changed
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
WITH pages AS (
2+
SELECT client, rank_grouping, page, JSON_QUERY_ARRAY(custom_metrics, '$.privacy.ccpa_link.CCPALinkPhrases') AS ccpa_link_phrases FROM `httparchive.all.pages`, -- TABLESAMPLE SYSTEM (0.01 PERCENT)
3+
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping WHERE date = '2024-06-01' AND is_root_page = true AND rank <= rank_grouping
4+
)
5+
SELECT client, rank_grouping, link_phrase, count(DISTINCT page) AS num_pages FROM pages, unnest(ccpa_link_phrases) link_phrase GROUP BY link_phrase, rank_grouping, client ORDER BY rank_grouping, client, num_pages DESC

sql/2024/privacy/ccpa_prevalence.sql

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
WITH pages AS (
2+
SELECT client, rank_grouping, page, JSON_VALUE(custom_metrics, '$.privacy.ccpa_link.hasCCPALink') AS has_ccpa_link FROM `httparchive.all.pages`,
3+
-- TABLESAMPLE SYSTEM (0.0025 PERCENT)
4+
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping WHERE date = '2024-06-01' AND is_root_page = true AND rank <= rank_grouping
5+
)
6+
SELECT client, rank_grouping, has_ccpa_link, count(DISTINCT page) AS num_pages FROM pages GROUP BY has_ccpa_link, rank_grouping, client ORDER BY rank_grouping, client, has_ccpa_link
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
-- Most common cookie names, by number of domains on which they appear. Goal is to identify common trackers that use first-party cookies across sites.
2+
3+
WITH pages AS (
4+
SELECT client, root_page, custom_metrics FROM `httparchive.all.pages` -- TABLESAMPLE SYSTEM (0.00001 PERCENT)
5+
WHERE date = '2024-06-01'
6+
),
7+
cookies AS (
8+
SELECT client, cookie, net.host(JSON_VALUE(cookie, '$.domain')) AS cookie_host, net.host(root_page) AS firstparty_host FROM pages, UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) cookie
9+
)
10+
SELECT client, count(DISTINCT firstparty_host) AS domain_count, JSON_VALUE(cookie, '$.name') AS cookie_name FROM cookies WHERE firstparty_host LIKE '%' || cookie_host GROUP BY client, cookie_name ORDER BY domain_count DESC, client DESC LIMIT 500
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
WITH pages AS (
2+
SELECT page, client, root_page, custom_metrics FROM `httparchive.all.pages` -- TABLESAMPLE SYSTEM (0.00001 PERCENT)
3+
WHERE date = '2024-06-01'
4+
),
5+
cookies AS (
6+
SELECT client, page, cookie, net.host(JSON_VALUE(cookie, '$.domain')) AS cookie_host, net.host(root_page) AS firstparty_host FROM pages, UNNEST(JSON_QUERY_ARRAY(custom_metrics, '$.cookies')) cookie
7+
)
8+
SELECT client, cookie_host, count(DISTINCT page) AS page_count FROM cookies WHERE firstparty_host NOT LIKE '%' || cookie_host GROUP BY client, cookie_host ORDER BY page_count DESC, client LIMIT 500
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
CREATE TEMP FUNCTION getFingerprintingTypes(input STRING)
2+
RETURNS ARRAY<STRING>
3+
LANGUAGE js AS """return Object.keys(JSON.parse(input).privacy?.fingerprinting?.counts || {})""";
4+
5+
WITH pages AS (
6+
SELECT client, page, fingerprinting_type FROM `httparchive.all.pages`, -- TABLESAMPLE SYSTEM (0.001 PERCENT)
7+
unnest(getFingerprintingTypes(custom_metrics)) AS fingerprinting_type WHERE date = '2024-06-01'
8+
)
9+
SELECT client, fingerprinting_type, count(DISTINCT page) AS page_count FROM pages GROUP BY client, fingerprinting_type ORDER BY page_count DESC
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
SELECT client, script, count(DISTINCT page) AS page_count FROM `httparchive.all.pages`, --TABLESAMPLE SYSTEM (0.001 PERCENT)
2+
unnest(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script WHERE date = '2024-06-01' GROUP BY client, script ORDER BY page_count DESC LIMIT 100;
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
SELECT client, array_length(JSON_QUERY_ARRAY(custom_metrics, '$.privacy.fingerprinting.likelyFingerprintingScripts')) AS script_count, count(DISTINCT page) AS page_count FROM `httparchive.all.pages` WHERE date = '2024-06-01' GROUP BY script_count, client ORDER BY script_count ASC;

0 commit comments

Comments
 (0)