Skip to content

Commit 7a80150

Browse files
yohhaantunetheweb
andauthored
Cookies sql 2024 (#3741)
* query for intermediate table * Add top first and 3rd party * CHIPS * edits * Top registrable domains setting 3rd party cookies * websites with most cookies * add stats cookie table * Format and lint queries * Format and lint queries * Stats * more stats * Lint fix * Update stats_cookies_table.sql * 10k to 100k * edit * new final queries * Format and lint queries * Format and lint queries * adding __Secure- and __Host- prefixes * lint * Update sql/2024/cookies/prevalence_type_attributes_per_rank.sql Co-authored-by: Barry Pollard <[email protected]> * New almanac table * Update SQL queries for Cookies 2024 chapter after creation of the httparchive.almanac.cookies table * Fix linting issue * more linting issues --------- Co-authored-by: Barry Pollard <[email protected]>
1 parent 187a571 commit 7a80150

19 files changed

+435
-0
lines changed
+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
-- Extract to the `httparchive.almanac.cookies `table the cookies that were set
2+
-- during the <DATE> crawl on <CLIENT>. Data in this table can then be queried
3+
-- more efficiently in consecutive queries without having to reextract it every
4+
-- time
5+
6+
7+
-- Code used by @tunetheweb to create the table
8+
-- see https://github.com/HTTPArchive/almanac.httparchive.org/pull/3741#discussion_r1823153262
9+
10+
-- CREATE TABLE `httparchive.almanac.cookies`
11+
-- (
12+
-- date DATE,
13+
-- client STRING,
14+
-- page STRING,
15+
-- root_page STRING,
16+
-- rank INTEGER,
17+
-- startedDateTime STRING,
18+
-- firstPartyCookie BOOL,
19+
-- name STRING,
20+
-- domain STRING,
21+
-- path STRING,
22+
-- expires STRING,
23+
-- size STRING,
24+
-- httpOnly STRING,
25+
-- secure STRING,
26+
-- session STRING,
27+
-- sameSite STRING,
28+
-- sameParty STRING,
29+
-- partitionKey STRING,
30+
-- partitionKeyOpaque STRING
31+
-- )
32+
-- PARTITION BY date
33+
-- CLUSTER BY
34+
-- client, rank, page
35+
-- AS
36+
-- ...
37+
38+
39+
INSERT INTO `httparchive.almanac.cookies`
40+
WITH intermediate_cookie AS (
41+
SELECT
42+
date,
43+
client,
44+
page,
45+
root_page,
46+
rank,
47+
JSON_VALUE(summary, '$.startedDateTime') AS startedDateTime,
48+
cookie
49+
FROM
50+
`httparchive.all.pages`,
51+
UNNEST(JSON_EXTRACT_ARRAY(custom_metrics, '$.cookies')) AS cookie
52+
WHERE
53+
date = '2024-06-01'
54+
)
55+
56+
SELECT
57+
date,
58+
client,
59+
page,
60+
root_page,
61+
rank,
62+
startedDateTime,
63+
ENDS_WITH(NET.HOST(page), NET.REG_DOMAIN(JSON_VALUE(cookie, '$.domain'))) AS firstPartyCookie,
64+
JSON_VALUE(cookie, '$.name') AS name,
65+
JSON_VALUE(cookie, '$.domain') AS domain,
66+
JSON_VALUE(cookie, '$.path') AS path,
67+
JSON_VALUE(cookie, '$.expires') AS expires,
68+
JSON_VALUE(cookie, '$.size') AS size,
69+
JSON_VALUE(cookie, '$.httpOnly') AS httpOnly,
70+
JSON_VALUE(cookie, '$.secure') AS secure,
71+
JSON_VALUE(cookie, '$.session') AS session,
72+
JSON_VALUE(cookie, '$.sameSite') AS sameSite,
73+
JSON_VALUE(cookie, '$.sameParty') AS sameParty,
74+
JSON_VALUE(cookie, '$.partitionKey') AS partitionKey,
75+
JSON_VALUE(cookie, '$.partitionKeyOpaque') AS partitionKeyOpaque
76+
FROM intermediate_cookie
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
-- Extract the top 20 first party cookies seen across websites that are
2+
-- partitioned. Note: it is a bit weird that 1st party cookies would also be
3+
-- partitioned, as CHIPS is meant for a 3rd party context...
4+
-- Before running query: edit date and client
5+
6+
SELECT
7+
name,
8+
COUNT(DISTINCT NET.HOST(page)) / (SELECT (COUNT(DISTINCT NET.HOST(page))) FROM `httparchive.almanac.cookies`) AS percentWebsites
9+
FROM `httparchive.almanac.cookies`
10+
WHERE
11+
date = '2024-06-01' AND
12+
client = 'desktop' AND
13+
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
14+
firstPartyCookie = TRUE AND
15+
partitionKey IS NOT NULL
16+
GROUP BY name
17+
ORDER BY percentWebsites DESC
18+
LIMIT 20
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
-- Extract the top 20 first party cookies seen across websites that are
2+
-- partitioned. Note: it is a bit weird that 1st party cookies would also be
3+
-- partitioned, as CHIPS is meant for a 3rd party context...
4+
-- Before running query: edit date and client
5+
6+
SELECT
7+
name,
8+
domain,
9+
COUNT(DISTINCT NET.HOST(page)) / (SELECT (COUNT(DISTINCT NET.HOST(page))) FROM `httparchive.almanac.cookies`) AS percentWebsites
10+
FROM `httparchive.almanac.cookies`
11+
WHERE
12+
date = '2024-06-01' AND
13+
client = 'desktop' AND
14+
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
15+
firstPartyCookie = FALSE AND
16+
partitionKey IS NOT NULL
17+
GROUP BY name, domain
18+
ORDER BY percentWebsites DESC
19+
LIMIT 20

sql/2024/cookies/README.md

+6
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,9 @@
1818
[~google-doc]: https://docs.google.com/document/d/1o2AgdsDq_x3OvthZF7Kb50rUKMVLn7UANT9Stz7ku2I/edit#heading=h.ymg495uvm3yx
1919
[~google-sheets]: https://docs.google.com/spreadsheets/d/1wDGnUkO0rgcU5_V6hmUrhm1pq60VU2XbeMHgYJEEaSM/edit#gid=454016814
2020
[~chapter-markdown]: https://github.com/HTTPArchive/almanac.httparchive.org/tree/main/src/content/en/2024/cookies.md
21+
22+
## Note about SQL queries
23+
24+
First execute [`0_create_cookies.sql`](0_create_cookies.sql) to export the
25+
results of the <DATE> crawl specified into the `httparchive.almanac.cookies`
26+
table that will then be used in other SQL queries.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
-- Extract stats around the expire date (age rounded to closest number of days)
2+
-- (only positive values, non session cookies)
3+
-- Before running query: edit date and client
4+
5+
WITH cookies_age AS (
6+
SELECT
7+
firstPartyCookie,
8+
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age
9+
FROM `httparchive.almanac.cookies`
10+
WHERE
11+
date = '2024-06-01' AND
12+
client = 'desktop' AND
13+
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
14+
firstPartyCookie IS NOT NULL AND
15+
CAST(expires AS FLOAT64) >= 0
16+
)
17+
18+
SELECT
19+
firstPartyCookie,
20+
MIN(age) AS min,
21+
APPROX_QUANTILES(age, 100)[OFFSET(25)] AS p25,
22+
APPROX_QUANTILES(age, 100)[OFFSET(50)] AS median,
23+
APPROX_QUANTILES(age, 100)[OFFSET(75)] AS p75,
24+
APPROX_QUANTILES(age, 100)[OFFSET(90)] AS p90,
25+
APPROX_QUANTILES(age, 100)[OFFSET(99)] AS p99,
26+
MAX(age) AS max
27+
FROM cookies_age
28+
GROUP BY firstPartyCookie
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
-- Extract stats around the expire date (age rounded to closest number of days)
2+
-- (only positive values, non session cookies)
3+
-- Before running query: edit date and client
4+
5+
WITH cookies_age AS (
6+
SELECT
7+
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age
8+
FROM `httparchive.almanac.cookies`
9+
WHERE
10+
date = '2024-06-01' AND
11+
client = 'desktop' AND
12+
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
13+
firstPartyCookie IS NOT NULL AND
14+
CAST(expires AS FLOAT64) >= 0
15+
)
16+
17+
SELECT
18+
MIN(age) AS min,
19+
APPROX_QUANTILES(age, 100)[OFFSET(25)] AS p25,
20+
APPROX_QUANTILES(age, 100)[OFFSET(50)] AS median,
21+
APPROX_QUANTILES(age, 100)[OFFSET(75)] AS p75,
22+
APPROX_QUANTILES(age, 100)[OFFSET(90)] AS p90,
23+
APPROX_QUANTILES(age, 100)[OFFSET(99)] AS p99,
24+
MAX(age) AS max
25+
FROM cookies_age
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
-- Extract the expire date (age rounded to closest number of days)
2+
-- (only positive values, non session cookies)
3+
-- Before running query: edit date and client
4+
5+
WITH cookies_age AS (
6+
SELECT
7+
ROUND((CAST(expires AS FLOAT64) - CAST(startedDateTime AS FLOAT64)) / (24 * 3600), 0) AS age
8+
FROM `httparchive.almanac.cookies`
9+
WHERE
10+
date = '2024-06-01' AND
11+
client = 'desktop' AND
12+
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
13+
firstPartyCookie IS NOT NULL AND
14+
CAST(expires AS FLOAT64) >= 0
15+
)
16+
17+
SELECT
18+
age,
19+
COUNT(0) AS nbCookies
20+
FROM cookies_age
21+
GROUP BY age
22+
ORDER BY age ASC

sql/2024/cookies/nb_cookies_cdf.sql

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
-- Extract the nb of cookies
2+
-- Before running query: edit date and client
3+
4+
WITH nb_cookies_per_website AS (
5+
SELECT
6+
firstPartyCookie,
7+
NET.HOST(page) AS pageFirstPartyHost,
8+
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies
9+
FROM `httparchive.almanac.cookies`
10+
WHERE
11+
date = '2024-06-01' AND
12+
client = 'desktop' AND
13+
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
14+
firstPartyCookie IS NOT NULL
15+
GROUP BY firstPartyCookie, pageFirstPartyHost
16+
)
17+
18+
SELECT
19+
firstPartyCookie,
20+
distinctNbCookies,
21+
COUNT(DISTINCT pageFirstPartyHost) AS nbWebsites
22+
FROM nb_cookies_per_website
23+
GROUP BY firstPartyCookie, distinctNbCookies
24+
ORDER BY firstPartyCookie, distinctNbCookies ASC
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
-- Extract stats about the nb of cookies
2+
-- Before running query: edit date and client
3+
4+
WITH nb_cookies_per_website AS (
5+
SELECT
6+
firstPartyCookie,
7+
NET.HOST(page) AS pageFirstPartyHost,
8+
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies
9+
FROM `httparchive.almanac.cookies`
10+
WHERE
11+
date = '2024-06-01' AND
12+
client = 'desktop' AND
13+
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
14+
firstPartyCookie IS NOT NULL
15+
GROUP BY firstPartyCookie, pageFirstPartyHost
16+
)
17+
18+
SELECT
19+
firstPartyCookie,
20+
MIN(distinctNbCookies) AS min,
21+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(25)] AS p25,
22+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(50)] AS median,
23+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(75)] AS p75,
24+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(90)] AS p90,
25+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(99)] AS p99,
26+
MAX(distinctNbCookies) AS max
27+
FROM nb_cookies_per_website
28+
GROUP BY firstPartyCookie
+25
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
-- Extract stats about the nb of cookies
2+
-- Before running query: edit date and client
3+
4+
WITH nb_cookies_per_website AS (
5+
SELECT
6+
NET.HOST(page) AS pageFirstPartyHost,
7+
COUNT(DISTINCT CONCAT(name, domain)) AS distinctNbCookies
8+
FROM `httparchive.almanac.cookies`
9+
WHERE
10+
date = '2024-06-01' AND
11+
client = 'desktop' AND
12+
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
13+
firstPartyCookie IS NOT NULL
14+
GROUP BY pageFirstPartyHost
15+
)
16+
17+
SELECT
18+
MIN(distinctNbCookies) AS min,
19+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(25)] AS p25,
20+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(50)] AS median,
21+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(75)] AS p75,
22+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(90)] AS p90,
23+
APPROX_QUANTILES(distinctNbCookies, 100)[OFFSET(99)] AS p99,
24+
MAX(distinctNbCookies) AS max
25+
FROM nb_cookies_per_website
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
-- Prevalence of cookies type and attributes per type of cookie (1st/3rd party)
2+
-- Before running query: edit date and client
3+
4+
SELECT
5+
firstPartyCookie,
6+
SUM(IF(httpOnly = 'true', 1, 0)) / COUNT(0) AS httpOnly,
7+
SUM(IF(secure = 'true', 1, 0)) / COUNT(0) AS secure,
8+
SUM(IF(session = 'true', 1, 0)) / COUNT(0) AS session,
9+
SUM(IF(sameParty = 'true', 1, 0)) / COUNT(0) AS sameParty,
10+
SUM(IF(sameSite = 'Lax', 1, 0)) / COUNT(0) AS sameSiteLax,
11+
SUM(IF(sameSite = 'None', 1, 0)) / COUNT(0) AS sameSiteNone,
12+
SUM(IF(sameSite = 'Strict', 1, 0)) / COUNT(0) AS sameSiteStrict,
13+
SUM(IF(sameSite IS NULL, 1, 0)) / COUNT(0) AS sameSiteNull,
14+
SUM(IF(partitionKey IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKey,
15+
SUM(IF(partitionKeyOpaque IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKeyOpaque,
16+
SUM(IF(STARTS_WITH(name, '__Host-'), 1, 0)) / COUNT(0) AS hostPrefix,
17+
SUM(IF(STARTS_WITH(name, '__Secure-'), 1, 0)) / COUNT(0) AS securePrefix
18+
FROM `httparchive.almanac.cookies`
19+
WHERE
20+
date = '2024-06-01' AND
21+
client = 'desktop' AND
22+
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
23+
firstPartyCookie IS NOT NULL -- just in case
24+
GROUP BY
25+
firstPartyCookie
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
-- Prevalence of cookies type and attributes per rank grouping
2+
-- Before running query: edit date and client
3+
4+
SELECT
5+
rank_grouping,
6+
SUM(IF(firstPartyCookie = TRUE, 1, 0)) / COUNT(0) AS firstParty,
7+
SUM(IF(firstPartyCookie = FALSE, 1, 0)) / COUNT(0) AS thirdParty,
8+
SUM(IF(httpOnly = 'true', 1, 0)) / COUNT(0) AS httpOnly,
9+
SUM(IF(secure = 'true', 1, 0)) / COUNT(0) AS secure,
10+
SUM(IF(session = 'true', 1, 0)) / COUNT(0) AS session,
11+
SUM(IF(sameParty = 'true', 1, 0)) / COUNT(0) AS sameParty,
12+
SUM(IF(sameSite = 'Lax', 1, 0)) / COUNT(0) AS sameSiteLax,
13+
SUM(IF(sameSite = 'None', 1, 0)) / COUNT(0) AS sameSiteNone,
14+
SUM(IF(sameSite = 'Strict', 1, 0)) / COUNT(0) AS sameSiteStrict,
15+
SUM(IF(sameSite IS NULL, 1, 0)) / COUNT(0) AS sameSiteNull,
16+
SUM(IF(partitionKey IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKey,
17+
SUM(IF(partitionKeyOpaque IS NOT NULL, 1, 0)) / COUNT(0) AS partitionKeyOpaque
18+
FROM `httparchive.almanac.cookies`,
19+
UNNEST([1000, 10000, 100000, 1000000, 10000000, 100000000]) AS rank_grouping
20+
WHERE
21+
rank <= rank_grouping AND
22+
date = '2024-06-01' AND
23+
client = 'desktop' AND
24+
firstPartyCookie IS NOT NULL -- just in case
25+
GROUP BY
26+
rank_grouping
27+
ORDER BY
28+
rank_grouping

sql/2024/cookies/size_cookies_cdf.sql

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
-- Extract the size of the cookies
2+
-- Before running query: edit date and client
3+
4+
SELECT
5+
CAST(size AS INT) AS sizeCookie,
6+
COUNT(0) AS nbCookies
7+
FROM `httparchive.almanac.cookies`
8+
WHERE
9+
date = '2024-06-01' AND
10+
client = 'desktop' AND
11+
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
12+
firstPartyCookie IS NOT NULL
13+
GROUP BY sizeCookie
14+
ORDER BY sizeCookie ASC
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
-- Extract stats around the size of the cookies
2+
-- Before running query: edit date and client
3+
4+
SELECT
5+
firstPartyCookie,
6+
MIN(CAST(size AS INT)) AS min,
7+
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(25)] AS p25,
8+
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(50)] AS median,
9+
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(75)] AS p75,
10+
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(90)] AS p90,
11+
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(99)] AS p99,
12+
MAX(CAST(size AS INT)) AS max
13+
FROM `httparchive.almanac.cookies`
14+
WHERE
15+
date = '2024-06-01' AND
16+
client = 'desktop' AND
17+
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
18+
firstPartyCookie IS NOT NULL
19+
GROUP BY firstPartyCookie
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
-- Extract stats around the size of the cookies
2+
-- Before running query: edit date and client
3+
4+
SELECT
5+
MIN(CAST(size AS INT)) AS min,
6+
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(25)] AS p25,
7+
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(50)] AS median,
8+
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(75)] AS p75,
9+
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(90)] AS p90,
10+
APPROX_QUANTILES(CAST(size AS INT), 100)[OFFSET(99)] AS p99,
11+
MAX(CAST(size AS INT)) AS max
12+
FROM `httparchive.almanac.cookies`
13+
WHERE
14+
date = '2024-06-01' AND
15+
client = 'desktop' AND
16+
rank <= 1000000 AND --2024 results were mainly extracted for top 1M cookies, feel free to remove this and expand in future
17+
firstPartyCookie IS NOT NULL

0 commit comments

Comments
 (0)