|
| 1 | +-- Extract to the `httparchive.almanac.cookies `table the cookies that were set |
| 2 | +-- during the <DATE> crawl on <CLIENT>. Data in this table can then be queried |
| 3 | +-- more efficiently in consecutive queries without having to reextract it every |
| 4 | +-- time |
| 5 | + |
| 6 | + |
| 7 | +-- Code used by @tunetheweb to create the table |
| 8 | +-- see https://github.com/HTTPArchive/almanac.httparchive.org/pull/3741#discussion_r1823153262 |
| 9 | + |
| 10 | +-- CREATE TABLE `httparchive.almanac.cookies` |
| 11 | +-- ( |
| 12 | +-- date DATE, |
| 13 | +-- client STRING, |
| 14 | +-- page STRING, |
| 15 | +-- root_page STRING, |
| 16 | +-- rank INTEGER, |
| 17 | +-- startedDateTime STRING, |
| 18 | +-- firstPartyCookie BOOL, |
| 19 | +-- name STRING, |
| 20 | +-- domain STRING, |
| 21 | +-- path STRING, |
| 22 | +-- expires STRING, |
| 23 | +-- size STRING, |
| 24 | +-- httpOnly STRING, |
| 25 | +-- secure STRING, |
| 26 | +-- session STRING, |
| 27 | +-- sameSite STRING, |
| 28 | +-- sameParty STRING, |
| 29 | +-- partitionKey STRING, |
| 30 | +-- partitionKeyOpaque STRING |
| 31 | +-- ) |
| 32 | +-- PARTITION BY date |
| 33 | +-- CLUSTER BY |
| 34 | +-- client, rank, page |
| 35 | +-- AS |
| 36 | +-- ... |
| 37 | + |
| 38 | + |
| 39 | +INSERT INTO `httparchive.almanac.cookies` |
| 40 | +WITH intermediate_cookie AS ( |
| 41 | + SELECT |
| 42 | + date, |
| 43 | + client, |
| 44 | + page, |
| 45 | + root_page, |
| 46 | + rank, |
| 47 | + JSON_VALUE(summary, '$.startedDateTime') AS startedDateTime, |
| 48 | + cookie |
| 49 | + FROM |
| 50 | + `httparchive.all.pages`, |
| 51 | + UNNEST(JSON_EXTRACT_ARRAY(custom_metrics, '$.cookies')) AS cookie |
| 52 | + WHERE |
| 53 | + date = '2024-06-01' |
| 54 | +) |
| 55 | + |
| 56 | +SELECT |
| 57 | + date, |
| 58 | + client, |
| 59 | + page, |
| 60 | + root_page, |
| 61 | + rank, |
| 62 | + startedDateTime, |
| 63 | + ENDS_WITH(NET.HOST(page), NET.REG_DOMAIN(JSON_VALUE(cookie, '$.domain'))) AS firstPartyCookie, |
| 64 | + JSON_VALUE(cookie, '$.name') AS name, |
| 65 | + JSON_VALUE(cookie, '$.domain') AS domain, |
| 66 | + JSON_VALUE(cookie, '$.path') AS path, |
| 67 | + JSON_VALUE(cookie, '$.expires') AS expires, |
| 68 | + JSON_VALUE(cookie, '$.size') AS size, |
| 69 | + JSON_VALUE(cookie, '$.httpOnly') AS httpOnly, |
| 70 | + JSON_VALUE(cookie, '$.secure') AS secure, |
| 71 | + JSON_VALUE(cookie, '$.session') AS session, |
| 72 | + JSON_VALUE(cookie, '$.sameSite') AS sameSite, |
| 73 | + JSON_VALUE(cookie, '$.sameParty') AS sameParty, |
| 74 | + JSON_VALUE(cookie, '$.partitionKey') AS partitionKey, |
| 75 | + JSON_VALUE(cookie, '$.partitionKeyOpaque') AS partitionKeyOpaque |
| 76 | +FROM intermediate_cookie |
0 commit comments