@@ -8,83 +8,74 @@ WITH redirect_requests AS (
8
8
index,
9
9
response_headers,
10
10
page
11
- FROM ` httparchive.all .requests`
11
+ FROM ` httparchive.crawl .requests`
12
12
WHERE
13
13
date = ' 2024-06-01' AND
14
14
is_root_page = TRUE AND
15
15
type NOT IN (' css' , ' image' , ' font' , ' video' , ' audio' ) AND
16
- LEFT(JSON_VALUE (summary, ' $ .status' ), 1 ) = ' 3 ' AND
16
+ ROUND(INT64 (summary .status ) / 100 ) = 3 AND
17
17
index <= 2
18
18
), navigation_redirect AS (
19
19
-- Find the first navigation redirect
20
20
SELECT
21
21
client,
22
22
url,
23
23
page,
24
- headers .value AS navigation_redirect_location
24
+ response_header .value AS navigation_redirect_location
25
25
FROM redirect_requests,
26
- UNNEST(response_headers) AS headers
26
+ UNNEST(response_headers) AS response_header
27
27
WHERE
28
28
index = 1 AND
29
- LOWER (headers .name ) = ' location' AND
30
- NET .REG_DOMAIN (page ) != NET .REG_DOMAIN (headers . value )
29
+ LOWER (response_header .name ) = ' location' AND
30
+ NET .REG_DOMAIN (response_header . value ) != NET .REG_DOMAIN (page )
31
31
), bounce_redirect AS (
32
32
-- Find the second navigation redirect
33
33
SELECT
34
34
client,
35
35
url,
36
36
page,
37
- headers .value AS bounce_redirect_location,
37
+ response_header .value AS bounce_redirect_location,
38
38
response_headers
39
39
FROM redirect_requests,
40
- UNNEST(response_headers) AS headers
40
+ UNNEST(response_headers) AS response_header
41
41
WHERE
42
42
index = 2 AND
43
- LOWER (headers .name ) = ' location' AND
44
- NET .REG_DOMAIN (headers .value ) = NET .REG_DOMAIN (page)
45
- ), bounce_redirect_with_cookies AS (
46
- -- Find the cookies set during the second navigation redirect
47
- SELECT
48
- client,
49
- url,
50
- page,
51
- bounce_redirect_location
52
- -- response_headers.value AS bounce_tracking_cookies
53
- FROM bounce_redirect,
54
- UNNEST(response_headers) AS response_headers
55
- WHERE
56
- LOWER (response_headers .name ) = ' set-cookie'
43
+ LOWER (response_header .name ) = ' location'
57
44
), bounce_sequences AS (
58
45
-- Combine the first and second navigation redirects
59
46
SELECT
60
47
nav .client ,
61
- nav .page ,
62
- nav .url AS navigation_url,
63
- nav .navigation_redirect_location ,
64
- bounce .bounce_redirect_location
48
+ NET .REG_DOMAIN (navigation_redirect_location) AS bounce_hostname,
49
+ COUNT (DISTINCT nav .page ) AS number_of_pages
65
50
-- ARRAY_AGG(bounce.bounce_tracking_cookies) AS bounce_tracking_cookies
66
51
FROM navigation_redirect AS nav
67
- LEFT JOIN bounce_redirect_with_cookies AS bounce
52
+ LEFT JOIN bounce_redirect AS bounce
68
53
ON
69
54
nav .client = bounce .client AND
70
55
nav .page = bounce .page AND
71
56
nav .navigation_redirect_location = bounce .url
72
57
WHERE bounce_redirect_location IS NOT NULL
73
58
GROUP BY
74
59
nav .client ,
75
- page,
76
- navigation_url,
77
- navigation_redirect_location,
78
- bounce_redirect_location
60
+ bounce_hostname
61
+ ), pages_total AS (
62
+ SELECT
63
+ client,
64
+ COUNT (DISTINCT page) AS total_pages
65
+ FROM ` httparchive.crawl.pages`
66
+ WHERE date = ' 2024-06-01' AND
67
+ is_root_page
68
+ GROUP BY client
79
69
)
80
70
81
71
-- Count the number of websites with bounce tracking per bounce hostname
82
72
SELECT
83
73
client,
84
- NET . HOST (navigation_redirect_location) AS bounce_hostname,
85
- COUNT (DISTINCT page) AS number_of_pages
86
- -- ARRAY_AGG(page LIMIT 2) AS page_examples
74
+ bounce_hostname,
75
+ number_of_pages,
76
+ number_of_pages / total_pages AS pct_pages
87
77
FROM bounce_sequences
88
- GROUP BY client, bounce_hostname
78
+ JOIN pages_total
79
+ USING (client)
89
80
ORDER BY number_of_pages DESC
90
81
LIMIT 100
0 commit comments