Skip to content

Commit de56957

Browse files
committed
automtically add robots=none meta-tag to every saved document
1 parent 4a6dadc commit de56957

File tree

9 files changed

+144
-78
lines changed

9 files changed

+144
-78
lines changed

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ authors = [
1010
"Andriy Rakhnin <[email protected]>",
1111
]
1212
edition = "2021"
13-
description = "CLI tool for saving web pages as a single HTML file"
13+
description = "CLI tool and library for saving web pages as a single HTML file"
1414
homepage = "https://github.com/Y2Z/monolith"
1515
repository = "https://github.com/Y2Z/monolith"
1616
readme = "README.md"

src/core.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@ use url::Url;
1515
use crate::cache::Cache;
1616
use crate::cookies::Cookie;
1717
use crate::html::{
18-
add_favicon, create_metadata_tag, get_base_url, get_charset, get_title, has_favicon,
19-
html_to_dom, serialize_document, set_base_url, set_charset, walk_and_embed_assets,
18+
add_favicon, create_metadata_tag, get_base_url, get_charset, get_robots, get_title,
19+
has_favicon, html_to_dom, serialize_document, set_base_url, set_charset, set_robots,
20+
walk_and_embed_assets,
2021
};
2122
use crate::url::{clean_url, create_data_url, get_referer_url, parse_data_url, resolve_url};
2223

@@ -263,6 +264,13 @@ pub fn create_monolithic_document_from_data(
263264
}
264265
}
265266

267+
// Append noindex META-tag
268+
if let meta_robots_content_value = get_robots(&dom.document).unwrap_or_default() {
269+
if meta_robots_content_value.trim().is_empty() || meta_robots_content_value != "none" {
270+
dom = set_robots(dom, "none");
271+
}
272+
}
273+
266274
// Save using specified charset, if given
267275
if let Some(custom_encoding) = options.encoding.clone() {
268276
document_encoding = custom_encoding;

src/html.rs

Lines changed: 63 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,20 @@ pub fn get_parent_node(child: &Handle) -> Handle {
300300
parent.and_then(|node| node.upgrade()).unwrap()
301301
}
302302

303+
pub fn get_robots(handle: &Handle) -> Option<String> {
304+
for meta_node in find_nodes(handle, vec!["html", "head", "meta"]).iter() {
305+
// Only the first base tag matters (we ignore the rest, if there's any)
306+
if get_node_attr(meta_node, "name")
307+
.unwrap_or_default()
308+
.eq_ignore_ascii_case("robots")
309+
{
310+
return get_node_attr(meta_node, "content");
311+
}
312+
}
313+
314+
None
315+
}
316+
303317
pub fn get_title(node: &Handle) -> Option<String> {
304318
for title_node in find_nodes(node, vec!["html", "head", "title"]).iter() {
305319
for child_node in title_node.children.borrow().iter() {
@@ -436,7 +450,7 @@ pub fn parse_srcset(srcset: &str) -> Vec<SrcSetItem> {
436450
srcset_items
437451
}
438452

439-
pub fn set_base_url(document: &Handle, desired_base_href: String) -> RcDom {
453+
pub fn set_base_url(document: &Handle, base_href_value: String) -> RcDom {
440454
let mut buf: Vec<u8> = Vec::new();
441455
serialize(
442456
&mut buf,
@@ -450,14 +464,14 @@ pub fn set_base_url(document: &Handle, desired_base_href: String) -> RcDom {
450464
if let Some(head_node) = get_child_node_by_name(&html_node, "head") {
451465
// Check if BASE node already exists in the DOM tree
452466
if let Some(base_node) = get_child_node_by_name(&head_node, "base") {
453-
set_node_attr(&base_node, "href", Some(desired_base_href));
467+
set_node_attr(&base_node, "href", Some(base_href_value));
454468
} else {
455469
let base_node = create_element(
456470
&dom,
457471
QualName::new(None, ns!(), LocalName::from("base")),
458472
vec![Attribute {
459473
name: QualName::new(None, ns!(), LocalName::from("href")),
460-
value: format_tendril!("{}", desired_base_href),
474+
value: format_tendril!("{}", base_href_value),
461475
}],
462476
);
463477

@@ -470,10 +484,10 @@ pub fn set_base_url(document: &Handle, desired_base_href: String) -> RcDom {
470484
dom
471485
}
472486

473-
pub fn set_charset(dom: RcDom, desired_charset: String) -> RcDom {
487+
pub fn set_charset(dom: RcDom, charset: String) -> RcDom {
474488
for meta_node in find_nodes(&dom.document, vec!["html", "head", "meta"]).iter() {
475489
if get_node_attr(meta_node, "charset").is_some() {
476-
set_node_attr(meta_node, "charset", Some(desired_charset));
490+
set_node_attr(meta_node, "charset", Some(charset));
477491
return dom;
478492
}
479493

@@ -485,7 +499,7 @@ pub fn set_charset(dom: RcDom, desired_charset: String) -> RcDom {
485499
set_node_attr(
486500
meta_node,
487501
"content",
488-
Some(format!("text/html;charset={}", desired_charset)),
502+
Some(format!("text/html;charset={}", charset)),
489503
);
490504
return dom;
491505
}
@@ -498,7 +512,7 @@ pub fn set_charset(dom: RcDom, desired_charset: String) -> RcDom {
498512
QualName::new(None, ns!(), LocalName::from("meta")),
499513
vec![Attribute {
500514
name: QualName::new(None, ns!(), LocalName::from("charset")),
501-
value: format_tendril!("{}", desired_charset),
515+
value: format_tendril!("{}", charset),
502516
}],
503517
);
504518

@@ -508,6 +522,7 @@ pub fn set_charset(dom: RcDom, desired_charset: String) -> RcDom {
508522
.children
509523
.borrow_mut()
510524
.push(meta_charset_node.clone());
525+
break;
511526
}
512527
}
513528

@@ -551,6 +566,47 @@ pub fn set_node_attr(node: &Handle, attr_name: &str, attr_value: Option<String>)
551566
};
552567
}
553568

569+
pub fn set_robots(dom: RcDom, content_value: &str) -> RcDom {
570+
for meta_node in find_nodes(&dom.document, vec!["html", "head", "meta"]).iter() {
571+
if get_node_attr(meta_node, "name")
572+
.unwrap_or_default()
573+
.eq_ignore_ascii_case("robots")
574+
{
575+
set_node_attr(meta_node, "content", Some(content_value.to_string()));
576+
return dom;
577+
}
578+
}
579+
580+
// Manually append robots META node to HEAD
581+
{
582+
let meta_charset_node: Handle = create_element(
583+
&dom,
584+
QualName::new(None, ns!(), LocalName::from("meta")),
585+
vec![
586+
Attribute {
587+
name: QualName::new(None, ns!(), LocalName::from("name")),
588+
value: format_tendril!("robots"),
589+
},
590+
Attribute {
591+
name: QualName::new(None, ns!(), LocalName::from("content")),
592+
value: format_tendril!("{}", content_value),
593+
},
594+
],
595+
);
596+
597+
// Insert newly created META charset node into HEAD
598+
for head_node in find_nodes(&dom.document, vec!["html", "head"]).iter() {
599+
head_node
600+
.children
601+
.borrow_mut()
602+
.push(meta_charset_node.clone());
603+
break;
604+
}
605+
}
606+
607+
dom
608+
}
609+
554610
pub fn serialize_document(dom: RcDom, document_encoding: String, options: &Options) -> Vec<u8> {
555611
let mut buf: Vec<u8> = Vec::new();
556612

tests/cli/base_url.rs

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,8 @@ mod passing {
2828
// STDOUT should contain newly added base URL
2929
assert_eq!(
3030
String::from_utf8_lossy(&out.stdout),
31-
"<html><head>\
32-
<base href=\"http://localhost:30701/\"></base>\
33-
</head><body>Hello, World!</body></html>\n"
31+
r#"<html><head><base href="http://localhost:30701/"></base><meta name="robots" content="none"></meta></head><body>Hello, World!</body></html>
32+
"#
3433
);
3534

3635
// Exit code should be 0
@@ -52,9 +51,8 @@ mod passing {
5251
// STDOUT should contain newly added base URL
5352
assert_eq!(
5453
String::from_utf8_lossy(&out.stdout),
55-
"<html><head>\
56-
<base href=\"http://localhost:30701/\">\
57-
</head><body>Hello, World!</body></html>\n"
54+
r#"<html><head><base href="http://localhost:30701/"><meta name="robots" content="none"></meta></head><body>Hello, World!</body></html>
55+
"#
5856
);
5957

6058
// Exit code should be 0
@@ -78,9 +76,8 @@ mod passing {
7876
// STDOUT should contain newly added base URL
7977
assert_eq!(
8078
String::from_utf8_lossy(&out.stdout),
81-
"<html><head>\
82-
<base href=\"http://localhost/\">\
83-
</head><body>Hello, World!</body></html>\n"
79+
r#"<html><head><base href="http://localhost/"><meta name="robots" content="none"></meta></head><body>Hello, World!</body></html>
80+
"#
8481
);
8582

8683
// Exit code should be 0
@@ -104,9 +101,8 @@ mod passing {
104101
// STDOUT should contain newly added base URL
105102
assert_eq!(
106103
String::from_utf8_lossy(&out.stdout),
107-
"<html><head>\
108-
<base href=\"\">\
109-
</head><body>Hello, World!</body></html>\n"
104+
r#"<html><head><base href=""><meta name="robots" content="none"></meta></head><body>Hello, World!</body></html>
105+
"#
110106
);
111107

112108
// Exit code should be 0

tests/cli/basic.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ mod passing {
6767
// STDOUT should contain HTML created out of STDIN
6868
assert_eq!(
6969
String::from_utf8_lossy(&out.stdout),
70-
r#"<html><head></head><body>Hello from STDIN
70+
r#"<html><head><meta name="robots" content="none"></meta></head><body>Hello from STDIN
7171
</body></html>
7272
"#
7373
);
@@ -116,7 +116,7 @@ mod passing {
116116
@import url("data:text/css;base64,Ym9keXtiYWNrZ3JvdW5kLWNvbG9yOiMwMDA7Y29sb3I6I2ZmZn0K");
117117
118118
</style>
119-
</head><body></body></html>
119+
<meta name="robots" content="none"></meta></head><body></body></html>
120120
"##
121121
);
122122

tests/cli/data_url.rs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ mod passing {
2929
// STDOUT should contain isolated HTML
3030
assert_eq!(
3131
String::from_utf8_lossy(&out.stdout),
32-
r#"<html><head><meta http-equiv="Content-Security-Policy" content="default-src 'unsafe-eval' 'unsafe-inline' data:;"></meta></head><body>Hello, World!</body></html>
32+
r#"<html><head><meta http-equiv="Content-Security-Policy" content="default-src 'unsafe-eval' 'unsafe-inline' data:;"></meta><meta name="robots" content="none"></meta></head><body>Hello, World!</body></html>
3333
"#
3434
);
3535

@@ -53,7 +53,7 @@ mod passing {
5353
// STDOUT should contain HTML with no CSS
5454
assert_eq!(
5555
String::from_utf8_lossy(&out.stdout),
56-
r#"<html><head><meta http-equiv="Content-Security-Policy" content="style-src 'none';"></meta><style></style></head><body>Hello</body></html>
56+
r#"<html><head><meta http-equiv="Content-Security-Policy" content="style-src 'none';"></meta><style></style><meta name="robots" content="none"></meta></head><body>Hello</body></html>
5757
"#
5858
);
5959

@@ -77,7 +77,7 @@ mod passing {
7777
// STDOUT should contain HTML with no web fonts
7878
assert_eq!(
7979
String::from_utf8_lossy(&out.stdout),
80-
r#"<html><head><meta http-equiv="Content-Security-Policy" content="font-src 'none';"></meta><style></style></head><body>Hi</body></html>
80+
r#"<html><head><meta http-equiv="Content-Security-Policy" content="font-src 'none';"></meta><style></style><meta name="robots" content="none"></meta></head><body>Hi</body></html>
8181
"#
8282
);
8383

@@ -101,7 +101,7 @@ mod passing {
101101
// STDOUT should contain HTML with no iframes
102102
assert_eq!(
103103
String::from_utf8_lossy(&out.stdout),
104-
r#"<html><head><meta http-equiv="Content-Security-Policy" content="frame-src 'none'; child-src 'none';"></meta></head><body><iframe src=""></iframe>Hi</body></html>
104+
r#"<html><head><meta http-equiv="Content-Security-Policy" content="frame-src 'none'; child-src 'none';"></meta><meta name="robots" content="none"></meta></head><body><iframe src=""></iframe>Hi</body></html>
105105
"#
106106
);
107107

@@ -126,7 +126,7 @@ mod passing {
126126
assert_eq!(
127127
String::from_utf8_lossy(&out.stdout),
128128
format!(
129-
r#"<html><head><meta http-equiv="Content-Security-Policy" content="img-src data:;"></meta></head><body><img src="{empty_image}">Hi</body></html>
129+
r#"<html><head><meta http-equiv="Content-Security-Policy" content="img-src data:;"></meta><meta name="robots" content="none"></meta></head><body><img src="{empty_image}">Hi</body></html>
130130
"#,
131131
empty_image = EMPTY_IMAGE_DATA_URL,
132132
)
@@ -152,7 +152,7 @@ mod passing {
152152
// STDOUT should contain HTML with no JS
153153
assert_eq!(
154154
String::from_utf8_lossy(&out.stdout),
155-
r#"<html><head><meta http-equiv="Content-Security-Policy" content="script-src 'none';"></meta><script></script></head><body>Hi</body></html>
155+
r#"<html><head><meta http-equiv="Content-Security-Policy" content="script-src 'none';"></meta><script></script><meta name="robots" content="none"></meta></head><body>Hi</body></html>
156156
"#
157157
);
158158

@@ -204,7 +204,8 @@ mod failing {
204204
// STDOUT should contain HTML without contents of local JS file
205205
assert_eq!(
206206
String::from_utf8_lossy(&out.stdout),
207-
"<html><head><script></script></head><body></body></html>\n"
207+
r#"<html><head><script></script><meta name="robots" content="none"></meta></head><body></body></html>
208+
"#
208209
);
209210

210211
// Exit code should be 0

tests/cli/local_files.rs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ mod passing {
5757
<title>Local HTML file</title>
5858
<link href="data:text/css;base64,Ym9keSB7CiAgICBiYWNrZ3JvdW5kLWNvbG9yOiAjMDAwOwogICAgY29sb3I6ICNmZmY7Cn0K" rel="stylesheet" type="text/css">
5959
<link rel="stylesheet" type="text/css">
60-
</head>
60+
<meta name="robots" content="none"></meta></head>
6161
6262
<body>
6363
<img alt="">
@@ -107,7 +107,7 @@ document.body.style.color = "red";
107107
<title>Local HTML file</title>
108108
<link rel="stylesheet" type="text/css">
109109
<link rel="stylesheet" type="text/css">
110-
</head>
110+
<meta name="robots" content="none"></meta></head>
111111
112112
<body>
113113
<img src="{empty_image}" alt="">
@@ -166,7 +166,7 @@ document.body.style.color = "red";
166166
<title>Local HTML file</title>
167167
<link rel="stylesheet" type="text/css">
168168
<link rel="stylesheet" type="text/css">
169-
</head>
169+
<meta name="robots" content="none"></meta></head>
170170
171171
<body>
172172
<img src="{empty_image}" alt="">
@@ -209,7 +209,7 @@ document.body.style.color = "red";
209209
// STDOUT should contain HTML with date URL for background-image in it
210210
assert_eq!(
211211
String::from_utf8_lossy(&out.stdout),
212-
r##"<html><head></head><body><div style="background-image: url(&quot;&quot;)"></div>
212+
r##"<html><head><meta name="robots" content="none"></meta></head><body><div style="background-image: url(&quot;&quot;)"></div>
213213
</body></html>
214214
"##
215215
);
@@ -241,7 +241,7 @@ document.body.style.color = "red";
241241
// STDOUT should contain HTML with one symbol extracted from SVG file
242242
assert_eq!(
243243
String::from_utf8_lossy(&out.stdout),
244-
r##"<html><head></head><body>
244+
r##"<html><head><meta name="robots" content="none"></meta></head><body>
245245
<button class="tm-votes-lever__button" data-test-id="votes-lever-upvote-button" title="Like" type="button">
246246
<svg class="tm-svg-img tm-votes-lever__icon" height="24" width="24">
247247
<title>Like</title>
@@ -283,7 +283,7 @@ document.body.style.color = "red";
283283
// STDOUT should contain HTML with data URL of SVG file
284284
assert_eq!(
285285
String::from_utf8_lossy(&out.stdout),
286-
r##"<html><head></head><body>
286+
r##"<html><head><meta name="robots" content="none"></meta></head><body>
287287
<svg height="24" width="24">
288288
<image href="" width="24" height="24">
289289
</image></svg>
@@ -348,7 +348,7 @@ document.body.style.color = "red";
348348
<title>Local HTML file</title>
349349
<link href="data:text/css;base64,Ym9keSB7CiAgICBiYWNrZ3JvdW5kLWNvbG9yOiAjMDAwOwogICAgY29sb3I6ICNGRkY7Cn0K" rel="stylesheet" type="text/css" crossorigin="anonymous">
350350
<link href="style.css" rel="stylesheet" type="text/css" crossorigin="anonymous">
351-
</head>
351+
<meta name="robots" content="none"></meta></head>
352352
353353
<body>
354354
<p>This page should have black background and white foreground, but only when served via http: (not via file:)</p>

0 commit comments

Comments
 (0)