Skip to content

Commit 906511f

Browse files
Ignore empty or missing hrefs in Simple HTML (#10276)
## Summary Closes #7735. ## Test Plan `cargo run pip install -f https://whl.smartgic.io/ ggwave --python-platform linux` (fails prior to this PR; passes after)
1 parent d1a5a27 commit 906511f

File tree

1 file changed

+62
-11
lines changed

1 file changed

+62
-11
lines changed

crates/uv-client/src/html.rs

+62-11
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use std::str::FromStr;
22

3-
use tl::{HTMLTag, Parser};
3+
use tl::HTMLTag;
44
use tracing::{instrument, warn};
55
use url::Url;
66

@@ -44,7 +44,12 @@ impl SimpleHtml {
4444
.iter()
4545
.filter_map(|node| node.as_tag())
4646
.filter(|link| link.name().as_bytes() == b"a")
47-
.map(|link| Self::parse_anchor(link, dom.parser()))
47+
.map(|link| Self::parse_anchor(link))
48+
.filter_map(|result| match result {
49+
Ok(None) => None,
50+
Ok(Some(file)) => Some(Ok(file)),
51+
Err(err) => Some(Err(err)),
52+
})
4853
.collect::<Result<Vec<_>, _>>()?;
4954
// While it has not been positively observed, we sort the files
5055
// to ensure we have a defined ordering. Otherwise, if we rely on
@@ -70,14 +75,18 @@ impl SimpleHtml {
7075
}
7176

7277
/// Parse a [`File`] from an `<a>` tag.
73-
fn parse_anchor(link: &HTMLTag, parser: &Parser) -> Result<File, Error> {
78+
///
79+
/// Returns `None` if the `<a>` don't doesn't have an `href` attribute.
80+
fn parse_anchor(link: &HTMLTag) -> Result<Option<File>, Error> {
7481
// Extract the href.
75-
let href = link
82+
let Some(href) = link
7683
.attributes()
7784
.get("href")
7885
.flatten()
7986
.filter(|bytes| !bytes.as_bytes().is_empty())
80-
.ok_or(Error::MissingHref(link.inner_text(parser).to_string()))?;
87+
else {
88+
return Ok(None);
89+
};
8190
let href = std::str::from_utf8(href.as_bytes())?;
8291

8392
// Extract the hash, which should be in the fragment.
@@ -158,7 +167,7 @@ impl SimpleHtml {
158167
None
159168
};
160169

161-
Ok(File {
170+
Ok(Some(File {
162171
core_metadata,
163172
dist_info_metadata: None,
164173
data_dist_info_metadata: None,
@@ -169,7 +178,7 @@ impl SimpleHtml {
169178
url: decoded.to_string(),
170179
size: None,
171180
upload_time: None,
172-
})
181+
}))
173182
}
174183
}
175184

@@ -628,8 +637,29 @@ mod tests {
628637
<!--TIMESTAMP 1703347410-->
629638
";
630639
let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap();
631-
let result = SimpleHtml::parse(text, &base).unwrap_err();
632-
insta::assert_snapshot!(result, @"Missing href attribute on anchor link: `Jinja2-3.1.2-py3-none-any.whl`");
640+
let result = SimpleHtml::parse(text, &base).unwrap();
641+
insta::assert_debug_snapshot!(result, @r###"
642+
SimpleHtml {
643+
base: BaseUrl(
644+
Url {
645+
scheme: "https",
646+
cannot_be_a_base: false,
647+
username: "",
648+
password: None,
649+
host: Some(
650+
Domain(
651+
"download.pytorch.org",
652+
),
653+
),
654+
port: None,
655+
path: "/whl/jinja2/",
656+
query: None,
657+
fragment: None,
658+
},
659+
),
660+
files: [],
661+
}
662+
"###);
633663
}
634664

635665
#[test]
@@ -645,8 +675,29 @@ mod tests {
645675
<!--TIMESTAMP 1703347410-->
646676
"#;
647677
let base = Url::parse("https://download.pytorch.org/whl/jinja2/").unwrap();
648-
let result = SimpleHtml::parse(text, &base).unwrap_err();
649-
insta::assert_snapshot!(result, @"Missing href attribute on anchor link: `Jinja2-3.1.2-py3-none-any.whl`");
678+
let result = SimpleHtml::parse(text, &base).unwrap();
679+
insta::assert_debug_snapshot!(result, @r###"
680+
SimpleHtml {
681+
base: BaseUrl(
682+
Url {
683+
scheme: "https",
684+
cannot_be_a_base: false,
685+
username: "",
686+
password: None,
687+
host: Some(
688+
Domain(
689+
"download.pytorch.org",
690+
),
691+
),
692+
port: None,
693+
path: "/whl/jinja2/",
694+
query: None,
695+
fragment: None,
696+
},
697+
),
698+
files: [],
699+
}
700+
"###);
650701
}
651702

652703
#[test]

0 commit comments

Comments
 (0)