1
1
use std:: str:: FromStr ;
2
2
3
- use tl:: { HTMLTag , Parser } ;
3
+ use tl:: HTMLTag ;
4
4
use tracing:: { instrument, warn} ;
5
5
use url:: Url ;
6
6
@@ -44,7 +44,12 @@ impl SimpleHtml {
44
44
. iter ( )
45
45
. filter_map ( |node| node. as_tag ( ) )
46
46
. filter ( |link| link. name ( ) . as_bytes ( ) == b"a" )
47
- . map ( |link| Self :: parse_anchor ( link, dom. parser ( ) ) )
47
+ . map ( |link| Self :: parse_anchor ( link) )
48
+ . filter_map ( |result| match result {
49
+ Ok ( None ) => None ,
50
+ Ok ( Some ( file) ) => Some ( Ok ( file) ) ,
51
+ Err ( err) => Some ( Err ( err) ) ,
52
+ } )
48
53
. collect :: < Result < Vec < _ > , _ > > ( ) ?;
49
54
// While it has not been positively observed, we sort the files
50
55
// to ensure we have a defined ordering. Otherwise, if we rely on
@@ -70,14 +75,18 @@ impl SimpleHtml {
70
75
}
71
76
72
77
/// Parse a [`File`] from an `<a>` tag.
73
- fn parse_anchor ( link : & HTMLTag , parser : & Parser ) -> Result < File , Error > {
78
+ ///
79
+ /// Returns `None` if the `<a>` don't doesn't have an `href` attribute.
80
+ fn parse_anchor ( link : & HTMLTag ) -> Result < Option < File > , Error > {
74
81
// Extract the href.
75
- let href = link
82
+ let Some ( href) = link
76
83
. attributes ( )
77
84
. get ( "href" )
78
85
. flatten ( )
79
86
. filter ( |bytes| !bytes. as_bytes ( ) . is_empty ( ) )
80
- . ok_or ( Error :: MissingHref ( link. inner_text ( parser) . to_string ( ) ) ) ?;
87
+ else {
88
+ return Ok ( None ) ;
89
+ } ;
81
90
let href = std:: str:: from_utf8 ( href. as_bytes ( ) ) ?;
82
91
83
92
// Extract the hash, which should be in the fragment.
@@ -158,7 +167,7 @@ impl SimpleHtml {
158
167
None
159
168
} ;
160
169
161
- Ok ( File {
170
+ Ok ( Some ( File {
162
171
core_metadata,
163
172
dist_info_metadata : None ,
164
173
data_dist_info_metadata : None ,
@@ -169,7 +178,7 @@ impl SimpleHtml {
169
178
url : decoded. to_string ( ) ,
170
179
size : None ,
171
180
upload_time : None ,
172
- } )
181
+ } ) )
173
182
}
174
183
}
175
184
@@ -628,8 +637,29 @@ mod tests {
628
637
<!--TIMESTAMP 1703347410-->
629
638
" ;
630
639
let base = Url :: parse ( "https://download.pytorch.org/whl/jinja2/" ) . unwrap ( ) ;
631
- let result = SimpleHtml :: parse ( text, & base) . unwrap_err ( ) ;
632
- insta:: assert_snapshot!( result, @"Missing href attribute on anchor link: `Jinja2-3.1.2-py3-none-any.whl`" ) ;
640
+ let result = SimpleHtml :: parse ( text, & base) . unwrap ( ) ;
641
+ insta:: assert_debug_snapshot!( result, @r###"
642
+ SimpleHtml {
643
+ base: BaseUrl(
644
+ Url {
645
+ scheme: "https",
646
+ cannot_be_a_base: false,
647
+ username: "",
648
+ password: None,
649
+ host: Some(
650
+ Domain(
651
+ "download.pytorch.org",
652
+ ),
653
+ ),
654
+ port: None,
655
+ path: "/whl/jinja2/",
656
+ query: None,
657
+ fragment: None,
658
+ },
659
+ ),
660
+ files: [],
661
+ }
662
+ "### ) ;
633
663
}
634
664
635
665
#[ test]
@@ -645,8 +675,29 @@ mod tests {
645
675
<!--TIMESTAMP 1703347410-->
646
676
"# ;
647
677
let base = Url :: parse ( "https://download.pytorch.org/whl/jinja2/" ) . unwrap ( ) ;
648
- let result = SimpleHtml :: parse ( text, & base) . unwrap_err ( ) ;
649
- insta:: assert_snapshot!( result, @"Missing href attribute on anchor link: `Jinja2-3.1.2-py3-none-any.whl`" ) ;
678
+ let result = SimpleHtml :: parse ( text, & base) . unwrap ( ) ;
679
+ insta:: assert_debug_snapshot!( result, @r###"
680
+ SimpleHtml {
681
+ base: BaseUrl(
682
+ Url {
683
+ scheme: "https",
684
+ cannot_be_a_base: false,
685
+ username: "",
686
+ password: None,
687
+ host: Some(
688
+ Domain(
689
+ "download.pytorch.org",
690
+ ),
691
+ ),
692
+ port: None,
693
+ path: "/whl/jinja2/",
694
+ query: None,
695
+ fragment: None,
696
+ },
697
+ ),
698
+ files: [],
699
+ }
700
+ "### ) ;
650
701
}
651
702
652
703
#[ test]
0 commit comments