|
| 1 | +# Script that parses wheel index (e.g. https://download.pytorch.org/whl/test/torch/), |
| 2 | +# fetches and validates binary size for the files that match the given regex. |
| 3 | + |
| 4 | +import requests |
| 5 | +import re |
| 6 | +from collections import namedtuple |
| 7 | +import click |
| 8 | +from bs4 import BeautifulSoup |
| 9 | +from urllib.parse import urljoin |
| 10 | + |
| 11 | +Wheel = namedtuple("Wheel", ["name", "url"]) |
| 12 | + |
| 13 | + |
| 14 | +def parse_index(html: str, |
| 15 | + base_url: str, |
| 16 | + include_regex: str = "", |
| 17 | + exclude_regex: str = "", |
| 18 | + latest_version_only=False) -> list[Wheel]: |
| 19 | + """ |
| 20 | + parse the html page and return a list of wheels |
| 21 | + :param html: html page |
| 22 | + :param base_url: base url of the page |
| 23 | + :param include_regex: regex to filter the wheel names. If empty, all wheels are included |
| 24 | + :param exclude_regex: regex to exclude the matching wheel names. If empty, no wheels are excluded |
| 25 | + :param latest_version_only: if True, return the wheels of the latest version only |
| 26 | + :return: list of wheels |
| 27 | + """ |
| 28 | + soup = BeautifulSoup(html, "html.parser") |
| 29 | + |
| 30 | + wheels = [] |
| 31 | + for a in soup.find_all("a"): |
| 32 | + wheel_name = a.text |
| 33 | + wheel_url = urljoin(base_url, a.get("href")) |
| 34 | + if (not include_regex or re.search(include_regex, wheel_name)) \ |
| 35 | + and (not exclude_regex or not re.search(exclude_regex, wheel_name)): |
| 36 | + wheels.append(Wheel(name=wheel_name, url=wheel_url)) |
| 37 | + |
| 38 | + # filter out the wheels that are not the latest version |
| 39 | + if len(wheels) > 0 and latest_version_only: |
| 40 | + # get the prefixes (up to the second '+'/'-' sign) of the wheels |
| 41 | + prefixes = set() |
| 42 | + for wheel in wheels: |
| 43 | + prefix = re.search(r"^([^-+]+[-+][^-+]+)[-+]", wheel.name).group(1) |
| 44 | + if not prefix: |
| 45 | + raise RuntimeError(f"Failed to get version prefix of {wheel.name}" |
| 46 | + "Please check the regex_filter or don't use --latest-version-only") |
| 47 | + prefixes.add(prefix) |
| 48 | + latest_version = max(prefixes) |
| 49 | + print(f"Latest version prefix: {latest_version}") |
| 50 | + |
| 51 | + # filter out the wheels that are not the latest version |
| 52 | + wheels = [wheel for wheel in wheels if wheel.name.startswith(latest_version)] |
| 53 | + |
| 54 | + return wheels |
| 55 | + |
| 56 | + |
| 57 | +def get_binary_size(file_url: str) -> int: |
| 58 | + """ |
| 59 | + get the binary size of the given file |
| 60 | + :param file_url: url of the file |
| 61 | + :return: binary size in bytes |
| 62 | + """ |
| 63 | + return int(requests.head(file_url).headers['Content-Length']) |
| 64 | + |
| 65 | + |
| 66 | +@click.command( |
| 67 | + help="Validate the binary sizes of the given wheel index." |
| 68 | +) |
| 69 | +@click.option("--url", help="url of the wheel index", |
| 70 | + default="https://download.pytorch.org/whl/nightly/torch/") |
| 71 | +@click.option("--include", help="regex to filter the wheel names. Only the matching wheel names will be checked.", |
| 72 | + default="") |
| 73 | +@click.option("--exclude", help="regex to exclude wheel names. Matching wheel names will NOT be checked.", |
| 74 | + default="") |
| 75 | +@click.option("--threshold", help="threshold in MB, optional", default=0) |
| 76 | +@click.option("--only-latest-version", help="only validate the latest version", |
| 77 | + is_flag=True, show_default=True, default=False) |
| 78 | +def main(url, include, exclude, threshold, only_latest_version): |
| 79 | + page = requests.get(url) |
| 80 | + wheels = parse_index(page.text, url, include, exclude, only_latest_version) |
| 81 | + for wheel in wheels: |
| 82 | + print(f"Validating {wheel.url}...") |
| 83 | + size = get_binary_size(wheel.url) |
| 84 | + print(f"{wheel.name}: {int(size) / 1024 / 1024:.2f} MB") |
| 85 | + if threshold and int(size) > threshold: |
| 86 | + raise RuntimeError( |
| 87 | + f"Binary size of {wheel.name} {int(size) / 1024 / 1024:.2f} MB exceeds the threshold {threshold} MB") |
| 88 | + |
| 89 | + |
| 90 | +if __name__ == "__main__": |
| 91 | + main() |
0 commit comments