From fa918bb4f99aac5014ce386481acab24757832b5 Mon Sep 17 00:00:00 2001 From: Dave Turner Date: Mon, 23 Jun 2025 15:22:08 +0100 Subject: [PATCH] Add method to sanitize urls This method strips auth, path, query and fragment parts of the URL. --- tests/test_urls.py | 33 +++++++++++++++++++++++++++++++++ xocto/urls.py | 29 ++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/tests/test_urls.py b/tests/test_urls.py index 914c7083..2004a348 100644 --- a/tests/test_urls.py +++ b/tests/test_urls.py @@ -161,3 +161,36 @@ def test_setting_destination_and_upload(self): "MAM/TO_OE", "MAM/TO_OE/pending", ) + +class TestSanitizeURL: + def test_removes_query_params(self): + """ + Removes query params from the URL. + """ + url = "https://example.com/path?query=param" + sanitized_url = urls.sanitize_url(url) + assert sanitized_url == "https://example.com/" + + def test_removes_fragment(self): + """ + Removes fragment from the URL. + """ + url = "https://example.com/path#fragment" + sanitized_url = urls.sanitize_url(url) + assert sanitized_url == "https://example.com/" + + def test_removes_auth(self): + """ + Removes auth from the URL. + """ + url = "https://user:test@localhost:8080/path" + sanitized_url = urls.sanitize_url(url) + assert sanitized_url == "https://localhost:8080/" + + def test_removes_path(self): + """ + Removes path from the URL. + """ + url = "https://example.com/path/" + sanitized_url = urls.sanitize_url(url) + assert sanitized_url == "https://example.com/" \ No newline at end of file diff --git a/xocto/urls.py b/xocto/urls.py index e6c5c400..77e362ec 100644 --- a/xocto/urls.py +++ b/xocto/urls.py @@ -2,7 +2,8 @@ import os from urllib import parse - +from urllib3 import exceptions as urllib3_exceptions +from urllib3 import util as urllib3_util def pop_url_query_param(url: str, key: str) -> tuple[str, str | None]: """ @@ -144,3 +145,29 @@ def _fix_url_scheme(*, old_url: str, new_url: str) -> str: segments = new_url.split(":", maxsplit=1) new_url = segments[0] + "://" + segments[1] return new_url + +def sanitize_url(url: str) -> str | None: + """ + Sanitizes the URL by removing the auth, path, query and fragment parts of the URL. `None` is + returned if the URL is invalid. + + E.g. + >>> sanitize_url('https://user:pass@localhost:8080/path?query#fragment') + 'https://localhost:8080/' + >>> sanitize_url('ftp://example.com:21') + 'ftp://example.com:21/' + >>> sanitize_url('invalid-url') + None + """ + try: + scheme, _auth, host, port, _path, _query, _fragment = urllib3_util.parse_url(url) + + if host is None: + return None + + scheme = f"{scheme}://" if scheme is not None else "" + port = f":{port}" if port is not None else "" + + return f"{scheme}{host}{port}/" + except urllib3_exceptions.LocationParseError: + return None \ No newline at end of file