Skip to content

Commit 856eb84

Browse files
Avoid calling http_head for non-HTTP URLs (#7062)
Avoid calling http_head for non-http URLs
1 parent e83d6fa commit 856eb84

File tree

1 file changed

+43
-42
lines changed

1 file changed

+43
-42
lines changed

src/datasets/utils/file_utils.py

+43-42
Original file line numberDiff line numberDiff line change
@@ -562,49 +562,50 @@ def get_from_cache(
562562
# s3fs uses "ETag", gcsfs uses "etag"
563563
etag = (response.get("ETag", None) or response.get("etag", None)) if use_etag else None
564564
connected = True
565-
try:
566-
response = http_head(
567-
url,
568-
allow_redirects=True,
569-
proxies=proxies,
570-
timeout=etag_timeout,
571-
max_retries=max_retries,
572-
headers=headers,
573-
)
574-
if response.status_code == 200: # ok
575-
etag = response.headers.get("ETag") if use_etag else None
576-
for k, v in response.cookies.items():
577-
# In some edge cases, we need to get a confirmation token
578-
if k.startswith("download_warning") and "drive.google.com" in url:
579-
url += "&confirm=" + v
580-
cookies = response.cookies
581-
connected = True
582-
# Fix Google Drive URL to avoid Virus scan warning
583-
if "drive.google.com" in url and "confirm=" not in url:
584-
url += "&confirm=t"
585-
# In some edge cases, head request returns 400 but the connection is actually ok
586-
elif (
587-
(response.status_code == 400 and "firebasestorage.googleapis.com" in url)
588-
or (response.status_code == 405 and "drive.google.com" in url)
589-
or (
590-
response.status_code == 403
591-
and (
592-
re.match(r"^https?://github.com/.*?/.*?/releases/download/.*?/.*?$", url)
593-
or re.match(r"^https://.*?s3.*?amazonaws.com/.*?$", response.url)
594-
)
595-
)
596-
or (response.status_code == 403 and "ndownloader.figstatic.com" in url)
597-
):
598-
connected = True
599-
logger.info(f"Couldn't get ETag version for url {url}")
600-
elif response.status_code == 401 and config.HF_ENDPOINT in url and token is None:
601-
raise ConnectionError(
602-
f"Unauthorized for URL {url}. Please use the parameter `token=True` after logging in with `huggingface-cli login`"
565+
else:
566+
try:
567+
response = http_head(
568+
url,
569+
allow_redirects=True,
570+
proxies=proxies,
571+
timeout=etag_timeout,
572+
max_retries=max_retries,
573+
headers=headers,
603574
)
604-
except (OSError, requests.exceptions.Timeout) as e:
605-
# not connected
606-
head_error = e
607-
pass
575+
if response.status_code == 200: # ok
576+
etag = response.headers.get("ETag") if use_etag else None
577+
for k, v in response.cookies.items():
578+
# In some edge cases, we need to get a confirmation token
579+
if k.startswith("download_warning") and "drive.google.com" in url:
580+
url += "&confirm=" + v
581+
cookies = response.cookies
582+
connected = True
583+
# Fix Google Drive URL to avoid Virus scan warning
584+
if "drive.google.com" in url and "confirm=" not in url:
585+
url += "&confirm=t"
586+
# In some edge cases, head request returns 400 but the connection is actually ok
587+
elif (
588+
(response.status_code == 400 and "firebasestorage.googleapis.com" in url)
589+
or (response.status_code == 405 and "drive.google.com" in url)
590+
or (
591+
response.status_code == 403
592+
and (
593+
re.match(r"^https?://github.com/.*?/.*?/releases/download/.*?/.*?$", url)
594+
or re.match(r"^https://.*?s3.*?amazonaws.com/.*?$", response.url)
595+
)
596+
)
597+
or (response.status_code == 403 and "ndownloader.figstatic.com" in url)
598+
):
599+
connected = True
600+
logger.info(f"Couldn't get ETag version for url {url}")
601+
elif response.status_code == 401 and config.HF_ENDPOINT in url and token is None:
602+
raise ConnectionError(
603+
f"Unauthorized for URL {url}. Please use the parameter `token=True` after logging in with `huggingface-cli login`"
604+
)
605+
except (OSError, requests.exceptions.Timeout) as e:
606+
# not connected
607+
head_error = e
608+
pass
608609

609610
# connected == False = we don't have a connection, or url doesn't exist, or is otherwise inaccessible.
610611
# try to get the last downloaded one

0 commit comments

Comments
 (0)