Skip to content

Commit 2228037

Browse files
authored
Fix data download if UCI is temporarily down (#592)
1 parent 6ea4dd3 commit 2228037

File tree

1 file changed

+8
-31
lines changed

1 file changed

+8
-31
lines changed

ch06/01_main-chapter-code/gpt_class_finetune.py

+8-31
Original file line numberDiff line numberDiff line change
@@ -21,34 +21,15 @@
2121
from previous_chapters import GPTModel, load_weights_into_gpt
2222

2323

24-
def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path, test_mode=False):
24+
def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
2525
if data_file_path.exists():
2626
print(f"{data_file_path} already exists. Skipping download and extraction.")
2727
return
2828

29-
if test_mode: # Try multiple times since CI sometimes has connectivity issues
30-
max_retries = 5
31-
delay = 5 # delay between retries in seconds
32-
for attempt in range(max_retries):
33-
try:
34-
# Downloading the file
35-
with urllib.request.urlopen(url, timeout=10) as response:
36-
with open(zip_path, "wb") as out_file:
37-
out_file.write(response.read())
38-
break # if download is successful, break out of the loop
39-
except urllib.error.URLError as e:
40-
print(f"Attempt {attempt + 1} failed: {e}")
41-
if attempt < max_retries - 1:
42-
time.sleep(delay) # wait before retrying
43-
else:
44-
print("Failed to download file after several attempts.")
45-
return # exit if all retries fail
46-
47-
else: # Code as it appears in the chapter
48-
# Downloading the file
49-
with urllib.request.urlopen(url) as response:
50-
with open(zip_path, "wb") as out_file:
51-
out_file.write(response.read())
29+
# Downloading the file
30+
with urllib.request.urlopen(url) as response:
31+
with open(zip_path, "wb") as out_file:
32+
out_file.write(response.read())
5233

5334
# Unzipping the file
5435
with zipfile.ZipFile(zip_path, "r") as zip_ref:
@@ -277,15 +258,11 @@ def plot_values(epochs_seen, examples_seen, train_values, val_values, label="los
277258
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"
278259

279260
try:
280-
download_and_unzip_spam_data(
281-
url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode
282-
)
261+
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
283262
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
284263
print(f"Primary URL failed: {e}. Trying backup URL...")
285-
backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
286-
download_and_unzip_spam_data(
287-
backup_url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode
288-
)
264+
url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
265+
download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
289266

290267
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
291268
balanced_df = create_balanced_dataset(df)

0 commit comments

Comments
 (0)