|
21 | 21 | from previous_chapters import GPTModel, load_weights_into_gpt
|
22 | 22 |
|
23 | 23 |
|
24 |
| -def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path, test_mode=False): |
| 24 | +def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path): |
25 | 25 | if data_file_path.exists():
|
26 | 26 | print(f"{data_file_path} already exists. Skipping download and extraction.")
|
27 | 27 | return
|
28 | 28 |
|
29 |
| - if test_mode: # Try multiple times since CI sometimes has connectivity issues |
30 |
| - max_retries = 5 |
31 |
| - delay = 5 # delay between retries in seconds |
32 |
| - for attempt in range(max_retries): |
33 |
| - try: |
34 |
| - # Downloading the file |
35 |
| - with urllib.request.urlopen(url, timeout=10) as response: |
36 |
| - with open(zip_path, "wb") as out_file: |
37 |
| - out_file.write(response.read()) |
38 |
| - break # if download is successful, break out of the loop |
39 |
| - except urllib.error.URLError as e: |
40 |
| - print(f"Attempt {attempt + 1} failed: {e}") |
41 |
| - if attempt < max_retries - 1: |
42 |
| - time.sleep(delay) # wait before retrying |
43 |
| - else: |
44 |
| - print("Failed to download file after several attempts.") |
45 |
| - return # exit if all retries fail |
46 |
| - |
47 |
| - else: # Code as it appears in the chapter |
48 |
| - # Downloading the file |
49 |
| - with urllib.request.urlopen(url) as response: |
50 |
| - with open(zip_path, "wb") as out_file: |
51 |
| - out_file.write(response.read()) |
| 29 | + # Downloading the file |
| 30 | + with urllib.request.urlopen(url) as response: |
| 31 | + with open(zip_path, "wb") as out_file: |
| 32 | + out_file.write(response.read()) |
52 | 33 |
|
53 | 34 | # Unzipping the file
|
54 | 35 | with zipfile.ZipFile(zip_path, "r") as zip_ref:
|
@@ -277,15 +258,11 @@ def plot_values(epochs_seen, examples_seen, train_values, val_values, label="los
|
277 | 258 | data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"
|
278 | 259 |
|
279 | 260 | try:
|
280 |
| - download_and_unzip_spam_data( |
281 |
| - url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode |
282 |
| - ) |
| 261 | + download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) |
283 | 262 | except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
|
284 | 263 | print(f"Primary URL failed: {e}. Trying backup URL...")
|
285 |
| - backup_url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" |
286 |
| - download_and_unzip_spam_data( |
287 |
| - backup_url, zip_path, extracted_path, data_file_path, test_mode=args.test_mode |
288 |
| - ) |
| 264 | + url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip" |
| 265 | + download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) |
289 | 266 |
|
290 | 267 | df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
|
291 | 268 | balanced_df = create_balanced_dataset(df)
|
|
0 commit comments