|
| 1 | +# pylint: disable=import-error |
| 2 | +import requests |
| 3 | +import pandas as pd |
| 4 | +from google.cloud import bigquery |
| 5 | + |
| 6 | + |
| 7 | +def extract_domains_from_file(file_path): |
| 8 | + domains = [] |
| 9 | + try: |
| 10 | + with open(file_path, "r") as file: |
| 11 | + for line in file: |
| 12 | + # Remove the '||' prefix and '^' suffix |
| 13 | + domain = line.strip().lstrip("||").rstrip("^") |
| 14 | + if domain: # Ensure the line is not empty |
| 15 | + domains.append(domain) |
| 16 | + except FileNotFoundError: |
| 17 | + print(f"Error: The file {file_path} does not exist.") |
| 18 | + except Exception as e: |
| 19 | + print(f"An error occurred: {e}") |
| 20 | + return domains |
| 21 | + |
| 22 | + |
| 23 | +def save_domains_to_csv(domains, csv_file_path): |
| 24 | + try: |
| 25 | + # Create a DataFrame from the list of domains |
| 26 | + df = pd.DataFrame(domains, columns=["Domain"]) |
| 27 | + # Save the DataFrame to a CSV file |
| 28 | + df.to_csv(csv_file_path, index=False) |
| 29 | + except Exception as e: |
| 30 | + print(f"An error occurred while writing to CSV: {e}") |
| 31 | + |
| 32 | + |
| 33 | +def upload_csv_to_bigquery(csv_file_path): |
| 34 | + # this needs the GOOGLE_APPLICATION_CREDENTIALS env variable to be set |
| 35 | + client = bigquery.Client() |
| 36 | + |
| 37 | + # Configure the job |
| 38 | + job_config = bigquery.LoadJobConfig( |
| 39 | + source_format=bigquery.SourceFormat.CSV, |
| 40 | + skip_leading_rows=1, # Adjust if your CSV doesn't have a header row |
| 41 | + autodetect=True, # Automatically infer schema |
| 42 | + ) |
| 43 | + |
| 44 | + # Load data from the CSV file |
| 45 | + with open(csv_file_path, "rb") as source_file: |
| 46 | + load_job = client.load_table_from_file( |
| 47 | + source_file, "httparchive.almanac.easylist_adservers", |
| 48 | + job_config=job_config |
| 49 | + ) |
| 50 | + |
| 51 | + # Wait for the job to complete |
| 52 | + load_job.result() |
| 53 | + |
| 54 | + |
| 55 | +# URL to the text file containing the regex patterns |
| 56 | +url = "https://raw.githubusercontent.com/easylist/easylist/master/" \ |
| 57 | + "easylist/easylist_adservers.txt" |
| 58 | +file_path = "easylist_adservers.txt" |
| 59 | +# Path to the output CSV file |
| 60 | +csv_file_path = "easylist_adservers.csv" |
| 61 | + |
| 62 | +# Download the file and save it locally |
| 63 | +response = requests.get(url) |
| 64 | +with open(file_path, "wb") as file: |
| 65 | + file.write(response.content) |
| 66 | + |
| 67 | +# Extract domains |
| 68 | +domains = extract_domains_from_file(file_path) |
| 69 | + |
| 70 | +# Save domains to CSV |
| 71 | +save_domains_to_csv(domains, csv_file_path) |
| 72 | + |
| 73 | +# upload domains to BQ |
| 74 | +upload_csv_to_bigquery(csv_file_path) |
| 75 | + |
| 76 | +print(f"Domains have been saved to {csv_file_path}") |
0 commit comments