-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathofac_scraper.py
98 lines (80 loc) · 3.66 KB
/
ofac_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import logging
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
class OfacWebsiteScraper:
def __init__(self):
# Set up Chrome options
chrome_options = Options()
chrome_options.add_argument("--headless") # Run in headless mode
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
# Automatically download and set up ChromeDriver
self.service = Service("/usr/bin/chromedriver")
self.driver = webdriver.Chrome(service=self.service, options=chrome_options)
def open_website(self, url):
self.driver.get(url)
def wait_for_element(self, by, value, timeout=30):
return WebDriverWait(self.driver, timeout).until(
EC.presence_of_element_located((by, value))
)
def click_element(self, by, value):
element = self.driver.find_element(by, value)
element.click()
def get_element_text(self, by, value):
return self.driver.find_element(by, value).text
def get_sha256_checksum(self):
MAX_RETRIES = 10
RETRY_DELAY = 10 # seconds to wait before retrying
for attempt in range(MAX_RETRIES):
print(
f"Attempting to get SHA-256 checksum (attempt {attempt + 1}/{MAX_RETRIES})..."
)
try:
self.open_website("https://sanctionslist.ofac.treas.gov/Home/SdnList")
print("Website opened")
# Wait until the 'File Signatures' button with the known ID is present
self.wait_for_element(By.ID, "accordion__heading-raa-1")
print("File Signatures button found")
# Click the 'File Signatures' button with the known ID
self.click_element(By.ID, "accordion__heading-raa-1")
print("File Signatures button clicked")
# Wait for the checksums panel to be visible
self.wait_for_element(By.ID, "accordion__panel-raa-1")
time.sleep(3)
print("Checksums panel found")
# Extract the checksums content
checksums_content = self.get_element_text(
By.ID, "accordion__panel-raa-1"
)
print("Checksums content extracted")
# Parse and return only the SHA-256 checksum
if "SHA-256: " not in checksums_content:
raise ValueError("SHA-256 checksum not found")
sha256_checksum = checksums_content.split("SHA-256: ")[1].split("\n")[0]
return sha256_checksum
except TimeoutException:
print(
f"Timeout occurred on attempt {attempt + 1}/{MAX_RETRIES}. Retrying in {RETRY_DELAY} seconds..."
)
time.sleep(RETRY_DELAY)
if attempt == MAX_RETRIES - 1:
raise TimeoutException(
"Max retries reached. The website is not responding."
)
def close(self):
self.driver.quit()
if __name__ == "__main__":
scraper = OfacWebsiteScraper()
try:
sha256_checksum = scraper.get_sha256_checksum()
print(sha256_checksum)
finally:
scraper.close()