Skip to content

Commit 4a7d572

Browse files
qs: fetch vt hashes: only cache the hashes, otherwise GB/day fetched
1 parent 9400430 commit 4a7d572

File tree

1 file changed

+31
-24
lines changed

1 file changed

+31
-24
lines changed

floss/qs/scripts/fetch_vt_hashes.py

Lines changed: 31 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@
7777
import pathlib
7878
import argparse
7979
import datetime
80-
from typing import Any, Iterator
80+
from typing import Any, Iterator, List
8181

8282
import requests
8383
import virustotal3.errors
@@ -206,6 +206,14 @@ def file_feed(api_key, time, timeout=None):
206206

207207

208208
def fetch_feed(api_key: str, ts: datetime.datetime) -> Iterator[Any]:
209+
feed = file_feed(api_key, format_timestamp(ts)).read().decode("utf-8")
210+
for line in feed.split("\n"):
211+
if not line:
212+
continue
213+
yield json.loads(line)
214+
215+
216+
def fetch_feed_hashes(api_key: str, ts: datetime.datetime) -> List[str]:
209217
ts_key = format_timestamp(ts)
210218

211219
dir = pathlib.Path(get_default_cache_directory())
@@ -216,18 +224,29 @@ def fetch_feed(api_key: str, ts: datetime.datetime) -> Iterator[Any]:
216224
with shelve.open(str(p)) as db:
217225
if ts_key not in db:
218226
try:
219-
db[ts_key] = gzip.compress(file_feed(api_key, ts_key).read())
227+
hashes = []
228+
for line in fetch_feed(API_KEY, ts):
229+
try:
230+
if line.get("type") != "file":
231+
continue
232+
233+
if "magic" not in line.get("attributes", {}) or "sha256" not in line.get("attributes", {}):
234+
continue
235+
236+
magic = line["attributes"]["magic"]
237+
if any(map(lambda prefix: magic.startswith(prefix), ["PE32", "ELF", "MS-DOS", "Mach-O", "COM"])):
238+
hashes.append(line["attributes"]["sha256"])
239+
except Exception as e:
240+
logger.warning("error: %s", str(e), exc_info=True)
241+
continue
242+
243+
db[ts_key] = hashes
220244
except Exception as e:
221245
logger.warning("error: %s", str(e), exc_info=True)
222-
return
223-
224-
feed = gzip.decompress(db[ts_key]).decode("utf-8")
225-
226-
for line in feed.split("\n"):
227-
if not line:
228-
continue
229-
yield json.loads(line)
246+
return []
230247

248+
return db[ts_key]
249+
231250

232251
def main():
233252
parser = argparse.ArgumentParser(
@@ -261,20 +280,8 @@ def main():
261280
current = start
262281
while format_timestamp(current) < format_timestamp(end):
263282
logger.info("fetching feed: %s", current.isoformat())
264-
for line in fetch_feed(API_KEY, current):
265-
try:
266-
if line.get("type") != "file":
267-
continue
268-
269-
if "magic" not in line.get("attributes", {}) or "sha256" not in line.get("attributes", {}):
270-
continue
271-
272-
magic = line["attributes"]["magic"]
273-
if any(map(lambda prefix: magic.startswith(prefix), ["PE32", "ELF", "MS-DOS", "Mach-O", "COM"])):
274-
print(line["attributes"]["sha256"])
275-
except Exception as e:
276-
logger.warning("error: %s", str(e), exc_info=True)
277-
continue
283+
for hash in fetch_feed_hashes(API_KEY, current):
284+
print(hash)
278285

279286
current += datetime.timedelta(minutes=1)
280287

0 commit comments

Comments
 (0)