-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathoreilly.py
executable file
·64 lines (57 loc) · 2.29 KB
/
oreilly.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/env python3
import os
import shutil
import requests
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
SITE_BASE = "http://www.oreilly.com/"
SITE_REPORTS = SITE_BASE + "free/reports.html"
EXTENSIONS = ["pdf", "mobi", "epub"]
OUTPUT_DIR = "output"
only_a_tags = SoupStrainer("a")
r = requests.get(SITE_REPORTS)
soup = BeautifulSoup(r.text, "html.parser", parse_only=only_a_tags)
sections = soup.find_all("a", class_="btn see-more")
for s in sections:
href = s.get('href')
x = href[len(SITE_BASE):].split('/')[0]
#print(x, href)
sr = requests.get(href)
ssoup = BeautifulSoup(sr.text, "html.parser", parse_only=only_a_tags)
ssections = ssoup.find_all("a", attrs={"data-toggle": "popover"})
for ss in ssections:
title = ss.get('title').replace('/', '-')
uri = ss.get('href')
#print(" ", title, uri)
if not "http:" in uri and not "https:" in uri:
uri = "http:" + uri
if '?' in uri:
tokens = uri.split('?')
# remove trailing ?intcmp= from URI
if "intcmp=" in tokens[1]:
uri = tokens[0]
if "topic=" in tokens[1]:
uri = tokens[0]
topic = tokens[1][len("topic="):]
if topic not in uri:
uri = uri[:len(SITE_BASE)] + topic + "/" + uri[len(SITE_BASE):]
#print(" -> ", uri)
uri_array = uri.rsplit('/', 1)
baseuri = uri_array[0] + "/files/"
for e in EXTENSIONS:
book_uri = baseuri + uri_array[1].rsplit('.', 1)[0] + "." + e
r2 = requests.options(book_uri)
if r2.status_code != 200:
continue
book_path = "{}/{}/{}".format(OUTPUT_DIR, x, title)
if not os.path.exists(book_path):
os.makedirs(book_path)
book_file = "{}/{}.{}".format(book_path, title, e)
#print(" * ", r2.status_code, book_uri)
if not os.path.exists(book_file):
print("Downloading {}.{} from {}".format(title, e, book_uri))
file = requests.get(book_uri, stream=True)
with open(book_file, 'wb') as out_file:
shutil.copyfileobj(file.raw, out_file)
del file
print("Up To Date !")