-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget2.py
255 lines (248 loc) · 8.29 KB
/
get2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
elif job == 'get2': # When completed, cat purged.csv allsofar.csv > wayback2_final1.csv (instead of purged.csv, I used post-check2-latest-greatest-csv-purged-of-all-badness.csv)
# should check if user set things up properly. See `harvest` for details.
# Set up requests session and output csv file
s = requests.Session()
outfile = open('output.csv', 'w')
w = csv.writer(outfile)
# Get list of urls to scrape
urls = fileinput.input(['scrapethis.txt'])
# get2 continues below this if block
else:
print('no job. Take day off.')
sys.exit(1)
# Only get1 and get2 get here
# Manually skip past what already did
# TODO: handle case where want to re-run with same startat. Only if run `./harvest` will new figures be respected
#startat = 11698
with open('startat.txt', 'r') as f: # Initially do: `echo 0 > startat.txt`
startat = int(f.read())
with open('laststartat.txt', 'w') as f:
f.write(str(startat))
# Skip already dones
c = 0
while c < startat:
if job == 'get1':
url = urls.pop().text
#url = next(urls).text
else:
# get2
url = next(urls)
print(f'{c}: Skipping: {url}')
c += 1
# get2 only
if job == 'get2':
proc_urls(urls)
sys.exit()
global c # count of urls; some possibly already skipped
# Loop through all urls
success = True
while True:
if success:
# Get next url if prev was successful (or on very first url), otherwise keep it the same to try again
try:
url = next(urls)
except StopIteration:
break
# Reset retrydelay if prev was successful, otherwise let it increment to wait longer and longer each time
retrydelay = 0
# Set success to False by default, so only if actually succeed further on will it be True
success = False
# Wait if going live
livewait()
# Get the page
try:
response = s.get(url)
except requests.exceptions.ConnectionError as e:
print(f'Connection Error in s.get call: {e.__doc__}') # A Connection error occurred
try:
print(e.message) # not all exceptions have this
except AttributeError:
# e has no attr message
pass
print('-----')
print(e) # ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
print('-----')
if retrydelay > 0:
print(f'Waiting {retrydelay} seconds before retrying... ', end='')
time.sleep(retrydelay)
print('Done!')
retrydelay += retrydelay_incr
#raise
continue
except Exception as e:
print(f'Unknown Error in s.get call: {e.__doc__}')
try:
print(e.message) # not all exceptions have this
except AttributeError:
# e has no attr message
pass
print('-----')
print(e)
print('-----')
raise
# Process response
if response.status_code == 200:
# request successful
if re.search(r'Unusual Traffic Activity', response.text):
# but got captcha
print(f'{c}: Captcha: {url}')
sys.exit()
else:
# Got the html
success = True
else:
# request unsuccessful
if response.status_code == 404:
# Not found
print(f'{c}: {response.status_code} {response.reason}: {url}')
elif response.status_code == 504:
# Time out
print(f'{c}: {response.status_code} {response.reason}: {url}')
elif response.status_code == 429:
# Too many requests
print(f'{c}: {response.status_code} {response.reason}: {url}')
elif response.status_code == 500:
# Internal server error
print(f'{c}: {response.status_code} {response.reason}: {url}')
elif response.status_code == 523:
# Unknown error
print(f'{c}: {response.status_code} {response.reason}: {url}')
elif response.status_code == 520:
# Unknown error
print(f'{c}: {response.status_code} {response.reason}: {url}')
elif response.status_code == 503:
# Service unavailable
print(f'{c}: {response.status_code} {response.reason}: {url}')
elif response.status_code == 524:
# No reason whatsoever??? Happened when catcha'd out and went to real postcode.my site. Wait and try again
# Happened again, real postcode.my site, but not captcha
print(f'{c}: {response.status_code} {response.reason}: {url}')
print(response)
elif response.status_code == 502:
# Bad gateway
print(f'{c}: {response.status_code} {response.reason}: {url}')
else:
# Unknown failed request, so alert programmer
print(response.status_code)
print(response.reason)
print(response)
sys.exit()
# We tried to get the webpage and may or may not have succeeded
if success:
# Assume success and try to get the tables
try:
tbls = pd.read_html(response.text)
except ValueError as e:
# Got no tables error. Was error on postcode.my at the time: https: // web.archive.org / web / 20131205013002 / http: // postcode.my / negeri - sembilan - kuala - klawang - taman - irama - 71600. html
print(e)
# log it to scrape manually
with open('failed2.txt', 'a') as f:
f.write(url + '\n')
# save html for analysis
with open('no_tables.html', 'w') as f:
f.write(response.text)
# continue with next url
#continue
sys.exit(1)
row = {}
tcount = 0
for t in tbls:
# each t is a pandas dataframe
tcount += 1
try:
t = dict(t.values) # convert from Item, Description to "proper" dict (i.e. Location, ... , Longitude)
except ValueError as e:
# fails for the bad table (adsbygoogle) cuz seems need only 2 each (one for key, one for value), but there's 4
# ah, it's the search form, so just skip
#with open('adsby.txt', 'w') as f:
# f.write(str(t))
#with open('adsby_info.txt', 'w') as f:
# f.write(t.info())
#with open('adsby_values.txt', 'w') as f:
# f.write(str(t.values))
# t = t.values
sys.exit()
#continue
# Throw out bad keys like adsbygoogle. Keep only good ones like Location, State, etc
fixtable(t)
# Add data in t to collection in row
try:
#row.update(t.values)
row.update(t)
except ValueError as e:
print(e)
print('I think this was for when was t.values, so will not happen now')
sys.exit()
# dictionary update sequence element # 0 has length 4; 2 is required
# Bad: 0... 3 0
# Find: ... NaN 1
# NaN...NaN 2
# Keyword...NaN
# [3 rows x 4 columns]
#
# This is the search form, so just skip it
print(f'Bad: {t}')
fname = re.sub('/', '+', f'{url}-{tcount}')
# Log the table for debugging (can del if always just the search form)
with open(f'{fname}.tbl', 'w') as f:
f.write(str(t))
# Log the html to parse manually later
with open(f'{fname}.html', 'w') as f: # added the '2'. Means skip cuz hopefully getting latlong now
f.write(response.text)
#continue
sys.exit(1)
# check if still need to get latlong
if 'Latitude' not in row.keys(): # only Lat good enough?
if getlatlong(row, response.text):
print(f'NEW LATLONG {c}:\t{row}')
with open('newlatlong.txt', 'a') as f:
f.write(str(row) + '\n')
# debug
print("since shouldn't happen anymore")
sys.exit(1)
else:
# Still failed to get lat long
print('still no latlong')
#continue
sys.exit(1)
# latlong is 0.0000: https://web.archive.org/web/20140724100239/http://postcode.my/melaka-melaka-jabatan-anti-malaria-75584.html
try:
if float(row['Latitude']) == 0 and float(row['Longitude']) == 0:
print(f'latlong zeroes:\t{row}')
#continue
sys.exit(1)
except KeyError:
# lat or long not even in row
sys.exit(1)
#continue
# latlong might be empty: https://web.archive.org/web/20150619125202/http://postcode.my/melaka-melaka-jabatan-perangkaan-75514.html
# print(f'bad latlong: {row}')
if isinstance(row['Latitude'], float): # need check long too?
if isnan(row['Latitude']):
print(f'blank latlong: {row}')
#continue
sys.exit(1)
# Successfully found scrape data. Show what we found on stdout
print(f'{c}: {row}')
# Write csv header if needed
if c == 0:
w.writerow(row.keys())
# Write the row to the output file
w.writerow(row.values())
# Advance the counter
c += 1
# Save c to pick up where left off next run
with open('startat.txt', 'w') as f:
f.write(str(c))
else:
# Failed to get page. [s]Exit to alert programmer. TODO: handle pages (probably by logging) that fail like this[/s]
print(f"Tried for {url} and didn't succeed.")
if re.search(r'Unusual Traffic Activity', response.text):
print('reset captcha please')
os.system('dunstify Postcode "Reset captcha!"')
sys.exit()
# Not unusual traffic, so just log it
with open('failed.txt', 'a') as f:
f.write(url + '\n')
#sys.exit()
print('All done!')