Skip to content

Commit 8bf1378

Browse files
authored
Merge pull request #87 from soxoj/trello-weibo
Added Weibo and Trello, transforms improvements
2 parents 687bb4c + ca6ae1b commit 8bf1378

File tree

6 files changed

+99
-22
lines changed

6 files changed

+99
-22
lines changed

CHANGELOG.md

+2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
# Changelog
22

33
## [Unreleased]
4+
* non-json transforms
5+
* added Weibo and Trello
46
* added username-from-email extractor
57
* added Yelp
68

socid_extractor/cli.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,15 @@ def get_site_response(url, cookies=None, headers={}):
2222

2323

2424
def run():
25-
parser = argparse.ArgumentParser(description=f'Extract accounts\' identifiers from pages. {len(schemes)} sites (methods) are supported.')
25+
parser = argparse.ArgumentParser(
26+
description=f'Extract accounts\' identifiers from pages. {len(schemes)} sites (methods) are supported.',
27+
prog='socid_extractor',
28+
)
2629
parser.add_argument('--url', help='url to parse')
2730
parser.add_argument('--cookies', default='', help='cookies to make http requests with auth')
2831
parser.add_argument('-v', '--verbose', action='store_true', help='display verbose information')
2932
parser.add_argument('-d', '--debug', action='store_true', help='display debug information')
30-
parser.add_argument('--file', action='store_true', help='load from file instead of URL')
33+
parser.add_argument('--file', help='file to parse')
3134
parser.add_argument('--activation', type=str, help='use certain type of request activation')
3235

3336
args = parser.parse_args()

socid_extractor/main.py

+38-20
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
HEADERS = {
1111
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
12-
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
12+
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
1313
}
1414

1515
PROCESS_ERRORS = (AttributeError, KeyError, IndexError, TypeError)
@@ -52,6 +52,31 @@ def mutate_url(url):
5252
return mutate_results
5353

5454

55+
def transform(scheme_data, extracted_data):
56+
transforms = scheme_data.get('transforms', [])
57+
if transforms:
58+
for t in transforms:
59+
logging.debug(t)
60+
try:
61+
extracted_data = t(extracted_data)
62+
except PROCESS_ERRORS as e:
63+
logging.debug(f'Transform error: {e}')
64+
extracted_data = {}
65+
logging.debug(extracted_data)
66+
return extracted_data
67+
68+
69+
def map_fields(scheme_data, transformed_data):
70+
values = {}
71+
for name, get_field in scheme_data['fields'].items():
72+
try:
73+
value = get_field(transformed_data)
74+
values[name] = str(value) if value not in (None, [], {}) else ''
75+
except PROCESS_ERRORS as e:
76+
logging.debug(f'Unable to extact field {name}: {e}')
77+
return values
78+
79+
5580
def extract(page):
5681
for scheme_name, scheme_data in schemes.items():
5782
flags = scheme_data['flags']
@@ -80,21 +105,11 @@ def extract(page):
80105

81106
if scheme_data.get('extract_json', False):
82107
extracted = regexp_group.group(1)
83-
84108
logging.debug('Extracted: %s', extracted)
85109

86-
transforms = scheme_data.get('transforms', [])
87-
if transforms:
88-
for t in transforms:
89-
logging.debug(t)
90-
try:
91-
extracted = t(extracted)
92-
except PROCESS_ERRORS as e:
93-
logging.debug(f'Transform error: {e}')
94-
extracted = {}
95-
logging.debug(extracted)
110+
transformed = transform(scheme_data, extracted)
96111

97-
json_data = json.loads(extracted)
112+
json_data = json.loads(transformed)
98113

99114
if json_data == {}:
100115
logging.debug('Unabled to extract json!')
@@ -107,14 +122,17 @@ def extract(page):
107122
with open('debug_extracted.json', 'w') as f:
108123
f.write(loaded_json_str)
109124

110-
for name, get_field in scheme_data['fields'].items():
111-
try:
112-
value = get_field(json_data)
113-
values[name] = str(value) if value not in (None, [], {}) else ''
114-
except PROCESS_ERRORS as e:
115-
logging.debug(f'Unable to extact field {name}: {e}')
125+
values = map_fields(scheme_data, json_data)
116126
else:
117-
values = regexp_group.groupdict()
127+
groupdict = regexp_group.groupdict()
128+
if groupdict:
129+
values = groupdict
130+
else:
131+
extracted = regexp_group.group(1)
132+
logging.debug('Extracted: %s', extracted)
133+
134+
transformed_data = transform(scheme_data, extracted)
135+
values = map_fields(scheme_data, transformed_data)
118136

119137
if use_html_parser:
120138
soup = bs(page, 'html.parser')

socid_extractor/schemes.py

+33
Original file line numberDiff line numberDiff line change
@@ -1343,5 +1343,38 @@
13431343
'location': lambda x: x.find('div', {'class': 'user-profile_info'}).find('h3').contents[0].split(' ', 1)[1],
13441344
'image': lambda x: x.find('div', {'class': 'user-profile_avatar'}).find('img').get('src'),
13451345
}
1346+
},
1347+
'Trello API': {
1348+
'flags': ['"aaId"', '"trophies":'],
1349+
'regex': r'^([\s\S]+)$',
1350+
'extract_json': True,
1351+
'fields': {
1352+
'id': lambda x: x['id'],
1353+
'username': lambda x: x['username'],
1354+
'fullname': lambda x: x['fullName'],
1355+
'email': lambda x: x['email'],
1356+
'image': lambda x: x['avatarUrl'] + '/170.png',
1357+
'bio': lambda x: x['bio'],
1358+
'type': lambda x: x['memberType'],
1359+
'gravatar_email_md5_hash': lambda x: x['gravatarHash'],
1360+
'is_verified': lambda x: x['confirmed'],
1361+
}
1362+
},
1363+
'Weibo': {
1364+
'flags': ['$CONFIG[\'timeweibo\']'],
1365+
'regex': r'\$CONFIG = {};[\r\n]([\s\S]+?)</script>',
1366+
'transforms': [
1367+
lambda x: re.split('[\r\n]', x),
1368+
lambda x: [r.split("'") for r in x if r],
1369+
lambda x: {r[1]: r[-2] for r in x},
1370+
],
1371+
'fields': {
1372+
'weibo_id': lambda x: x['oid'],
1373+
'fullname': lambda x: x['onick'],
1374+
'nickname': lambda x: x['nick'],
1375+
'image': lambda x: x['avatar_large'],
1376+
'gender': lambda x: x['sex'],
1377+
'language': lambda x: x['lang'],
1378+
}
13461379
}
13471380
}

tests/reformat.sh

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
#!/bin/bash
2+
sed -i '' -r 's/^([0-9a-zA-Z_]+): (.+)$/ assert info.get("\1") == "\2"/g' test_e2e.py

tests/test_e2e.py

+19
Original file line numberDiff line numberDiff line change
@@ -972,3 +972,22 @@ def test_yelp_userid():
972972
assert info.get('fullname') == 'Dima "ZOMG" M.'
973973
assert info.get('location') == 'Brooklyn, NY'
974974
assert info.get('image') == 'https://s3-media0.fl.yelpcdn.com/photo/bGiNMDL6FZAtPpMfljRGtg/ls.jpg'
975+
976+
977+
def test_trello():
978+
info = extract(parse('https://trello.com/1/Members/xFubuki')[0])
979+
980+
assert info.get("id") == "5e78cae55d711a6382e239c1"
981+
assert info.get("username") == "xfubuki"
982+
assert info.get("fullname") == "xFubuki"
983+
assert info.get("image") == "https://trello-members.s3.amazonaws.com/5e78cae55d711a6382e239c1/d9c5264e657de6175f14a9067126873f/170.png"
984+
assert info.get("type") == "normal"
985+
assert info.get("is_verified") == "True"
986+
987+
988+
def test_weibo():
989+
headers = {"cookie": "SUB=_2AkMXyuc_f8NxqwJRmP8SyWPrbo13zAvEieKhlhbkJRMxHRl-123", "cache-control": "no-cache"}
990+
info = extract(parse('https://weibo.com/clairekuo?is_all=1', headers=headers, timeout=10)[0])
991+
992+
assert info.get("weibo_id") == "1733299783"
993+
assert info.get("fullname") == "郭靜Claire"

0 commit comments

Comments
 (0)