9
9
10
10
HEADERS = {
11
11
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36' ,
12
- "accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
12
+ "accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" ,
13
13
}
14
14
15
15
PROCESS_ERRORS = (AttributeError , KeyError , IndexError , TypeError )
@@ -52,6 +52,31 @@ def mutate_url(url):
52
52
return mutate_results
53
53
54
54
55
+ def transform (scheme_data , extracted_data ):
56
+ transforms = scheme_data .get ('transforms' , [])
57
+ if transforms :
58
+ for t in transforms :
59
+ logging .debug (t )
60
+ try :
61
+ extracted_data = t (extracted_data )
62
+ except PROCESS_ERRORS as e :
63
+ logging .debug (f'Transform error: { e } ' )
64
+ extracted_data = {}
65
+ logging .debug (extracted_data )
66
+ return extracted_data
67
+
68
+
69
+ def map_fields (scheme_data , transformed_data ):
70
+ values = {}
71
+ for name , get_field in scheme_data ['fields' ].items ():
72
+ try :
73
+ value = get_field (transformed_data )
74
+ values [name ] = str (value ) if value not in (None , [], {}) else ''
75
+ except PROCESS_ERRORS as e :
76
+ logging .debug (f'Unable to extact field { name } : { e } ' )
77
+ return values
78
+
79
+
55
80
def extract (page ):
56
81
for scheme_name , scheme_data in schemes .items ():
57
82
flags = scheme_data ['flags' ]
@@ -80,21 +105,11 @@ def extract(page):
80
105
81
106
if scheme_data .get ('extract_json' , False ):
82
107
extracted = regexp_group .group (1 )
83
-
84
108
logging .debug ('Extracted: %s' , extracted )
85
109
86
- transforms = scheme_data .get ('transforms' , [])
87
- if transforms :
88
- for t in transforms :
89
- logging .debug (t )
90
- try :
91
- extracted = t (extracted )
92
- except PROCESS_ERRORS as e :
93
- logging .debug (f'Transform error: { e } ' )
94
- extracted = {}
95
- logging .debug (extracted )
110
+ transformed = transform (scheme_data , extracted )
96
111
97
- json_data = json .loads (extracted )
112
+ json_data = json .loads (transformed )
98
113
99
114
if json_data == {}:
100
115
logging .debug ('Unabled to extract json!' )
@@ -107,14 +122,17 @@ def extract(page):
107
122
with open ('debug_extracted.json' , 'w' ) as f :
108
123
f .write (loaded_json_str )
109
124
110
- for name , get_field in scheme_data ['fields' ].items ():
111
- try :
112
- value = get_field (json_data )
113
- values [name ] = str (value ) if value not in (None , [], {}) else ''
114
- except PROCESS_ERRORS as e :
115
- logging .debug (f'Unable to extact field { name } : { e } ' )
125
+ values = map_fields (scheme_data , json_data )
116
126
else :
117
- values = regexp_group .groupdict ()
127
+ groupdict = regexp_group .groupdict ()
128
+ if groupdict :
129
+ values = groupdict
130
+ else :
131
+ extracted = regexp_group .group (1 )
132
+ logging .debug ('Extracted: %s' , extracted )
133
+
134
+ transformed_data = transform (scheme_data , extracted )
135
+ values = map_fields (scheme_data , transformed_data )
118
136
119
137
if use_html_parser :
120
138
soup = bs (page , 'html.parser' )
0 commit comments