Skip to content

Commit a405fe1

Browse files
committed
4827 WIP !!!
1 parent a53dd7e commit a405fe1

File tree

6 files changed

+107
-78
lines changed

6 files changed

+107
-78
lines changed

airbyte-integrations/bases/source-acceptance-test/source_acceptance_test/tests/test_core.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ def _validate_schema(records, configured_catalog):
155155
for stream_name, errors in streams_errors.items():
156156
errors = map(str, errors.values())
157157
str_errors = f"\n{bar}\n".join(errors)
158-
logging.error(f"The {stream_name} stream has the following schema errors:\n{str_errors}")
158+
logging.error(f"\n3The {stream_name} stream has the following schema errors:\n{str_errors}")
159159

160160
if streams_errors:
161161
pytest.fail(f"Please check your json_schema in selected streams {tuple(streams_errors.keys())}.")

airbyte-integrations/connectors/source-facebook-marketing/acceptance-test-config.yml

+3-2
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,12 @@ tests:
1515
- config_path: "secrets/config.json"
1616
configured_catalog_path: "integration_tests/configured_catalog.json"
1717
# FB serializes numeric fields as strings
18-
validate_schema: no
18+
# validate_schema: no
19+
timeout_seconds: 600
1920
incremental:
2021
- config_path: "secrets/config.json"
2122
configured_catalog_path: "integration_tests/configured_catalog_without_insights.json"
22-
future_state_path: "integration_tests/abnormal_state.json"
23+
future_state_path: "integration_tests/future_state.json"
2324
full_refresh:
2425
- config_path: "secrets/config.json"
2526
configured_catalog_path: "integration_tests/configured_catalog.json"
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,5 @@
11
{
22
"streams": [
3-
{
4-
"stream": {
5-
"name": "campaigns",
6-
"json_schema": {},
7-
"supported_sync_modes": ["full_refresh", "incremental"],
8-
"source_defined_cursor": true,
9-
"default_cursor_field": ["updated_time"],
10-
"source_defined_primary_key": [["id"]],
11-
"namespace": null
12-
},
13-
"sync_mode": "incremental",
14-
"cursor_field": null,
15-
"destination_sync_mode": "append",
16-
"primary_key": null
17-
},
183
{
194
"stream": {
205
"name": "ad_sets",
@@ -30,36 +15,6 @@
3015
"destination_sync_mode": "append",
3116
"primary_key": null
3217
},
33-
{
34-
"stream": {
35-
"name": "ads",
36-
"json_schema": {},
37-
"supported_sync_modes": ["full_refresh", "incremental"],
38-
"source_defined_cursor": true,
39-
"default_cursor_field": ["updated_time"],
40-
"source_defined_primary_key": [["id"]],
41-
"namespace": null
42-
},
43-
"sync_mode": "incremental",
44-
"cursor_field": null,
45-
"destination_sync_mode": "append",
46-
"primary_key": null
47-
},
48-
{
49-
"stream": {
50-
"name": "ad_creatives",
51-
"json_schema": {},
52-
"supported_sync_modes": ["full_refresh"],
53-
"source_defined_cursor": null,
54-
"default_cursor_field": null,
55-
"source_defined_primary_key": [["id"]],
56-
"namespace": null
57-
},
58-
"sync_mode": "full_refresh",
59-
"cursor_field": null,
60-
"destination_sync_mode": "append",
61-
"primary_key": null
62-
},
6318
{
6419
"stream": {
6520
"name": "ads_insights",
@@ -74,21 +29,6 @@
7429
"cursor_field": ["date_start"],
7530
"destination_sync_mode": "append",
7631
"primary_key": null
77-
},
78-
{
79-
"stream": {
80-
"name": "ads_insights_age_and_gender",
81-
"json_schema": {},
82-
"supported_sync_modes": ["full_refresh", "incremental"],
83-
"source_defined_cursor": true,
84-
"default_cursor_field": ["date_start"],
85-
"source_defined_primary_key": null,
86-
"namespace": null
87-
},
88-
"sync_mode": "incremental",
89-
"cursor_field": ["date_start"],
90-
"destination_sync_mode": "append",
91-
"primary_key": null
9232
}
9333
]
9434
}
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,42 @@
11
{
22
"campaigns": {
3-
"updated_time": "2021-07-25T13:34:26Z",
3+
"updated_time": "2121-07-25T13:34:26Z",
44
"include_deleted": true
55
},
66
"ad_creatives": {
7-
"updated_time": "2021-07-25T13:34:26Z",
7+
"updated_time": "2121-07-25T13:34:26Z",
88
"include_deleted": true
99
},
1010
"ad_sets": {
11-
"updated_time": "2021-07-25T13:34:26Z",
11+
"updated_time": "2121-07-25T13:34:26Z",
1212
"include_deleted": true
1313
},
1414
"ads": {
15-
"updated_time": "2021-07-25T13:34:26Z",
15+
"updated_time": "2121-07-25T13:34:26Z",
1616
"include_deleted": true
1717
},
1818
"ads_insights": {
19-
"date_start": "2021-07-25T13:34:26Z",
19+
"date_start": "2121-07-25T13:34:26Z",
2020
"include_deleted": true
2121
},
2222
"ads_insights_age_and_gender": {
23-
"date_start": "2021-07-25T13:34:26Z",
23+
"date_start": "2121-07-25T13:34:26Z",
2424
"include_deleted": true
2525
},
2626
"ads_insights_country": {
27-
"date_start": "2021-07-25T13:34:26Z",
27+
"date_start": "2121-07-25T13:34:26Z",
2828
"include_deleted": true
2929
},
3030
"ads_insights_dma": {
31-
"date_start": "2021-07-25T13:34:26Z",
31+
"date_start": "2121-07-25T13:34:26Z",
3232
"include_deleted": true
3333
},
3434
"ads_insights_platfrom_and_device": {
35-
"date_start": "2021-07-25T13:34:26Z",
35+
"date_start": "2121-07-25T13:34:26Z",
3636
"include_deleted": true
3737
},
3838
"ads_insights_region": {
39-
"date_start": "2021-07-25T13:34:26Z",
39+
"date_start": "2121-07-25T13:34:26Z",
4040
"include_deleted": true
4141
}
4242
}

airbyte-integrations/connectors/source-facebook-marketing/source_facebook_marketing/schemas/ads.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@
148148
"action.type": {
149149
"type": ["null", "array"],
150150
"items": {
151-
"type": ["null", "string"]
151+
"type": ["null", "array"]
152152
}
153153
},
154154
"post.wall": {

airbyte-integrations/connectors/source-facebook-marketing/source_facebook_marketing/streams.py

+92-4
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,78 @@ def read_records(
107107
) -> Iterable[Mapping[str, Any]]:
108108
"""Main read method used by CDK"""
109109
for record in self._read_records(params=self.request_params(stream_state=stream_state)):
110-
yield self._extend_record(record, fields=self.fields)
110+
yield self.transform(self._extend_record(record, fields=self.fields))
111+
112+
# for i in range(3):
113+
# yield self.transform(
114+
# {
115+
# 'tracking_specs': [
116+
# {'action.type': ['offsite_conversion'], 'fb_pixel': ['2667253716886462']},
117+
# {'action.type': ['attention_event'], 'creative': ['23846815595220398']},
118+
# {'action.type': ['post_engagement'], 'page': ['112704783733939'], 'post': ['244953057175777']},
119+
# {'action.type': ['link_click'], 'post': ['244953057175777'], 'post.wall': ['112704783733939']},
120+
# {'action.type': ['dwell'], 'creative': ['23846815595220398']}
121+
# ],
122+
# 'updated_time': '2021-02-15T08:49:56-0800'
123+
# }
124+
# )
125+
126+
def transform(self, record: Mapping[str, Any]) -> Mapping[str, Any]:
127+
"""
128+
Use this method to remove update fields types in record according to schema.
129+
"""
130+
schema = self.get_json_schema()
131+
self.logger.error(f"12{str(record)}")
132+
self.lol_dict(record, schema["properties"])
133+
134+
return record
135+
136+
def get_python_type(self, _types):
137+
types_mapping = {
138+
"string": str,
139+
"number": float,
140+
"integer": int,
141+
"null": None,
142+
"object": dict,
143+
"array": list,
144+
"boolean": bool,
145+
}
146+
147+
if isinstance(_types, list):
148+
return tuple([types_mapping[t] for t in _types if t != "null"])
149+
150+
return tuple(types_mapping[_types])
151+
152+
def lol_dict(self, record, schema):
153+
for key, value in record.items():
154+
if key not in schema:
155+
continue
156+
157+
if isinstance(value, dict):
158+
self.lol_dict(record=value, schema=schema[key].get("properties", {}))
159+
elif isinstance(value, list) and "items" in schema[key]:
160+
for record_list_item in value:
161+
if list in self.get_python_type(schema[key]["items"]["type"]):
162+
# TODO If you have list of lists then add `if` below
163+
pass
164+
elif dict in self.get_python_type(schema[key]["items"]["type"]):
165+
self.lol_dict(record=record_list_item, schema=schema[key]["items"]["properties"])
166+
elif not isinstance(record_list_item, self.get_python_type(schema[key]["items"]["type"])):
167+
record[key] = self.get_python_type(schema[key]["items"]["type"])[0](record_list_item)
168+
169+
if not isinstance(value, self.get_python_type(schema[key]["type"])):
170+
record[key] = self.get_python_type(schema[key]["type"])[0](value)
171+
172+
173+
# def lol_list(self, record, list_records, schema):
174+
# for list_item in list_records:
175+
# if list in self.get_python_type(schema[key]["items"]["type"]):
176+
# # TODO If you have list of lists then add `if` below
177+
# pass
178+
# elif dict in self.get_python_type(schema[key]["items"]["type"]):
179+
# self.lol_dict(record=record_list_item, schema=schema[key]["items"]["properties"])
180+
# elif not isinstance(record_list_item, self.get_python_type(schema[key]["items"]["type"])):
181+
# record[key] = self.get_python_type(schema[key]["items"]["type"])[0](record_list_item)
111182

112183
def _read_records(self, params: Mapping[str, Any]) -> Iterable:
113184
"""Wrapper around query to backoff errors.
@@ -295,7 +366,7 @@ class AdsInsights(FBMarketingIncrementalStream):
295366
MAX_WAIT_TO_START = pendulum.duration(minutes=5)
296367
MAX_WAIT_TO_FINISH = pendulum.duration(minutes=30)
297368
MAX_ASYNC_SLEEP = pendulum.duration(minutes=5)
298-
MAX_ASYNC_JOBS = 3
369+
MAX_ASYNC_JOBS = 10
299370
INSIGHTS_RETENTION_PERIOD = pendulum.duration(days=37 * 30)
300371

301372
action_breakdowns = ALL_ACTION_BREAKDOWNS
@@ -305,6 +376,23 @@ class AdsInsights(FBMarketingIncrementalStream):
305376

306377
breakdowns = []
307378

379+
fields_to_transform = (
380+
(int, ("clicks", "impressions", "reach", "unique_clicks", )),
381+
(float, ("frequency", "social_spend", "spend", "wish_bid", )),
382+
383+
384+
(list, (
385+
("actions", (
386+
(int, ("1d_click", "7d_click", "28d_click", )),
387+
(float, ("value", ))
388+
)),
389+
("unique_actions", (
390+
(int, ("1d_click", "7d_click", "28d_click",)),
391+
(float, ("value",))
392+
)),
393+
)),
394+
)
395+
308396
def __init__(self, buffer_days, days_per_job, **kwargs):
309397
super().__init__(**kwargs)
310398
self.lookback_window = pendulum.duration(days=buffer_days)
@@ -322,7 +410,7 @@ def read_records(
322410
# because we query `lookback_window` days before actual cursor we might get records older then cursor
323411

324412
for obj in result.get_result():
325-
yield obj.export_all_data()
413+
yield self.transform(obj.export_all_data())
326414

327415
def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]:
328416
"""Slice by date periods and schedule async job for each period, run at most MAX_ASYNC_JOBS jobs at the same time.
@@ -353,7 +441,7 @@ def wait_for_job(self, job) -> AdReportRun:
353441
job = job.api_get()
354442
job_progress_pct = job["async_percent_completion"]
355443
job_id = job["report_run_id"]
356-
self.logger.info(f"ReportRunId {job_id} is {job_progress_pct}% complete")
444+
self.logger.info(f"ReportRunId {job_id} is {job_progress_pct}% complete ({job['async_status']})")
357445
runtime = pendulum.now() - start_time
358446

359447
if job["async_status"] == "Job Completed":

0 commit comments

Comments
 (0)