2
2
# Copyright (c) 2021 Airbyte, Inc., all rights reserved.
3
3
#
4
4
5
- import time
6
5
import urllib .parse as urlparse
7
6
from abc import ABC
8
- from collections import deque
9
7
from datetime import datetime
10
8
from typing import Any , Iterable , Iterator , List , Mapping , MutableMapping , Optional , Sequence
11
9
21
19
from facebook_business .api import FacebookAdsApiBatch , FacebookRequest , FacebookResponse
22
20
from facebook_business .exceptions import FacebookRequestError
23
21
from source_facebook_marketing .api import API
22
+ from source_facebook_marketing .async_job_manager import InsightsAsyncJobManager
24
23
25
- from .async_job import AsyncJob
26
- from .common import FacebookAPIException , JobException , batch , deep_merge , retry_pattern
24
+ from .common import FacebookAPIException , batch , deep_merge , retry_pattern
27
25
28
26
backoff_policy = retry_pattern (backoff .expo , FacebookRequestError , max_tries = 5 , factor = 5 )
29
27
@@ -291,8 +289,8 @@ class AdsInsights(FBMarketingIncrementalStream):
291
289
]
292
290
293
291
MAX_ASYNC_SLEEP = pendulum .duration (minutes = 5 )
294
- MAX_ASYNC_JOBS = 10
295
- INSIGHTS_RETENTION_PERIOD = pendulum . duration ( days = 37 * 30 )
292
+ MAX_ASYNC_JOBS = 1000
293
+ INSIGHTS_RETENTION_PERIOD_MONTHES = 37
296
294
297
295
action_breakdowns = ALL_ACTION_BREAKDOWNS
298
296
level = "ad"
@@ -314,7 +312,7 @@ def __init__(
314
312
315
313
super ().__init__ (** kwargs )
316
314
self .lookback_window = pendulum .duration (days = buffer_days )
317
- self ._days_per_job = days_per_job
315
+ self ._days_per_job = 5 # days_per_job
318
316
self ._fields = fields
319
317
self .action_breakdowns = action_breakdowns or self .action_breakdowns
320
318
self .breakdowns = breakdowns or self .breakdowns
@@ -336,7 +334,7 @@ def read_records(
336
334
stream_state : Mapping [str , Any ] = None ,
337
335
) -> Iterable [Mapping [str , Any ]]:
338
336
"""Waits for current job to finish (slice) and yield its result"""
339
- job = self . wait_for_job ( stream_slice ["job" ])
337
+ job = stream_slice ["job" ]
340
338
# because we query `lookback_window` days before actual cursor we might get records older then cursor
341
339
342
340
for obj in job .get_result ():
@@ -349,50 +347,37 @@ def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs) -> Ite
349
347
2. we should run as many job as possible before checking for result
350
348
3. we shouldn't proceed to consumption of the next job before previous succeed
351
349
"""
352
- stream_state = stream_state or {}
353
- running_jobs = deque ()
354
- date_ranges = list (self ._date_ranges (stream_state = stream_state ))
355
- for params in date_ranges :
356
- params = deep_merge (params , self .request_params (stream_state = stream_state ))
357
- job = AsyncJob (api = self ._api , params = params )
358
- job .start ()
359
- running_jobs .append (job )
360
- if len (running_jobs ) >= self .MAX_ASYNC_JOBS :
361
- yield {"job" : running_jobs .popleft ()}
362
-
363
- while running_jobs :
364
- yield {"job" : running_jobs .popleft ()}
365
-
366
- @retry_pattern (backoff .expo , JobException , max_tries = 10 , factor = 5 )
367
- def wait_for_job (self , job : AsyncJob ) -> AsyncJob :
368
- if job .failed :
369
- job .restart ()
370
-
371
- factor = 2
372
- sleep_seconds = factor
373
- while not job .completed :
374
- self .logger .info (f"{ job } : sleeping { sleep_seconds } seconds while waiting for completion" )
375
- time .sleep (sleep_seconds )
376
- if sleep_seconds < self .MAX_ASYNC_SLEEP .in_seconds ():
377
- sleep_seconds *= factor
378
-
379
- return job
380
350
381
- def request_params (self , stream_state : Mapping [str , Any ], ** kwargs ) -> MutableMapping [str , Any ]:
382
- params = super ().request_params (stream_state = stream_state , ** kwargs )
383
- params = deep_merge (
384
- params ,
385
- {
386
- "level" : self .level ,
387
- "action_breakdowns" : self .action_breakdowns ,
388
- "breakdowns" : self .breakdowns ,
389
- "fields" : self .fields ,
390
- "time_increment" : self .time_increment ,
391
- "action_attribution_windows" : self .action_attribution_windows ,
392
- },
351
+ job_params = self .request_params (stream_state = stream_state )
352
+ job_manager = InsightsAsyncJobManager (
353
+ api = self ._api ,
354
+ job_params = job_params ,
355
+ start_days_per_job = 5 ,
356
+ from_date = self .get_start_date (stream_state ),
357
+ to_date = self ._end_date ,
393
358
)
359
+ job_manager .add_async_jobs ()
394
360
395
- return params
361
+ while not job_manager .done ():
362
+ yield {"job" : job_manager .get_next_completed_job ()}
363
+
364
+ def get_start_date (self , stream_state : Mapping [str , Any ]) -> pendulum .Date :
365
+ state_value = stream_state .get (self .cursor_field ) if stream_state else None
366
+ if state_value :
367
+ start_date = pendulum .parse (state_value ) - self .lookback_window
368
+ else :
369
+ start_date = self ._start_date
370
+ return max (self ._end_date .subtract (months = self .INSIGHTS_RETENTION_PERIOD_MONTHES ), start_date )
371
+
372
+ def request_params (self , stream_state : Mapping [str , Any ], ** kwargs ) -> MutableMapping [str , Any ]:
373
+ return {
374
+ "level" : self .level ,
375
+ "action_breakdowns" : self .action_breakdowns ,
376
+ "breakdowns" : self .breakdowns ,
377
+ "fields" : self .fields ,
378
+ "time_increment" : self .time_increment ,
379
+ "action_attribution_windows" : self .action_attribution_windows ,
380
+ }
396
381
397
382
def _state_filter (self , stream_state : Mapping [str , Any ]) -> Mapping [str , Any ]:
398
383
"""Works differently for insights, so remove it"""
@@ -435,26 +420,6 @@ def _schema_for_breakdowns(self) -> Mapping[str, Any]:
435
420
436
421
return {breakdown : schemas [breakdown ] for breakdown in self .breakdowns }
437
422
438
- def _date_ranges (self , stream_state : Mapping [str , Any ]) -> Iterator [dict ]:
439
- """Iterate over period between start_date/state and now
440
-
441
- Notes: Facebook freezes insight data 28 days after it was generated, which means that all data
442
- from the past 28 days may have changed since we last emitted it, so we retrieve it again.
443
- """
444
- state_value = stream_state .get (self .cursor_field )
445
- if state_value :
446
- start_date = pendulum .parse (state_value ) - self .lookback_window
447
- else :
448
- start_date = self ._start_date
449
- end_date = self ._end_date
450
- start_date = max (end_date - self .INSIGHTS_RETENTION_PERIOD , start_date )
451
-
452
- for since in pendulum .period (start_date , end_date ).range ("days" , self ._days_per_job ):
453
- until = min (since .add (days = self ._days_per_job - 1 ), end_date ) # -1 because time_range is inclusive
454
- yield {
455
- "time_range" : {"since" : since .to_date_string (), "until" : until .to_date_string ()},
456
- }
457
-
458
423
459
424
class AdsInsightsAgeAndGender (AdsInsights ):
460
425
breakdowns = ["age" , "gender" ]
0 commit comments