3
3
import asyncio
4
4
import logging
5
5
import math
6
- from dataclasses import dataclass
7
- from datetime import timedelta
6
+ from collections .abc import Iterable
8
7
from queue import Queue
9
- from time import sleep
10
8
from typing import TYPE_CHECKING , Any , TypedDict
11
9
12
10
from apify_shared .utils import filter_out_none_values_recursively , ignore_docs , parse_date_fields
17
15
from apify_client .clients .base import ResourceClient , ResourceClientAsync
18
16
19
17
if TYPE_CHECKING :
20
- from collections . abc import Iterable
18
+ from datetime import timedelta
21
19
22
20
from apify_shared .consts import StorageGeneralAccess
23
21
@@ -43,19 +41,6 @@ class BatchAddRequestsResult(TypedDict):
43
41
unprocessedRequests : list [dict ]
44
42
45
43
46
- @dataclass
47
- class AddRequestsBatch :
48
- """Batch of requests to add to the request queue.
49
-
50
- Args:
51
- requests: List of requests to be added to the request queue.
52
- num_of_retries: Number of times this batch has been retried.
53
- """
54
-
55
- requests : Iterable [dict ]
56
- num_of_retries : int = 0
57
-
58
-
59
44
class RequestQueueClient (ResourceClient ):
60
45
"""Sub-client for manipulating a single request queue."""
61
46
@@ -301,8 +286,8 @@ def batch_add_requests(
301
286
* ,
302
287
forefront : bool = False ,
303
288
max_parallel : int = 1 ,
304
- max_unprocessed_requests_retries : int = 3 ,
305
- min_delay_between_unprocessed_requests_retries : timedelta = timedelta ( milliseconds = 500 ) ,
289
+ max_unprocessed_requests_retries : int | None = None ,
290
+ min_delay_between_unprocessed_requests_retries : timedelta | None = None ,
306
291
) -> BatchAddRequestsResult :
307
292
"""Add requests to the request queue in batches.
308
293
@@ -316,13 +301,17 @@ def batch_add_requests(
316
301
max_parallel: Specifies the maximum number of parallel tasks for API calls. This is only applicable
317
302
to the async client. For the sync client, this value must be set to 1, as parallel execution
318
303
is not supported.
319
- max_unprocessed_requests_retries: Number of retry attempts for unprocessed requests.
320
- min_delay_between_unprocessed_requests_retries: Minimum delay between retry attempts for unprocessed
321
- requests.
304
+ max_unprocessed_requests_retries: Deprecated argument. Will be removed in next major release.
305
+ min_delay_between_unprocessed_requests_retries: Deprecated argument. Will be removed in next major release.
322
306
323
307
Returns:
324
308
Result containing lists of processed and unprocessed requests.
325
309
"""
310
+ if max_unprocessed_requests_retries :
311
+ logger .warning ('`max_unprocessed_requests_retries` is deprecated and not used anymore.' )
312
+ if min_delay_between_unprocessed_requests_retries :
313
+ logger .warning ('`min_delay_between_unprocessed_requests_retries` is deprecated and not used anymore.' )
314
+
326
315
if max_parallel != 1 :
327
316
raise NotImplementedError ('max_parallel is only supported in async client' )
328
317
@@ -339,38 +328,30 @@ def batch_add_requests(
339
328
)
340
329
341
330
# Put the batches into the queue for processing.
342
- queue = Queue [AddRequestsBatch ]()
331
+ queue = Queue [Iterable [ dict ] ]()
343
332
344
- for b in batches :
345
- queue .put (AddRequestsBatch ( b ) )
333
+ for batch in batches :
334
+ queue .put (batch )
346
335
347
336
processed_requests = list [dict ]()
348
337
unprocessed_requests = list [dict ]()
349
338
350
339
# Process all batches in the queue sequentially.
351
340
while not queue .empty ():
352
- batch = queue .get ()
341
+ request_batch = queue .get ()
353
342
354
343
# Send the batch to the API.
355
344
response = self .http_client .call (
356
345
url = self ._url ('requests/batch' ),
357
346
method = 'POST' ,
358
347
params = request_params ,
359
- json = list (batch . requests ),
348
+ json = list (request_batch ),
360
349
timeout_secs = _MEDIUM_TIMEOUT ,
361
350
)
362
351
363
- # Retry if the request failed and the retry limit has not been reached.
364
- if not response .is_success and batch .num_of_retries < max_unprocessed_requests_retries :
365
- batch .num_of_retries += 1
366
- sleep (min_delay_between_unprocessed_requests_retries .total_seconds ())
367
- queue .put (batch )
368
-
369
- # Otherwise, add the processed/unprocessed requests to their respective lists.
370
- else :
371
- response_parsed = parse_date_fields (pluck_data (response .json ()))
372
- processed_requests .extend (response_parsed .get ('processedRequests' , []))
373
- unprocessed_requests .extend (response_parsed .get ('unprocessedRequests' , []))
352
+ response_parsed = parse_date_fields (pluck_data (response .json ()))
353
+ processed_requests .extend (response_parsed .get ('processedRequests' , []))
354
+ unprocessed_requests .extend (response_parsed .get ('unprocessedRequests' , []))
374
355
375
356
return {
376
357
'processedRequests' : processed_requests ,
@@ -667,14 +648,12 @@ async def delete_request_lock(
667
648
668
649
async def _batch_add_requests_worker (
669
650
self ,
670
- queue : asyncio .Queue [AddRequestsBatch ],
651
+ queue : asyncio .Queue [Iterable [ dict ] ],
671
652
request_params : dict ,
672
- max_unprocessed_requests_retries : int ,
673
- min_delay_between_unprocessed_requests_retries : timedelta ,
674
653
) -> BatchAddRequestsResult :
675
654
"""Worker function to process a batch of requests.
676
655
677
- This worker will process batches from the queue, retrying requests that fail until the retry limit is reached .
656
+ This worker will process batches from the queue.
678
657
679
658
Return result containing lists of processed and unprocessed requests by the worker.
680
659
"""
@@ -684,7 +663,7 @@ async def _batch_add_requests_worker(
684
663
while True :
685
664
# Get the next batch from the queue.
686
665
try :
687
- batch = await queue .get ()
666
+ request_batch = await queue .get ()
688
667
except asyncio .CancelledError :
689
668
break
690
669
@@ -694,25 +673,13 @@ async def _batch_add_requests_worker(
694
673
url = self ._url ('requests/batch' ),
695
674
method = 'POST' ,
696
675
params = request_params ,
697
- json = list (batch . requests ),
676
+ json = list (request_batch ),
698
677
timeout_secs = _MEDIUM_TIMEOUT ,
699
678
)
700
679
701
680
response_parsed = parse_date_fields (pluck_data (response .json ()))
702
-
703
- # Retry if the request failed and the retry limit has not been reached.
704
- if not response .is_success and batch .num_of_retries < max_unprocessed_requests_retries :
705
- batch .num_of_retries += 1
706
- await asyncio .sleep (min_delay_between_unprocessed_requests_retries .total_seconds ())
707
- await queue .put (batch )
708
-
709
- # Otherwise, add the processed/unprocessed requests to their respective lists.
710
- else :
711
- processed_requests .extend (response_parsed .get ('processedRequests' , []))
712
- unprocessed_requests .extend (response_parsed .get ('unprocessedRequests' , []))
713
-
714
- except Exception as exc :
715
- logger .warning (f'Error occurred while processing a batch of requests: { exc } ' )
681
+ processed_requests .extend (response_parsed .get ('processedRequests' , []))
682
+ unprocessed_requests .extend (response_parsed .get ('unprocessedRequests' , []))
716
683
717
684
finally :
718
685
# Mark the batch as done whether it succeeded or failed.
@@ -729,8 +696,8 @@ async def batch_add_requests(
729
696
* ,
730
697
forefront : bool = False ,
731
698
max_parallel : int = 5 ,
732
- max_unprocessed_requests_retries : int = 3 ,
733
- min_delay_between_unprocessed_requests_retries : timedelta = timedelta ( milliseconds = 500 ) ,
699
+ max_unprocessed_requests_retries : int | None = None ,
700
+ min_delay_between_unprocessed_requests_retries : timedelta | None = None ,
734
701
) -> BatchAddRequestsResult :
735
702
"""Add requests to the request queue in batches.
736
703
@@ -744,15 +711,19 @@ async def batch_add_requests(
744
711
max_parallel: Specifies the maximum number of parallel tasks for API calls. This is only applicable
745
712
to the async client. For the sync client, this value must be set to 1, as parallel execution
746
713
is not supported.
747
- max_unprocessed_requests_retries: Number of retry attempts for unprocessed requests.
748
- min_delay_between_unprocessed_requests_retries: Minimum delay between retry attempts for unprocessed
749
- requests.
714
+ max_unprocessed_requests_retries: Deprecated argument. Will be removed in next major release.
715
+ min_delay_between_unprocessed_requests_retries: Deprecated argument. Will be removed in next major release.
750
716
751
717
Returns:
752
718
Result containing lists of processed and unprocessed requests.
753
719
"""
720
+ if max_unprocessed_requests_retries :
721
+ logger .warning ('`max_unprocessed_requests_retries` is deprecated and not used anymore.' )
722
+ if min_delay_between_unprocessed_requests_retries :
723
+ logger .warning ('`min_delay_between_unprocessed_requests_retries` is deprecated and not used anymore.' )
724
+
754
725
tasks = set [asyncio .Task ]()
755
- queue : asyncio .Queue [AddRequestsBatch ] = asyncio .Queue ()
726
+ queue : asyncio .Queue [Iterable [ dict ] ] = asyncio .Queue ()
756
727
request_params = self ._params (clientKey = self .client_key , forefront = forefront )
757
728
758
729
# Compute the payload size limit to ensure it doesn't exceed the maximum allowed size.
@@ -766,15 +737,13 @@ async def batch_add_requests(
766
737
)
767
738
768
739
for batch in batches :
769
- await queue .put (AddRequestsBatch ( batch ) )
740
+ await queue .put (batch )
770
741
771
742
# Start a required number of worker tasks to process the batches.
772
743
for i in range (max_parallel ):
773
744
coro = self ._batch_add_requests_worker (
774
745
queue ,
775
746
request_params ,
776
- max_unprocessed_requests_retries ,
777
- min_delay_between_unprocessed_requests_retries ,
778
747
)
779
748
task = asyncio .create_task (coro , name = f'batch_add_requests_worker_{ i } ' )
780
749
tasks .add (task )
0 commit comments