2
2
# Copyright (c) 2021 Airbyte, Inc., all rights reserved.
3
3
#
4
4
5
- import os
6
5
import time
7
6
from abc import ABC , abstractmethod
8
7
from copy import deepcopy
9
8
from typing import Any , Iterable , List , Mapping , MutableMapping , Optional , Union
10
9
from urllib import parse
11
10
12
11
import requests
13
- import vcr
14
12
from airbyte_cdk .models import SyncMode
15
13
from airbyte_cdk .sources .streams .http import HttpStream , HttpSubStream
16
14
from requests .exceptions import HTTPError
17
- from vcr .cassette import Cassette
18
-
19
-
20
- def request_cache () -> Cassette :
21
- """
22
- Builds VCR instance.
23
- It deletes file everytime we create it, normally should be called only once.
24
- We can't use NamedTemporaryFile here because yaml serializer doesn't work well with empty files.
25
- """
26
- filename = "request_cache.yml"
27
- try :
28
- os .remove (filename )
29
- except FileNotFoundError :
30
- pass
31
-
32
- return vcr .use_cassette (str (filename ), record_mode = "new_episodes" , serializer = "yaml" )
33
15
34
16
35
17
class GithubStream (HttpStream , ABC ):
36
- cache = request_cache ()
37
18
url_base = "https://api.github.com/"
38
19
39
- # To prevent dangerous behavior, the `vcr` library prohibits the use of nested caching.
40
- # Here's an example of dangerous behavior:
41
- # cache = Cassette.use('whatever')
42
- # with cache:
43
- # with cache:
44
- # pass
45
- #
46
- # Therefore, we will only use `cache` for the top-level stream, so as not to cause possible difficulties.
47
- top_level_stream = True
48
-
49
20
primary_key = "id"
21
+ use_cache = True
50
22
51
23
# GitHub pagination could be from 1 to 100.
52
24
page_size = 100
@@ -100,11 +72,7 @@ def backoff_time(self, response: requests.Response) -> Union[int, float]:
100
72
101
73
def read_records (self , stream_slice : Mapping [str , any ] = None , ** kwargs ) -> Iterable [Mapping [str , Any ]]:
102
74
try :
103
- if self .top_level_stream :
104
- with self .cache :
105
- yield from super ().read_records (stream_slice = stream_slice , ** kwargs )
106
- else :
107
- yield from super ().read_records (stream_slice = stream_slice , ** kwargs )
75
+ yield from super ().read_records (stream_slice = stream_slice , ** kwargs )
108
76
except HTTPError as e :
109
77
error_msg = str (e )
110
78
@@ -422,6 +390,7 @@ class PullRequests(SemiIncrementalGithubStream):
422
390
"""
423
391
424
392
page_size = 50
393
+ first_read_override_key = "first_read_override"
425
394
426
395
def __init__ (self , ** kwargs ):
427
396
super ().__init__ (** kwargs )
@@ -431,7 +400,7 @@ def read_records(self, stream_state: Mapping[str, Any] = None, **kwargs) -> Iter
431
400
"""
432
401
Decide if this a first read or not by the presence of the state object
433
402
"""
434
- self ._first_read = not bool (stream_state )
403
+ self ._first_read = not bool (stream_state ) or stream_state . get ( self . first_read_override_key , False )
435
404
yield from super ().read_records (stream_state = stream_state , ** kwargs )
436
405
437
406
def path (self , stream_slice : Mapping [str , Any ] = None , ** kwargs ) -> str :
@@ -459,7 +428,7 @@ def is_sorted_descending(self) -> bool:
459
428
"""
460
429
Depending if there any state we read stream in ascending or descending order.
461
430
"""
462
- return self ._first_read
431
+ return not self ._first_read
463
432
464
433
465
434
class CommitComments (SemiIncrementalGithubStream ):
@@ -686,23 +655,42 @@ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
686
655
# Pull request substreams
687
656
688
657
689
- class PullRequestSubstream (HttpSubStream , GithubStream , ABC ):
690
- top_level_stream = False
658
+ class PullRequestSubstream (HttpSubStream , SemiIncrementalGithubStream , ABC ):
659
+ use_cache = False
691
660
692
661
def __init__ (self , parent : PullRequests , ** kwargs ):
693
662
super ().__init__ (parent = parent , ** kwargs )
694
663
695
664
def stream_slices (
696
665
self , sync_mode : SyncMode , cursor_field : List [str ] = None , stream_state : Mapping [str , Any ] = None
697
666
) -> Iterable [Optional [Mapping [str , Any ]]]:
698
- parent_stream_slices = super ().stream_slices (sync_mode = sync_mode , cursor_field = cursor_field , stream_state = stream_state )
699
-
667
+ """
668
+ Override the parent PullRequests stream configuration to always fetch records in ascending order
669
+ """
670
+ parent_state = deepcopy (stream_state ) or {}
671
+ parent_state [PullRequests .first_read_override_key ] = True
672
+ parent_stream_slices = super ().stream_slices (sync_mode = sync_mode , cursor_field = cursor_field , stream_state = parent_state )
700
673
for parent_stream_slice in parent_stream_slices :
701
674
yield {
702
675
"pull_request_number" : parent_stream_slice ["parent" ]["number" ],
703
676
"repository" : parent_stream_slice ["parent" ]["repository" ],
704
677
}
705
678
679
+ def read_records (
680
+ self ,
681
+ sync_mode : SyncMode ,
682
+ cursor_field : List [str ] = None ,
683
+ stream_slice : Mapping [str , Any ] = None ,
684
+ stream_state : Mapping [str , Any ] = None ,
685
+ ) -> Iterable [Mapping [str , Any ]]:
686
+ """
687
+ We've already determined the list of pull requests to run the stream against.
688
+ Skip the start_point_map and cursor_field logic in SemiIncrementalGithubStream.read_records.
689
+ """
690
+ yield from super (SemiIncrementalGithubStream , self ).read_records (
691
+ sync_mode = sync_mode , cursor_field = cursor_field , stream_slice = stream_slice , stream_state = stream_state
692
+ )
693
+
706
694
707
695
class PullRequestStats (PullRequestSubstream ):
708
696
"""
@@ -731,19 +719,29 @@ class Reviews(PullRequestSubstream):
731
719
API docs: https://docs.github.com/en/rest/reference/pulls#list-reviews-for-a-pull-request
732
720
"""
733
721
722
+ cursor_field = "submitted_at"
723
+
734
724
def path (
735
725
self , stream_state : Mapping [str , Any ] = None , stream_slice : Mapping [str , Any ] = None , next_page_token : Mapping [str , Any ] = None
736
726
) -> str :
737
727
return f"repos/{ stream_slice ['repository' ]} /pulls/{ stream_slice ['pull_request_number' ]} /reviews"
738
728
729
+ # Set the parent stream state's cursor field before fetching its records
730
+ def stream_slices (self , stream_state : Mapping [str , Any ] = None , ** kwargs ) -> Iterable [Optional [Mapping [str , Any ]]]:
731
+ parent_state = deepcopy (stream_state ) or {}
732
+ for repository in self .repositories :
733
+ if repository in parent_state and self .cursor_field in parent_state [repository ]:
734
+ parent_state [repository ][self .parent .cursor_field ] = parent_state [repository ][self .cursor_field ]
735
+ yield from super ().stream_slices (stream_state = parent_state , ** kwargs )
736
+
739
737
740
738
# Reactions streams
741
739
742
740
743
741
class ReactionStream (GithubStream , ABC ):
744
742
745
743
parent_key = "id"
746
- top_level_stream = False
744
+ use_cache = False
747
745
748
746
def __init__ (self , ** kwargs ):
749
747
self ._stream_kwargs = deepcopy (kwargs )
0 commit comments