Skip to content

Commit 5b6b48c

Browse files
authored
🎉 Source GitHub: Use CDK caching and convert PR-related streams to incremental (#7250)
* Source GitHub: Use CDK caching and convert PR-related streams to incremental * Remove extra change * Consolidate * Address comments * Fix integration test config * Fix merge * Update sample state * Bump release version * Bump version * Address feedback * Bump version * Fix formatting
1 parent 678cfbe commit 5b6b48c

File tree

10 files changed

+82
-51
lines changed

10 files changed

+82
-51
lines changed

‎airbyte-config/init/src/main/resources/seed/source_definitions.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@
217217
- name: GitHub
218218
sourceDefinitionId: ef69ef6e-aa7f-4af1-a01d-ef775033524e
219219
dockerRepository: airbyte/source-github
220-
dockerImageTag: 0.2.9
220+
dockerImageTag: 0.2.10
221221
documentationUrl: https://docs.airbyte.io/integrations/sources/github
222222
icon: github.svg
223223
sourceType: api

‎airbyte-integrations/connectors/source-github/Dockerfile

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@ RUN pip install .
1212
ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
1313
ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
1414

15-
LABEL io.airbyte.version=0.2.9
15+
LABEL io.airbyte.version=0.2.10
1616
LABEL io.airbyte.name=airbyte/source-github

‎airbyte-integrations/connectors/source-github/acceptance-test-config.yml

+2
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,11 @@ tests:
2525
issue_milestones: ["airbytehq/integration-test", "updated_at"]
2626
issues: ["airbytehq/integration-test", "updated_at"]
2727
projects: ["airbytehq/integration-test", "updated_at"]
28+
pull_request_stats: ["airbytehq/integration-test", "updated_at"]
2829
pull_requests: ["airbytehq/integration-test", "updated_at"]
2930
releases: ["airbytehq/integration-test", "created_at"]
3031
review_comments: ["airbytehq/integration-test", "updated_at"]
32+
reviews: ["airbytehq/integration-test", "submitted_at"]
3133
stargazers: ["airbytehq/integration-test", "starred_at"]
3234
full_refresh:
3335
- config_path: "secrets/config.json"

‎airbyte-integrations/connectors/source-github/integration_tests/abnormal_state.json

+10
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@
3939
"updated_at": "2121-06-28T17:24:51Z"
4040
}
4141
},
42+
"pull_request_stats": {
43+
"airbytehq/integration-test": {
44+
"updated_at": "2121-06-29T02:04:57Z"
45+
}
46+
},
4247
"pull_requests": {
4348
"airbytehq/integration-test": {
4449
"updated_at": "2121-06-28T23:36:35Z"
@@ -54,6 +59,11 @@
5459
"updated_at": "2121-06-23T23:57:07Z"
5560
}
5661
},
62+
"reviews": {
63+
"airbytehq/integration-test": {
64+
"submitted_at": "2121-06-29T02:04:57Z"
65+
}
66+
},
5767
"stargazers": {
5868
"airbytehq/integration-test": {
5969
"starred_at": "2121-06-29T02:04:57Z"

‎airbyte-integrations/connectors/source-github/integration_tests/configured_catalog.json

+12-6
Original file line numberDiff line numberDiff line change
@@ -198,11 +198,14 @@
198198
"stream": {
199199
"name": "pull_request_stats",
200200
"json_schema": {},
201-
"supported_sync_modes": ["full_refresh"],
201+
"supported_sync_modes": ["full_refresh", "incremental"],
202+
"source_defined_cursor": true,
203+
"default_cursor_field": ["updated_at"],
202204
"source_defined_primary_key": [["id"]]
203205
},
204-
"sync_mode": "full_refresh",
205-
"destination_sync_mode": "overwrite"
206+
"sync_mode": "incremental",
207+
"destination_sync_mode": "append",
208+
"cursor_field": ["updated_at"]
206209
},
207210
{
208211
"stream": {
@@ -257,11 +260,14 @@
257260
"stream": {
258261
"name": "reviews",
259262
"json_schema": {},
260-
"supported_sync_modes": ["full_refresh"],
263+
"supported_sync_modes": ["full_refresh", "incremental"],
264+
"source_defined_cursor": true,
265+
"default_cursor_field": ["submitted_at"],
261266
"source_defined_primary_key": [["id"]]
262267
},
263-
"sync_mode": "full_refresh",
264-
"destination_sync_mode": "overwrite"
268+
"sync_mode": "incremental",
269+
"destination_sync_mode": "append",
270+
"cursor_field": ["submitted_at"]
265271
},
266272
{
267273
"stream": {

‎airbyte-integrations/connectors/source-github/integration_tests/sample_state.json

+10
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@
2929
"created_at": "2021-06-23T23:57:07Z"
3030
}
3131
},
32+
"pull_request_stats": {
33+
"airbytehq/integration-test": {
34+
"updated_at": "2021-08-30T12:01:15Z"
35+
}
36+
},
3237
"pull_requests": {
3338
"airbytehq/integration-test": {
3439
"updated_at": "2021-06-28T23:36:35Z"
@@ -53,5 +58,10 @@
5358
"airbytehq/integration-test": {
5459
"created_at": "2021-06-30T10:04:41Z"
5560
}
61+
},
62+
"reviews": {
63+
"airbytehq/integration-test": {
64+
"submitted_at": "2021-08-30T12:01:15Z"
65+
}
5666
}
5767
}

‎airbyte-integrations/connectors/source-github/source_github/schemas/pull_request_stats.json

+4
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@
4949
},
5050
"changed_files": {
5151
"type": ["null", "integer"]
52+
},
53+
"updated_at": {
54+
"type": ["null", "string"],
55+
"format": "date-time"
5256
}
5357
}
5458
}

‎airbyte-integrations/connectors/source-github/source_github/source.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -179,12 +179,12 @@ def streams(self, config: Mapping[str, Any]) -> List[Stream]:
179179
Organizations(**organization_args),
180180
Projects(**repository_args_with_start_date),
181181
PullRequestCommentReactions(**repository_args_with_start_date),
182-
PullRequestStats(parent=pull_requests_stream, **repository_args),
182+
PullRequestStats(parent=pull_requests_stream, **repository_args_with_start_date),
183183
PullRequests(**repository_args_with_start_date),
184184
Releases(**repository_args_with_start_date),
185185
Repositories(**organization_args),
186186
ReviewComments(**repository_args_with_start_date),
187-
Reviews(parent=pull_requests_stream, **repository_args),
187+
Reviews(parent=pull_requests_stream, **repository_args_with_start_date),
188188
Stargazers(**repository_args_with_start_date),
189189
Tags(**repository_args),
190190
Teams(**organization_args),

‎airbyte-integrations/connectors/source-github/source_github/streams.py

+39-41
Original file line numberDiff line numberDiff line change
@@ -2,51 +2,23 @@
22
# Copyright (c) 2021 Airbyte, Inc., all rights reserved.
33
#
44

5-
import os
65
import time
76
from abc import ABC, abstractmethod
87
from copy import deepcopy
98
from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Union
109
from urllib import parse
1110

1211
import requests
13-
import vcr
1412
from airbyte_cdk.models import SyncMode
1513
from airbyte_cdk.sources.streams.http import HttpStream, HttpSubStream
1614
from requests.exceptions import HTTPError
17-
from vcr.cassette import Cassette
18-
19-
20-
def request_cache() -> Cassette:
21-
"""
22-
Builds VCR instance.
23-
It deletes file everytime we create it, normally should be called only once.
24-
We can't use NamedTemporaryFile here because yaml serializer doesn't work well with empty files.
25-
"""
26-
filename = "request_cache.yml"
27-
try:
28-
os.remove(filename)
29-
except FileNotFoundError:
30-
pass
31-
32-
return vcr.use_cassette(str(filename), record_mode="new_episodes", serializer="yaml")
3315

3416

3517
class GithubStream(HttpStream, ABC):
36-
cache = request_cache()
3718
url_base = "https://api.github.com/"
3819

39-
# To prevent dangerous behavior, the `vcr` library prohibits the use of nested caching.
40-
# Here's an example of dangerous behavior:
41-
# cache = Cassette.use('whatever')
42-
# with cache:
43-
# with cache:
44-
# pass
45-
#
46-
# Therefore, we will only use `cache` for the top-level stream, so as not to cause possible difficulties.
47-
top_level_stream = True
48-
4920
primary_key = "id"
21+
use_cache = True
5022

5123
# GitHub pagination could be from 1 to 100.
5224
page_size = 100
@@ -100,11 +72,7 @@ def backoff_time(self, response: requests.Response) -> Union[int, float]:
10072

10173
def read_records(self, stream_slice: Mapping[str, any] = None, **kwargs) -> Iterable[Mapping[str, Any]]:
10274
try:
103-
if self.top_level_stream:
104-
with self.cache:
105-
yield from super().read_records(stream_slice=stream_slice, **kwargs)
106-
else:
107-
yield from super().read_records(stream_slice=stream_slice, **kwargs)
75+
yield from super().read_records(stream_slice=stream_slice, **kwargs)
10876
except HTTPError as e:
10977
error_msg = str(e)
11078

@@ -422,6 +390,7 @@ class PullRequests(SemiIncrementalGithubStream):
422390
"""
423391

424392
page_size = 50
393+
first_read_override_key = "first_read_override"
425394

426395
def __init__(self, **kwargs):
427396
super().__init__(**kwargs)
@@ -431,7 +400,7 @@ def read_records(self, stream_state: Mapping[str, Any] = None, **kwargs) -> Iter
431400
"""
432401
Decide if this a first read or not by the presence of the state object
433402
"""
434-
self._first_read = not bool(stream_state)
403+
self._first_read = not bool(stream_state) or stream_state.get(self.first_read_override_key, False)
435404
yield from super().read_records(stream_state=stream_state, **kwargs)
436405

437406
def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
@@ -459,7 +428,7 @@ def is_sorted_descending(self) -> bool:
459428
"""
460429
Depending if there any state we read stream in ascending or descending order.
461430
"""
462-
return self._first_read
431+
return not self._first_read
463432

464433

465434
class CommitComments(SemiIncrementalGithubStream):
@@ -686,23 +655,42 @@ def path(self, stream_slice: Mapping[str, Any] = None, **kwargs) -> str:
686655
# Pull request substreams
687656

688657

689-
class PullRequestSubstream(HttpSubStream, GithubStream, ABC):
690-
top_level_stream = False
658+
class PullRequestSubstream(HttpSubStream, SemiIncrementalGithubStream, ABC):
659+
use_cache = False
691660

692661
def __init__(self, parent: PullRequests, **kwargs):
693662
super().__init__(parent=parent, **kwargs)
694663

695664
def stream_slices(
696665
self, sync_mode: SyncMode, cursor_field: List[str] = None, stream_state: Mapping[str, Any] = None
697666
) -> Iterable[Optional[Mapping[str, Any]]]:
698-
parent_stream_slices = super().stream_slices(sync_mode=sync_mode, cursor_field=cursor_field, stream_state=stream_state)
699-
667+
"""
668+
Override the parent PullRequests stream configuration to always fetch records in ascending order
669+
"""
670+
parent_state = deepcopy(stream_state) or {}
671+
parent_state[PullRequests.first_read_override_key] = True
672+
parent_stream_slices = super().stream_slices(sync_mode=sync_mode, cursor_field=cursor_field, stream_state=parent_state)
700673
for parent_stream_slice in parent_stream_slices:
701674
yield {
702675
"pull_request_number": parent_stream_slice["parent"]["number"],
703676
"repository": parent_stream_slice["parent"]["repository"],
704677
}
705678

679+
def read_records(
680+
self,
681+
sync_mode: SyncMode,
682+
cursor_field: List[str] = None,
683+
stream_slice: Mapping[str, Any] = None,
684+
stream_state: Mapping[str, Any] = None,
685+
) -> Iterable[Mapping[str, Any]]:
686+
"""
687+
We've already determined the list of pull requests to run the stream against.
688+
Skip the start_point_map and cursor_field logic in SemiIncrementalGithubStream.read_records.
689+
"""
690+
yield from super(SemiIncrementalGithubStream, self).read_records(
691+
sync_mode=sync_mode, cursor_field=cursor_field, stream_slice=stream_slice, stream_state=stream_state
692+
)
693+
706694

707695
class PullRequestStats(PullRequestSubstream):
708696
"""
@@ -731,19 +719,29 @@ class Reviews(PullRequestSubstream):
731719
API docs: https://docs.github.com/en/rest/reference/pulls#list-reviews-for-a-pull-request
732720
"""
733721

722+
cursor_field = "submitted_at"
723+
734724
def path(
735725
self, stream_state: Mapping[str, Any] = None, stream_slice: Mapping[str, Any] = None, next_page_token: Mapping[str, Any] = None
736726
) -> str:
737727
return f"repos/{stream_slice['repository']}/pulls/{stream_slice['pull_request_number']}/reviews"
738728

729+
# Set the parent stream state's cursor field before fetching its records
730+
def stream_slices(self, stream_state: Mapping[str, Any] = None, **kwargs) -> Iterable[Optional[Mapping[str, Any]]]:
731+
parent_state = deepcopy(stream_state) or {}
732+
for repository in self.repositories:
733+
if repository in parent_state and self.cursor_field in parent_state[repository]:
734+
parent_state[repository][self.parent.cursor_field] = parent_state[repository][self.cursor_field]
735+
yield from super().stream_slices(stream_state=parent_state, **kwargs)
736+
739737

740738
# Reactions streams
741739

742740

743741
class ReactionStream(GithubStream, ABC):
744742

745743
parent_key = "id"
746-
top_level_stream = False
744+
use_cache = False
747745

748746
def __init__(self, **kwargs):
749747
self._stream_kwargs = deepcopy(kwargs)

‎docs/integrations/sources/github.md

+1
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ Your token should have at least the `repo` scope. Depending on which streams you
9292

9393
| Version | Date | Pull Request | Subject |
9494
| :--- | :--- | :--- | :--- |
95+
| 0.2.10 | 2021-01-03 | [7250](https://github.com/airbytehq/airbyte/pull/7250) | Use CDK caching and convert PR-related streams to incremental |
9596
| 0.2.9 | 2021-12-29 | [9179](https://github.com/airbytehq/airbyte/pull/9179) | Use default retry delays on server error responses |
9697
| 0.2.8 | 2021-12-07 | [8524](https://github.com/airbytehq/airbyte/pull/8524) | Update connector fields title/description |
9798
| 0.2.7 | 2021-12-06 | [8518](https://github.com/airbytehq/airbyte/pull/8518) | Add connection retry with Github |

0 commit comments

Comments
 (0)