Skip to content

Commit ef0ecc3

Browse files
midavadimgirarda
andauthored
🎉 Source Mixpanel low code migration (#36724)
Co-authored-by: Alexandre Girard <[email protected]>
1 parent c2d133d commit ef0ecc3

File tree

22 files changed

+904
-1000
lines changed

22 files changed

+904
-1000
lines changed

airbyte-integrations/connectors/source-mixpanel/integration_tests/abnormal_state.json

+3-1
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
"type": "STREAM",
44
"stream": {
55
"stream_state": {
6+
"36152117": { "date": "2030-01-01" },
67
"41833532": { "date": "2030-01-01" },
7-
"36152117": { "date": "2030-01-01" }
8+
"41833755": { "date": "2030-01-01" },
9+
"41833700": { "date": "2030-01-01" }
810
},
911
"stream_descriptor": { "name": "funnels" }
1012
}

airbyte-integrations/connectors/source-mixpanel/metadata.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ data:
1111
connectorSubtype: api
1212
connectorType: source
1313
definitionId: 12928b32-bf0a-4f1e-964f-07e12e37153a
14-
dockerImageTag: 2.2.2
14+
dockerImageTag: 2.3.0
1515
dockerRepository: airbyte/source-mixpanel
1616
documentationUrl: https://docs.airbyte.com/integrations/sources/mixpanel
1717
githubIssueLabel: source-mixpanel
@@ -58,5 +58,5 @@ data:
5858
supportLevel: certified
5959
tags:
6060
- language:python
61-
- cdk:python
61+
- cdk:low-code
6262
metadataSpecVersion: "1.0"

airbyte-integrations/connectors/source-mixpanel/pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ requires = [ "poetry-core>=1.0.0",]
33
build-backend = "poetry.core.masonry.api"
44

55
[tool.poetry]
6-
version = "2.2.2"
6+
version = "2.3.0"
77
name = "source-mixpanel"
88
description = "Source implementation for Mixpanel."
99
authors = [ "Airbyte <[email protected]>",]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,339 @@
1+
# Copyright (c) 2024 Airbyte, Inc., all rights reserved.
2+
3+
import time
4+
from dataclasses import dataclass
5+
from typing import Any, Iterable, List, Mapping, MutableMapping, Optional
6+
7+
import dpath.util
8+
import requests
9+
from airbyte_cdk.models import AirbyteMessage, SyncMode, Type
10+
from airbyte_cdk.sources.declarative.extractors import DpathExtractor
11+
from airbyte_cdk.sources.declarative.interpolation import InterpolatedString
12+
from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_migration import LegacyToPerPartitionStateMigration
13+
from airbyte_cdk.sources.declarative.models import DatetimeBasedCursor
14+
from airbyte_cdk.sources.declarative.partition_routers import SubstreamPartitionRouter
15+
from airbyte_cdk.sources.declarative.requesters import HttpRequester
16+
from airbyte_cdk.sources.declarative.requesters.paginators.strategies.page_increment import PageIncrement
17+
from airbyte_cdk.sources.declarative.schema import JsonFileSchemaLoader
18+
from airbyte_cdk.sources.declarative.schema.json_file_schema_loader import _default_file_path
19+
from airbyte_cdk.sources.declarative.transformations import RecordTransformation
20+
from airbyte_cdk.sources.declarative.types import Config, Record, StreamSlice, StreamState
21+
22+
from .source import SourceMixpanel
23+
from .streams.engage import EngageSchema
24+
25+
26+
class MixpanelHttpRequester(HttpRequester):
27+
reqs_per_hour_limit = 60
28+
is_first_request = True
29+
30+
def get_request_headers(
31+
self,
32+
*,
33+
stream_state: Optional[StreamState] = None,
34+
stream_slice: Optional[StreamSlice] = None,
35+
next_page_token: Optional[Mapping[str, Any]] = None,
36+
) -> Mapping[str, Any]:
37+
38+
return {"Accept": "application/json"}
39+
40+
def get_request_params(
41+
self,
42+
*,
43+
stream_state: Optional[StreamState] = None,
44+
stream_slice: Optional[StreamSlice] = None,
45+
next_page_token: Optional[Mapping[str, Any]] = None,
46+
) -> MutableMapping[str, Any]:
47+
project_id = self.config.get("credentials", {}).get("project_id")
48+
return {"project_id": project_id} if project_id else {}
49+
50+
def _request_params(
51+
self,
52+
stream_state: Optional[StreamState],
53+
stream_slice: Optional[StreamSlice],
54+
next_page_token: Optional[Mapping[str, Any]],
55+
extra_params: Optional[Mapping[str, Any]] = None,
56+
) -> Mapping[str, Any]:
57+
"""
58+
Flatten extra_params if it contains pagination information
59+
"""
60+
next_page_token = None # reset it, pagination data is in extra_params
61+
if extra_params:
62+
page = extra_params.pop("page", {})
63+
extra_params.update(page)
64+
return super()._request_params(stream_state, stream_slice, next_page_token, extra_params)
65+
66+
def send_request(self, **kwargs) -> Optional[requests.Response]:
67+
68+
if self.reqs_per_hour_limit:
69+
if self.is_first_request:
70+
self.is_first_request = False
71+
else:
72+
# we skip this block, if self.reqs_per_hour_limit = 0,
73+
# in all other cases wait for X seconds to match API limitations
74+
# https://help.mixpanel.com/hc/en-us/articles/115004602563-Rate-Limits-for-Export-API-Endpoints#api-export-endpoint-rate-limits
75+
self.logger.info(
76+
f"Sleep for {3600 / self.reqs_per_hour_limit} seconds to match API limitations after reading from {self.name}"
77+
)
78+
time.sleep(3600 / self.reqs_per_hour_limit)
79+
80+
return super().send_request(**kwargs)
81+
82+
83+
class AnnotationsHttpRequester(MixpanelHttpRequester):
84+
def get_request_params(
85+
self,
86+
*,
87+
stream_state: Optional[StreamState] = None,
88+
stream_slice: Optional[StreamSlice] = None,
89+
next_page_token: Optional[Mapping[str, Any]] = None,
90+
) -> MutableMapping[str, Any]:
91+
return {}
92+
93+
94+
class FunnelsHttpRequester(MixpanelHttpRequester):
95+
def get_request_params(
96+
self,
97+
*,
98+
stream_state: Optional[StreamState] = None,
99+
stream_slice: Optional[StreamSlice] = None,
100+
next_page_token: Optional[Mapping[str, Any]] = None,
101+
) -> MutableMapping[str, Any]:
102+
params = super().get_request_params(stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token)
103+
params["unit"] = "day"
104+
return params
105+
106+
107+
class CohortMembersSubstreamPartitionRouter(SubstreamPartitionRouter):
108+
def get_request_body_json(
109+
self,
110+
stream_state: Optional[StreamState] = None,
111+
stream_slice: Optional[StreamSlice] = None,
112+
next_page_token: Optional[Mapping[str, Any]] = None,
113+
) -> Mapping[str, Any]:
114+
# https://developer.mixpanel.com/reference/engage-query
115+
cohort_id = stream_slice["id"]
116+
return {"filter_by_cohort": f'{{"id":{cohort_id}}}'}
117+
118+
119+
class EngageTransformation(RecordTransformation):
120+
def transform(
121+
self,
122+
record: Record,
123+
config: Optional[Config] = None,
124+
stream_state: Optional[StreamState] = None,
125+
stream_slice: Optional[StreamSlice] = None,
126+
) -> Record:
127+
"""
128+
- flatten $properties fields
129+
- remove leading '$'
130+
"""
131+
record["distinct_id"] = record.pop("$distinct_id")
132+
properties = record.pop("$properties")
133+
for property_name in properties:
134+
this_property_name = property_name
135+
if property_name.startswith("$"):
136+
# Just remove leading '$' for 'reserved' mixpanel properties name, example:
137+
# from API: '$browser'
138+
# to stream: 'browser'
139+
this_property_name = this_property_name[1:]
140+
record[this_property_name] = properties[property_name]
141+
142+
return record
143+
144+
145+
class RevenueDpathExtractor(DpathExtractor):
146+
def extract_records(self, response: requests.Response) -> List[Mapping[str, Any]]:
147+
"""
148+
response.json() example:
149+
{
150+
'computed_at': '2021-07-03T12:43:48.889421+00:00',
151+
'results': {
152+
'$overall': { <-- should be skipped
153+
'amount': 0.0,
154+
'count': 124,
155+
'paid_count': 0
156+
},
157+
'2021-06-01': {
158+
'amount': 0.0,
159+
'count': 124,
160+
'paid_count': 0
161+
},
162+
'2021-06-02': {
163+
'amount': 0.0,
164+
'count': 124,
165+
'paid_count': 0
166+
},
167+
...
168+
},
169+
'session_id': '162...',
170+
'status': 'ok'
171+
}
172+
"""
173+
new_records = []
174+
for record in super().extract_records(response):
175+
for date_entry in record:
176+
if date_entry != "$overall":
177+
list.append(new_records, {"date": date_entry, **record[date_entry]})
178+
return new_records
179+
180+
181+
class FunnelsDpathExtractor(DpathExtractor):
182+
def extract_records(self, response: requests.Response) -> List[Mapping[str, Any]]:
183+
"""
184+
response.json() example:
185+
{
186+
'computed_at': '2021-07-03T12:43:48.889421+00:00',
187+
'results': {
188+
'$overall': { <-- should be skipped
189+
'amount': 0.0,
190+
'count': 124,
191+
'paid_count': 0
192+
},
193+
'2021-06-01': {
194+
'amount': 0.0,
195+
'count': 124,
196+
'paid_count': 0
197+
},
198+
...
199+
},
200+
'session_id': '162...',
201+
'status': 'ok'
202+
}
203+
"""
204+
new_records = []
205+
for record in super().extract_records(response):
206+
for date_entry in record:
207+
list.append(new_records, {"date": date_entry, **record[date_entry]})
208+
return new_records
209+
210+
211+
class FunnelsSubstreamPartitionRouter(SubstreamPartitionRouter):
212+
def stream_slices(self) -> Iterable[StreamSlice]:
213+
"""
214+
Add 'funnel_name' to the slice, the rest code is exactly the same as in super().stream_slices(...)
215+
Remove empty 'parent_slice' attribute to be compatible with LegacyToPerPartitionStateMigration
216+
"""
217+
if not self.parent_stream_configs:
218+
yield from []
219+
else:
220+
for parent_stream_config in self.parent_stream_configs:
221+
parent_stream = parent_stream_config.stream
222+
parent_field = parent_stream_config.parent_key.eval(self.config) # type: ignore # parent_key is always casted to an interpolated string
223+
partition_field = parent_stream_config.partition_field.eval(self.config) # type: ignore # partition_field is always casted to an interpolated string
224+
for parent_stream_slice in parent_stream.stream_slices(
225+
sync_mode=SyncMode.full_refresh, cursor_field=None, stream_state=None
226+
):
227+
empty_parent_slice = True
228+
parent_partition = parent_stream_slice.partition if parent_stream_slice else {}
229+
230+
for parent_record in parent_stream.read_records(
231+
sync_mode=SyncMode.full_refresh, cursor_field=None, stream_slice=parent_stream_slice, stream_state=None
232+
):
233+
# Skip non-records (eg AirbyteLogMessage)
234+
if isinstance(parent_record, AirbyteMessage):
235+
if parent_record.type == Type.RECORD:
236+
parent_record = parent_record.record.data
237+
else:
238+
continue
239+
elif isinstance(parent_record, Record):
240+
parent_record = parent_record.data
241+
try:
242+
partition_value = dpath.util.get(parent_record, parent_field)
243+
except KeyError:
244+
pass
245+
else:
246+
empty_parent_slice = False
247+
yield StreamSlice(
248+
partition={partition_field: partition_value},
249+
cursor_slice={"funnel_name": parent_record.get("name")},
250+
)
251+
# If the parent slice contains no records,
252+
if empty_parent_slice:
253+
yield from []
254+
255+
256+
@dataclass
257+
class EngagePaginationStrategy(PageIncrement):
258+
"""
259+
Engage stream uses 2 params for pagination:
260+
session_id - returned after first request
261+
page - incremental page number
262+
"""
263+
264+
def next_page_token(self, response, last_records: List[Mapping[str, Any]]) -> Optional[Mapping[str, Any]]:
265+
"""
266+
Determines page and subpage numbers for the `items` stream
267+
268+
Attributes:
269+
response: Contains `boards` and corresponding lists of `items` for each `board`
270+
last_records: Parsed `items` from the response
271+
"""
272+
decoded_response = response.json()
273+
page_number = decoded_response.get("page")
274+
total = decoded_response.get("total") # exist only on first page
275+
if total:
276+
self._total = total
277+
278+
if self._total and page_number is not None and self._total > self.page_size * (page_number + 1):
279+
return {"session_id": decoded_response.get("session_id"), "page": page_number + 1}
280+
else:
281+
self._total = None
282+
return None
283+
284+
285+
class EngageJsonFileSchemaLoader(JsonFileSchemaLoader):
286+
"""Engage schema combines static and dynamic approaches"""
287+
288+
schema: Mapping[str, Any]
289+
290+
def __post_init__(self, parameters: Mapping[str, Any]):
291+
if not self.file_path:
292+
self.file_path = _default_file_path()
293+
self.file_path = InterpolatedString.create(self.file_path, parameters=parameters)
294+
self.schema = {}
295+
296+
def get_json_schema(self) -> Mapping[str, Any]:
297+
"""
298+
Dynamically load additional properties from API
299+
Add cache to reduce a number of API calls because get_json_schema()
300+
is called for each extracted record
301+
"""
302+
303+
if self.schema:
304+
return self.schema
305+
306+
schema = super().get_json_schema()
307+
308+
types = {
309+
"boolean": {"type": ["null", "boolean"]},
310+
"number": {"type": ["null", "number"], "multipleOf": 1e-20},
311+
# no format specified as values can be "2021-12-16T00:00:00", "1638298874", "15/08/53895"
312+
"datetime": {"type": ["null", "string"]},
313+
"object": {"type": ["null", "object"], "additionalProperties": True},
314+
"list": {"type": ["null", "array"], "required": False, "items": {}},
315+
"string": {"type": ["null", "string"]},
316+
}
317+
318+
params = {"authenticator": SourceMixpanel.get_authenticator(self.config), "region": self.config.get("region")}
319+
project_id = self.config.get("credentials", {}).get("project_id")
320+
if project_id:
321+
params["project_id"] = project_id
322+
323+
schema["additionalProperties"] = self.config.get("select_properties_by_default", True)
324+
325+
# read existing Engage schema from API
326+
schema_properties = EngageSchema(**params).read_records(sync_mode=SyncMode.full_refresh)
327+
for property_entry in schema_properties:
328+
property_name: str = property_entry["name"]
329+
property_type: str = property_entry["type"]
330+
if property_name.startswith("$"):
331+
# Just remove leading '$' for 'reserved' mixpanel properties name, example:
332+
# from API: '$browser'
333+
# to stream: 'browser'
334+
property_name = property_name[1:]
335+
# Do not overwrite 'standard' hard-coded properties, add 'custom' properties
336+
if property_name not in schema["properties"]:
337+
schema["properties"][property_name] = types.get(property_type, {"type": ["null", "string"]})
338+
self.schema = schema
339+
return schema

0 commit comments

Comments
 (0)