|
4 | 4 |
|
5 | 5 | import csv
|
6 | 6 | import ctypes
|
7 |
| -import io |
8 | 7 | import math
|
| 8 | +import os |
9 | 9 | import time
|
10 | 10 | from abc import ABC
|
| 11 | +from contextlib import closing |
11 | 12 | from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Type, Union
|
12 | 13 |
|
| 14 | +import pandas as pd |
13 | 15 | import pendulum
|
14 | 16 | import requests # type: ignore[import]
|
15 | 17 | from airbyte_cdk.models import SyncMode
|
16 | 18 | from airbyte_cdk.sources.streams.http import HttpStream
|
17 | 19 | from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer
|
| 20 | +from numpy import nan |
18 | 21 | from pendulum import DateTime # type: ignore[attr-defined]
|
19 | 22 | from requests import codes, exceptions
|
20 | 23 |
|
21 | 24 | from .api import UNSUPPORTED_FILTERING_STREAMS, Salesforce
|
22 |
| -from .exceptions import SalesforceException |
| 25 | +from .exceptions import SalesforceException, TmpFileIOError |
23 | 26 | from .rate_limiting import default_backoff_handler
|
24 | 27 |
|
25 | 28 | # https://stackoverflow.com/a/54517228
|
@@ -136,17 +139,17 @@ def path(self, next_page_token: Mapping[str, Any] = None, **kwargs: Any) -> str:
|
136 | 139 | transformer = TypeTransformer(TransformConfig.CustomSchemaNormalization | TransformConfig.DefaultSchemaNormalization)
|
137 | 140 |
|
138 | 141 | @default_backoff_handler(max_tries=5, factor=15)
|
139 |
| - def _send_http_request(self, method: str, url: str, json: dict = None): |
| 142 | + def _send_http_request(self, method: str, url: str, json: dict = None, stream: bool = False): |
140 | 143 | headers = self.authenticator.get_auth_header()
|
141 |
| - response = self._session.request(method, url=url, headers=headers, json=json) |
| 144 | + response = self._session.request(method, url=url, headers=headers, json=json, stream=stream) |
142 | 145 | if response.status_code not in [200, 204]:
|
143 | 146 | self.logger.error(f"error body: {response.text}, sobject options: {self.sobject_options}")
|
144 | 147 | response.raise_for_status()
|
145 | 148 | return response
|
146 | 149 |
|
147 | 150 | def create_stream_job(self, query: str, url: str) -> Optional[str]:
|
148 | 151 | """
|
149 |
| - docs: https://developer.salesforce.com/docs/atlas.en-us.api_asynch.meta/api_asynch/create_job.htm |
| 152 | + docs: https://developer.salesforce.com/docs/atlas.en-us.api_asynch.meta/api_asynch/create_job.html |
150 | 153 | """
|
151 | 154 | json = {"operation": "queryAll", "query": query, "contentType": "CSV", "columnDelimiter": "COMMA", "lineEnding": "LF"}
|
152 | 155 | try:
|
@@ -210,7 +213,7 @@ def wait_for_job(self, url: str) -> str:
|
210 | 213 | # this is only job metadata without payload
|
211 | 214 | error_message = job_info.get("errorMessage")
|
212 | 215 | if not error_message:
|
213 |
| - # not all failed response can have "errorMessage" and we need to print full response body |
| 216 | + # not all failed response can have "errorMessage" and we need to show full response body |
214 | 217 | error_message = job_info
|
215 | 218 | self.logger.error(f"JobStatus: {job_status}, sobject options: {self.sobject_options}, error message: '{error_message}'")
|
216 | 219 |
|
@@ -257,13 +260,47 @@ def filter_null_bytes(self, s: str):
|
257 | 260 | self.logger.warning("Filter 'null' bytes from string, size reduced %d -> %d chars", len(s), len(res))
|
258 | 261 | return res
|
259 | 262 |
|
260 |
| - def download_data(self, url: str) -> Iterable[Tuple[int, Mapping[str, Any]]]: |
261 |
| - job_data = self._send_http_request("GET", f"{url}/results") |
262 |
| - decoded_content = self.filter_null_bytes(job_data.content.decode("utf-8")) |
263 |
| - fp = io.StringIO(decoded_content, newline="") |
264 |
| - csv_data = csv.DictReader(fp, dialect="unix") |
265 |
| - for n, row in enumerate(csv_data, 1): |
266 |
| - yield n, row |
| 263 | + def download_data(self, url: str, chunk_size: float = 1024) -> os.PathLike: |
| 264 | + """ |
| 265 | + Retrieves binary data result from successfully `executed_job`, using chunks, to avoid local memory limitaions. |
| 266 | + @ url: string - the url of the `executed_job` |
| 267 | + @ chunk_size: float - the buffer size for each chunk to fetch from stream, in bytes, default: 1024 bytes |
| 268 | +
|
| 269 | + Returns the string with file path of downloaded binary data. Saved temporarily. |
| 270 | + """ |
| 271 | + # set filepath for binary data from response |
| 272 | + tmp_file = os.path.realpath(os.path.basename(url)) |
| 273 | + with closing(self._send_http_request("GET", f"{url}/results", stream=True)) as response: |
| 274 | + with open(tmp_file, "w") as data_file: |
| 275 | + for chunk in response.iter_content(chunk_size=chunk_size): |
| 276 | + data_file.writelines(self.filter_null_bytes(chunk.decode("utf-8"))) |
| 277 | + # check the file exists |
| 278 | + if os.path.isfile(tmp_file): |
| 279 | + return tmp_file |
| 280 | + else: |
| 281 | + raise TmpFileIOError(f"The IO/Error occured while verifying binary data. Stream: {self.name}, file {tmp_file} doesn't exist.") |
| 282 | + |
| 283 | + def read_with_chunks(self, path: str = None, chunk_size: int = 100) -> Iterable[Tuple[int, Mapping[str, Any]]]: |
| 284 | + """ |
| 285 | + Reads the downloaded binary data, using lines chunks, set by `chunk_size`. |
| 286 | + @ path: string - the path to the downloaded temporarily binary data. |
| 287 | + @ chunk_size: int - the number of lines to read at a time, default: 100 lines / time. |
| 288 | + """ |
| 289 | + try: |
| 290 | + with open(path, "r", encoding="utf-8") as data: |
| 291 | + chunks = pd.read_csv(data, chunksize=chunk_size, iterator=True, dialect="unix") |
| 292 | + for chunk in chunks: |
| 293 | + chunk = chunk.replace({nan: None}).to_dict(orient="records") |
| 294 | + for n, row in enumerate(chunk, 1): |
| 295 | + yield n, row |
| 296 | + except pd.errors.EmptyDataError as e: |
| 297 | + self.logger.info(f"Empty data received. {e}") |
| 298 | + yield from [] |
| 299 | + except IOError as ioe: |
| 300 | + raise TmpFileIOError(f"The IO/Error occured while reading tmp data. Called: {path}. Stream: {self.name}", ioe) |
| 301 | + finally: |
| 302 | + # remove binary tmp file, after data is read |
| 303 | + os.remove(path) |
267 | 304 |
|
268 | 305 | def abort_job(self, url: str):
|
269 | 306 | data = {"state": "Aborted"}
|
@@ -292,7 +329,6 @@ def request_params(
|
292 | 329 |
|
293 | 330 | if self.primary_key and self.name not in UNSUPPORTED_FILTERING_STREAMS:
|
294 | 331 | query += f"ORDER BY {self.primary_key} ASC LIMIT {self.page_size}"
|
295 |
| - |
296 | 332 | return {"q": query}
|
297 | 333 |
|
298 | 334 | def read_records(
|
@@ -325,7 +361,7 @@ def read_records(
|
325 | 361 |
|
326 | 362 | count = 0
|
327 | 363 | record: Mapping[str, Any] = {}
|
328 |
| - for count, record in self.download_data(url=job_full_url): |
| 364 | + for count, record in self.read_with_chunks(self.download_data(url=job_full_url)): |
329 | 365 | yield record
|
330 | 366 | self.delete_job(url=job_full_url)
|
331 | 367 |
|
|
0 commit comments