🐛 [On call / 171] Source Salesforce: fixed the bug when Bulk fetch took all memory of kube pods (#11692)

bazarnov · web-flow · commit 644ace4036de · 2022-04-13T20:35:10.000+03:00
diff --git a/airbyte-config/init/src/main/resources/seed/source_definitions.yaml b/airbyte-config/init/src/main/resources/seed/source_definitions.yaml
@@ -689,7 +689,7 @@
 - name: Salesforce
   sourceDefinitionId: b117307c-14b6-41aa-9422-947e34922962
   dockerRepository: airbyte/source-salesforce
-  dockerImageTag: 1.0.2
+  dockerImageTag: 1.0.3
   documentationUrl: https://docs.airbyte.io/integrations/sources/salesforce
   icon: salesforce.svg
   sourceType: api
diff --git a/airbyte-config/init/src/main/resources/seed/source_specs.yaml b/airbyte-config/init/src/main/resources/seed/source_specs.yaml
@@ -7298,7 +7298,7 @@
     supportsNormalization: false
     supportsDBT: false
     supported_destination_sync_modes: []
-- dockerImage: "airbyte/source-salesforce:1.0.2"
+- dockerImage: "airbyte/source-salesforce:1.0.3"
   spec:
     documentationUrl: "https://docs.airbyte.com/integrations/sources/salesforce"
     connectionSpecification:
diff --git a/airbyte-integrations/connectors/source-salesforce/Dockerfile b/airbyte-integrations/connectors/source-salesforce/Dockerfile
@@ -1,29 +1,17 @@
-FROM python:3.9.11-alpine3.15 as base
-FROM base as builder
+FROM python:3.9-slim
 
+# Bash is installed for more convenient debugging.
+RUN apt-get update && apt-get install -y bash && rm -rf /var/lib/apt/lists/*
 
-RUN apk --no-cache upgrade \
-    && pip install --upgrade pip  \
-    && apk --no-cache add tzdata build-base
+ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
 
 WORKDIR /airbyte/integration_code
+COPY source_salesforce ./source_salesforce
 COPY setup.py ./
-RUN pip install --prefix=/install .
-
-
-FROM base
-COPY --from=builder /install /usr/local
-# add default timezone settings
-COPY --from=builder /usr/share/zoneinfo/Etc/UTC /etc/localtime
-RUN echo "Etc/UTC" > /etc/timezone
-
-WORKDIR /airbyte/integration_code
 COPY main.py ./
-COPY source_salesforce ./source_salesforce
+RUN pip install .
 
-
-ENV AIRBYTE_ENTRYPOINT "python /airbyte/integration_code/main.py"
 ENTRYPOINT ["python", "/airbyte/integration_code/main.py"]
 
-LABEL io.airbyte.version=1.0.2
+LABEL io.airbyte.version=1.0.3
 LABEL io.airbyte.name=airbyte/source-salesforce
diff --git a/airbyte-integrations/connectors/source-salesforce/README.md b/airbyte-integrations/connectors/source-salesforce/README.md
@@ -101,7 +101,8 @@ Customize `acceptance-test-config.yml` file to configure tests. See [Source Acce
 If your connector requires to create or destroy resources for use during acceptance tests create fixtures for it and place them inside integration_tests/acceptance.py.
 To run your integration tests with acceptance tests, from the connector root, run
 ```
-python -m pytest integration_tests -p integration_tests.acceptance
+docker build . --no-cache -t airbyte/source-salesforce:dev \
+    && python -m pytest -p source_acceptance_test.plugin
 ```
 To run your integration tests with docker
 
diff --git a/airbyte-integrations/connectors/source-salesforce/setup.py b/airbyte-integrations/connectors/source-salesforce/setup.py
@@ -5,9 +5,9 @@
 
 from setuptools import find_packages, setup
 
-MAIN_REQUIREMENTS = ["airbyte-cdk", "vcrpy==4.1.1"]
+MAIN_REQUIREMENTS = ["airbyte-cdk", "vcrpy==4.1.1", "pandas"]
 
-TEST_REQUIREMENTS = ["pytest~=6.1", "source-acceptance-test", "requests_mock", "pytest-timeout"]
+TEST_REQUIREMENTS = ["pytest~=6.1", "requests_mock", "source-acceptance-test", "pytest-timeout"]
 
 setup(
     name="source_salesforce",
diff --git a/airbyte-integrations/connectors/source-salesforce/source_salesforce/exceptions.py b/airbyte-integrations/connectors/source-salesforce/source_salesforce/exceptions.py
@@ -3,6 +3,16 @@
 #
 
 
+from airbyte_cdk.logger import AirbyteLogger
+
+
+class Error(Exception):
+    """Base Error class for other exceptions"""
+
+    # Define the instance of the Native Airbyte Logger
+    logger = AirbyteLogger()
+
+
 class SalesforceException(Exception):
     """
     Default Salesforce exception.
@@ -13,3 +23,8 @@ class TypeSalesforceException(SalesforceException):
     """
     We use this exception for unknown input data types for Salesforce.
     """
+
+
+class TmpFileIOError(Error):
+    def __init__(self, msg: str, err: str = None):
+        self.logger.fatal(f"{msg}. Error: {err}")
diff --git a/airbyte-integrations/connectors/source-salesforce/source_salesforce/streams.py b/airbyte-integrations/connectors/source-salesforce/source_salesforce/streams.py
@@ -4,22 +4,25 @@
 
 import csv
 import ctypes
-import io
 import math
+import os
 import time
 from abc import ABC
+from contextlib import closing
 from typing import Any, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Type, Union
 
+import pandas as pd
 import pendulum
 import requests  # type: ignore[import]
 from airbyte_cdk.models import SyncMode
 from airbyte_cdk.sources.streams.http import HttpStream
 from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer
+from numpy import nan
 from pendulum import DateTime  # type: ignore[attr-defined]
 from requests import codes, exceptions
 
 from .api import UNSUPPORTED_FILTERING_STREAMS, Salesforce
-from .exceptions import SalesforceException
+from .exceptions import SalesforceException, TmpFileIOError
 from .rate_limiting import default_backoff_handler
 
 # https://stackoverflow.com/a/54517228
@@ -136,17 +139,17 @@ def path(self, next_page_token: Mapping[str, Any] = None, **kwargs: Any) -> str:
     transformer = TypeTransformer(TransformConfig.CustomSchemaNormalization | TransformConfig.DefaultSchemaNormalization)
 
     @default_backoff_handler(max_tries=5, factor=15)
-    def _send_http_request(self, method: str, url: str, json: dict = None):
+    def _send_http_request(self, method: str, url: str, json: dict = None, stream: bool = False):
         headers = self.authenticator.get_auth_header()
-        response = self._session.request(method, url=url, headers=headers, json=json)
+        response = self._session.request(method, url=url, headers=headers, json=json, stream=stream)
         if response.status_code not in [200, 204]:
             self.logger.error(f"error body: {response.text}, sobject options: {self.sobject_options}")
         response.raise_for_status()
         return response
 
     def create_stream_job(self, query: str, url: str) -> Optional[str]:
         """
-        docs: https://developer.salesforce.com/docs/atlas.en-us.api_asynch.meta/api_asynch/create_job.htm
+        docs: https://developer.salesforce.com/docs/atlas.en-us.api_asynch.meta/api_asynch/create_job.html
         """
         json = {"operation": "queryAll", "query": query, "contentType": "CSV", "columnDelimiter": "COMMA", "lineEnding": "LF"}
         try:
@@ -210,7 +213,7 @@ def wait_for_job(self, url: str) -> str:
                     # this is only job metadata without payload
                     error_message = job_info.get("errorMessage")
                     if not error_message:
-                        # not all failed response can have "errorMessage" and we need to print full response body
+                        # not all failed response can have "errorMessage" and we need to show full response body
                         error_message = job_info
                     self.logger.error(f"JobStatus: {job_status}, sobject options: {self.sobject_options}, error message: '{error_message}'")
 
@@ -257,13 +260,47 @@ def filter_null_bytes(self, s: str):
             self.logger.warning("Filter 'null' bytes from string, size reduced %d -> %d chars", len(s), len(res))
         return res
 
-    def download_data(self, url: str) -> Iterable[Tuple[int, Mapping[str, Any]]]:
-        job_data = self._send_http_request("GET", f"{url}/results")
-        decoded_content = self.filter_null_bytes(job_data.content.decode("utf-8"))
-        fp = io.StringIO(decoded_content, newline="")
-        csv_data = csv.DictReader(fp, dialect="unix")
-        for n, row in enumerate(csv_data, 1):
-            yield n, row
+    def download_data(self, url: str, chunk_size: float = 1024) -> os.PathLike:
+        """
+        Retrieves binary data result from successfully `executed_job`, using chunks, to avoid local memory limitaions.
+        @ url: string - the url of the `executed_job`
+        @ chunk_size: float - the buffer size for each chunk to fetch from stream, in bytes, default: 1024 bytes
+
+        Returns the string with file path of downloaded binary data. Saved temporarily.
+        """
+        # set filepath for binary data from response
+        tmp_file = os.path.realpath(os.path.basename(url))
+        with closing(self._send_http_request("GET", f"{url}/results", stream=True)) as response:
+            with open(tmp_file, "w") as data_file:
+                for chunk in response.iter_content(chunk_size=chunk_size):
+                    data_file.writelines(self.filter_null_bytes(chunk.decode("utf-8")))
+        # check the file exists
+        if os.path.isfile(tmp_file):
+            return tmp_file
+        else:
+            raise TmpFileIOError(f"The IO/Error occured while verifying binary data. Stream: {self.name}, file {tmp_file} doesn't exist.")
+
+    def read_with_chunks(self, path: str = None, chunk_size: int = 100) -> Iterable[Tuple[int, Mapping[str, Any]]]:
+        """
+        Reads the downloaded binary data, using lines chunks, set by `chunk_size`.
+        @ path: string - the path to the downloaded temporarily binary data.
+        @ chunk_size: int - the number of lines to read at a time, default: 100 lines / time.
+        """
+        try:
+            with open(path, "r", encoding="utf-8") as data:
+                chunks = pd.read_csv(data, chunksize=chunk_size, iterator=True, dialect="unix")
+                for chunk in chunks:
+                    chunk = chunk.replace({nan: None}).to_dict(orient="records")
+                    for n, row in enumerate(chunk, 1):
+                        yield n, row
+        except pd.errors.EmptyDataError as e:
+            self.logger.info(f"Empty data received. {e}")
+            yield from []
+        except IOError as ioe:
+            raise TmpFileIOError(f"The IO/Error occured while reading tmp data. Called: {path}. Stream: {self.name}", ioe)
+        finally:
+            # remove binary tmp file, after data is read
+            os.remove(path)
 
     def abort_job(self, url: str):
         data = {"state": "Aborted"}
@@ -292,7 +329,6 @@ def request_params(
 
         if self.primary_key and self.name not in UNSUPPORTED_FILTERING_STREAMS:
             query += f"ORDER BY {self.primary_key} ASC LIMIT {self.page_size}"
-
         return {"q": query}
 
     def read_records(
@@ -325,7 +361,7 @@ def read_records(
 
             count = 0
             record: Mapping[str, Any] = {}
-            for count, record in self.download_data(url=job_full_url):
+            for count, record in self.read_with_chunks(self.download_data(url=job_full_url)):
                 yield record
             self.delete_job(url=job_full_url)
 
diff --git a/airbyte-integrations/connectors/source-salesforce/unit_tests/api_test.py b/airbyte-integrations/connectors/source-salesforce/unit_tests/api_test.py
@@ -2,6 +2,7 @@
 # Copyright (c) 2021 Airbyte, Inc., all rights reserved.
 #
 
+
 import csv
 import io
 import re
@@ -71,7 +72,7 @@ def test_stream_has_no_state_bulk_api_should_be_used(stream_config, stream_api):
     assert isinstance(stream, BulkSalesforceStream)
 
 
-@pytest.mark.parametrize("item_number", [0, 15, 2000, 2324, 193434])
+@pytest.mark.parametrize("item_number", [0, 15, 2000, 2324, 3000])
 def test_bulk_sync_pagination(item_number, stream_config, stream_api):
     stream: BulkIncrementalSalesforceStream = generate_stream("Account", stream_config, stream_api)
     test_ids = [i for i in range(1, item_number)]
@@ -203,12 +204,12 @@ def test_download_data_filter_null_bytes(stream_config, stream_api):
 
     with requests_mock.Mocker() as m:
         m.register_uri("GET", f"{job_full_url}/results", content=b"\x00")
-        res = list(stream.download_data(url=job_full_url))
+        res = list(stream.read_with_chunks(stream.download_data(url=job_full_url)))
         assert res == []
 
         m.register_uri("GET", f"{job_full_url}/results", content=b'"Id","IsDeleted"\n\x00"0014W000027f6UwQAI","false"\n\x00\x00')
-        res = list(stream.download_data(url=job_full_url))
-        assert res == [(1, {"Id": "0014W000027f6UwQAI", "IsDeleted": "false"})]
+        res = list(stream.read_with_chunks(stream.download_data(url=job_full_url)))
+        assert res == [(1, {"Id": "0014W000027f6UwQAI", "IsDeleted": False})]
 
 
 def test_check_connection_rate_limit(stream_config):
@@ -406,9 +407,9 @@ def test_csv_reader_dialect_unix():
     url = "https://fake-account.salesforce.com/services/data/v52.0/jobs/query/7504W00000bkgnpQAA"
 
     data = [
-        {"Id": "1", "Name": '"first_name" "last_name"'},
-        {"Id": "2", "Name": "'" + 'first_name"\n' + "'" + 'last_name\n"'},
-        {"Id": "3", "Name": "first_name last_name"},
+        {"Id": 1, "Name": '"first_name" "last_name"'},
+        {"Id": 2, "Name": "'" + 'first_name"\n' + "'" + 'last_name\n"'},
+        {"Id": 3, "Name": "first_name last_name"},
     ]
 
     with io.StringIO("", newline="") as csvfile:
@@ -420,7 +421,7 @@ def test_csv_reader_dialect_unix():
 
     with requests_mock.Mocker() as m:
         m.register_uri("GET", url + "/results", text=text)
-        result = [dict(i[1]) for i in stream.download_data(url)]
+        result = [dict(i[1]) for i in stream.read_with_chunks(stream.download_data(url))]
         assert result == data
 
 
diff --git a/airbyte-integrations/connectors/source-salesforce/unit_tests/test_memory.py b/airbyte-integrations/connectors/source-salesforce/unit_tests/test_memory.py
@@ -0,0 +1,45 @@
+#
+# Copyright (c) 2021 Airbyte, Inc., all rights reserved.
+#
+
+
+import tracemalloc
+
+import pytest
+import requests_mock
+from conftest import generate_stream
+from source_salesforce.streams import BulkIncrementalSalesforceStream
+
+
+@pytest.mark.parametrize(
+    "n_records, first_size, first_peak",
+    (
+        (1000, 0.4, 1),
+        (10000, 1, 2),
+        (100000, 4, 7),
+        (200000, 7, 12),
+    ),
+    ids=[
+        "1k recods",
+        "10k records",
+        "100k records",
+        "200k records",
+    ],
+)
+def test_memory_download_data(stream_config, stream_api, n_records, first_size, first_peak):
+    job_full_url: str = "https://fase-account.salesforce.com/services/data/v52.0/jobs/query/7504W00000bkgnpQAA"
+    stream: BulkIncrementalSalesforceStream = generate_stream("Account", stream_config, stream_api)
+    content = b'"Id","IsDeleted"'
+    for _ in range(n_records):
+        content += b'"0014W000027f6UwQAI","false"\n'
+
+    with requests_mock.Mocker() as m:
+        m.register_uri("GET", f"{job_full_url}/results", content=content)
+        tracemalloc.start()
+        for x in stream.read_with_chunks(stream.download_data(url=job_full_url)):
+            pass
+        fs, fp = tracemalloc.get_traced_memory()
+        first_size_in_mb, first_peak_in_mb = fs / 1024**2, fp / 1024**2
+
+        assert first_size_in_mb < first_size
+        assert first_peak_in_mb < first_peak
diff --git a/docs/integrations/sources/salesforce.md b/docs/integrations/sources/salesforce.md
@@ -122,6 +122,7 @@ Now that you have set up the Salesforce source connector, check out the followin
 
 | Version | Date       | Pull Request | Subject                                                                                                                          |
 |:--------|:-----------| :--- |:---------------------------------------------------------------------------------------------------------------------------------|
+| 1.0.3 | 2022-04-04 | [11692](https://github.com/airbytehq/airbyte/pull/11692) | Optimised memory usage for `BULK` API calls |
 | 1.0.2 | 2022-03-01 | [10751](https://github.com/airbytehq/airbyte/pull/10751) | Fix broken link anchor in connector configuration |
 | 1.0.1 | 2022-02-27 | [10679](https://github.com/airbytehq/airbyte/pull/10679) | Reorganize input parameter order on the UI |
 | 1.0.0 | 2022-02-27 | [10516](https://github.com/airbytehq/airbyte/pull/10516) | Speed up schema discovery by using parallelism |