airbytehq · marcosmarxm · Apr 11, 2024 · Feb 19, 2024 · Feb 19, 2024 · Mar 7, 2024
diff --git a/airbyte-integrations/connectors/source-google-sheets/metadata.yaml b/airbyte-integrations/connectors/source-google-sheets/metadata.yaml
@@ -10,7 +10,7 @@ data:
   connectorSubtype: file
   connectorType: source
   definitionId: 71607ba1-c0ac-4799-8049-7f4b90dd50f7
-  dockerImageTag: 0.5.0
+  dockerImageTag: 0.5.1
   dockerRepository: airbyte/source-google-sheets
   documentationUrl: https://docs.airbyte.com/integrations/sources/google-sheets
   githubIssueLabel: source-google-sheets

diff --git a/airbyte-integrations/connectors/source-google-sheets/pyproject.toml b/airbyte-integrations/connectors/source-google-sheets/pyproject.toml
@@ -3,7 +3,7 @@ requires = [ "poetry-core>=1.0.0",]
 build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
-version = "0.5.0"
+version = "0.5.1"
 name = "source-google-sheets"
 description = "Source implementation for Google Sheets."
 authors = [ "Airbyte <[email protected]>",]

diff --git a/airbyte-integrations/connectors/source-google-sheets/source_google_sheets/client.py b/airbyte-integrations/connectors/source-google-sheets/source_google_sheets/client.py
@@ -21,7 +21,7 @@ class Backoff:
         @classmethod
         def increase_row_batch_size(cls, details):
             if details["exception"].status_code == status_codes.TOO_MANY_REQUESTS and cls.row_batch_size < 1000:
-                cls.row_batch_size = cls.row_batch_size + 10
+                cls.row_batch_size = cls.row_batch_size + 100
                 logger.info(f"Increasing number of records fetching due to rate limits. Current value: {cls.row_batch_size}")
 
         @staticmethod

diff --git a/airbyte-integrations/connectors/source-google-sheets/source_google_sheets/source.py b/airbyte-integrations/connectors/source-google-sheets/source_google_sheets/source.py
@@ -149,6 +149,7 @@ def _read(
         catalog: ConfiguredAirbyteCatalog,
     ) -> Generator[AirbyteMessage, None, None]:
         client = GoogleSheetsClient(self.get_credentials(config))
+        client.Backoff.row_batch_size = config.get("batch_size", 200)
 
         sheet_to_column_name = Helpers.parse_sheet_and_column_names_from_catalog(catalog)
         stream_name_to_stream = {stream.stream.name: stream for stream in catalog.streams}

diff --git a/airbyte-integrations/connectors/source-google-sheets/source_google_sheets/spec.yaml b/airbyte-integrations/connectors/source-google-sheets/source_google_sheets/spec.yaml
@@ -8,6 +8,21 @@ connectionSpecification:
     - credentials
   additionalProperties: true
   properties:
+    batch_size:
+      type: integer
+      title: Row Batch Size
+      description: >-
+        Default value is 200.
+        An integer representing row batch size for each sent request to Google Sheets API.
+        Row batch size means how many rows are processed from the google sheet, for example default value 200
+        would process rows 1-201, then 201-401 and so on.
+        Based on <a href='https://developers.google.com/sheets/api/limits'>Google Sheets API limits documentation</a>,
+        it is possible to send up to 300 requests per minute, but each individual request has to be processed under 180 seconds,
+        otherwise the request returns a timeout error. In regards to this information, consider network speed and
+        number of columns of the google sheet when deciding a batch_size value.
+        Default value should cover most of the cases, but if a google sheet has over 100,000 records or more,
+        consider increasing batch_size value.
+      default: 200
     spreadsheet_id:
       type: string
       title: Spreadsheet Link

diff --git a/airbyte-integrations/connectors/source-google-sheets/unit_tests/test_client.py b/airbyte-integrations/connectors/source-google-sheets/unit_tests/test_client.py
@@ -24,8 +24,8 @@ def test_backoff_increase_row_batch_size():
     e = requests.HTTPError("error")
     e.status_code = 429
     client.Backoff.increase_row_batch_size({"exception": e})
-    assert client.Backoff.row_batch_size == 210
-    assert client._create_range("spreadsheet_id", 0) == "spreadsheet_id!0:210"
+    assert client.Backoff.row_batch_size == 300
+    assert client._create_range("spreadsheet_id", 0) == "spreadsheet_id!0:300"
     client.Backoff.row_batch_size = 1000
     client.Backoff.increase_row_batch_size({"exception": e})
     assert client.Backoff.row_batch_size == 1000
@@ -57,12 +57,12 @@ def test_client_get_values_on_backoff(caplog):
     e = requests.HTTPError("error")
     e.status_code = 429
     client_google_sheets.Backoff.increase_row_batch_size({"exception": e})
-    assert client_google_sheets.Backoff.row_batch_size == 220
+    assert client_google_sheets.Backoff.row_batch_size == 310
     client_google_sheets.get_values(
         sheet="sheet",
         row_cursor=0,
         spreadsheetId="spreadsheet_id",
         majorDimension="ROWS",
     )
 
-    assert "Fetching range sheet!0:220" in caplog.text
+    assert "Fetching range sheet!0:310" in caplog.text
diff --git a/docs/integrations/sources/google-sheets.md b/docs/integrations/sources/google-sheets.md
@@ -97,8 +97,18 @@ If your spreadsheet is viewable by anyone with its link, no further action is ne
   - To authenticate your Google account via OAuth, select **Authenticate via Google (OAuth)** from the dropdown and enter your Google application's client ID, client secret, and refresh token.
 <!-- /env:oss -->
 6. For **Spreadsheet Link**, enter the link to the Google spreadsheet. To get the link, go to the Google spreadsheet you want to sync, click **Share** in the top right corner, and click **Copy Link**.
-7. (Optional) You may enable the option to **Convert Column Names to SQL-Compliant Format**. Enabling this option will allow the connector to convert column names to a standardized, SQL-friendly format. For example, a column name of `Café Earnings 2022` will be converted to `cafe_earnings_2022`. We recommend enabling this option if your target destination is SQL-based (ie Postgres, MySQL). Set to false by default.
-8. Click **Set up source** and wait for the tests to complete.
+7. For **Batch Size**, enter an integer which represents batch size when processing a Google Sheet. Default value is 200.
+Batch size is an integer representing row batch size for each sent request to Google Sheets API.
+Row batch size means how many rows are processed from the google sheet, for example default value 200
+would process rows 1-201, then 201-401 and so on.
+Based on [Google Sheets API limits documentation](https://developers.google.com/sheets/api/limits),
+it is possible to send up to 300 requests per minute, but each individual request has to be processed under 180 seconds,
+otherwise the request returns a timeout error. In regards to this information, consider network speed and
+number of columns of the google sheet when deciding a batch_size value.
+Default value should cover most of the cases, but if a google sheet has over 100,000 records or more,
+consider increasing batch_size value.
+8. (Optional) You may enable the option to **Convert Column Names to SQL-Compliant Format**. Enabling this option will allow the connector to convert column names to a standardized, SQL-friendly format. For example, a column name of `Café Earnings 2022` will be converted to `cafe_earnings_2022`. We recommend enabling this option if your target destination is SQL-based (ie Postgres, MySQL). Set to false by default.
+9. Click **Set up source** and wait for the tests to complete.
 
 <HideInUI>