airbytehq
diff --git a/‎.github/workflows/connectors_up_to_date.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/connectors_up_to_date.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎airbyte-cdk/python/CHANGELOG.md
Lines changed: 3 additions & 0 deletions b/‎airbyte-cdk/python/CHANGELOG.md
Lines changed: 3 additions & 0 deletions
diff --git a/‎airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py
Lines changed: 68 additions & 32 deletions b/‎airbyte-cdk/python/airbyte_cdk/sources/declarative/partition_routers/substream_partition_router.py
Lines changed: 68 additions & 32 deletions
diff --git a/‎airbyte-cdk/python/airbyte_cdk/sources/file_based/config/file_based_stream_config.py
Lines changed: 6 additions & 0 deletions b/‎airbyte-cdk/python/airbyte_cdk/sources/file_based/config/file_based_stream_config.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py
Lines changed: 2 additions & 2 deletions b/‎airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/abstract_file_based_stream.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py
Lines changed: 30 additions & 20 deletions b/‎airbyte-cdk/python/airbyte_cdk/sources/file_based/stream/default_file_based_stream.py
Lines changed: 30 additions & 20 deletions
diff --git a/‎airbyte-cdk/python/airbyte_cdk/sources/streams/core.py
Lines changed: 32 additions & 5 deletions b/‎airbyte-cdk/python/airbyte_cdk/sources/streams/core.py
Lines changed: 32 additions & 5 deletions
@@ -11,7 +11,7 @@ on:
     inputs:
       connectors-options:
         description: "Options to pass to the 'airbyte-ci connectors' command group."
-        default: "--concurrency=10 --language=python --language=low-code"
+        default: "--concurrency=10 --language=python --language=low-code --language=manifest-only"
       auto-merge:
         description: "Whether to auto-merge the PRs created by the action."
         default: "false"
 
@@ -1,5 +1,8 @@
 # Changelog
 
+## 3.4.0
+file-based cdk: add config option to limit number of files for schema discover
+
 ## 3.3.0
 CDK: add incomplete status to availability check during read
 
 
@@ -1,16 +1,18 @@
 #
 # Copyright (c) 2023 Airbyte, Inc., all rights reserved.
 #
-
+import logging
 from dataclasses import InitVar, dataclass
 from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Mapping, Optional, Union
 
 import dpath
-from airbyte_cdk.models import AirbyteMessage, SyncMode, Type
+from airbyte_cdk.models import AirbyteMessage
+from airbyte_cdk.models import Type as MessageType
 from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString
 from airbyte_cdk.sources.declarative.partition_routers.partition_router import PartitionRouter
 from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType
 from airbyte_cdk.sources.types import Config, Record, StreamSlice, StreamState
+from airbyte_cdk.utils import AirbyteTracedException
 
 if TYPE_CHECKING:
     from airbyte_cdk.sources.declarative.declarative_stream import DeclarativeStream
@@ -131,40 +133,70 @@ def stream_slices(self) -> Iterable[StreamSlice]:
                 parent_field = parent_stream_config.parent_key.eval(self.config)  # type: ignore # parent_key is always casted to an interpolated string
                 partition_field = parent_stream_config.partition_field.eval(self.config)  # type: ignore # partition_field is always casted to an interpolated string
                 incremental_dependency = parent_stream_config.incremental_dependency
-                for parent_stream_slice in parent_stream.stream_slices(
-                    sync_mode=SyncMode.full_refresh, cursor_field=None, stream_state=None
-                ):
-                    parent_partition = parent_stream_slice.partition if parent_stream_slice else {}
-
-                    # we need to read all records for slice to update the parent stream cursor
-                    stream_slices_for_parent = []
-
-                    # only stream_slice param is used in the declarative stream, stream state is set in PerPartitionCursor set_initial_state
-                    for parent_record in parent_stream.read_records(
-                        sync_mode=SyncMode.full_refresh, cursor_field=None, stream_slice=parent_stream_slice, stream_state=None
-                    ):
-                        # Skip non-records (eg AirbyteLogMessage)
-                        if isinstance(parent_record, AirbyteMessage):
-                            if parent_record.type == Type.RECORD:
-                                parent_record = parent_record.record.data
-                            else:
-                                continue
-                        elif isinstance(parent_record, Record):
-                            parent_record = parent_record.data
-                        try:
-                            partition_value = dpath.get(parent_record, parent_field)
-                        except KeyError:
-                            pass
+
+                stream_slices_for_parent = []
+                previous_associated_slice = None
+
+                # read_stateless() assumes the parent is not concurrent. This is currently okay since the concurrent CDK does
+                # not support either substreams or RFR, but something that needs to be considered once we do
+                for parent_record in parent_stream.read_only_records():
+                    parent_partition = None
+                    parent_associated_slice = None
+                    # Skip non-records (eg AirbyteLogMessage)
+                    if isinstance(parent_record, AirbyteMessage):
+                        self.logger.warning(
+                            f"Parent stream {parent_stream.name} returns records of type AirbyteMessage. This SubstreamPartitionRouter is not able to checkpoint incremental parent state."
+                        )
+                        if parent_record.type == MessageType.RECORD:
+                            parent_record = parent_record.record.data
                         else:
-                            stream_slices_for_parent.append(
-                                StreamSlice(partition={partition_field: partition_value, "parent_slice": parent_partition}, cursor_slice={})
+                            continue
+                    elif isinstance(parent_record, Record):
+                        parent_partition = parent_record.associated_slice.partition if parent_record.associated_slice else {}
+                        parent_associated_slice = parent_record.associated_slice
+                        parent_record = parent_record.data
+                    elif not isinstance(parent_record, Mapping):
+                        # The parent_record should only take the form of a Record, AirbyteMessage, or Mapping. Anything else is invalid
+                        raise AirbyteTracedException(message=f"Parent stream returned records as invalid type {type(parent_record)}")
+                    try:
+                        partition_value = dpath.get(parent_record, parent_field)
+                    except KeyError:
+                        pass
+                    else:
+                        if incremental_dependency:
+                            if previous_associated_slice is None:
+                                previous_associated_slice = parent_associated_slice
+                            elif previous_associated_slice != parent_associated_slice:
+                                # Update the parent state, as parent stream read all record for current slice and state
+                                # is already updated.
+                                #
+                                # When the associated slice of the current record of the parent stream changes, this
+                                # indicates the parent stream has finished processing the current slice and has moved onto
+                                # the next. When this happens, we should update the partition router's current state and
+                                # flush the previous set of collected records and start a new set
+                                #
+                                # Note: One tricky aspect to take note of here is that parent_stream.state will actually
+                                # fetch state of the stream of the previous record's slice NOT the current record's slice.
+                                # This is because in the retriever, we only update stream state after yielding all the
+                                # records. And since we are in the middle of the current slice, parent_stream.state is
+                                # still set to the previous state.
+                                self._parent_state[parent_stream.name] = parent_stream.state
+                                yield from stream_slices_for_parent
+
+                                # Reset stream_slices_for_parent after we've flushed parent records for the previous parent slice
+                                stream_slices_for_parent = []
+                                previous_associated_slice = parent_associated_slice
+                        stream_slices_for_parent.append(
+                            StreamSlice(
+                                partition={partition_field: partition_value, "parent_slice": parent_partition or {}}, cursor_slice={}
                             )
+                        )
 
-                    # update the parent state, as parent stream read all record for current slice and state is already updated
-                    if incremental_dependency:
-                        self._parent_state[parent_stream.name] = parent_stream.state
+                # A final parent state update and yield of records is needed, so we don't skip records for the final parent slice
+                if incremental_dependency:
+                    self._parent_state[parent_stream.name] = parent_stream.state
 
-                    yield from stream_slices_for_parent
+                yield from stream_slices_for_parent
 
     def set_initial_state(self, stream_state: StreamState) -> None:
         """
@@ -215,3 +247,7 @@ def get_stream_state(self) -> Optional[Mapping[str, StreamState]]:
         }
         """
         return self._parent_state
+
+    @property
+    def logger(self) -> logging.Logger:
+        return logging.getLogger("airbyte.SubstreamPartitionRouter")
@@ -64,6 +64,12 @@ class FileBasedStreamConfig(BaseModel):
         description="When enabled, syncs will not validate or structure records against the stream's schema.",
         default=False,
     )
+    recent_n_files_to_read_for_schema_discovery: Optional[int] = Field(
+        title="Files To Read For Schema Discover",
+        description="The number of resent files which will be used to discover the schema for this stream.",
+        default=None,
+        gt=0,
+    )
 
     @validator("input_schema", pre=True)
     def validate_input_schema(cls, v: Optional[str]) -> Optional[str]:
 
@@ -32,7 +32,7 @@ class AbstractFileBasedStream(Stream):
       files in the stream.
     - A DiscoveryPolicy that controls the number of concurrent requests sent to the source
       during discover, and the number of files used for schema discovery.
-    - A dictionary of FileType:Parser that holds all of the file types that can be handled
+    - A dictionary of FileType:Parser that holds all the file types that can be handled
       by the stream.
     """
 
@@ -70,7 +70,7 @@ def list_files(self) -> List[RemoteFile]:
         List all files that belong to the stream.
 
         The output of this method is cached so we don't need to list the files more than once.
-        This means we won't pick up changes to the files during a sync. This meethod uses the
+        This means we won't pick up changes to the files during a sync. This method uses the
         get_files method which is implemented by the concrete stream class.
         """
         return list(self.get_files())
 
@@ -191,30 +191,40 @@ def _get_raw_json_schema(self) -> JsonSchema:
             return schemaless_schema
         else:
             files = self.list_files()
-            total_n_files = len(files)
-
-            if total_n_files == 0:
-                self.logger.warning(msg=f"No files were identified in the stream {self.name}. Setting default schema for the stream.")
-                return schemaless_schema
-
-            max_n_files_for_schema_inference = self._discovery_policy.get_max_n_files_for_schema_inference(self.get_parser())
-            if total_n_files > max_n_files_for_schema_inference:
-                # Use the most recent files for schema inference, so we pick up schema changes during discovery.
-                files = sorted(files, key=lambda x: x.last_modified, reverse=True)[:max_n_files_for_schema_inference]
-                self.logger.warn(
-                    msg=f"Refusing to infer schema for all {total_n_files} files; using {max_n_files_for_schema_inference} files."
+            first_n_files = len(files)
+
+            if self.config.recent_n_files_to_read_for_schema_discovery:
+                self.logger.info(
+                    msg=(
+                        f"Only first {self.config.recent_n_files_to_read_for_schema_discovery} files will be used to infer schema "
+                        f"for stream {self.name} due to limitation in config."
+                    )
                 )
+                first_n_files = self.config.recent_n_files_to_read_for_schema_discovery
 
-            inferred_schema = self.infer_schema(files)
+        if first_n_files == 0:
+            self.logger.warning(msg=f"No files were identified in the stream {self.name}. Setting default schema for the stream.")
+            return schemaless_schema
 
-            if not inferred_schema:
-                raise InvalidSchemaError(
-                    FileBasedSourceError.INVALID_SCHEMA_ERROR,
-                    details=f"Empty schema. Please check that the files are valid for format {self.config.format}",
-                    stream=self.name,
-                )
+        max_n_files_for_schema_inference = self._discovery_policy.get_max_n_files_for_schema_inference(self.get_parser())
+
+        if first_n_files > max_n_files_for_schema_inference:
+            # Use the most recent files for schema inference, so we pick up schema changes during discovery.
+            self.logger.warning(msg=f"Refusing to infer schema for {first_n_files} files; using {max_n_files_for_schema_inference} files.")
+            first_n_files = max_n_files_for_schema_inference
+
+        files = sorted(files, key=lambda x: x.last_modified, reverse=True)[:first_n_files]
+
+        inferred_schema = self.infer_schema(files)
+
+        if not inferred_schema:
+            raise InvalidSchemaError(
+                FileBasedSourceError.INVALID_SCHEMA_ERROR,
+                details=f"Empty schema. Please check that the files are valid for format {self.config.format}",
+                stream=self.name,
+            )
 
-            schema = {"type": "object", "properties": inferred_schema}
+        schema = {"type": "object", "properties": inferred_schema}
 
         return schema
 
 
@@ -10,7 +10,7 @@
 from typing import Any, Dict, Iterable, List, Mapping, MutableMapping, Optional, Tuple, Union
 
 import airbyte_cdk.sources.utils.casing as casing
-from airbyte_cdk.models import AirbyteMessage, AirbyteStream, ConfiguredAirbyteStream, SyncMode
+from airbyte_cdk.models import AirbyteMessage, AirbyteStream, ConfiguredAirbyteStream, DestinationSyncMode, SyncMode
 from airbyte_cdk.models import Type as MessageType
 from airbyte_cdk.sources.streams.checkpoint import (
     CheckpointMode,
@@ -24,7 +24,7 @@
 
 # list of all possible HTTP methods which can be used for sending of request bodies
 from airbyte_cdk.sources.utils.schema_helpers import InternalConfig, ResourceSchemaLoader
-from airbyte_cdk.sources.utils.slice_logger import SliceLogger
+from airbyte_cdk.sources.utils.slice_logger import DebugSliceLogger, SliceLogger
 from airbyte_cdk.sources.utils.transform import TransformConfig, TypeTransformer
 from deprecated import deprecated
 
@@ -156,6 +156,7 @@ def read(  # type: ignore  # ignoring typing for ConnectorStateManager because o
         except AttributeError:
             pass
 
+        should_checkpoint = bool(state_manager)
         checkpoint_reader = self._get_checkpoint_reader(
             logger=logger, cursor_field=cursor_field, sync_mode=sync_mode, stream_state=stream_state
         )
@@ -193,25 +194,51 @@ def read(  # type: ignore  # ignoring typing for ConnectorStateManager because o
 
                     checkpoint_interval = self.state_checkpoint_interval
                     checkpoint = checkpoint_reader.get_checkpoint()
-                    if checkpoint_interval and record_counter % checkpoint_interval == 0 and checkpoint is not None:
+                    if should_checkpoint and checkpoint_interval and record_counter % checkpoint_interval == 0 and checkpoint is not None:
                         airbyte_state_message = self._checkpoint_state(checkpoint, state_manager=state_manager)
                         yield airbyte_state_message
 
                     if internal_config.is_limit_reached(record_counter):
                         break
             self._observe_state(checkpoint_reader)
             checkpoint_state = checkpoint_reader.get_checkpoint()
-            if checkpoint_state is not None:
+            if should_checkpoint and checkpoint_state is not None:
                 airbyte_state_message = self._checkpoint_state(checkpoint_state, state_manager=state_manager)
                 yield airbyte_state_message
 
             next_slice = checkpoint_reader.next()
 
         checkpoint = checkpoint_reader.get_checkpoint()
-        if checkpoint is not None:
+        if should_checkpoint and checkpoint is not None:
             airbyte_state_message = self._checkpoint_state(checkpoint, state_manager=state_manager)
             yield airbyte_state_message
 
+    def read_only_records(self, state: Optional[Mapping[str, Any]] = None) -> Iterable[StreamData]:
+        """
+        Helper method that performs a read on a stream with an optional state and emits records. If the parent stream supports
+        incremental, this operation does not update the stream's internal state (if it uses the modern state setter/getter)
+        or emit state messages.
+        """
+
+        configured_stream = ConfiguredAirbyteStream(
+            stream=AirbyteStream(
+                name=self.name,
+                json_schema={},
+                supported_sync_modes=[SyncMode.full_refresh, SyncMode.incremental],
+            ),
+            sync_mode=SyncMode.incremental if state else SyncMode.full_refresh,
+            destination_sync_mode=DestinationSyncMode.append,
+        )
+
+        yield from self.read(
+            configured_stream=configured_stream,
+            logger=self.logger,
+            slice_logger=DebugSliceLogger(),
+            stream_state=dict(state) if state else {},  # read() expects MutableMapping instead of Mapping which is used more often
+            state_manager=None,
+            internal_config=InternalConfig(),
+        )
+
     @abstractmethod
     def read_records(
         self,