Skip to content

Commit 5378cef

Browse files
maxi297octavia-squidington-iiibrianjlai
authored
chore: remove declarative stream (#707)
Co-authored-by: octavia-squidington-iii <[email protected]> Co-authored-by: brianjlai <[email protected]>
1 parent e5a1fc2 commit 5378cef

35 files changed

+1792
-3938
lines changed

airbyte_cdk/legacy/sources/declarative/incremental/per_partition_cursor.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -146,8 +146,10 @@ def set_initial_state(self, stream_state: StreamState) -> None:
146146
if "state" in stream_state:
147147
self._state_to_migrate_from = stream_state["state"]
148148

149-
# Set parent state for partition routers based on parent streams
150-
self._partition_router.set_initial_state(stream_state)
149+
# We used to set the parent state through this method but since moving the SubstreamPartitionRouter to the
150+
# Concurrent CDK/AbstractStream, the state is passed at the __init__ stage and this does not need to be called.
151+
# We are still keeping this line as a comment to be explicit about the past behavior.
152+
# self._partition_router.set_initial_state(stream_state)
151153

152154
def observe(self, stream_slice: StreamSlice, record: Record) -> None:
153155
self._cursor_per_partition[self._to_partition_key(stream_slice.partition)].observe(

airbyte_cdk/sources/declarative/concurrent_declarative_source.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -704,7 +704,7 @@ def _group_streams(
704704
stream_slicer=declarative_stream.retriever.stream_slicer,
705705
slice_limit=self._limits.max_slices
706706
if self._limits
707-
else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
707+
else None, # technically not needed because create_default_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
708708
)
709709
else:
710710
if (
@@ -773,7 +773,7 @@ def _group_streams(
773773
declarative_stream.retriever.stream_slicer,
774774
slice_limit=self._limits.max_slices
775775
if self._limits
776-
else None, # technically not needed because create_declarative_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
776+
else None, # technically not needed because create_default_stream() -> create_simple_retriever() will apply the decorator. But for consistency and depending how we build create_default_stream, this may be needed later
777777
)
778778

779779
final_state_cursor = FinalStateCursor(

airbyte_cdk/sources/declarative/incremental/concurrent_partition_cursor.py

Lines changed: 57 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@
1111
from datetime import timedelta
1212
from typing import Any, Callable, Iterable, List, Mapping, MutableMapping, Optional
1313

14+
from airbyte_cdk.models import (
15+
AirbyteStateBlob,
16+
AirbyteStateMessage,
17+
AirbyteStateType,
18+
AirbyteStreamState,
19+
StreamDescriptor,
20+
)
1421
from airbyte_cdk.sources.connector_state_manager import ConnectorStateManager
1522
from airbyte_cdk.sources.declarative.incremental.global_substream_cursor import (
1623
Timer,
@@ -48,7 +55,7 @@ class ConcurrentPerPartitionCursor(Cursor):
4855
Manages state per partition when a stream has many partitions, preventing data loss or duplication.
4956
5057
Attributes:
51-
DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000).
58+
DEFAULT_MAX_PARTITIONS_NUMBER (int): Maximum number of partitions to retain in memory (default is 10,000). This limit needs to be higher than the number of threads we might enqueue (which is represented by ThreadPoolManager.DEFAULT_MAX_QUEUE_SIZE). If not, we could have partitions that have been generated and submitted to the ThreadPool but got deleted from the ConcurrentPerPartitionCursor and when closing them, it will generate KeyError.
5259
5360
- **Partition Limitation Logic**
5461
Ensures the number of tracked partitions does not exceed the specified limit to prevent memory overuse. Oldest partitions are removed when the limit is reached.
@@ -128,6 +135,7 @@ def __init__(
128135

129136
# FIXME this is a temporary field the time of the migration from declarative cursors to concurrent ones
130137
self._attempt_to_create_cursor_if_not_provided = attempt_to_create_cursor_if_not_provided
138+
self._synced_some_data = False
131139

132140
@property
133141
def cursor_field(self) -> CursorField:
@@ -168,8 +176,8 @@ def close_partition(self, partition: Partition) -> None:
168176
with self._lock:
169177
self._semaphore_per_partition[partition_key].acquire()
170178
if not self._use_global_cursor:
171-
self._cursor_per_partition[partition_key].close_partition(partition=partition)
172179
cursor = self._cursor_per_partition[partition_key]
180+
cursor.close_partition(partition=partition)
173181
if (
174182
partition_key in self._partitions_done_generating_stream_slices
175183
and self._semaphore_per_partition[partition_key]._value == 0
@@ -213,8 +221,10 @@ def ensure_at_least_one_state_emitted(self) -> None:
213221
if not any(
214222
semaphore_item[1]._value for semaphore_item in self._semaphore_per_partition.items()
215223
):
216-
self._global_cursor = self._new_global_cursor
217-
self._lookback_window = self._timer.finish()
224+
if self._synced_some_data:
225+
# we only update those if we actually synced some data
226+
self._global_cursor = self._new_global_cursor
227+
self._lookback_window = self._timer.finish()
218228
self._parent_state = self._partition_router.get_stream_state()
219229
self._emit_state_message(throttle=False)
220230

@@ -422,9 +432,6 @@ def _set_initial_state(self, stream_state: StreamState) -> None:
422432
if stream_state.get("parent_state"):
423433
self._parent_state = stream_state["parent_state"]
424434

425-
# Set parent state for partition routers based on parent streams
426-
self._partition_router.set_initial_state(stream_state)
427-
428435
def _set_global_state(self, stream_state: Mapping[str, Any]) -> None:
429436
"""
430437
Initializes the global cursor state from the provided stream state.
@@ -458,6 +465,7 @@ def observe(self, record: Record) -> None:
458465
except ValueError:
459466
return
460467

468+
self._synced_some_data = True
461469
record_cursor = self._connector_state_converter.output_format(
462470
self._connector_state_converter.parse_value(record_cursor_value)
463471
)
@@ -541,3 +549,45 @@ def _get_cursor(self, record: Record) -> ConcurrentCursor:
541549

542550
def limit_reached(self) -> bool:
543551
return self._number_of_partitions > self.SWITCH_TO_GLOBAL_LIMIT
552+
553+
@staticmethod
554+
def get_parent_state(
555+
stream_state: Optional[StreamState], parent_stream_name: str
556+
) -> Optional[AirbyteStateMessage]:
557+
if not stream_state:
558+
return None
559+
560+
if "parent_state" not in stream_state:
561+
logger.warning(
562+
f"Trying to get_parent_state for stream `{parent_stream_name}` when there are not parent state in the state"
563+
)
564+
return None
565+
elif parent_stream_name not in stream_state["parent_state"]:
566+
logger.info(
567+
f"Could not find parent state for stream `{parent_stream_name}`. On parents available are {list(stream_state['parent_state'].keys())}"
568+
)
569+
return None
570+
571+
return AirbyteStateMessage(
572+
type=AirbyteStateType.STREAM,
573+
stream=AirbyteStreamState(
574+
stream_descriptor=StreamDescriptor(parent_stream_name, None),
575+
stream_state=AirbyteStateBlob(stream_state["parent_state"][parent_stream_name]),
576+
),
577+
)
578+
579+
@staticmethod
580+
def get_global_state(
581+
stream_state: Optional[StreamState], parent_stream_name: str
582+
) -> Optional[AirbyteStateMessage]:
583+
return (
584+
AirbyteStateMessage(
585+
type=AirbyteStateType.STREAM,
586+
stream=AirbyteStreamState(
587+
stream_descriptor=StreamDescriptor(parent_stream_name, None),
588+
stream_state=AirbyteStateBlob(stream_state["state"]),
589+
),
590+
)
591+
if stream_state and "state" in stream_state
592+
else None
593+
)

airbyte_cdk/sources/declarative/incremental/global_substream_cursor.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -192,8 +192,10 @@ def set_initial_state(self, stream_state: StreamState) -> None:
192192
# Example: {"global_state_format_key": "global_state_format_value"}
193193
self._stream_cursor.set_initial_state(stream_state)
194194

195-
# Set parent state for partition routers based on parent streams
196-
self._partition_router.set_initial_state(stream_state)
195+
# We used to set the parent state through this method but since moving the SubstreamPartitionRouter to the
196+
# Concurrent CDK/AbstractStream, the state is passed at the __init__ stage and this does not need to be called.
197+
# We are still keeping this line as a comment to be explicit about the past behavior.
198+
# self._partition_router.set_initial_state(stream_state)
197199

198200
def _inject_lookback_into_stream_cursor(self, lookback_window: int) -> None:
199201
"""

0 commit comments

Comments
 (0)