-
Notifications
You must be signed in to change notification settings - Fork 148
Add Iceberg tag time travel support #4211
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1955,6 +1955,7 @@ class TimeTravelConfig(NamedTuple): | |
| timestamp: Optional[str] = None | ||
| timestamp_type: Optional[str] = None | ||
| stream: Optional[str] = None | ||
| iceberg_tag: Optional[str] = None | ||
|
|
||
| @staticmethod | ||
| def validate_and_normalize_params( | ||
|
|
@@ -1964,6 +1965,7 @@ def validate_and_normalize_params( | |
| timestamp: Optional[Union[str, datetime.datetime]] = None, | ||
| timestamp_type: Optional[Union[str, "TimestampTimeZone"]] = None, | ||
| stream: Optional[str] = None, | ||
| iceberg_tag: Optional[str] = None, | ||
| ) -> Optional["TimeTravelConfig"]: | ||
| """ | ||
| Validates and normalizes time travel parameters. | ||
|
|
@@ -1986,7 +1988,8 @@ def validate_and_normalize_params( | |
| ValueError: If parameters are invalid. | ||
| """ | ||
| time_travel_arg_count = sum( | ||
| arg is not None for arg in (statement, offset, timestamp, stream) | ||
| arg is not None | ||
| for arg in (statement, offset, timestamp, stream, iceberg_tag) | ||
| ) | ||
|
|
||
| # Validate mode | ||
|
|
@@ -2003,10 +2006,16 @@ def validate_and_normalize_params( | |
| f"Invalid time travel mode: {time_travel_mode}. Must be 'at' or 'before'." | ||
| ) | ||
|
|
||
| # Validate iceberg_tag can only be used with 'at' mode | ||
| if iceberg_tag is not None and time_travel_mode.lower() != "at": | ||
| raise ValueError( | ||
| "Iceberg tag time travel can only be used with time_travel_mode='at', not 'before'." | ||
| ) | ||
|
|
||
| # Validate exactly one parameter is provided | ||
| if time_travel_arg_count != 1: | ||
| raise ValueError( | ||
| "Exactly one of 'statement', 'offset', 'timestamp', or 'stream' must be provided." | ||
| "Exactly one of 'statement', 'offset', 'timestamp', 'stream', or 'iceberg_tag' must be provided." | ||
| ) | ||
|
|
||
| # Normalize timestamp | ||
|
|
@@ -2040,6 +2049,7 @@ def validate_and_normalize_params( | |
| timestamp=normalized_timestamp, | ||
| timestamp_type=timestamp_type, | ||
| stream=stream, | ||
| iceberg_tag=iceberg_tag, | ||
| ) | ||
|
|
||
| def generate_sql_clause(self) -> str: | ||
|
|
@@ -2048,7 +2058,8 @@ def generate_sql_clause(self) -> str: | |
| Args: | ||
| config: Time travel configuration. | ||
| Returns: | ||
| SQL clause like " AT (TIMESTAMP => TO_TIMESTAMP_NTZ('...'))" | ||
| SQL clause like " AT (TIMESTAMP => TO_TIMESTAMP_NTZ('...'))" or | ||
| " AT (ICEBERG_TAG => 'tag_name')" for Iceberg tables. | ||
| """ | ||
| clause = f" {self.time_travel_mode.upper()} " | ||
|
|
||
|
|
@@ -2058,6 +2069,8 @@ def generate_sql_clause(self) -> str: | |
| clause += f"(OFFSET => {self.offset})" | ||
| elif self.stream is not None: | ||
| clause += f"(STREAM => '{self.stream}')" | ||
| elif self.iceberg_tag is not None: | ||
| clause += f"(ICEBERG_TAG => '{self.iceberg_tag}')" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm unable to see this feature in public doc. |
||
| elif self.timestamp is not None: | ||
| if self.timestamp_type is not None: | ||
| timestamp_type = self.timestamp_type.upper() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -125,6 +125,8 @@ | |
| "TIMESTAMP": "timestamp", | ||
| "TIMESTAMP_TYPE": "timestamp_type", | ||
| "STREAM": "stream", | ||
| "ICEBERG_TAG": "iceberg_tag", | ||
| "TAG": "iceberg_tag", | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When using snowflake has its own "tag" concept, will there be future conflict under the context of dataframe reader? |
||
| } | ||
|
|
||
| READER_OPTIONS_ALIAS_MAP = { | ||
|
|
@@ -162,6 +164,10 @@ def _extract_time_travel_from_options(options: dict) -> dict: | |
| - Automatically sets time_travel_mode to 'at' | ||
| - Cannot be used with time_travel_mode='before' (raises error) | ||
| - Cannot be mixed with regular 'timestamp' option (raises error) | ||
|
|
||
| Special handling for 'TAG' or 'ICEBERG_TAG' (Spark Iceberg compatibility): | ||
| - Automatically sets time_travel_mode to 'at' (Iceberg tags only work with AT) | ||
| - Cannot be used with time_travel_mode='before' (raises error) | ||
| """ | ||
| result = {} | ||
| excluded_keys = set() | ||
|
|
@@ -183,6 +189,22 @@ def _extract_time_travel_from_options(options: dict) -> dict: | |
| result["timestamp"] = options["AS-OF-TIMESTAMP"] | ||
| excluded_keys.add("TIMESTAMP") | ||
|
|
||
| # Handle Iceberg tag option (Spark Iceberg compatibility) | ||
| tag_value = options.get("TAG") or options.get("ICEBERG_TAG") | ||
| if tag_value is not None: | ||
| if ( | ||
| "TIME_TRAVEL_MODE" in options | ||
| and options["TIME_TRAVEL_MODE"].lower() == "before" | ||
| ): | ||
| raise ValueError( | ||
| "Cannot use 'tag' option with time_travel_mode='before'. " | ||
| "Iceberg tags only work with time_travel_mode='at'." | ||
| ) | ||
| result["time_travel_mode"] = "at" | ||
| result["iceberg_tag"] = tag_value | ||
| excluded_keys.add("ICEBERG_TAG") | ||
| excluded_keys.add("TAG") | ||
|
|
||
| for option_key, param_name in _TIME_TRAVEL_OPTIONS_PARAMS_MAP.items(): | ||
| if option_key in options and option_key not in excluded_keys: | ||
| result[param_name] = options[option_key] | ||
|
|
@@ -549,6 +571,7 @@ def table( | |
| timestamp: Optional[Union[str, datetime]] = None, | ||
| timestamp_type: Optional[Union[str, TimestampTimeZone]] = None, | ||
| stream: Optional[str] = None, | ||
| iceberg_tag: Optional[str] = None, | ||
| ) -> Table: | ||
| """Returns a Table that points to the specified table. | ||
|
|
||
|
|
@@ -568,6 +591,9 @@ def table( | |
| timestamp_type: Type of timestamp interpretation ('NTZ', 'LTZ', or 'TZ'). | ||
| Can also be set via ``option("timestamp_type", "LTZ")``. | ||
| stream: Stream name for time travel. Can also be set via ``option("stream", "stream_name")``. | ||
| iceberg_tag: Iceberg snapshot tag name for time travel on Iceberg tables. | ||
| Can also be set via ``option("tag", "tag_name")`` or | ||
| ``option("iceberg_tag", "tag_name")``. Automatically sets time_travel_mode='at'. | ||
|
|
||
| Note: | ||
| Time travel options can be set either as direct parameters or via the | ||
|
|
@@ -577,6 +603,9 @@ def table( | |
| PySpark Compatibility: The ``as-of-timestamp`` option automatically sets | ||
| ``time_travel_mode="at"`` and cannot be used with ``time_travel_mode="before"``. | ||
|
|
||
| Spark Iceberg Compatibility: The ``tag`` option automatically sets | ||
| ``time_travel_mode="at"`` for Iceberg tag-based time travel. | ||
|
|
||
| Examples:: | ||
|
|
||
| # Using direct parameters | ||
|
|
@@ -591,6 +620,9 @@ def table( | |
| # PySpark-style as-of-timestamp (automatically sets mode to "at") | ||
| >>> table = session.read.option("as-of-timestamp", "2023-01-01 12:00:00").table("my_table") # doctest: +SKIP | ||
|
|
||
| # Iceberg tag-based time travel (automatically sets mode to "at") | ||
| >>> table = session.read.option("tag", "release_v1").table("my_iceberg_table") # doctest: +SKIP | ||
|
|
||
| # timestamp_type automatically set to "TZ" due to timezone info | ||
| >>> import datetime, pytz # doctest: +SKIP | ||
| >>> tz_aware = datetime.datetime(2023, 1, 1, 12, 0, 0, tzinfo=pytz.UTC) # doctest: +SKIP | ||
|
|
@@ -625,15 +657,24 @@ def table( | |
| ast.timestamp_type.value = str(timestamp_type) | ||
| if stream is not None: | ||
| ast.stream.value = stream | ||
|
|
||
| if time_travel_mode is not None: | ||
| if iceberg_tag is not None and hasattr(ast, "iceberg_tag"): | ||
| ast.iceberg_tag.value = iceberg_tag | ||
|
|
||
| if time_travel_mode is not None or iceberg_tag is not None: | ||
| # If iceberg_tag is provided but time_travel_mode is not, default to 'at' | ||
| effective_mode = ( | ||
| time_travel_mode | ||
| if time_travel_mode | ||
| else ("at" if iceberg_tag else None) | ||
| ) | ||
| time_travel_params = { | ||
| "time_travel_mode": time_travel_mode, | ||
| "time_travel_mode": effective_mode, | ||
| "statement": statement, | ||
| "offset": offset, | ||
| "timestamp": timestamp, | ||
| "timestamp_type": timestamp_type, | ||
| "stream": stream, | ||
| "iceberg_tag": iceberg_tag, | ||
| } | ||
| else: | ||
| # if time_travel_mode is not provided, extract time travel config from options | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the client change generated from the monorepo AST definition?
I remember a client AST change isn't strictly required right now; if it's needed, it should be derived from the monorepo updates -- there is a step by step doc for the AST mono repo updates.
@sfc-gh-heshah what's your recommendation here? can we just not add the ast in the PR and do that later?