Merge branch 'sea-hybrid' into sea-decouple-link-fetch

varun-edachali-dbx · varun-edachali-dbx · commit a618115c74db · 2025-07-16T07:36:52.000+05:30
diff --git a/src/databricks/sql/backend/databricks_client.py b/src/databricks/sql/backend/databricks_client.py
@@ -3,8 +3,6 @@
 from abc import ABC, abstractmethod
 from typing import Dict, List, Optional, Any, Union, TYPE_CHECKING
 
-from databricks.sql.types import SSLOptions
-
 if TYPE_CHECKING:
     from databricks.sql.client import Cursor
     from databricks.sql.result_set import ResultSet
@@ -24,13 +22,6 @@ class DatabricksClient(ABC):
     - Fetching metadata about catalogs, schemas, tables, and columns
     """
 
-    def __init__(self, ssl_options: SSLOptions, **kwargs):
-        self._use_arrow_native_complex_types = kwargs.get(
-            "_use_arrow_native_complex_types", True
-        )
-        self._max_download_threads = kwargs.get("max_download_threads", 10)
-        self._ssl_options = ssl_options
-
     # == Connection and Session Management ==
     @abstractmethod
     def open_session(
@@ -110,6 +101,7 @@ def execute_command(
             parameters: List of parameters to bind to the query
             async_op: Whether to execute the command asynchronously
             enforce_embedded_schema_correctness: Whether to enforce schema correctness
+            row_limit: Maximum number of rows in the response.
 
         Returns:
             If async_op is False, returns a ResultSet object containing the
diff --git a/src/databricks/sql/backend/sea/backend.py b/src/databricks/sql/backend/sea/backend.py
@@ -124,15 +124,19 @@ def __init__(
             http_path,
         )
 
-        super().__init__(ssl_options=ssl_options, **kwargs)
+        self._max_download_threads = kwargs.get("max_download_threads", 10)
+        self._ssl_options = ssl_options
+        self._use_arrow_native_complex_types = kwargs.get(
+            "_use_arrow_native_complex_types", True
+        )
 
         self.use_hybrid_disposition = kwargs.get("use_hybrid_disposition", True)
 
         # Extract warehouse ID from http_path
         self.warehouse_id = self._extract_warehouse_id(http_path)
 
         # Initialize HTTP client
-        self.http_client = SeaHttpClient(
+        self._http_client = SeaHttpClient(
             server_hostname=server_hostname,
             port=port,
             http_path=http_path,
@@ -224,7 +228,7 @@ def open_session(
             schema=schema,
         )
 
-        response = self.http_client._make_request(
+        response = self._http_client._make_request(
             method="POST", path=self.SESSION_PATH, data=request_data.to_dict()
         )
 
@@ -264,7 +268,7 @@ def close_session(self, session_id: SessionId) -> None:
             session_id=sea_session_id,
         )
 
-        self.http_client._make_request(
+        self._http_client._make_request(
             method="DELETE",
             path=self.SESSION_PATH_WITH_ID.format(sea_session_id),
             data=request_data.to_dict(),
@@ -443,7 +447,9 @@ def execute_command(
                 sea_parameters.append(
                     StatementParameter(
                         name=param.name,
-                        value=param.value,
+                        value=(
+                            param.value.stringValue if param.value is not None else None
+                        ),
                         type=param.type,
                     )
                 )
@@ -477,7 +483,7 @@ def execute_command(
             result_compression=result_compression,
         )
 
-        response_data = self.http_client._make_request(
+        response_data = self._http_client._make_request(
             method="POST", path=self.STATEMENT_PATH, data=request.to_dict()
         )
         response = ExecuteStatementResponse.from_dict(response_data)
@@ -522,7 +528,7 @@ def cancel_command(self, command_id: CommandId) -> None:
             raise ValueError("Not a valid SEA command ID")
 
         request = CancelStatementRequest(statement_id=sea_statement_id)
-        self.http_client._make_request(
+        self._http_client._make_request(
             method="POST",
             path=self.CANCEL_STATEMENT_PATH_WITH_ID.format(sea_statement_id),
             data=request.to_dict(),
@@ -547,7 +553,7 @@ def close_command(self, command_id: CommandId) -> None:
             raise ValueError("Not a valid SEA command ID")
 
         request = CloseStatementRequest(statement_id=sea_statement_id)
-        self.http_client._make_request(
+        self._http_client._make_request(
             method="DELETE",
             path=self.STATEMENT_PATH_WITH_ID.format(sea_statement_id),
             data=request.to_dict(),
@@ -575,7 +581,7 @@ def get_query_state(self, command_id: CommandId) -> CommandState:
             raise ValueError("Not a valid SEA command ID")
 
         request = GetStatementRequest(statement_id=sea_statement_id)
-        response_data = self.http_client._make_request(
+        response_data = self._http_client._make_request(
             method="GET",
             path=self.STATEMENT_PATH_WITH_ID.format(sea_statement_id),
             data=request.to_dict(),
@@ -615,7 +621,7 @@ def get_execution_result(
         request = GetStatementRequest(statement_id=sea_statement_id)
 
         # Get the statement result
-        response_data = self.http_client._make_request(
+        response_data = self._http_client._make_request(
             method="GET",
             path=self.STATEMENT_PATH_WITH_ID.format(sea_statement_id),
             data=request.to_dict(),
@@ -649,13 +655,13 @@ def get_chunk_links(
             ExternalLink: External link for the chunk
         """
 
-        response_data = self.http_client._make_request(
+        response_data = self._http_client._make_request(
             method="GET",
             path=self.CHUNK_PATH_WITH_ID_AND_INDEX.format(statement_id, chunk_index),
         )
         response = GetChunksResponse.from_dict(response_data)
 
-        links = response.external_links
+        links = response.external_links or []
         return links
 
     # == Metadata Operations ==
diff --git a/src/databricks/sql/backend/sea/models/responses.py b/src/databricks/sql/backend/sea/models/responses.py
@@ -5,7 +5,7 @@
 """
 
 import base64
-from typing import Dict, Any, List
+from typing import Dict, Any, List, Optional
 from dataclasses import dataclass
 
 from databricks.sql.backend.types import CommandState
@@ -165,34 +165,33 @@ def from_dict(cls, data: Dict[str, Any]) -> "CreateSessionResponse":
 
 @dataclass
 class GetChunksResponse:
-    """Response from getting chunks for a statement."""
-
-    statement_id: str
-    external_links: List[ExternalLink]
+    """
+    Response from getting chunks for a statement.
+
+    The response model can be found in the docs, here:
+    https://docs.databricks.com/api/workspace/statementexecution/getstatementresultchunkn
+    """
+
+    data: Optional[List[List[Any]]] = None
+    external_links: Optional[List[ExternalLink]] = None
+    byte_count: Optional[int] = None
+    chunk_index: Optional[int] = None
+    next_chunk_index: Optional[int] = None
+    next_chunk_internal_link: Optional[str] = None
+    row_count: Optional[int] = None
+    row_offset: Optional[int] = None
 
     @classmethod
     def from_dict(cls, data: Dict[str, Any]) -> "GetChunksResponse":
         """Create a GetChunksResponse from a dictionary."""
-        external_links = []
-        if "external_links" in data:
-            for link_data in data["external_links"]:
-                external_links.append(
-                    ExternalLink(
-                        external_link=link_data.get("external_link", ""),
-                        expiration=link_data.get("expiration", ""),
-                        chunk_index=link_data.get("chunk_index", 0),
-                        byte_count=link_data.get("byte_count", 0),
-                        row_count=link_data.get("row_count", 0),
-                        row_offset=link_data.get("row_offset", 0),
-                        next_chunk_index=link_data.get("next_chunk_index"),
-                        next_chunk_internal_link=link_data.get(
-                            "next_chunk_internal_link"
-                        ),
-                        http_headers=link_data.get("http_headers"),
-                    )
-                )
-
+        result = _parse_result({"result": data})
         return cls(
-            statement_id=data.get("statement_id", ""),
-            external_links=external_links,
+            data=result.data,
+            external_links=result.external_links,
+            byte_count=result.byte_count,
+            chunk_index=result.chunk_index,
+            next_chunk_index=result.next_chunk_index,
+            next_chunk_internal_link=result.next_chunk_internal_link,
+            row_count=result.row_count,
+            row_offset=result.row_offset,
         )
diff --git a/src/databricks/sql/backend/sea/queue.py b/src/databricks/sql/backend/sea/queue.py
@@ -22,7 +22,7 @@
     ResultManifest,
 )
 from databricks.sql.backend.sea.utils.constants import ResultFormat
-from databricks.sql.exc import ProgrammingError
+from databricks.sql.exc import ProgrammingError, ServerOperationError
 from databricks.sql.thrift_api.TCLIService.ttypes import TSparkArrowResultLink
 from databricks.sql.types import SSLOptions
 from databricks.sql.utils import (
@@ -83,7 +83,7 @@ def build_queue(
 
             # EXTERNAL_LINKS disposition
             return SeaCloudFetchQueue(
-                initial_links=result_data.external_links or [],
+                result_data=result_data,
                 max_download_threads=max_download_threads,
                 ssl_options=ssl_options,
                 sea_client=sea_client,
@@ -117,6 +117,9 @@ def remaining_rows(self) -> List[List[str]]:
         self.cur_row_index += len(slice)
         return slice
 
+    def close(self):
+        return
+
 
 class LinkFetcher:
     def __init__(
@@ -218,7 +221,7 @@ class SeaCloudFetchQueue(CloudFetchQueue):
 
     def __init__(
         self,
-        initial_links: List["ExternalLink"],
+        result_data: ResultData,
         max_download_threads: int,
         ssl_options: SSLOptions,
         sea_client: "SeaDatabricksClient",
@@ -252,14 +255,22 @@ def __init__(
 
         self._sea_client = sea_client
         self._statement_id = statement_id
+        self._total_chunk_count = total_chunk_count
 
         logger.debug(
             "SeaCloudFetchQueue: Initialize CloudFetch loader for statement {}, total chunks: {}".format(
                 statement_id, total_chunk_count
             )
         )
 
-        if total_chunk_count < 1:
+        initial_links = result_data.external_links or []
+        self._chunk_index_to_link = {link.chunk_index: link for link in initial_links}
+
+        # Track the current chunk we're processing
+        self._current_chunk_index = 0
+        first_link = self._chunk_index_to_link.get(self._current_chunk_index, None)
+        if not first_link:
+            # possibly an empty response
             return
 
         self.current_chunk_index = 0
diff --git a/src/databricks/sql/backend/thrift_backend.py b/src/databricks/sql/backend/thrift_backend.py
@@ -149,8 +149,6 @@ def __init__(
             http_path,
         )
 
-        super().__init__(ssl_options, **kwargs)
-
         port = port or 443
         if kwargs.get("_connection_uri"):
             uri = kwargs.get("_connection_uri")
@@ -164,13 +162,20 @@ def __init__(
             raise ValueError("No valid connection settings.")
 
         self._initialize_retry_args(kwargs)
+        self._use_arrow_native_complex_types = kwargs.get(
+            "_use_arrow_native_complex_types", True
+        )
 
         self._use_arrow_native_decimals = kwargs.get("_use_arrow_native_decimals", True)
         self._use_arrow_native_timestamps = kwargs.get(
             "_use_arrow_native_timestamps", True
         )
 
         # Cloud fetch
+        self._max_download_threads = kwargs.get("max_download_threads", 10)
+
+        self._ssl_options = ssl_options
+
         self._auth_provider = auth_provider
 
         # Connector version 3 retry approach
diff --git a/src/databricks/sql/result_set.py b/src/databricks/sql/result_set.py
@@ -169,6 +169,7 @@ def close(self) -> None:
         been closed on the server for some other reason, issue a request to the server to close it.
         """
         try:
+            self.results.close()
             if (
                 self.status != CommandState.CLOSED
                 and not self.has_been_closed_server_side
diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py
@@ -47,6 +47,10 @@ def next_n_rows(self, num_rows: int):
     def remaining_rows(self):
         pass
 
+    @abstractmethod
+    def close(self):
+        pass
+
 
 class ThriftResultSetQueueFactory(ABC):
     @staticmethod
@@ -159,6 +163,9 @@ def remaining_rows(self):
         self.cur_row_index += slice.num_rows
         return slice
 
+    def close(self):
+        return
+
 
 class ArrowQueue(ResultSetQueue):
     def __init__(
@@ -196,6 +203,9 @@ def remaining_rows(self) -> "pyarrow.Table":
         self.cur_row_index += slice.num_rows
         return slice
 
+    def close(self):
+        return
+
 
 class CloudFetchQueue(ResultSetQueue, ABC):
     """Base class for cloud fetch queues that handle EXTERNAL_LINKS disposition with ARROW format."""
@@ -230,7 +240,12 @@ def __init__(
         self.table_row_index = 0
 
         # Initialize download manager
-        self.download_manager: Optional["ResultFileDownloadManager"] = None
+        self.download_manager = ResultFileDownloadManager(
+            links=[],
+            max_download_threads=max_download_threads,
+            lz4_compressed=lz4_compressed,
+            ssl_options=ssl_options,
+        )
 
     def next_n_rows(self, num_rows: int) -> "pyarrow.Table":
         """
@@ -287,11 +302,8 @@ def remaining_rows(self) -> "pyarrow.Table":
 
     def _create_table_at_offset(self, offset: int) -> Union["pyarrow.Table", None]:
         """Create next table at the given row offset"""
-        # Create next table by retrieving the logical next downloaded file, or return None to signal end of queue
-        if not self.download_manager:
-            logger.debug("CloudFetchQueue: No download manager available")
-            return None
 
+        # Create next table by retrieving the logical next downloaded file, or return None to signal end of queue
         downloaded_file = self.download_manager.get_next_downloaded_file(offset)
         if not downloaded_file:
             logger.debug(
@@ -324,6 +336,9 @@ def _create_empty_table(self) -> "pyarrow.Table":
             return pyarrow.Table.from_pydict({})
         return create_arrow_table_from_arrow_file(self.schema_bytes, self.description)
 
+    def close(self):
+        self.download_manager._shutdown_manager()
+
 
 class ThriftCloudFetchQueue(CloudFetchQueue):
     """Queue implementation for EXTERNAL_LINKS disposition with ARROW format for Thrift backend."""
@@ -373,14 +388,7 @@ def __init__(
                         result_link.startRowOffset, result_link.rowCount
                     )
                 )
-
-        # Initialize download manager
-        self.download_manager = ResultFileDownloadManager(
-            links=self.result_links,
-            max_download_threads=self.max_download_threads,
-            lz4_compressed=self.lz4_compressed,
-            ssl_options=self._ssl_options,
-        )
+                self.download_manager.add_link(result_link)
 
         # Initialize table and position
         self.table = self._create_next_table()
diff --git a/tests/unit/test_client.py b/tests/unit/test_client.py
diff --git a/tests/unit/test_sea_backend.py b/tests/unit/test_sea_backend.py
diff --git a/tests/unit/test_sea_queue.py b/tests/unit/test_sea_queue.py