Merge branch 'sea-hybrid' into sea-decouple-link-fetch

varun-edachali-dbx · varun-edachali-dbx · commit 7257168c6153 · 2025-07-11T11:19:17.000+05:30
diff --git a/src/databricks/sql/backend/sea/queue.py b/src/databricks/sql/backend/sea/queue.py
@@ -6,7 +6,7 @@
 
 from databricks.sql.cloudfetch.download_manager import ResultFileDownloadManager
 
-import lz4.frame
+from databricks.sql.cloudfetch.downloader import ResultSetDownloadHandler
 
 try:
     import pyarrow
@@ -37,25 +37,6 @@
 logger = logging.getLogger(__name__)
 
 
-def decompress_multi_frame_lz4(attachment: bytes) -> bytes:
-    try:
-        decompressor = lz4.frame.LZ4FrameDecompressor()
-        arrow_file = decompressor.decompress(attachment)
-
-        # the attachment may be a concatenation of multiple LZ4 frames
-        while decompressor.unused_data:
-            remaining_data = decompressor.unused_data
-            arrow_file += decompressor.decompress(remaining_data)
-
-            logger.debug(f"LZ4 decompressed {len(arrow_file)} bytes from attachment")
-
-    except Exception as e:
-        logger.error(f"LZ4 decompression failed: {e}")
-        raise e
-
-    return arrow_file
-
-
 class SeaResultSetQueueFactory(ABC):
     @staticmethod
     def build_queue(
@@ -90,7 +71,7 @@ def build_queue(
         elif manifest.format == ResultFormat.ARROW_STREAM.value:
             if result_data.attachment is not None:
                 arrow_file = (
-                    decompress_multi_frame_lz4(result_data.attachment)
+                    ResultSetDownloadHandler._decompress_data(result_data.attachment)
                     if lz4_compressed
                     else result_data.attachment
                 )
@@ -300,10 +281,57 @@ def __init__(
         self.link_fetcher.start()
 
         # Initialize table and position
-        self.table = self._create_next_table()
+        self.table = self._create_table_from_link(self._current_chunk_link)
+
+    def _convert_to_thrift_link(self, link: "ExternalLink") -> TSparkArrowResultLink:
+        """Convert SEA external links to Thrift format for compatibility with existing download manager."""
+        # Parse the ISO format expiration time
+        expiry_time = int(dateutil.parser.parse(link.expiration).timestamp())
+        return TSparkArrowResultLink(
+            fileLink=link.external_link,
+            expiryTime=expiry_time,
+            rowCount=link.row_count,
+            bytesNum=link.byte_count,
+            startRowOffset=link.row_offset,
+            httpHeaders=link.http_headers or {},
+        )
+
+    def _get_chunk_link(self, chunk_index: int) -> Optional["ExternalLink"]:
+        if chunk_index not in self._chunk_index_to_link:
+            links = self._sea_client.get_chunk_links(self._statement_id, chunk_index)
+            self._chunk_index_to_link.update({link.chunk_index: link for link in links})
+        return self._chunk_index_to_link.get(chunk_index, None)
+
+    def _progress_chunk_link(self):
+        """Progress to the next chunk link."""
+        if not self._current_chunk_link:
+            return None
+
+        next_chunk_index = self._current_chunk_link.next_chunk_index
+
+        if next_chunk_index is None:
+            self._current_chunk_link = None
+            return None
+
+        self._current_chunk_link = self._get_chunk_link(next_chunk_index)
+        if not self._current_chunk_link:
+            logger.error(
+                "SeaCloudFetchQueue: unable to retrieve link for chunk {}".format(
+                    next_chunk_index
+                )
+            )
+            return None
+
+        logger.debug(
+            f"SeaCloudFetchQueue: Progressed to link for chunk {next_chunk_index}: {self._current_chunk_link}"
+        )
 
     def _create_next_table(self) -> Union["pyarrow.Table", None]:
         """Create next table by retrieving the logical next downloaded file."""
+        if not self._current_chunk_link:
+            logger.debug("SeaCloudFetchQueue: No current chunk link, returning")
+            return None
+
         if not self.download_manager:
             logger.debug("SeaCloudFetchQueue: No download manager, returning")
             return None
@@ -317,4 +345,8 @@ def _create_next_table(self) -> Union["pyarrow.Table", None]:
 
         self.current_chunk_index += 1
 
-        return arrow_table
+        if not self._current_chunk_link:
+            logger.debug("SeaCloudFetchQueue: No current chunk link, returning")
+            return None
+
+        return self._create_table_from_link(self._current_chunk_link)
diff --git a/tests/e2e/common/large_queries_mixin.py b/tests/e2e/common/large_queries_mixin.py
@@ -2,6 +2,8 @@
 import math
 import time
 
+import pytest
+
 log = logging.getLogger(__name__)
 
 
@@ -42,7 +44,14 @@ def fetch_rows(self, cursor, row_count, fetchmany_size):
             + "assuming 10K fetch size."
         )
 
-    def test_query_with_large_wide_result_set(self):
+    @pytest.mark.parametrize(
+        "extra_params",
+        [
+            {},
+            {"use_sea": True},
+        ],
+    )
+    def test_query_with_large_wide_result_set(self, extra_params):
         resultSize = 300 * 1000 * 1000  # 300 MB
         width = 8192  # B
         rows = resultSize // width
@@ -52,7 +61,7 @@ def test_query_with_large_wide_result_set(self):
         fetchmany_size = 10 * 1024 * 1024 // width
         # This is used by PyHive tests to determine the buffer size
         self.arraysize = 1000
-        with self.cursor() as cursor:
+        with self.cursor(extra_params) as cursor:
             for lz4_compression in [False, True]:
                 cursor.connection.lz4_compression = lz4_compression
                 uuids = ", ".join(["uuid() uuid{}".format(i) for i in range(cols)])
@@ -68,7 +77,14 @@ def test_query_with_large_wide_result_set(self):
                     assert row[0] == row_id  # Verify no rows are dropped in the middle.
                     assert len(row[1]) == 36
 
-    def test_query_with_large_narrow_result_set(self):
+    @pytest.mark.parametrize(
+        "extra_params",
+        [
+            {},
+            {"use_sea": True},
+        ],
+    )
+    def test_query_with_large_narrow_result_set(self, extra_params):
         resultSize = 300 * 1000 * 1000  # 300 MB
         width = 8  # sizeof(long)
         rows = resultSize / width
@@ -77,12 +93,19 @@ def test_query_with_large_narrow_result_set(self):
         fetchmany_size = 10 * 1024 * 1024 // width
         # This is used by PyHive tests to determine the buffer size
         self.arraysize = 10000000
-        with self.cursor() as cursor:
+        with self.cursor(extra_params) as cursor:
             cursor.execute("SELECT * FROM RANGE({rows})".format(rows=rows))
             for row_id, row in enumerate(self.fetch_rows(cursor, rows, fetchmany_size)):
                 assert row[0] == row_id
 
-    def test_long_running_query(self):
+    @pytest.mark.parametrize(
+        "extra_params",
+        [
+            {},
+            {"use_sea": True},
+        ],
+    )
+    def test_long_running_query(self, extra_params):
         """Incrementally increase query size until it takes at least 3 minutes,
         and asserts that the query completes successfully.
         """
@@ -92,7 +115,7 @@ def test_long_running_query(self):
         duration = -1
         scale0 = 10000
         scale_factor = 1
-        with self.cursor() as cursor:
+        with self.cursor(extra_params) as cursor:
             while duration < min_duration:
                 assert scale_factor < 1024, "Detected infinite loop"
                 start = time.time()
diff --git a/tests/unit/test_sea_queue.py b/tests/unit/test_sea_queue.py