init sea link retry func

varun-edachali-dbx · varun-edachali-dbx · commit f374f5fe6b3c · 2025-07-14T10:30:13.000+05:30
Signed-off-by: varun-edachali-dbx &lt;varun.edachali@databricks.com&gt;
diff --git a/examples/experimental/tests/test_sea_sync_query.py b/examples/experimental/tests/test_sea_sync_query.py
@@ -4,9 +4,10 @@
 import os
 import sys
 import logging
+import time
 from databricks.sql.client import Connection
 
-logging.basicConfig(level=logging.INFO)
+logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger(__name__)
 
 
@@ -51,20 +52,19 @@ def test_sea_sync_query_with_cloud_fetch():
         )
 
         # Execute a query that generates large rows to force multiple chunks
-        requested_row_count = 10000
+        requested_row_count = 100000000
         cursor = connection.cursor()
         query = f"""
-        SELECT 
-            id, 
-            concat('value_', repeat('a', 10000)) as test_value
-        FROM range(1, {requested_row_count} + 1) AS t(id)
+        SELECT * FROM samples.tpch.lineitem LIMIT {requested_row_count}
         """
 
         logger.info(
             f"Executing synchronous query with cloud fetch to generate {requested_row_count} rows"
         )
         cursor.execute(query)
         results = [cursor.fetchone()]
+        logger.info("SLEEPING FOR 1000 SECONDS TO EXPIRE LINKS")
+        time.sleep(1000)
         results.extend(cursor.fetchmany(10))
         results.extend(cursor.fetchall())
         actual_row_count = len(results)
diff --git a/src/databricks/sql/backend/sea/queue.py b/src/databricks/sql/backend/sea/queue.py
@@ -204,7 +204,24 @@ def _worker_loop(self):
             if not links_downloaded:
                 break
 
+    def _restart_from_expired_link(self, link: TSparkArrowResultLink):
+        self.stop()
+
+        with self._link_data_update:
+            self.download_manager.cancel_tasks_from_offset(link.startRowOffset)
+
+            chunks_to_restart = []
+            for chunk_index, l in self.chunk_index_to_link.items():
+                if l.row_offset < link.startRowOffset:
+                    continue
+                chunks_to_restart.append(chunk_index)
+            for chunk_index in chunks_to_restart:
+                self.chunk_index_to_link.pop(chunk_index)
+
+        self.start()
+
     def start(self):
+        self._shutdown_event.clear()
         self._worker_thread = threading.Thread(target=self._worker_loop)
         self._worker_thread.start()
 
@@ -269,6 +286,7 @@ def __init__(
             max_download_threads=max_download_threads,
             lz4_compressed=lz4_compressed,
             ssl_options=ssl_options,
+            expiry_callback=self._expiry_callback,
         )
 
         self.link_fetcher = LinkFetcher(
@@ -283,6 +301,12 @@ def __init__(
         # Initialize table and position
         self.table = self._create_next_table()
 
+    def _expiry_callback(self, link: TSparkArrowResultLink):
+        logger.info(
+            f"SeaCloudFetchQueue: Link expired, restarting from offset {link.startRowOffset}"
+        )
+        self.link_fetcher._restart_from_expired_link(link)
+
     def _create_next_table(self) -> Union["pyarrow.Table", None]:
         """Create next table by retrieving the logical next downloaded file."""
         if not self.download_manager:
diff --git a/src/databricks/sql/cloudfetch/download_manager.py b/src/databricks/sql/cloudfetch/download_manager.py
@@ -1,7 +1,7 @@
 import logging
 
 from concurrent.futures import ThreadPoolExecutor, Future
-from typing import List, Union
+from typing import Callable, List, Optional, Union
 
 from databricks.sql.cloudfetch.downloader import (
     ResultSetDownloadHandler,
@@ -22,6 +22,7 @@ def __init__(
         max_download_threads: int,
         lz4_compressed: bool,
         ssl_options: SSLOptions,
+        expiry_callback: Callable[[TSparkArrowResultLink], None],
     ):
         self._pending_links: List[TSparkArrowResultLink] = []
         for link in links:
@@ -40,6 +41,7 @@ def __init__(
 
         self._downloadable_result_settings = DownloadableResultSettings(lz4_compressed)
         self._ssl_options = ssl_options
+        self._expiry_callback = expiry_callback
 
     def get_next_downloaded_file(
         self, next_row_offset: int
@@ -62,7 +64,6 @@ def get_next_downloaded_file(
 
         # No more files to download from this batch of links
         if len(self._download_tasks) == 0:
-            self._shutdown_manager()
             return None
 
         task = self._download_tasks.pop(0)
@@ -81,6 +82,34 @@ def get_next_downloaded_file(
 
         return file
 
+    def cancel_tasks_from_offset(self, start_row_offset: int):
+        """
+        Cancel all download tasks starting from a specific row offset.
+        This is used when links expire and we need to restart from a certain point.
+        """
+
+        def to_cancel(link: TSparkArrowResultLink) -> bool:
+            return link.startRowOffset < start_row_offset
+
+        tasks_to_cancel = [task for task in self._download_tasks if to_cancel(task.link)]
+        for task in tasks_to_cancel:
+            task.cancel()
+        logger.info(
+            f"ResultFileDownloadManager: cancelled {len(tasks_to_cancel)} tasks from offset {start_row_offset}"
+        )
+
+        # Remove cancelled tasks from the download queue
+        tasks_to_keep = [task for task in self._download_tasks if not to_cancel(task.link)]
+        self._download_tasks = tasks_to_keep
+
+        pending_links_to_keep = [
+            link for link in self._pending_links if not to_cancel(link)
+        ]
+        self._pending_links = pending_links_to_keep
+        logger.info(
+            f"ResultFileDownloadManager: removed {len(self._pending_links) - len(pending_links_to_keep)} links from pending links"
+        )
+
     def _schedule_downloads(self):
         """
         While download queue has a capacity, peek pending links and submit them to thread pool.
@@ -97,8 +126,10 @@ def _schedule_downloads(self):
                 settings=self._downloadable_result_settings,
                 link=link,
                 ssl_options=self._ssl_options,
+                expiry_callback=self._expiry_callback,
             )
             task = self._thread_pool.submit(handler.run)
+            task.link = link
             self._download_tasks.append(task)
 
     def add_link(self, link: TSparkArrowResultLink):
diff --git a/src/databricks/sql/cloudfetch/downloader.py b/src/databricks/sql/cloudfetch/downloader.py
@@ -1,5 +1,6 @@
 import logging
 from dataclasses import dataclass
+from typing import Callable
 
 import requests
 from requests.adapters import HTTPAdapter, Retry
@@ -66,10 +67,12 @@ def __init__(
         settings: DownloadableResultSettings,
         link: TSparkArrowResultLink,
         ssl_options: SSLOptions,
+        expiry_callback: Callable[[TSparkArrowResultLink], None],
     ):
         self.settings = settings
         self.link = link
         self._ssl_options = ssl_options
+        self._expiry_callback = expiry_callback
 
     def run(self) -> DownloadedFile:
         """
@@ -86,7 +89,7 @@ def run(self) -> DownloadedFile:
         )
 
         # Check if link is already expired or is expiring
-        ResultSetDownloadHandler._validate_link(
+        self._validate_link(
             self.link, self.settings.link_expiry_buffer_secs
         )
 
@@ -136,8 +139,7 @@ def run(self) -> DownloadedFile:
             if session:
                 session.close()
 
-    @staticmethod
-    def _validate_link(link: TSparkArrowResultLink, expiry_buffer_secs: int):
+    def _validate_link(self, link: TSparkArrowResultLink, expiry_buffer_secs: int):
         """
         Check if a link has expired or will expire.
 
@@ -149,7 +151,7 @@ def _validate_link(link: TSparkArrowResultLink, expiry_buffer_secs: int):
             link.expiryTime <= current_time
             or link.expiryTime - current_time <= expiry_buffer_secs
         ):
-            raise Error("CloudFetch link has expired")
+            self._expiry_callback(link)
 
     @staticmethod
     def _decompress_data(compressed_data: bytes) -> bytes:
diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py
@@ -14,6 +14,8 @@
 
 import lz4.frame
 
+from databricks.sql.exc import Error
+
 try:
     import pyarrow
 except ImportError:
@@ -374,12 +376,16 @@ def __init__(
                     )
                 )
 
+        def expiry_callback(link: TSparkArrowResultLink):
+            raise Error("Cloudfetch link has expired")
+
         # Initialize download manager
         self.download_manager = ResultFileDownloadManager(
             links=self.result_links,
             max_download_threads=self.max_download_threads,
             lz4_compressed=self.lz4_compressed,
             ssl_options=self._ssl_options,
+            expiry_callback=expiry_callback,
         )
 
         # Initialize table and position