11from __future__ import annotations
2- from typing import Any , Dict , List , Optional , Union , TYPE_CHECKING
2+ from typing import Dict , List , Optional , Union
33
44from dateutil import parser
55import datetime
99from collections .abc import Mapping
1010from decimal import Decimal
1111from enum import Enum
12- from typing import Any , Dict , List , Optional , Tuple , Union , Sequence
12+ from typing import Dict , List , Optional , Tuple , Union , Sequence
1313import re
1414
15- import dateutil
1615import lz4 .frame
1716
18- from databricks .sql .backend .sea .backend import SeaDatabricksClient
19- from databricks .sql .backend .sea .models .base import ResultData , ResultManifest
20-
2117try :
2218 import pyarrow
2319except ImportError :
2420 pyarrow = None
2521
2622from databricks .sql import OperationalError
27- from databricks .sql .exc import ProgrammingError
2823from databricks .sql .cloudfetch .download_manager import ResultFileDownloadManager
2924from databricks .sql .thrift_api .TCLIService .ttypes import (
3025 TRowSet ,
3126 TSparkArrowResultLink ,
3227 TSparkRowSetType ,
3328)
3429from databricks .sql .types import SSLOptions
35- from databricks .sql .backend .sea .models .base import (
36- ResultData ,
37- ExternalLink ,
38- ResultManifest ,
39- )
30+
4031from databricks .sql .parameters .native import ParameterStructure , TDbsqlParameter
4132
4233import logging
@@ -227,11 +218,12 @@ def __init__(
227218 lz4_compressed: Whether the data is LZ4 compressed
228219 description: Column descriptions
229220 """
221+
222+ self .schema_bytes = schema_bytes
223+ self .max_download_threads = max_download_threads
230224 self .lz4_compressed = lz4_compressed
231225 self .description = description
232- self .schema_bytes = schema_bytes
233226 self ._ssl_options = ssl_options
234- self .max_download_threads = max_download_threads
235227
236228 # Table state
237229 self .table = None
@@ -240,104 +232,73 @@ def __init__(
240232 # Initialize download manager
241233 self .download_manager : Optional ["ResultFileDownloadManager" ] = None
242234
243- def remaining_rows (self ) -> "pyarrow.Table" :
235+ def next_n_rows (self , num_rows : int ) -> "pyarrow.Table" :
244236 """
245- Get all remaining rows of the cloud fetch Arrow dataframes.
237+ Get up to the next n rows of the cloud fetch Arrow dataframes.
246238
239+ Args:
240+ num_rows (int): Number of rows to retrieve.
247241 Returns:
248242 pyarrow.Table
249243 """
250244 if not self .table :
245+ logger .debug ("CloudFetchQueue: no more rows available" )
251246 # Return empty pyarrow table to cause retry of fetch
252247 return self ._create_empty_table ()
253-
254- results = pyarrow .Table .from_pydict ({}) # Empty table
255- while self .table :
256- table_slice = self .table .slice (
257- self .table_row_index , self .table .num_rows - self .table_row_index
258- )
259- if results .num_rows > 0 :
260- results = pyarrow .concat_tables ([results , table_slice ])
261- else :
262- results = table_slice
263-
264- self .table_row_index += table_slice .num_rows
265- self .table = self ._create_next_table ()
266- self .table_row_index = 0
267-
268- return results
269-
270- def next_n_rows (self , num_rows : int ) -> "pyarrow.Table" :
271- """Get up to the next n rows of the cloud fetch Arrow dataframes."""
272- if not self .table :
273- # Return empty pyarrow table to cause retry of fetch
274- return self ._create_empty_table ()
275-
276- logger .info ("SeaCloudFetchQueue: Retrieving up to {} rows" .format (num_rows ))
277- results = pyarrow .Table .from_pydict ({}) # Empty table
278- rows_fetched = 0
279-
248+ logger .debug ("CloudFetchQueue: trying to get {} next rows" .format (num_rows ))
249+ results = self .table .slice (0 , 0 )
280250 while num_rows > 0 and self .table :
281251 # Get remaining of num_rows or the rest of the current table, whichever is smaller
282252 length = min (num_rows , self .table .num_rows - self .table_row_index )
283- logger .info (
284- "CloudFetchQueue: Slicing table from index {} for {} rows (table has {} rows total)" .format (
285- self .table_row_index , length , self .table .num_rows
286- )
287- )
288253 table_slice = self .table .slice (self .table_row_index , length )
289-
290- # Concatenate results if we have any
291- if results .num_rows > 0 :
292- logger .info (
293- "CloudFetchQueue: Concatenating {} rows to existing {} rows" .format (
294- table_slice .num_rows , results .num_rows
295- )
296- )
297- results = pyarrow .concat_tables ([results , table_slice ])
298- else :
299- results = table_slice
300-
254+ results = pyarrow .concat_tables ([results , table_slice ])
301255 self .table_row_index += table_slice .num_rows
302- rows_fetched += table_slice .num_rows
303-
304- logger .info (
305- "CloudFetchQueue: After slice, table_row_index={}, rows_fetched={}" .format (
306- self .table_row_index , rows_fetched
307- )
308- )
309256
310257 # Replace current table with the next table if we are at the end of the current table
311258 if self .table_row_index == self .table .num_rows :
312- logger .info (
313- "CloudFetchQueue: Reached end of current table, fetching next"
314- )
315259 self .table = self ._create_next_table ()
316260 self .table_row_index = 0
317-
318261 num_rows -= table_slice .num_rows
319262
320- logger .info ("CloudFetchQueue: Retrieved {} rows" .format (results .num_rows ))
263+ logger .debug ("CloudFetchQueue: collected {} next rows" .format (results .num_rows ))
321264 return results
322265
323- def _create_empty_table (self ) -> "pyarrow.Table" :
324- """Create a 0-row table with just the schema bytes."""
325- if not self .schema_bytes :
326- return pyarrow .Table .from_pydict ({})
327- return create_arrow_table_from_arrow_file (self .schema_bytes , self .description )
266+ def remaining_rows (self ) -> "pyarrow.Table" :
267+ """
268+ Get all remaining rows of the cloud fetch Arrow dataframes.
269+
270+ Returns:
271+ pyarrow.Table
272+ """
273+
274+ if not self .table :
275+ # Return empty pyarrow table to cause retry of fetch
276+ return self ._create_empty_table ()
277+ results = self .table .slice (0 , 0 )
278+ while self .table :
279+ table_slice = self .table .slice (
280+ self .table_row_index , self .table .num_rows - self .table_row_index
281+ )
282+ results = pyarrow .concat_tables ([results , table_slice ])
283+ self .table_row_index += table_slice .num_rows
284+ self .table = self ._create_next_table ()
285+ self .table_row_index = 0
286+ return results
328287
329288 def _create_table_at_offset (self , offset : int ) -> Union ["pyarrow.Table" , None ]:
330- """Create next table by retrieving the logical next downloaded file. """
289+ """Create next table at the given row offset """
331290 # Create next table by retrieving the logical next downloaded file, or return None to signal end of queue
332291 if not self .download_manager :
333292 logger .debug ("CloudFetchQueue: No download manager available" )
334293 return None
335294
336295 downloaded_file = self .download_manager .get_next_downloaded_file (offset )
337296 if not downloaded_file :
297+ logger .debug (
298+ "CloudFetchQueue: Cannot find downloaded file for row {}" .format (offset )
299+ )
338300 # None signals no more Arrow tables can be built from the remaining handlers if any remain
339301 return None
340-
341302 arrow_table = create_arrow_table_from_arrow_file (
342303 downloaded_file .file_bytes , self .description
343304 )
@@ -357,6 +318,12 @@ def _create_next_table(self) -> Union["pyarrow.Table", None]:
357318 """Create next table by retrieving the logical next downloaded file."""
358319 pass
359320
321+ def _create_empty_table (self ) -> "pyarrow.Table" :
322+ """Create a 0-row table with just the schema bytes."""
323+ if not self .schema_bytes :
324+ return pyarrow .Table .from_pydict ({})
325+ return create_arrow_table_from_arrow_file (self .schema_bytes , self .description )
326+
360327
361328class ThriftCloudFetchQueue (CloudFetchQueue ):
362329 """Queue implementation for EXTERNAL_LINKS disposition with ARROW format for Thrift backend."""
0 commit comments