init fetch phase JSON + INLINE

varun-edachali-dbx · varun-edachali-dbx · commit f0d9c6530a19 · 2025-06-04T05:52:10.000Z
Signed-off-by: varun-edachali-dbx &lt;varun.edachali@databricks.com&gt;
diff --git a/examples/experimental/sea_connector_test.py b/examples/experimental/sea_connector_test.py
@@ -7,6 +7,96 @@
 logger = logging.getLogger(__name__)
 
 
+def test_sea_result_set_json_array_inline():
+    """
+    Test the SEA result set implementation with JSON_ARRAY format and INLINE disposition.
+    
+    This function connects to a Databricks SQL endpoint using the SEA backend,
+    executes a query that returns a small result set (which will use INLINE disposition),
+    and tests the various fetch methods to verify the result set implementation works correctly.
+    """
+    server_hostname = os.environ.get("DATABRICKS_SERVER_HOSTNAME")
+    http_path = os.environ.get("DATABRICKS_HTTP_PATH")
+    access_token = os.environ.get("DATABRICKS_TOKEN")
+    catalog = os.environ.get("DATABRICKS_CATALOG")
+
+    if not all([server_hostname, http_path, access_token]):
+        logger.error("Missing required environment variables.")
+        logger.error(
+            "Please set DATABRICKS_SERVER_HOSTNAME, DATABRICKS_HTTP_PATH, and DATABRICKS_TOKEN."
+        )
+        sys.exit(1)
+
+    try:
+        # Create connection with SEA backend
+        logger.info("Creating connection with SEA backend...")
+        connection = Connection(
+            server_hostname=server_hostname,
+            http_path=http_path,
+            access_token=access_token,
+            catalog=catalog,
+            schema="default",
+            use_sea=True,
+            user_agent_entry="SEA-Test-Client",
+        )
+
+        logger.info(
+            f"Successfully opened SEA session with ID: {connection.get_session_id_hex()}"
+        )
+        
+        # Create cursor
+        cursor = connection.cursor()
+        
+        # Execute a query that returns a small result set (will use INLINE disposition)
+        logger.info("Executing query: SELECT * FROM range(1, 10) AS id")
+        cursor.execute("SELECT * FROM range(1, 10) AS id")
+        
+        # Test fetchone
+        logger.info("Testing fetchone...")
+        row = cursor.fetchone()
+        logger.info(f"First row: {row}")
+        
+        # Test fetchmany
+        logger.info("Testing fetchmany(3)...")
+        rows = cursor.fetchmany(3)
+        logger.info(f"Next 3 rows: {rows}")
+        
+        # Test fetchall
+        logger.info("Testing fetchall...")
+        remaining_rows = cursor.fetchall()
+        logger.info(f"Remaining rows: {remaining_rows}")
+        
+        # Execute another query to test arrow fetch methods
+        logger.info("Executing query for Arrow testing: SELECT * FROM range(1, 5) AS id, range(101, 105) AS value")
+        cursor.execute("SELECT * FROM range(1, 5) AS id, range(101, 105) AS value")
+        
+        # Test fetchmany_arrow
+        logger.info("Testing fetchmany_arrow(2)...")
+        arrow_batch = cursor.fetchmany_arrow(2)
+        logger.info(f"Arrow batch num rows: {arrow_batch.num_rows}")
+        logger.info(f"Arrow batch columns: {arrow_batch.column_names}")
+        logger.info(f"Arrow batch data: {arrow_batch.to_pydict()}")
+        
+        # Test fetchall_arrow
+        logger.info("Testing fetchall_arrow...")
+        remaining_arrow_batch = cursor.fetchall_arrow()
+        logger.info(f"Remaining arrow batch num rows: {remaining_arrow_batch.num_rows}")
+        logger.info(f"Remaining arrow batch data: {remaining_arrow_batch.to_pydict()}")
+        
+        # Close cursor and connection
+        cursor.close()
+        connection.close()
+        logger.info("Successfully closed SEA session")
+
+    except Exception as e:
+        logger.error(f"Error during SEA result set test: {str(e)}")
+        import traceback
+        logger.error(traceback.format_exc())
+        sys.exit(1)
+
+    logger.info("SEA result set test with JSON_ARRAY format and INLINE disposition completed successfully")
+
+
 def test_sea_query_execution_with_compression():
     """
     Test executing a query using the SEA backend with result compression.
@@ -159,4 +249,7 @@ def test_sea_session():
     test_sea_session()
 
     # Test query execution with compression
-    test_sea_query_execution_with_compression()
+    test_sea_query_execution_with_compression()
+    
+    # Test result set implementation
+    test_sea_result_set_json_array_inline()
diff --git a/src/databricks/sql/backend/sea_result_set.py b/src/databricks/sql/backend/sea_result_set.py
@@ -7,12 +7,18 @@
 
 import json
 import logging
-from typing import Optional, List, Any, Dict, Tuple
+from typing import Optional, List, Any, Dict, Tuple, cast
+
+try:
+    import pyarrow
+except ImportError:
+    pyarrow = None
 
 from databricks.sql.result_set import ResultSet
 from databricks.sql.types import Row
 from databricks.sql.backend.types import CommandId, CommandState
 from databricks.sql.exc import Error
+from databricks.sql.utils import ResultSetQueueFactory, JsonQueue
 
 from databricks.sql.backend.models import (
     StatementStatus,
@@ -106,14 +112,19 @@ def __init__(
 
         # Initialize other properties
         self._is_staging_operation = False  # SEA doesn't have staging operations
-        self._rows_buffer = []
-        self._current_row_index = 0
-        self._has_more_rows = True
+        self._has_more_rows = False
         self._current_chunk_index = 0
 
-        # If we have inline data, fill the buffer
-        if self.result and self.result.data:
-            self._rows_buffer = self.result.data
+        # Initialize queue for result data
+        if self.result:
+            self.results = ResultSetQueueFactory.build_queue(
+                sea_result_data=self.result,
+                description=cast(Optional[List[List[Any]]], self.description)
+            )
+            self._has_more_rows = True if self.result.data else False
+        else:
+            self.results = JsonQueue([])
+            self._has_more_rows = False
 
     @property
     def is_staging_operation(self) -> bool:
@@ -153,28 +164,120 @@ def _extract_description_from_manifest(
         return description
 
     def _fill_results_buffer(self) -> None:
-        """Fill the results buffer from the backend."""
-        raise NotImplementedError("Not implemented yet")
+        """Fill the results buffer from the backend for INLINE disposition."""
+        if not self.result or not self.result.data:
+            self._has_more_rows = False
+            return
+        
+        # For INLINE disposition, we already have all the data
+        # No need to fetch more data from the backend
+        self._has_more_rows = False  # No more rows to fetch for INLINE
+
+    def _convert_rows_to_arrow_table(self, rows):
+        """Convert rows to Arrow table."""
+        if not self.description:
+            return pyarrow.Table.from_pylist([])
+        
+        # Create dict of column data
+        column_data = {}
+        column_names = [col[0] for col in self.description]
+        
+        for i, name in enumerate(column_names):
+            column_data[name] = [row[i] for row in rows]
+        
+        return pyarrow.Table.from_pydict(column_data)
+
+    def _create_empty_arrow_table(self):
+        """Create an empty Arrow table with the correct schema."""
+        if not self.description:
+            return pyarrow.Table.from_pylist([])
+        
+        column_names = [col[0] for col in self.description]
+        return pyarrow.Table.from_pydict({name: [] for name in column_names})
 
     def fetchone(self) -> Optional[Row]:
         """Fetch the next row of a query result set."""
-        raise NotImplementedError("Not implemented yet")
+        if isinstance(self.results, JsonQueue):
+            rows = self.results.next_n_rows(1)
+            if not rows:
+                return None
+            
+            row = rows[0]
+            
+            # Convert to Row object
+            if self.description:
+                column_names = [col[0] for col in self.description]
+                ResultRow = Row(*column_names)
+                return ResultRow(*row)
+            return row
+        else:
+            # This should not happen with current implementation
+            # but added for future compatibility
+            raise NotImplementedError("Unsupported queue type")
 
-    def fetchmany(self, size: int) -> List[Row]:
+    def fetchmany(self, size: Optional[int] = None) -> List[Row]:
         """Fetch the next set of rows of a query result."""
-        raise NotImplementedError("Not implemented yet")
+        if size is None:
+            size = self.arraysize
+            
+        if size < 0:
+            raise ValueError(f"size argument for fetchmany is {size} but must be >= 0")
+        
+        if isinstance(self.results, JsonQueue):
+            rows = self.results.next_n_rows(size)
+            
+            # Convert to Row objects
+            if self.description:
+                column_names = [col[0] for col in self.description]
+                ResultRow = Row(*column_names)
+                return [ResultRow(*row) for row in rows]
+            return rows
+        else:
+            # This should not happen with current implementation
+            # but added for future compatibility
+            raise NotImplementedError("Unsupported queue type")
 
     def fetchall(self) -> List[Row]:
         """Fetch all remaining rows of a query result."""
-        raise NotImplementedError("Not implemented yet")
+        if isinstance(self.results, JsonQueue):
+            rows = self.results.remaining_rows()
+            
+            # Convert to Row objects
+            if self.description:
+                column_names = [col[0] for col in self.description]
+                ResultRow = Row(*column_names)
+                return [ResultRow(*row) for row in rows]
+            return rows
+        else:
+            # This should not happen with current implementation
+            # but added for future compatibility
+            raise NotImplementedError("Unsupported queue type")
 
     def fetchmany_arrow(self, size: int) -> Any:
         """Fetch the next set of rows as an Arrow table."""
-        raise NotImplementedError("Not implemented yet")
+        if not pyarrow:
+            raise ImportError("PyArrow is required for Arrow support")
+        
+        rows = self.fetchmany(size)
+        if not rows:
+            # Return empty Arrow table with schema
+            return self._create_empty_arrow_table()
+        
+        # Convert rows to Arrow table
+        return self._convert_rows_to_arrow_table(rows)
 
     def fetchall_arrow(self) -> Any:
         """Fetch all remaining rows as an Arrow table."""
-        raise NotImplementedError("Not implemented yet")
+        if not pyarrow:
+            raise ImportError("PyArrow is required for Arrow support")
+        
+        rows = self.fetchall()
+        if not rows:
+            # Return empty Arrow table with schema
+            return self._create_empty_arrow_table()
+        
+        # Convert rows to Arrow table
+        return self._convert_rows_to_arrow_table(rows)
 
     def close(self) -> None:
         """Close the result set and release any resources."""
@@ -185,4 +288,4 @@ def close(self) -> None:
                     CommandId.from_sea_statement_id(self.statement_id)
                 )
             except Exception as e:
-                logger.warning(f"Error closing SEA statement: {e}")
+                logger.warning(f"Error closing SEA statement: {e}")
diff --git a/src/databricks/sql/utils.py b/src/databricks/sql/utils.py
@@ -48,41 +48,90 @@ def remaining_rows(self):
         pass
 
 
+class JsonQueue(ResultSetQueue):
+    """Queue implementation for JSON_ARRAY format data."""
+    
+    def __init__(self, data_array):
+        """Initialize with JSON array data."""
+        self.data_array = data_array
+        self.cur_row_index = 0
+        self.n_valid_rows = len(data_array)
+    
+    def next_n_rows(self, num_rows):
+        """Get the next n rows from the data array."""
+        length = min(num_rows, self.n_valid_rows - self.cur_row_index)
+        slice = self.data_array[self.cur_row_index:self.cur_row_index + length]
+        self.cur_row_index += length
+        return slice
+    
+    def remaining_rows(self):
+        """Get all remaining rows from the data array."""
+        slice = self.data_array[self.cur_row_index:]
+        self.cur_row_index += len(slice)
+        return slice
+
+
 class ResultSetQueueFactory(ABC):
     @staticmethod
     def build_queue(
-        row_set_type: TSparkRowSetType,
-        t_row_set: TRowSet,
-        arrow_schema_bytes: bytes,
-        max_download_threads: int,
-        ssl_options: SSLOptions,
+        row_set_type: Optional[TSparkRowSetType] = None,
+        t_row_set: Optional[TRowSet] = None,
+        arrow_schema_bytes: Optional[bytes] = None,
+        max_download_threads: Optional[int] = None,
+        ssl_options: Optional[SSLOptions] = None,
         lz4_compressed: bool = True,
         description: Optional[List[List[Any]]] = None,
+        # SEA specific parameters
+        sea_result_data: Optional[Any] = None,
     ) -> ResultSetQueue:
         """
         Factory method to build a result set queue.
-
+        
+        This method is extended to handle both Thrift and SEA result formats.
+        For SEA, the sea_result_data parameter is used instead of row_set_type and t_row_set.
+        
         Args:
+            # Thrift parameters
             row_set_type (enum): Row set type (Arrow, Column, or URL).
             t_row_set (TRowSet): Result containing arrow batches, columns, or cloud fetch links.
+            
+            # Common parameters
             arrow_schema_bytes (bytes): Bytes representing the arrow schema.
             lz4_compressed (bool): Whether result data has been lz4 compressed.
             description (List[List[Any]]): Hive table schema description.
             max_download_threads (int): Maximum number of downloader thread pool threads.
             ssl_options (SSLOptions): SSLOptions object for CloudFetchQueue
-
+            
+            # SEA parameters
+            sea_result_data (ResultData): Result data from SEA response
+            
         Returns:
             ResultSetQueue
         """
-        if row_set_type == TSparkRowSetType.ARROW_BASED_SET:
+        # Handle SEA result data
+        if sea_result_data is not None:
+            if sea_result_data.data:
+                # INLINE disposition with JSON_ARRAY format
+                return JsonQueue(sea_result_data.data)
+            elif sea_result_data.external_links:
+                # EXTERNAL_LINKS disposition (not implemented yet)
+                raise NotImplementedError(
+                    "EXTERNAL_LINKS disposition is not supported yet"
+                )
+            else:
+                # Empty result set
+                return JsonQueue([])
+        
+        # Handle Thrift result data (existing implementation)
+        if row_set_type == TSparkRowSetType.ARROW_BASED_SET and t_row_set is not None and arrow_schema_bytes is not None:
             arrow_table, n_valid_rows = convert_arrow_based_set_to_arrow_table(
                 t_row_set.arrowBatches, lz4_compressed, arrow_schema_bytes
             )
             converted_arrow_table = convert_decimals_in_arrow_table(
                 arrow_table, description
             )
             return ArrowQueue(converted_arrow_table, n_valid_rows)
-        elif row_set_type == TSparkRowSetType.COLUMN_BASED_SET:
+        elif row_set_type == TSparkRowSetType.COLUMN_BASED_SET and t_row_set is not None:
             column_table, column_names = convert_column_based_set_to_column_table(
                 t_row_set.columns, description
             )
@@ -92,7 +141,7 @@ def build_queue(
             )
 
             return ColumnQueue(ColumnTable(converted_column_table, column_names))
-        elif row_set_type == TSparkRowSetType.URL_BASED_SET:
+        elif row_set_type == TSparkRowSetType.URL_BASED_SET and t_row_set is not None and arrow_schema_bytes is not None and max_download_threads is not None and ssl_options is not None:
             return CloudFetchQueue(
                 schema_bytes=arrow_schema_bytes,
                 start_row_offset=t_row_set.startRowOffset,
diff --git a/tests/unit/test_sea_result_set.py b/tests/unit/test_sea_result_set.py