diff --git a/google/cloud/aiplatform/preview/datasets.py b/google/cloud/aiplatform/preview/datasets.py index d65811b611..1616abb991 100644 --- a/google/cloud/aiplatform/preview/datasets.py +++ b/google/cloud/aiplatform/preview/datasets.py @@ -16,6 +16,8 @@ # import dataclasses +import io +import json from typing import Dict, List, Optional, Tuple import uuid @@ -32,7 +34,6 @@ from vertexai import generative_models from vertexai.generative_models import _generative_models from vertexai.preview import prompts -import pandas from google.protobuf import field_mask_pb2 from google.protobuf import struct_pb2 @@ -758,7 +759,7 @@ def from_bigquery( def from_pandas( cls, *, - dataframe: pandas.DataFrame, + dataframe: "pandas.DataFrame", # type: ignore # noqa: F821 target_table_id: Optional[str] = None, display_name: Optional[str] = None, project: Optional[str] = None, @@ -1077,7 +1078,7 @@ def from_gemini_request_jsonl( jsonl_string = blob.download_as_text() lines = [line.strip() for line in jsonl_string.splitlines() if line.strip()] - df = pandas.DataFrame(lines, columns=[request_column_name]) + json_string = json.dumps({request_column_name: lines}) session_options = bigframes.BigQueryOptions( credentials=credentials, @@ -1085,7 +1086,7 @@ def from_gemini_request_jsonl( location=location, ) with bigframes.connect(session_options) as session: - temp_bigframes_df = session.read_pandas(df) + temp_bigframes_df = session.read_json(io.StringIO(json_string)) temp_bigframes_df[request_column_name] = bigframes.bigquery.parse_json( temp_bigframes_df[request_column_name] ) diff --git a/tests/unit/aiplatform/test_multimodal_datasets.py b/tests/unit/aiplatform/test_multimodal_datasets.py index 1b8446507a..c286feb487 100644 --- a/tests/unit/aiplatform/test_multimodal_datasets.py +++ b/tests/unit/aiplatform/test_multimodal_datasets.py @@ -584,9 +584,10 @@ def test_create_dataset_from_gemini_request_jsonl( mock_bucket.blob.assert_called_once_with("test-file.jsonl") mock_blob.download_as_text.assert_called_once() - pandas.testing.assert_frame_equal( - session_mock.read_pandas.call_args[0][0], - pandas.DataFrame({"requests": ["json_line_1", "json_line_2"]}), + session_mock.read_json.assert_called_once() + assert ( + session_mock.read_json.call_args[0][0].getvalue() + == '{"requests": ["json_line_1", "json_line_2"]}' ) bq_client_mock.return_value.copy_table.assert_called_once_with( sources=mock.ANY, @@ -636,9 +637,10 @@ def test_create_dataset_from_gemini_request_jsonl_without_target_table_id( mock_bucket.blob.assert_called_once_with("test-file.jsonl") mock_blob.download_as_text.assert_called_once() - pandas.testing.assert_frame_equal( - session_mock.read_pandas.call_args[0][0], - pandas.DataFrame({"requests": ["json_line_1", "json_line_2"]}), + session_mock.read_json.assert_called_once() + assert ( + session_mock.read_json.call_args[0][0].getvalue() + == '{"requests": ["json_line_1", "json_line_2"]}' ) # Assert that the default BQ dataset is created