diff --git a/requirements.txt b/requirements.txt index 260025b..08130f9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,7 +15,7 @@ google-auth-httplib2 google-auth-oauthlib ratelimit backoff -kdbai-client +kdbai-client>=1.4.0 sentry-sdk[opentelemetry] halo sentence-transformers>=2.6.1 diff --git a/src/vdf_io/export_vdf/kdbai_export.py b/src/vdf_io/export_vdf/kdbai_export.py index ecaa84c..27503f8 100644 --- a/src/vdf_io/export_vdf/kdbai_export.py +++ b/src/vdf_io/export_vdf/kdbai_export.py @@ -17,7 +17,6 @@ standardize_metric, ) - load_dotenv() @@ -27,16 +26,28 @@ class ExportKDBAI(ExportVDB): @classmethod def make_parser(cls, subparsers): parser_kdbai = subparsers.add_parser( - cls.DB_NAME_SLUG, help="Export data from KDB.AI" + cls.DB_NAME_SLUG, + help="Export data from KDB.AI", + ) + parser_kdbai.add_argument( + "--kdbai_endpoint", + type=str, + help="KDB.AI cloud endpoint to connect.", ) parser_kdbai.add_argument( - "-u", - "--url", + "--kdbai_api_key", type=str, - help="KDB.AI cloud endpoint to connect", + help="KDB.AI cloud endpoint to connect.", ) parser_kdbai.add_argument( - "-t", "--tables", type=str, help="KDB.AI tables to export (comma-separated)" + "--database_name", + type=str, + help="Name of the KDB.AI database to write into.", + ) + parser_kdbai.add_argument( + "--tables_names", + type=str, + help="Names of the KDB.AI tables to export (comma-separated).", ) @classmethod @@ -44,49 +55,56 @@ def export_vdb(cls, args): """ Export data from KDBAI """ - set_arg_from_input( - args, - "url", - "Enter the KDB.AI endpoint instance: ", - str, - env_var="KDBAI_ENDPOINT", - ) - set_arg_from_password( - args, "kdbai_api_key", "Enter your KDB.AI API key: ", "KDBAI_API_KEY" - ) + if args.get("kdbai_endpoint") is None: + set_arg_from_input( + args, + "kdbai_endpoint", + "Enter the KDB.AI endpoint instance: ", + str, + env_var="KDBAI_ENDPOINT", + ) + + if args.get("kdbai_api_key") is None: + set_arg_from_password( + args, "kdbai_api_key", "Enter your KDB.AI API key: ", "KDBAI_API_KEY" + ) + kdbai_export = ExportKDBAI(args) - set_arg_from_input( - args, - "tables", - "Enter the name of table to export:", - str, - choices=kdbai_export.get_all_index_names(), - ) - if args.get("tables", None) == "": - args["tables"] = ",".join(kdbai_export.get_all_index_names()) + + if args.get("tables_names") is None: + set_arg_from_input( + args, + "tables_names", + "Enter the name of table to export:", + str, + choices=kdbai_export.get_all_index_names(), + ) + + if args.get("tables_names", None) == "": + args["tables_names"] = ",".join(kdbai_export.get_all_index_names()) kdbai_export.get_data() return kdbai_export def __init__(self, args): super().__init__(args) api_key = args.get("kdbai_api_key") - endpoint = args.get("url") - self.session = kdbai.Session(api_key=api_key, endpoint=endpoint) - self.model = args.get("model_name") + endpoint = args.get("kdbai_endpoint") + session = kdbai.Session(api_key=api_key, endpoint=endpoint) + self.db = session.database("default") def get_index_names(self): - if "tables" not in self.args or self.args["tables"] is None: + if "tables_names" not in self.args or self.args["tables_names"] is None: return self.get_all_index_names() - return self.args["tables"].split(",") + return self.args["tables_names"].split(",") def get_all_index_names(self): - return self.session.list() + return [name.name for name in self.db.tables] def get_data(self): - if "tables" not in self.args or self.args["tables"] is None: + if "tables_names" not in self.args or self.args["tables_names"] is None: table_names = self.get_all_index_names() else: - table_names = self.args["tables"].split(",") + table_names = self.args["tables_names"].split(",") index_metas: Dict[str, List[NamespaceMeta]] = {} for table_name in tqdm(table_names, desc="Fetching indexes"): index_metas[table_name] = self.export_table(table_name) @@ -106,10 +124,9 @@ def get_data(self): json_file.write(meta_json_text) def export_table(self, table_name): - model = self.model vectors_directory = self.create_vec_dir(table_name) - table = self.session.table(table_name) + table = self.db.table(table_name) table_res = table.query() save_path = f"{vectors_directory}/{table_name}.parquet" table_res.to_parquet(save_path, index=False) @@ -118,18 +135,13 @@ def export_table(self, table_name): # vectors = table_res["vector"].apply(pd.Series) # metadata = table_res.drop(columns=["vector"]).to_dict(orient="records") # self.save_vectors_to_parquet(vectors, metadata, vectors_directory) - embedding_name = None - embedding_dims = None - embedding_dist = None - tab_schema = table.schema() - - for i in range(len(tab_schema["columns"])): - if "vectorIndex" in tab_schema["columns"][i].keys(): - embedding_name = tab_schema["columns"][i]["name"] - embedding_dims = tab_schema["columns"][i]["vectorIndex"]["dims"] - embedding_dist = standardize_metric( - tab_schema["columns"][i]["vectorIndex"]["metric"], self.DB_NAME_SLUG - ) + + model = table.indexes[0]["type"] + embedding_name = table.indexes[0]["column"] + embedding_dims = table.indexes[0]["params"]["dims"] + embedding_dist = standardize_metric( + table.indexes[0]["params"]["metric"], self.DB_NAME_SLUG + ) namespace_meta = NamespaceMeta( namespace="", diff --git a/src/vdf_io/import_vdf/kdbai_import.py b/src/vdf_io/import_vdf/kdbai_import.py index 9e9a523..e80ee97 100644 --- a/src/vdf_io/import_vdf/kdbai_import.py +++ b/src/vdf_io/import_vdf/kdbai_import.py @@ -5,7 +5,6 @@ import kdbai_client as kdbai -from vdf_io.constants import INT_MAX from vdf_io.names import DBNames from vdf_io.import_vdf.vdf_import_cls import ImportVDB from vdf_io.meta_types import NamespaceMeta @@ -18,6 +17,33 @@ load_dotenv() +_parquettype_to_pytype = { + "BOOLEAN": "bool", + "int16": "int16", + "int32": "int32", + "int64": "int64", + "FLOAT": "float32", + "list": "float32s", + "DOUBLE": "float64", + "BYTE_ARRAY": "bytes", + "FIXED_LEN_BYTE_ARRAY": "bytes", + "string": "str", + "BINARY": "bytes", + "timestamp[ns]": "datetime64[ns]", + "TIMESTAMP_MILLIS": "datetime64[ms]", + "TIMESTAMP_MICROS": "datetime64[us]", + "DATE": "datetime64[D]", + "TIME_MILLIS": "timedelta64[ms]", + "TIME_MICROS": "timedelta64[us]", + "DECIMAL": "float64", + "UINT8": "uint8", + "UINT16": "uint16", + "UINT32": "uint32", + "UINT64": "uint64", + "INTERVAL": "timedelta64", +} + + class ImportKDBAI(ImportVDB): DB_NAME_SLUG = DBNames.KDBAI @@ -26,25 +52,28 @@ def import_vdb(cls, args): """ Import data to KDB.AI """ - set_arg_from_input( - args, - "url", - "Enter the endpoint for KDB.AI Cloud instance: ", - str, - env_var="KDBAI_ENDPOINT", - ) - set_arg_from_password( - args, - "kdbai_api_key", - "Enter your KDB.AI API key: ", - env_var_name="KDBAI_API_KEY", - ) - set_arg_from_input( - args, - "index", - "Enter the index type used (Flat, IVF, IVFPQ, HNSW): ", - str, - ) + if args.get("kdbai_endpoint") is None: + set_arg_from_input( + args, + "kdbai_endpoint", + "Enter the KDB.AI endpoint instance: ", + str, + env_var="KDBAI_ENDPOINT", + ) + + if args.get("kdbai_api_key") is None: + set_arg_from_password( + args, "kdbai_api_key", "Enter your KDB.AI API key: ", "KDBAI_API_KEY" + ) + + if args.get("index") is None: + set_arg_from_input( + args, + "index", + "Enter the index type used (Flat, IVF, IVFPQ, HNSW, QFLAT, QHNSW): ", + str, + ) + kdbai_import = ImportKDBAI(args) kdbai_import.upsert_data() return kdbai_import @@ -64,16 +93,17 @@ def make_parser(cls, subparsers): def __init__(self, args): super().__init__(args) api_key = args.get("kdbai_api_key") - endpoint = args.get("url") + endpoint = args.get("kdbai_endpoint") self.index = args.get("index") - allowed_vector_types = ["flat", "ivf", "ivfpq", "hnsw"] + allowed_vector_types = ["flat", "ivf", "ivfpq", "hnsw", "qflat", "qhnsw"] if self.index.lower() not in allowed_vector_types: raise ValueError( f"Invalid vectorIndex type: {self.index}. " f"Allowed types are {', '.join(allowed_vector_types)}" ) - self.session = kdbai.Session(api_key=api_key, endpoint=endpoint) + session = kdbai.Session(api_key=api_key, endpoint=endpoint) + self.db = session.database("default") def compliant_name(self, name: str) -> str: new_name = name.replace("-", "_") @@ -82,15 +112,11 @@ def compliant_name(self, name: str) -> str: return new_name def upsert_data(self): - self.total_imported_count = 0 - max_hit = False indexes_content: Dict[str, List[NamespaceMeta]] = self.vdf_meta["indexes"] index_names: List[str] = list(indexes_content.keys()) if len(index_names) == 0: raise ValueError("No indexes found in VDF_META.json") - # Load Parquet file - # print(indexes_content[index_names[0]]):List[NamespaceMeta] for index_name, index_meta in tqdm( indexes_content.items(), desc="Importing indexes" ): @@ -124,7 +150,7 @@ def upsert_data(self): pandas_table.to_parquet(parquet_file_path) parquet_table = pq.read_table(parquet_file_path) - # rename columns by replacing "-" with "_" + old_column_name_to_new = { col: self.compliant_name(col) for col in parquet_table.column_names @@ -142,107 +168,77 @@ def upsert_data(self): ] # Extract information from JSON - # namespace = indexes_content[index_names[0]][""][0]["namespace"] vector_column_names = [ self.compliant_name(col) for col in vector_column_names ] vector_column_name = self.compliant_name(vector_column_name) + # Define the schema - schema = { - "columns": [ - { - "name": vector_column_name, - "vectorIndex": { - "dims": namespace_meta["dimensions"], - "metric": standardize_metric_reverse( - namespace_meta.get("metric"), - self.DB_NAME_SLUG, - ), - "type": self.index.lower(), - }, - } - ] - } + schema = [] + for c in parquet_columns: + column_name = c["name"] + column_type = c["type"] - cols_to_be_dropped = [] - # Add other columns from Parquet (excluding vector columns) - for col in parquet_columns: - if col["name"] not in vector_column_names: - schema["columns"].append( - {"name": col["name"], "pytype": col["type"]} + try: + schema.append( + { + "name": column_name, + "type": _parquettype_to_pytype[column_type], + } + ) + except KeyError: + raise ValueError( + f"Cannot create the table. The column '{column_name}' with type '{column_type}' is not mapped. Please update the schema." ) - elif col["name"] != vector_column_name: - cols_to_be_dropped.append(col["name"]) - for column in schema["columns"]: - if "pytype" in column and column["pytype"] == "string": - column["pytype"] = "str" - if "pytype" in column and column["pytype"] == "double": - column["pytype"] = "float64" + index = { + "name": "flat", + "column": vector_column_name, + "type": namespace_meta["model_name"], + "params": { + "dims": namespace_meta["dimensions"], + "metric": standardize_metric_reverse( + namespace_meta.get("metric"), + self.DB_NAME_SLUG, + ), + }, + } - # First ensure the table does not already exist try: - if new_index_name in self.session.list(): - table = self.session.table(new_index_name) + if new_index_name in [name.name for name in self.db.tables]: + table = self.db.table(new_index_name) tqdm.write( f"Table '{new_index_name}' already exists. Upserting data into it." ) - # self.session.table(index_names).drop() else: - table = self.session.create_table(new_index_name, schema) + table = self.db.create_table( + new_index_name, schema=schema, indexes=[index] + ) tqdm.write("Table created") - # time.sleep(5) + except kdbai.KDBAIException as e: tqdm.write(f"Error creating table: {e}") raise RuntimeError(f"Error creating table: {e}") - # insert data - # Set the batch size - df = parquet_table.to_pandas().drop(columns=cols_to_be_dropped) - if self.total_imported_count + len(df) >= ( - self.args.get("max_num_rows") or INT_MAX - ): - max_hit = True - # Take a subset of df - df = df.iloc[ - : (self.args.get("max_num_rows") or INT_MAX) - - self.total_imported_count - ] - i = 0 - # convert pytype double to float64 - for col in df.columns: - if df[col].dtype == "double": - df[col] = df[col].astype("float64") - tqdm.write(f"Converting column {col} to float64") + df = parquet_table.to_pandas() + batch_size = self.args.get("batch_size", 10_000) or 10_000 pbar = tqdm(total=df.shape[0], desc="Inserting data") - while i < df.shape[0]: - chunk = df[i : i + batch_size].reset_index(drop=True) - # Assuming 'table' has an 'insert' method - try: - table.insert(chunk) - pbar.update(chunk.shape[0]) - i += batch_size - except kdbai.KDBAIException as e: - if "smaller batches" in str(e): - tqdm.write( - f"Reducing batch size to {batch_size * 2 // 3}" - ) - batch_size = batch_size * 2 // 3 - else: + + i = 0 + try: + while i < df.shape[0]: + chunk = df.iloc[ + i : min(i + batch_size, df.shape[0]) + ].reset_index(drop=True) + + try: + table.insert(chunk) + pbar.update(chunk.shape[0]) + i += batch_size + except kdbai.KDBAIException as e: raise RuntimeError(f"Error inserting chunk: {e}") - continue - self.total_imported_count += len(df) - if max_hit: - break - if max_hit: - break - if max_hit: - tqdm.write( - f"Max rows to be imported {self.args['max_num_rows']} hit. Exiting" - ) - break + finally: + pbar.close() - # table.insert(df) print("Data imported successfully") - self.args["imported_count"] = self.total_imported_count diff --git a/src/vdf_io/notebooks/kdbai_end_to_end_vectorIO.ipynb b/src/vdf_io/notebooks/kdbai_end_to_end_vectorIO.ipynb index d3e63d7..0479129 100644 --- a/src/vdf_io/notebooks/kdbai_end_to_end_vectorIO.ipynb +++ b/src/vdf_io/notebooks/kdbai_end_to_end_vectorIO.ipynb @@ -8,7 +8,9 @@ "outputs": [], "source": [ "from getpass import getpass\n", - "import kdbai_client as kdbai" + "import kdbai_client as kdbai\n", + "import pandas as pd \n", + "import numpy as np" ] }, { @@ -16,43 +18,118 @@ "id": "2ef38294-9cce-42f1-b193-ddffae041ecd", "metadata": {}, "source": [ - "## Connect to KDB.AI cloud instance and verify table exists and has data" + "##### Option 1. KDB.AI Cloud\n", + "\n", + "To use KDB.AI Cloud, you will need two session details - a URL endpoint and an API key.\n", + "To get these you can sign up for free [here](https://trykdb.kx.com/kdbai/signup).\n", + "\n", + "You can connect to a KDB.AI Cloud session using `kdbai.Session` and passing the session URL endpoint and API key details from your KDB.AI Cloud portal.\n", + "\n", + "If the environment variables `KDBAI_ENDPOINTS` and `KDBAI_API_KEY` exist on your system containing your KDB.AI Cloud portal details, these variables will automatically be used to connect.\n", + "If these do not exist, it will prompt you to enter your KDB.AI Cloud portal session URL endpoint and API key details." ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "5f9392fc-87fd-42ae-bb5c-1518c2022028", "metadata": {}, + "outputs": [], + "source": [ + "#Set up KDB.AI endpoint and API key\n", + "KDBAI_ENDPOINT = (\n", + " os.environ[\"KDBAI_ENDPOINT\"]\n", + " if \"KDBAI_ENDPOINT\" in os.environ\n", + " else input(\"KDB.AI endpoint: \")\n", + ")\n", + "KDBAI_API_KEY = (\n", + " os.environ[\"KDBAI_API_KEY\"]\n", + " if \"KDBAI_API_KEY\" in os.environ\n", + " else getpass(\"KDB.AI API key: \")\n", + ")\n", + "\n", + "session = kdbai.Session(endpoint=KDBAI_ENDPOINT, api_key=KDBAI_API_KEY)" + ] + }, + { + "cell_type": "markdown", + "id": "e0405867", + "metadata": {}, + "source": [ + "##### Option 2. KDB.AI Server\n", + "\n", + "To use KDB.AI Server, you will need download and run your own container.\n", + "To do this, you will first need to sign up for free [here](https://trykdb.kx.com/kdbaiserver/signup/).\n", + "\n", + "You will receive an email with the required license file and bearer token needed to download your instance.\n", + "Follow instructions in the signup email to get your session up and running.\n", + "\n", + "Once the [setup steps](https://code.kx.com/kdbai/gettingStarted/kdb-ai-server-setup.html) are complete you can then connect to your KDB.AI Server session using `kdbai.Session` and passing your local endpoint." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "85b3cd03", + "metadata": {}, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "KDB.AI endpoint: ········\n", - "KDB.AI API key: ········\n" + "WARNING:root:You are running a development version of `kdbai_client`.\n", + "Compatibility with the KDB.AI server is not guaranteed.\n" ] } ], "source": [ - "KDBAI_ENDPOINT = getpass(\"KDB.AI endpoint: \")\n", - "KDBAI_API_KEY = getpass(\"KDB.AI API key: \")\n", - "session = kdbai.Session(api_key=KDBAI_API_KEY, endpoint=KDBAI_ENDPOINT)" + "# session = kdbai.Session(endpoint='http://localhost:8082')" ] }, { "cell_type": "code", "execution_count": 3, - "id": "05292533-0f14-4edc-a011-924b92075abd", + "id": "3dd1bd38", "metadata": {}, "outputs": [], "source": [ - "table = session.table(\"openai_pdf\")" + "db= session.database(\"default\")" ] }, { "cell_type": "code", "execution_count": 4, + "id": "5b7672cf", + "metadata": {}, + "outputs": [], + "source": [ + "n_rows = 100\n", + "data = pd.DataFrame({\n", + " 'id': np.arange(n_rows, dtype='int16'),\n", + " 'content': [f'document{i}' for i in range(n_rows)],\n", + " 'publication_date': pd.date_range(start='2020-01-01', periods=n_rows, freq='1MIN'),\n", + " 'embeddings': [np.random.rand(25).astype('float32') for _ in range(n_rows)]\n", + "})\n", + "\n", + "\n", + "schema = [\n", + " {'name': 'id', 'type': 'int16'},\n", + " {\"name\": \"content\", \"type\": \"str\"},\n", + " {'name': 'publication_date', 'type': 'datetime64[ns]'},\n", + " {'name': 'embeddings', 'type': 'float32s'},\n", + " ]\n", + "\n", + "\n", + "INDEX_FLAT = {'name': 'flat', 'column': 'embeddings', 'type': 'flat', 'params': {'dims': 25, 'metric': 'L2'}}\n", + "\n", + "table = db.create_table(\"vectorIO\",schema=schema, indexes=[INDEX_FLAT])\n", + "\n", + "table.insert(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, "id": "2e6d02c7-db75-497d-9dcd-c7855a56c3a0", "metadata": {}, "outputs": [ @@ -77,102 +154,126 @@ " \n", " \n", " \n", - " Embeddings\n", - " Sentences\n", + " id\n", + " content\n", + " publication_date\n", + " embeddings\n", " \n", " \n", " \n", " \n", " 0\n", - " [-0.00248929625377059, 0.006371329538524151, -...\n", - " Draft version August 14, 2023\\nTypeset using L...\n", + " 0\n", + " document0\n", + " 2020-01-01 00:00:00\n", + " [0.30686182, 0.62582034, 0.116534576, 0.334927...\n", " \n", " \n", " 1\n", - " [0.006372543051838875, 0.002492946805432439, -...\n", - " Ted Mackereth\\n ,3, 4, 5, ∗and\\nJohn C. Forbes...\n", + " 1\n", + " document1\n", + " 2020-01-01 00:01:00\n", + " [0.42317352, 0.7345967, 0.37989786, 0.9099123,...\n", " \n", " \n", " 2\n", - " [0.0013814108679071069, 0.0014908972661942244,...\n", - " We define a novel framework: firstly to predic...\n", + " 2\n", + " document2\n", + " 2020-01-01 00:02:00\n", + " [0.7737126, 0.004722189, 0.2140698, 0.6124037,...\n", " \n", " \n", " 3\n", - " [0.014146137051284313, -0.0005565343308262527,...\n", - " We predict the spatial and compositional distr...\n", + " 3\n", + " document3\n", + " 2020-01-01 00:03:00\n", + " [0.9648716, 0.7883268, 0.80019754, 0.29287186,...\n", " \n", " \n", " 4\n", - " [0.01576417125761509, 0.004918665625154972, 0....\n", - " Selecting ISO water mass\\nfraction as an examp...\n", + " 4\n", + " document4\n", + " 2020-01-01 00:04:00\n", + " [0.1775476, 0.44076434, 0.7793666, 0.6005743, ...\n", " \n", " \n", " ...\n", " ...\n", " ...\n", + " ...\n", + " ...\n", " \n", " \n", - " 586\n", - " [-0.00573846697807312, -0.006318436004221439, ...\n", - " 2021, ApJ, 922, 189,\\ndoi: 10.3847/1538-4357/a...\n", + " 95\n", + " 95\n", + " document95\n", + " 2020-01-01 01:35:00\n", + " [0.3033411, 0.1814489, 0.29027653, 0.60969937,...\n", " \n", " \n", - " 587\n", - " [0.0037363762967288494, -0.0014329071855172515...\n", - " 2020,\\nNature Methods, 17, 261, doi: 10.1038/s...\n", + " 96\n", + " 96\n", + " document96\n", + " 2020-01-01 01:36:00\n", + " [0.44959554, 0.03702071, 0.87624884, 0.9032671...\n", " \n", " \n", - " 588\n", - " [0.0008434861665591598, -0.01327595580369234, ...\n", - " A., Frinchaboy, P. M., et al.\n", + " 97\n", + " 97\n", + " document97\n", + " 2020-01-01 01:37:00\n", + " [0.7518059, 0.58830845, 0.07086042, 0.59891945...\n", " \n", " \n", - " 589\n", - " [-0.01480394322425127, 0.0031033621635288, -0....\n", - " 2013, AJ, 146, 81, doi: 10.1088/0004-6256/146/...\n", + " 98\n", + " 98\n", + " document98\n", + " 2020-01-01 01:38:00\n", + " [0.7576109, 0.93266773, 0.15977149, 0.05038771...\n", " \n", " \n", - " 590\n", - " [-0.017208511009812355, 0.0005549969500862062,...\n", - " 2017,\\nAJ, 154, 198, doi: 10.3847/1538-3881/aa...\n", + " 99\n", + " 99\n", + " document99\n", + " 2020-01-01 01:39:00\n", + " [0.22993591, 0.42983195, 0.5941011, 0.50349414...\n", " \n", " \n", "\n", - "

591 rows × 2 columns

\n", + "

100 rows × 4 columns

\n", "" ], "text/plain": [ - " Embeddings \\\n", - "0 [-0.00248929625377059, 0.006371329538524151, -... \n", - "1 [0.006372543051838875, 0.002492946805432439, -... \n", - "2 [0.0013814108679071069, 0.0014908972661942244,... \n", - "3 [0.014146137051284313, -0.0005565343308262527,... \n", - "4 [0.01576417125761509, 0.004918665625154972, 0.... \n", - ".. ... \n", - "586 [-0.00573846697807312, -0.006318436004221439, ... \n", - "587 [0.0037363762967288494, -0.0014329071855172515... \n", - "588 [0.0008434861665591598, -0.01327595580369234, ... \n", - "589 [-0.01480394322425127, 0.0031033621635288, -0.... \n", - "590 [-0.017208511009812355, 0.0005549969500862062,... \n", + " id content publication_date \\\n", + "0 0 document0 2020-01-01 00:00:00 \n", + "1 1 document1 2020-01-01 00:01:00 \n", + "2 2 document2 2020-01-01 00:02:00 \n", + "3 3 document3 2020-01-01 00:03:00 \n", + "4 4 document4 2020-01-01 00:04:00 \n", + ".. .. ... ... \n", + "95 95 document95 2020-01-01 01:35:00 \n", + "96 96 document96 2020-01-01 01:36:00 \n", + "97 97 document97 2020-01-01 01:37:00 \n", + "98 98 document98 2020-01-01 01:38:00 \n", + "99 99 document99 2020-01-01 01:39:00 \n", "\n", - " Sentences \n", - "0 Draft version August 14, 2023\\nTypeset using L... \n", - "1 Ted Mackereth\\n ,3, 4, 5, ∗and\\nJohn C. Forbes... \n", - "2 We define a novel framework: firstly to predic... \n", - "3 We predict the spatial and compositional distr... \n", - "4 Selecting ISO water mass\\nfraction as an examp... \n", - ".. ... \n", - "586 2021, ApJ, 922, 189,\\ndoi: 10.3847/1538-4357/a... \n", - "587 2020,\\nNature Methods, 17, 261, doi: 10.1038/s... \n", - "588 A., Frinchaboy, P. M., et al. \n", - "589 2013, AJ, 146, 81, doi: 10.1088/0004-6256/146/... \n", - "590 2017,\\nAJ, 154, 198, doi: 10.3847/1538-3881/aa... \n", + " embeddings \n", + "0 [0.30686182, 0.62582034, 0.116534576, 0.334927... \n", + "1 [0.42317352, 0.7345967, 0.37989786, 0.9099123,... \n", + "2 [0.7737126, 0.004722189, 0.2140698, 0.6124037,... \n", + "3 [0.9648716, 0.7883268, 0.80019754, 0.29287186,... \n", + "4 [0.1775476, 0.44076434, 0.7793666, 0.6005743, ... \n", + ".. ... \n", + "95 [0.3033411, 0.1814489, 0.29027653, 0.60969937,... \n", + "96 [0.44959554, 0.03702071, 0.87624884, 0.9032671... \n", + "97 [0.7518059, 0.58830845, 0.07086042, 0.59891945... \n", + "98 [0.7576109, 0.93266773, 0.15977149, 0.05038771... \n", + "99 [0.22993591, 0.42983195, 0.5941011, 0.50349414... \n", "\n", - "[591 rows x 2 columns]" + "[100 rows x 4 columns]" ] }, - "execution_count": 4, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -191,31 +292,12 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "1178a222-462c-4600-8725-53a6d907046e", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Enter the KDB.AI endpoint instance: https://cloud.kdb.ai/instance/pwjhnuwjez\n", - "Enter the name of table to export: openai_pdf\n", - "Enter the embedding model used: text-embedding-ada-002\n", - "Enter your KDB.AI API key: ········\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Export to disk completed. Exported to: vdf_20240205_101022_82f0d/\n", - "Time taken to export data: 00:00:20\n" - ] - } - ], + "outputs": [], "source": [ - "%run src/export_vdf.py kdbai" + "%run vector-io/src/vdf_io/export_vdf_cli.py kdbai" ] }, { @@ -223,52 +305,19 @@ "id": "7de087de-5fb1-4826-bb52-cce0d22a0255", "metadata": {}, "source": [ - "## Drop table from KDB.AI instance and query to confirm its deleted" + "## Drop table from KDB.AI table" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 17, "id": "5a411ce3-5dbc-46db-bc6c-26f552efdb1c", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "table.drop()" ] }, - { - "cell_type": "code", - "execution_count": 7, - "id": "93046e19-fb05-4aa3-8618-6e72bfb1da4a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Caught an exception: Failed to process the query {'table': 'openai_pdf'} on table named: openai_pdf, because of: Table not found, table=openai_pdf.\n" - ] - } - ], - "source": [ - "try:\n", - " table.query()\n", - " time.sleep(5)\n", - "except kdbai.KDBAIException as e:\n", - " print(f\"Caught an exception: {e}\")" - ] - }, { "cell_type": "markdown", "id": "f853bdf2-4f6a-4287-ae9a-39e5d4f4e054", @@ -279,41 +328,12 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "85a2ebc8-04a0-44ce-a3ce-278526146e94", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/christos/anaconda3/envs/threedotten/lib/python3.10/site-packages/pinecone/data/index.py:1: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from tqdm.autonotebook import tqdm\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Enter the endpoint for KDB.AI Cloud instance: https://cloud.kdb.ai/instance/pwjhnuwjez\n", - "Enter your KDB.AI API key: ········\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Table created\n", - "Inserted 250 out of 591 rows.\n", - "Inserted 500 out of 591 rows.\n", - "Inserted 591 out of 591 rows.\n", - "Data fully added\n", - "Time taken: 37.84 seconds\n" - ] - } - ], + "outputs": [], "source": [ - "%run src/import_vdf.py -d \"vdf_20240205_101022_82f0d\" kdbai" + "%run vector-io/src/vdf_io/import_vdf_cli.py kdbai" ] }, { @@ -324,6 +344,16 @@ "## Confirm table exists and data available" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "afad1b66", + "metadata": {}, + "outputs": [], + "source": [ + "table = db.table(\"vectorIO\")" + ] + }, { "cell_type": "code", "execution_count": 9, @@ -472,7 +502,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.10.15" } }, "nbformat": 4,