Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
263 changes: 235 additions & 28 deletions python-recipes/RAG/03_llamaindex.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -33,30 +33,15 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "UQezgPCG1vml",
"outputId": "97b9bc03-da1b-439a-c37b-be6fdb58ab21"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Cloning into 'temp_repo'...\n",
"remote: Enumerating objects: 138, done.\u001b[K\n",
"remote: Counting objects: 100% (138/138), done.\u001b[K\n",
"remote: Compressing objects: 100% (98/98), done.\u001b[K\n",
"remote: Total 138 (delta 68), reused 91 (delta 35), pack-reused 0\u001b[K\n",
"Receiving objects: 100% (138/138), 7.19 MiB | 4.45 MiB/s, done.\n",
"Resolving deltas: 100% (68/68), done.\n",
"mv: rename temp_repo/resources to ./resources: Directory not empty\n"
]
}
],
"outputs": [],
"source": [
"# NBVAL_SKIP\n",
"!git clone https://github.com/redis-developer/redis-ai-resources.git temp_repo\n",
Expand Down Expand Up @@ -200,7 +185,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Sample doc Doc ID: 67e07154-6ea0-4822-8957-ac1d212fc9ee\n",
"Sample doc Doc ID: c013353e-dae7-4d17-befd-9e784c8acf79\n",
"Text: UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington,\n",
"D.C. 20549 FORM 10-K (Mark One) ☒ ANNUAL REPORT PURSUANT T O SECTION\n",
"13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the fiscal year\n",
Expand Down Expand Up @@ -245,13 +230,13 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.core import StorageContext\n",
"\n",
"vector_store = RedisVectorStore(redis_url=REDIS_URL, index_name=\"llama\", overwrite=True)\n",
"vector_store = RedisVectorStore(redis_url=REDIS_URL, overwrite=True)\n",
"\n",
"storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
"\n",
Expand All @@ -267,7 +252,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -285,14 +270,14 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Node ID: b561dd17-5545-4d3a-bc4f-18cb39c7c01e\n",
"Node ID: d2e6cd9c-0716-49d8-8563-407a00d05445\n",
"Text: Table of Contents FISCAL 2023 NIKE BRAND REVENUE HIGHLIGHTS The\n",
"following tables present NIKE Brand revenues disaggregated by\n",
"reportable operating segment, distribution channel and major product\n",
Expand All @@ -301,14 +286,14 @@
"fiscal 2022 on...\n",
"Score: 0.900\n",
"\n",
"Node ID: 0415f059-9258-426b-8b21-34b287b3c21b\n",
"Node ID: 28542d3b-b345-4e9e-b675-f62361ec85d9\n",
"Text: Table of Contents NORTH AMERICA (Dollars in millions) FISCAL\n",
"2023FISCAL 2022 % CHANGE% CHANGE EXCLUDING CURRENCY CHANGESFISCAL 2021\n",
"% CHANGE% CHANGE EXCLUDING CURRENCY CHANGES Revenues by: Footwear $\n",
"14,897 $ 12,228 22 % 22 %$ 11,644 5 % 5 % Apparel 5,947 5,492 8 % 9 %\n",
"5,028 9 % 9 % Equipment 764 633 21 % 21 % 507 25 % 25 % TOTAL REVENUES\n",
"$ 21,6...\n",
"Score: 0.886\n",
"Score: 0.885\n",
"\n"
]
}
Expand All @@ -329,7 +314,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 7,
"metadata": {},
"outputs": [
{
Expand All @@ -338,7 +323,7 @@
"\"NIKE's revenue in fiscal 23 was $51.2 billion.\""
]
},
"execution_count": 13,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -348,6 +333,228 @@
"response.response"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Use a custom index schema\n",
"\n",
"In most use cases, you need the ability to customize the underling index configuration\n",
"and specification. For example, this is handy in order to define specific metadata filters you wish to enable.\n",
"\n",
"With Redis, this is as simple as defining an index schema object\n",
"(from file or dict) and passing it through to the vector store client wrapper."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from redisvl.schema import IndexSchema\n",
"\n",
"\n",
"custom_schema = IndexSchema.from_dict(\n",
" {\n",
" # customize basic index specs\n",
" \"index\": {\n",
" \"name\": \"custom_index\",\n",
" \"prefix\": \"docs\",\n",
" \"key_separator\": \":\",\n",
" },\n",
" # customize fields that are indexed\n",
" \"fields\": [\n",
" # required fields for llamaindex\n",
" {\"type\": \"tag\", \"name\": \"id\"},\n",
" {\"type\": \"tag\", \"name\": \"doc_id\"},\n",
" {\"type\": \"text\", \"name\": \"text\"},\n",
" # custom metadata fields\n",
" {\"type\": \"numeric\", \"name\": \"updated_at\"},\n",
" {\"type\": \"tag\", \"name\": \"file_name\"},\n",
" # custom vector field definition for cohere embeddings\n",
" {\n",
" \"type\": \"vector\",\n",
" \"name\": \"vector\",\n",
" \"attrs\": {\n",
" \"dims\": 1536,\n",
" \"algorithm\": \"hnsw\",\n",
" \"distance_metric\": \"cosine\",\n",
" },\n",
" },\n",
" ],\n",
" }\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"IndexInfo(name='custom_index', prefix='docs', key_separator=':', storage_type=<StorageType.HASH: 'hash'>)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"custom_schema.index"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'id': TagField(name='id', type='tag', path=None, attrs=TagFieldAttributes(sortable=False, separator=',', case_sensitive=False, withsuffixtrie=False)),\n",
" 'doc_id': TagField(name='doc_id', type='tag', path=None, attrs=TagFieldAttributes(sortable=False, separator=',', case_sensitive=False, withsuffixtrie=False)),\n",
" 'text': TextField(name='text', type='text', path=None, attrs=TextFieldAttributes(sortable=False, weight=1, no_stem=False, withsuffixtrie=False, phonetic_matcher=None)),\n",
" 'updated_at': NumericField(name='updated_at', type='numeric', path=None, attrs=NumericFieldAttributes(sortable=False)),\n",
" 'file_name': TagField(name='file_name', type='tag', path=None, attrs=TagFieldAttributes(sortable=False, separator=',', case_sensitive=False, withsuffixtrie=False)),\n",
" 'vector': HNSWVectorField(name='vector', type='vector', path=None, attrs=HNSWVectorFieldAttributes(dims=1536, algorithm=<VectorIndexAlgorithm.HNSW: 'HNSW'>, datatype=<VectorDataType.FLOAT32: 'FLOAT32'>, distance_metric=<VectorDistanceMetric.COSINE: 'COSINE'>, initial_cap=None, m=16, ef_construction=200, ef_runtime=10, epsilon=0.01))}"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"custom_schema.fields"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# from datetime import datetime\n",
"\n",
"\n",
"# def date_to_timestamp(date_string: str) -> int:\n",
"# date_format: str = \"%Y-%m-%d\"\n",
"# return int(datetime.strptime(date_string, date_format).timestamp())\n",
"\n",
"\n",
"# # iterate through documents and add new field\n",
"# for document in docs:\n",
"# document.metadata[\"updated_at\"] = date_to_timestamp(\n",
"# document.metadata[\"last_modified_date\"]\n",
"# )"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"vector_store = RedisVectorStore(\n",
" schema=custom_schema, # provide customized schema\n",
" redis_url=REDIS_URL,\n",
" overwrite=True,\n",
")\n",
"\n",
"storage_context = StorageContext.from_defaults(vector_store=vector_store)\n",
"\n",
"# build and load index from documents and storage context\n",
"index = VectorStoreIndex.from_documents(\n",
" docs, storage_context=storage_context\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Query the vector store and filter on metadata\n",
"Now that we have additional metadata indexed in Redis, let's try some queries which add in filters. As an example, we'll do a search for chunks with the word \"audit\" from an exact file \"amzn-10k-2023.pdf\". "
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"from llama_index.core.vector_stores import (\n",
" MetadataFilters,\n",
" MetadataFilter,\n",
" ExactMatchFilter,\n",
")\n",
"\n",
"retriever = index.as_retriever(\n",
" similarity_top_k=3,\n",
" filters=MetadataFilters(\n",
" filters=[\n",
" ExactMatchFilter(key=\"file_name\", value=\"amzn-10k-2023.pdf\"),\n",
" MetadataFilter(\n",
" key=\"text\",\n",
" value=\"audit\",\n",
" operator=\"text_match\",\n",
" ),\n",
" ],\n",
" condition=\"and\",\n",
" ),\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Node ID: cd0c5d8f-e3b1-4cbb-aa6a-5960003cdb2d\n",
"Text: Table of Contents valuation. In the ordinary course of our\n",
"business, there are many transactions and calculations for which the\n",
"ultimate tax determination is uncertain. Significant judgment is\n",
"required in evaluating and estimating our tax expense, assets, and\n",
"liabilities. We are also subject to tax controversies in various\n",
"jurisdictions that can...\n",
"Score: 0.746\n",
"\n",
"Node ID: 6745f668-4c7a-43bf-a9c3-9b04e1a497f8\n",
"Text: Table of Contents Included in other income (expense), net in\n",
"2021 and 2022 is a marketable equity securities valuation gain (loss)\n",
"of $11.8 billion and $(12.7) billion from our equity investment in\n",
"Rivian Automotive, Inc. (“Rivian”). Our investment in Rivian’s\n",
"preferred stock was accounted for at cost, with adjustments for\n",
"observable changes in ...\n",
"Score: 0.740\n",
"\n",
"Node ID: 717666fe-fea5-488b-999c-84e6d8b9a0db\n",
"Text: Exhibit 31.1 CERTIFICATIONS I, Andrew R. Jassy, certify that: 1.\n",
"I have reviewed this Form 10-K of Amazon.com, Inc.; 2. Based on my\n",
"knowledge, this report does not contain any untrue statement of a\n",
"material fact or omit to state a material fact necessary to make the\n",
"statements made, in light of the circumstances under which such\n",
"statements were ...\n",
"Score: 0.732\n",
"\n"
]
}
],
"source": [
"result_nodes = retriever.retrieve(\"What did the author learn?\")\n",
"\n",
"for node in result_nodes:\n",
" print(node)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -376,7 +583,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
"version": "3.11.9"
},
"widgets": {
"application/vnd.jupyter.widget-state+json": {
Expand Down
Loading