44Sales Support Model (SSM) for the LangChain project.
55See: https://python.langchain.com/docs/modules/model_io/llms/llm_caching
66 https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf
7+ https://python.langchain.com/docs/integrations/retrievers/pinecone_hybrid_search
78"""
89
910import glob
1011import os
12+ import textwrap
1113from typing import List # ClassVar
1214
1315# pinecone integration
2729from langchain .globals import set_llm_cache
2830from langchain .llms .openai import OpenAI
2931from langchain .prompts import PromptTemplate
32+ from langchain .retrievers import PineconeHybridSearchRetriever
3033from langchain .schema import HumanMessage , SystemMessage
31- from langchain .text_splitter import Document , RecursiveCharacterTextSplitter
34+ from langchain .text_splitter import Document
3235from langchain .vectorstores .pinecone import Pinecone
36+ from pinecone_text .sparse import BM25Encoder
3337
3438# this project
3539from models .const import Credentials
4650set_llm_cache (InMemoryCache ())
4751
4852
53+ class TextSplitter :
54+ """
55+ Custom text splitter that add metadata to the Document object
56+ which is required by PineconeHybridSearchRetriever.
57+ """
58+
59+ # ...
60+
61+ def create_documents (self , texts ):
62+ """Create documents"""
63+ documents = []
64+ for text in texts :
65+ # Create a Document object with the text and metadata
66+ document = Document (page_content = text , metadata = {"context" : text })
67+ documents .append (document )
68+ return documents
69+
70+
4971class SalesSupportModel :
5072 """Sales Support Model (SSM)."""
5173
@@ -60,15 +82,14 @@ class SalesSupportModel:
6082 )
6183
6284 # embeddings
63- text_splitter = RecursiveCharacterTextSplitter (
64- chunk_size = 100 ,
65- chunk_overlap = 0 ,
66- )
67- openai_embedding = OpenAIEmbeddings ()
68- pinecone_index = Pinecone .from_existing_index (
69- Credentials .PINECONE_INDEX_NAME ,
70- embedding = openai_embedding ,
85+ openai_embeddings = OpenAIEmbeddings (
86+ api_key = Credentials .OPENAI_API_KEY , organization = Credentials .OPENAI_API_ORGANIZATION
7187 )
88+ pinecone_index = pinecone .Index (index_name = Credentials .PINECONE_INDEX_NAME )
89+ vector_store = Pinecone (index = pinecone_index , embedding = openai_embeddings , text_key = "lc_id" )
90+
91+ text_splitter = TextSplitter ()
92+ bm25_encoder = BM25Encoder ().default ()
7293
7394 def cached_chat_request (self , system_message : str , human_message : str ) -> SystemMessage :
7495 """Cached chat request."""
@@ -86,24 +107,54 @@ def prompt_with_template(self, prompt: PromptTemplate, concept: str, model: str
86107 retval = llm (prompt .format (concept = concept ))
87108 return retval
88109
89- # FIX NOTE: DEPRECATED
90110 def split_text (self , text : str ) -> List [Document ]:
91- """Split text."""
92- text_splitter = RecursiveCharacterTextSplitter (
93- chunk_size = 100 ,
94- chunk_overlap = 0 ,
95- )
96- retval = text_splitter .create_documents ([text ])
111+ """Split text. Leaving this here for now, since it exposes the return type."""
112+ retval = self .text_splitter .create_documents ([text ])
97113 return retval
98114
115+ def fit_tf_idf_values (self , corpus : List [str ]):
116+ """Fit TF-IDF values.
117+ 1. Fit the BM25 encoder on the corpus
118+ 2. Encode the corpus
119+ 3. Store the encoded corpus in Pinecone
120+ """
121+ corpus = ["foo" , "bar" , "world" , "hello" ]
122+
123+ # fit tf-idf values on your corpus
124+ self .bm25_encoder .fit (corpus )
125+
126+ # persist the values to a json file
127+ self .bm25_encoder .dump ("bm25_values.json" )
128+ self .bm25_encoder = BM25Encoder ().load ("bm25_values.json" )
129+ self .bm25_encoder .fit (corpus )
130+
99131 def load (self , filepath : str ):
100132 """
101133 Embed PDF.
102134 1. Load PDF document text data
103135 2. Split into pages
104136 3. Embed each page
105137 4. Store in Pinecone
138+
139+ Note: it's important to make sure that the "context" field that holds the document text
140+ in the metadata is not indexed. Currently you need to specify explicitly the fields you
141+ do want to index. For more information checkout
142+ https://docs.pinecone.io/docs/manage-indexes#selective-metadata-indexing
106143 """
144+ try :
145+ print ("Deleting index..." )
146+ pinecone .delete_index (Credentials .PINECONE_INDEX_NAME )
147+ except pinecone .exceptions .PineconeException :
148+ print ("Index does not exist. Continuing..." )
149+
150+ metadata_config = {
151+ "indexed" : ["lc_id" , "lc_type" ],
152+ "context" : ["lc_text" ],
153+ }
154+ print ("Creating index. This may take a few minutes..." )
155+ pinecone .create_index (
156+ Credentials .PINECONE_INDEX_NAME , dimension = 1536 , metric = "dotproduct" , metadata_config = metadata_config
157+ )
107158
108159 pdf_files = glob .glob (os .path .join (filepath , "*.pdf" ))
109160 i = 0
@@ -117,12 +168,10 @@ def load(self, filepath: str):
117168 for doc in docs :
118169 k += 1
119170 print (k * "-" , end = "\r " )
120- texts_splitter_results = self .text_splitter .create_documents ([doc .page_content ])
121- self .pinecone_index .from_existing_index (
122- index_name = Credentials .PINECONE_INDEX_NAME ,
123- embedding = self .openai_embedding ,
124- text_key = texts_splitter_results ,
125- )
171+ documents = self .text_splitter .create_documents ([doc .page_content ])
172+ document_texts = [doc .page_content for doc in documents ]
173+ embeddings = self .openai_embeddings .embed_documents (document_texts )
174+ self .vector_store .add_documents (documents = documents , embeddings = embeddings )
126175
127176 print ("Finished loading PDFs" )
128177
@@ -133,26 +182,42 @@ def rag(self, prompt: str):
133182 from storage using a Retriever.
134183 2. Generate: A ChatModel / LLM produces an answer using a prompt that includes
135184 the question and the retrieved data
136- """
137185
138- # pylint: disable=unused-variable
139- def format_docs ( docs ):
140- """Format docs."""
141- return " \n \n " . join ( doc . page_content for doc in docs )
186+ To prompt OpenAI's GPT-3 model to consider the embeddings from the Pinecone
187+ vector database, you would typically need to convert the embeddings back
188+ into a format that GPT-3 can understand, such as text. However, GPT-3 does
189+ not natively support direct input of embeddings.
142190
143- retriever = self .pinecone_index .as_retriever ()
144-
145- # Use the retriever to get relevant documents
191+ The typical workflow is to use the embeddings to retrieve relevant documents,
192+ and then use the text of these documents as part of the prompt for GPT-3.
193+ """
194+ retriever = PineconeHybridSearchRetriever (
195+ embeddings = self .openai_embeddings , sparse_encoder = self .bm25_encoder , index = self .pinecone_index
196+ )
146197 documents = retriever .get_relevant_documents (query = prompt )
147198 print (f"Retrieved { len (documents )} related documents from Pinecone" )
148199
149- # Generate a prompt from the retrieved documents
150- prompt += " " .join (doc .page_content for doc in documents )
151- print (f"Prompt contains { len (prompt .split ())} words" )
152- print ("Prompt:" , prompt )
153- print (doc for doc in documents )
200+ # Extract the text from the documents
201+ document_texts = [doc .page_content for doc in documents ]
202+ leader = textwrap .dedent (
203+ """\
204+ You can assume that the following is true,
205+ and you should attempt to incorporate these facts
206+ in your response:
207+ """
208+ )
209+
210+ # Create a prompt that includes the document texts
211+ prompt_with_relevant_documents = f"{ prompt + leader } { '. ' .join (document_texts )} "
212+
213+ print (f"Prompt contains { len (prompt_with_relevant_documents .split ())} words" )
214+ print ("Prompt:" , prompt_with_relevant_documents )
154215
155216 # Get a response from the GPT-3.5-turbo model
156- response = self .cached_chat_request (system_message = "You are a helpful assistant." , human_message = prompt )
217+ response = self .cached_chat_request (
218+ system_message = "You are a helpful assistant." , human_message = prompt_with_relevant_documents
219+ )
157220
221+ print ("Response:" )
222+ print ("------------------------------------------------------" )
158223 return response
0 commit comments