11# -*- coding: utf-8 -*-
2- # pylint: disable=too-few-public-methods
32"""
43Hybrid Search Retriever. A class that combines the following:
54 - OpenAI prompting and ChatModel
1615 https://python.langchain.com/docs/integrations/retrievers/pinecone_hybrid_search
1716"""
1817
19- # document loading
20- import glob
21-
2218# general purpose imports
2319import logging
24- import os
2520import textwrap
2621from typing import Union
2722
2823# pinecone integration
29- import pinecone
3024from langchain .cache import InMemoryCache
3125from langchain .chat_models import ChatOpenAI
32- from langchain .document_loaders import PyPDFLoader
3326
3427# embedding
35- from langchain .embeddings import OpenAIEmbeddings
3628from langchain .globals import set_llm_cache
3729
3830# prompting and chat
4234# hybrid search capability
4335from langchain .retrievers import PineconeHybridSearchRetriever
4436from langchain .schema import BaseMessage , HumanMessage , SystemMessage
45- from langchain .text_splitter import Document
46- from langchain .vectorstores .pinecone import Pinecone
4737from pinecone_text .sparse import BM25Encoder
4838
4939# this project
5040from models .const import Config , Credentials
41+ from models .pinecone import PineconeIndex
5142
5243
5344###############################################################################
5647logging .basicConfig (level = logging .DEBUG if Config .DEBUG_MODE else logging .INFO )
5748
5849
59- class TextSplitter :
60- """
61- Custom text splitter that adds metadata to the Document object
62- which is required by PineconeHybridSearchRetriever.
63- """
64-
65- def create_documents (self , texts ):
66- """Create documents"""
67- documents = []
68- for text in texts :
69- # Create a Document object with the text and metadata
70- document = Document (page_content = text , metadata = {"context" : text })
71- documents .append (document )
72- return documents
73-
74-
7550class HybridSearchRetriever :
7651 """Hybrid Search Retriever"""
7752
7853 _chat : ChatOpenAI = None
79- _openai_embeddings : OpenAIEmbeddings = None
80- _pinecone_index : pinecone .Index = None
81- _vector_store : Pinecone = None
82- _text_splitter : TextSplitter = None
8354 _b25_encoder : BM25Encoder = None
55+ _pinecone : PineconeIndex = None
56+ _retriever : PineconeHybridSearchRetriever = None
8457
8558 def __init__ (self ):
8659 """Constructor"""
87- pinecone .init (api_key = Credentials .PINECONE_API_KEY , environment = Config .PINECONE_ENVIRONMENT )
8860 set_llm_cache (InMemoryCache ())
8961
62+ @property
63+ def pinecone (self ) -> PineconeIndex :
64+ """PineconeIndex lazy read-only property."""
65+ if self ._pinecone is None :
66+ self ._pinecone = PineconeIndex ()
67+ return self ._pinecone
68+
9069 # prompting wrapper
9170 @property
9271 def chat (self ) -> ChatOpenAI :
@@ -102,48 +81,22 @@ def chat(self) -> ChatOpenAI:
10281 )
10382 return self ._chat
10483
105- # embeddings
106- @property
107- def openai_embeddings (self ) -> OpenAIEmbeddings :
108- """OpenAIEmbeddings lazy read-only property."""
109- if self ._openai_embeddings is None :
110- self ._openai_embeddings = OpenAIEmbeddings (
111- api_key = Credentials .OPENAI_API_KEY , organization = Credentials .OPENAI_API_ORGANIZATION
112- )
113- return self ._openai_embeddings
114-
115- @property
116- def pinecone_index (self ) -> pinecone .Index :
117- """pinecone.Index lazy read-only property."""
118- if self ._pinecone_index is None :
119- self ._pinecone_index = pinecone .Index (index_name = Config .PINECONE_INDEX_NAME )
120- return self ._pinecone_index
121-
122- @property
123- def vector_store (self ) -> Pinecone :
124- """Pinecone lazy read-only property."""
125- if self ._vector_store is None :
126- self ._vector_store = Pinecone (
127- index = self .pinecone_index ,
128- embedding = self .openai_embeddings ,
129- text_key = Config .PINECONE_VECTORSTORE_TEXT_KEY ,
130- )
131- return self ._vector_store
132-
133- @property
134- def text_splitter (self ) -> TextSplitter :
135- """TextSplitter lazy read-only property."""
136- if self ._text_splitter is None :
137- self ._text_splitter = TextSplitter ()
138- return self ._text_splitter
139-
14084 @property
14185 def bm25_encoder (self ) -> BM25Encoder :
14286 """BM25Encoder lazy read-only property."""
14387 if self ._b25_encoder is None :
14488 self ._b25_encoder = BM25Encoder ().default ()
14589 return self ._b25_encoder
14690
91+ @property
92+ def retriever (self ) -> PineconeHybridSearchRetriever :
93+ """PineconeHybridSearchRetriever lazy read-only property."""
94+ if self ._retriever is None :
95+ self ._retriever = PineconeHybridSearchRetriever (
96+ embeddings = self .pinecone .openai_embeddings , sparse_encoder = self .bm25_encoder , index = self .pinecone .index
97+ )
98+ return self ._retriever
99+
147100 def cached_chat_request (
148101 self , system_message : Union [str , SystemMessage ], human_message : Union [str , HumanMessage ]
149102 ) -> BaseMessage :
@@ -169,54 +122,8 @@ def prompt_with_template(
169122 return retval
170123
171124 def load (self , filepath : str ):
172- """
173- Embed PDF.
174- 1. Load PDF document text data
175- 2. Split into pages
176- 3. Embed each page
177- 4. Store in Pinecone
178-
179- Note: it's important to make sure that the "context" field that holds the document text
180- in the metadata is not indexed. Currently you need to specify explicitly the fields you
181- do want to index. For more information checkout
182- https://docs.pinecone.io/docs/manage-indexes#selective-metadata-indexing
183- """
184- try :
185- logging .info ("Deleting index..." )
186- pinecone .delete_index (Config .PINECONE_INDEX_NAME )
187- except pinecone .exceptions .PineconeException :
188- logging .info ("Index does not exist. Continuing..." )
189-
190- metadata_config = {
191- "indexed" : [Config .PINECONE_VECTORSTORE_TEXT_KEY , "lc_type" ],
192- "context" : ["lc_text" ],
193- }
194- logging .info ("Creating index. This may take a few minutes..." )
195- pinecone .create_index (
196- Config .PINECONE_INDEX_NAME ,
197- dimension = Config .PINECONE_DIMENSIONS ,
198- metric = Config .PINECONE_METRIC ,
199- metadata_config = metadata_config ,
200- )
201-
202- pdf_files = glob .glob (os .path .join (filepath , "*.pdf" ))
203- i = 0
204- for pdf_file in pdf_files :
205- i += 1
206- j = len (pdf_files )
207- logging .info ("Loading PDF %s of %s: %s" , i , j , pdf_file )
208- loader = PyPDFLoader (file_path = pdf_file )
209- docs = loader .load ()
210- k = 0
211- for doc in docs :
212- k += 1
213- logging .info (k * "-" , end = "\r " )
214- documents = self .text_splitter .create_documents ([doc .page_content ])
215- document_texts = [doc .page_content for doc in documents ]
216- embeddings = self .openai_embeddings .embed_documents (document_texts )
217- self .vector_store .add_documents (documents = documents , embeddings = embeddings )
218-
219- logging .info ("Finished loading PDFs" )
125+ """Pdf loader."""
126+ self .pinecone .pdf_loader (filepath = filepath )
220127
221128 def rag (self , human_message : Union [str , HumanMessage ]):
222129 """
@@ -241,10 +148,8 @@ def rag(self, human_message: Union[str, HumanMessage]):
241148 # ---------------------------------------------------------------------
242149 # 1.) Retrieve relevant documents from Pinecone vector database
243150 # ---------------------------------------------------------------------
244- retriever = PineconeHybridSearchRetriever (
245- embeddings = self .openai_embeddings , sparse_encoder = self .bm25_encoder , index = self .pinecone_index
246- )
247- documents = retriever .get_relevant_documents (query = human_message .content )
151+ # documents = self.retriever.get_relevant_documents(query=human_message.content)
152+ documents = self .pinecone .vector_store .similarity_search (query = human_message .content )
248153
249154 # Extract the text from the documents
250155 document_texts = [doc .page_content for doc in documents ]
@@ -261,14 +166,15 @@ def rag(self, human_message: Union[str, HumanMessage]):
261166 # finished with hybrid search setup
262167 # ---------------------------------------------------------------------
263168
264- # 2.) get a response from the chat model
265- response = self .cached_chat_request (system_message = system_message , human_message = human_message )
266-
267169 logging .debug ("------------------------------------------------------" )
268170 logging .debug ("rag() Retrieval Augmented Generation prompt" )
269171 logging .debug ("Diagnostic information:" )
270172 logging .debug (" Retrieved %i related documents from Pinecone" , len (documents ))
271173 logging .debug (" System messages contains %i words" , len (system_message .content .split ()))
272174 logging .debug (" Prompt: %s" , system_message .content )
273175 logging .debug ("------------------------------------------------------" )
176+
177+ # 2.) get a response from the chat model
178+ response = self .cached_chat_request (system_message = system_message , human_message = human_message )
179+
274180 return response .content
0 commit comments