diff --git a/controller.py b/controller.py index 464faf1..4b4a003 100644 --- a/controller.py +++ b/controller.py @@ -14,17 +14,17 @@ import gc import os import pandas as pd -import shutil from openai import APIConnectionError from src.embedders import Transformer, util # Embedder imports are used by eval(Embedder) in __setup_tmp_embedder -from src.embedders.classification.contextual import ( +from src.embedders.classification.contextual import ( # noqa: F401 OpenAISentenceEmbedder, HuggingFaceSentenceEmbedder, + PrivatemodeAISentenceEmbedder, ) -from src.embedders.classification.reduce import PCASentenceReducer +from src.embedders.classification.reduce import PCASentenceReducer # noqa: F401 from src.util import daemon, request_util from src.util.decorator import param_throttle from src.util.embedders import get_embedder diff --git a/src/embedders/classification/contextual.py b/src/embedders/classification/contextual.py index b32acd9..230cdad 100644 --- a/src/embedders/classification/contextual.py +++ b/src/embedders/classification/contextual.py @@ -8,6 +8,11 @@ from openai import OpenAI, AzureOpenAI from openai import AuthenticationError, RateLimitError import time +import os +from transformers import AutoTokenizer + + +PRIVATEMODE_AI_URL = os.getenv("PRIVATEMODE_AI_URL", "http://privatemode-proxy:8080/v1") class TransformerSentenceEmbedder(SentenceEmbedder): @@ -199,3 +204,77 @@ def dump(self, project_id: str, embedding_id: str) -> None: export_file = util.INFERENCE_DIR / project_id / f"embedder-{embedding_id}.json" export_file.parent.mkdir(parents=True, exist_ok=True) util.write_json(self.to_json(), export_file, indent=2) + + +class PrivatemodeAISentenceEmbedder(SentenceEmbedder): + + def __init__( + self, + batch_size: int = 128, + model_name: str = "intfloat/multilingual-e5-large-instruct", + ): + """ + Embeds documents using privatemode ai proxy via OpenAI classes. + Note that the model and api key are currently hardcoded since they aren't configurable. + + Args: + batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. + model_name (str, optional): Name of the embedding model from Privatemode AI (e.g. intfloat/multilingual-e5-large-instruct). Defaults to "intfloat/multilingual-e5-large-instruct". + + Raises: + Exception: If you use Azure, you need to provide api_type, api_version and api_base. + + + """ + super().__init__(batch_size) + self.model_name = model_name + self.openai_client = OpenAI( + api_key="dummy", # Set in proxy + base_url=PRIVATEMODE_AI_URL, + ) + # for trimming the length of the text if > 512 tokens + self._auto_tokenizer = AutoTokenizer.from_pretrained(self.model_name) + + def _encode( + self, documents: List[Union[str, Doc]], fit_model: bool + ) -> Generator[List[List[float]], None, None]: + for documents_batch in util.batch(documents, self.batch_size): + documents_batch = [self._trim_length(doc.replace("\n", " ")) for doc in documents_batch] + try: + response = self.openai_client.embeddings.create( + input=documents_batch, model=self.model_name + ) + embeddings = [entry.embedding for entry in response.data] + yield embeddings + except AuthenticationError: + raise Exception( + "OpenAI API key is invalid. Please provide a valid API key in the constructor of PrivatemodeAISentenceEmbedder." + ) + + @staticmethod + def load(embedder: dict) -> "PrivatemodeAISentenceEmbedder": + return PrivatemodeAISentenceEmbedder( + model_name=embedder["model_name"], + batch_size=embedder["batch_size"], + ) + + def to_json(self) -> dict: + return { + "cls": "PrivatemodeAISentenceEmbedder", + "model_name": self.model_name, + "batch_size": self.batch_size, + } + + def dump(self, project_id: str, embedding_id: str) -> None: + export_file = util.INFERENCE_DIR / project_id / f"embedder-{embedding_id}.json" + export_file.parent.mkdir(parents=True, exist_ok=True) + util.write_json(self.to_json(), export_file, indent=2) + + def _trim_length(self, text: str, max_length: int=512) -> str: + tokens = self._auto_tokenizer( + text, + truncation=True, + max_length=max_length, + return_tensors=None # No tensors needed for just truncating + ) + return self._auto_tokenizer.decode(tokens["input_ids"], skip_special_tokens=True) diff --git a/src/embedders/classification/reduce.py b/src/embedders/classification/reduce.py index c44265c..da40271 100644 --- a/src/embedders/classification/reduce.py +++ b/src/embedders/classification/reduce.py @@ -5,9 +5,10 @@ from src.embedders import PCAReducer, util # Embedder imports are used by eval(Embedder) in load methods -from src.embedders.classification.contextual import ( +from src.embedders.classification.contextual import ( # noqa: F401 OpenAISentenceEmbedder, HuggingFaceSentenceEmbedder, + PrivatemodeAISentenceEmbedder, ) diff --git a/src/embedders/util.py b/src/embedders/util.py index 2ddecb5..b0021ff 100644 --- a/src/embedders/util.py +++ b/src/embedders/util.py @@ -35,3 +35,4 @@ def read_json(file_path: str) -> dict[str, Any]: def write_json(obj: Any, file_path: str, **kwargs) -> None: with open(file_path, "w") as f: json.dump(obj, f, **kwargs) + diff --git a/src/util/embedders.py b/src/util/embedders.py index 81d1c40..8ce608f 100644 --- a/src/util/embedders.py +++ b/src/util/embedders.py @@ -2,6 +2,7 @@ from src.embedders.classification.contextual import ( OpenAISentenceEmbedder, HuggingFaceSentenceEmbedder, + PrivatemodeAISentenceEmbedder, ) from src.embedders.extraction.contextual import TransformerTokenEmbedder from src.embedders.classification.reduce import PCASentenceReducer @@ -42,6 +43,8 @@ def get_embedder( embedder = HuggingFaceSentenceEmbedder( config_string=model, batch_size=batch_size ) + elif platform == enums.EmbeddingPlatform.PRIVATEMODE_AI.value: + embedder = PrivatemodeAISentenceEmbedder(batch_size=batch_size) else: raise Exception(f"Unknown platform {platform}") diff --git a/submodules/model b/submodules/model index b41145a..a169a62 160000 --- a/submodules/model +++ b/submodules/model @@ -1 +1 @@ -Subproject commit b41145ac4d0284b68c65b88baff034123f5403a5 +Subproject commit a169a6243f5e9285044bcf7f1fbdb26a7395b257