From 574939e27e6882b63dbf5c120fdf31017a09e545 Mon Sep 17 00:00:00 2001 From: JWittmeyer Date: Thu, 31 Jul 2025 11:23:26 +0200 Subject: [PATCH 1/4] Enables privatemode ai as embedding provider --- controller.py | 6 +-- src/embedders/classification/contextual.py | 62 ++++++++++++++++++++++ src/embedders/classification/reduce.py | 3 +- src/util/embedders.py | 3 ++ submodules/model | 2 +- 5 files changed, 71 insertions(+), 5 deletions(-) diff --git a/controller.py b/controller.py index 464faf1..4b4a003 100644 --- a/controller.py +++ b/controller.py @@ -14,17 +14,17 @@ import gc import os import pandas as pd -import shutil from openai import APIConnectionError from src.embedders import Transformer, util # Embedder imports are used by eval(Embedder) in __setup_tmp_embedder -from src.embedders.classification.contextual import ( +from src.embedders.classification.contextual import ( # noqa: F401 OpenAISentenceEmbedder, HuggingFaceSentenceEmbedder, + PrivatemodeAISentenceEmbedder, ) -from src.embedders.classification.reduce import PCASentenceReducer +from src.embedders.classification.reduce import PCASentenceReducer # noqa: F401 from src.util import daemon, request_util from src.util.decorator import param_throttle from src.util.embedders import get_embedder diff --git a/src/embedders/classification/contextual.py b/src/embedders/classification/contextual.py index b32acd9..6766c29 100644 --- a/src/embedders/classification/contextual.py +++ b/src/embedders/classification/contextual.py @@ -199,3 +199,65 @@ def dump(self, project_id: str, embedding_id: str) -> None: export_file = util.INFERENCE_DIR / project_id / f"embedder-{embedding_id}.json" export_file.parent.mkdir(parents=True, exist_ok=True) util.write_json(self.to_json(), export_file, indent=2) + + +class PrivatemodeAISentenceEmbedder(SentenceEmbedder): + def __init__( + self, + batch_size: int = 128, + model_name: str = "intfloat/multilingual-e5-large-instruct", + ): + """ + Embeds documents using privatemode ai proxy via OpenAI classes. + Note that the model and api key are currently hardcoded since they aren't configurable. + + Args: + batch_size (int, optional): Defines the number of conversions after which the embedder yields. Defaults to 128. + model_name (str, optional): Name of the embedding model from Privatemode AI (e.g. intfloat/multilingual-e5-large-instruct). Defaults to "intfloat/multilingual-e5-large-instruct". + + Raises: + Exception: If you use Azure, you need to provide api_type, api_version and api_base. + + + """ + super().__init__(batch_size) + self.model_name = model_name + self.openai_client = OpenAI( + api_key="dummy", # Set in proxy + base_url="http://privatemode-proxy:8080/v1", + ) + + def _encode( + self, documents: List[Union[str, Doc]], fit_model: bool + ) -> Generator[List[List[float]], None, None]: + for documents_batch in util.batch(documents, self.batch_size): + documents_batch = [doc.replace("\n", " ") for doc in documents_batch] + try: + response = self.openai_client.embeddings.create( + input=documents_batch, model=self.model_name + ) + embeddings = [entry.embedding for entry in response.data] + yield embeddings + except AuthenticationError: + raise Exception( + "OpenAI API key is invalid. Please provide a valid API key in the constructor of PrivatemodeAISentenceEmbedder." + ) + + @staticmethod + def load(embedder: dict) -> "PrivatemodeAISentenceEmbedder": + return PrivatemodeAISentenceEmbedder( + model_name=embedder["model_name"], + batch_size=embedder["batch_size"], + ) + + def to_json(self) -> dict: + return { + "cls": "PrivatemodeAISentenceEmbedder", + "model_name": self.model_name, + "batch_size": self.batch_size, + } + + def dump(self, project_id: str, embedding_id: str) -> None: + export_file = util.INFERENCE_DIR / project_id / f"embedder-{embedding_id}.json" + export_file.parent.mkdir(parents=True, exist_ok=True) + util.write_json(self.to_json(), export_file, indent=2) diff --git a/src/embedders/classification/reduce.py b/src/embedders/classification/reduce.py index c44265c..da40271 100644 --- a/src/embedders/classification/reduce.py +++ b/src/embedders/classification/reduce.py @@ -5,9 +5,10 @@ from src.embedders import PCAReducer, util # Embedder imports are used by eval(Embedder) in load methods -from src.embedders.classification.contextual import ( +from src.embedders.classification.contextual import ( # noqa: F401 OpenAISentenceEmbedder, HuggingFaceSentenceEmbedder, + PrivatemodeAISentenceEmbedder, ) diff --git a/src/util/embedders.py b/src/util/embedders.py index 81d1c40..8ce608f 100644 --- a/src/util/embedders.py +++ b/src/util/embedders.py @@ -2,6 +2,7 @@ from src.embedders.classification.contextual import ( OpenAISentenceEmbedder, HuggingFaceSentenceEmbedder, + PrivatemodeAISentenceEmbedder, ) from src.embedders.extraction.contextual import TransformerTokenEmbedder from src.embedders.classification.reduce import PCASentenceReducer @@ -42,6 +43,8 @@ def get_embedder( embedder = HuggingFaceSentenceEmbedder( config_string=model, batch_size=batch_size ) + elif platform == enums.EmbeddingPlatform.PRIVATEMODE_AI.value: + embedder = PrivatemodeAISentenceEmbedder(batch_size=batch_size) else: raise Exception(f"Unknown platform {platform}") diff --git a/submodules/model b/submodules/model index b41145a..775d27f 160000 --- a/submodules/model +++ b/submodules/model @@ -1 +1 @@ -Subproject commit b41145ac4d0284b68c65b88baff034123f5403a5 +Subproject commit 775d27f52f9a8ff0665a026692fc99e959d6aeda From 92ba716729e7b3f21c5873100a4a76ded555d329 Mon Sep 17 00:00:00 2001 From: JWittmeyer Date: Mon, 4 Aug 2025 14:58:04 +0200 Subject: [PATCH 2/4] PR comment --- src/embedders/classification/contextual.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/embedders/classification/contextual.py b/src/embedders/classification/contextual.py index 6766c29..a0e91ad 100644 --- a/src/embedders/classification/contextual.py +++ b/src/embedders/classification/contextual.py @@ -8,6 +8,10 @@ from openai import OpenAI, AzureOpenAI from openai import AuthenticationError, RateLimitError import time +import os + + +PRIVATEMODE_AI_URL = os.getenv("PRIVATEMODE_AI_URL", "http://privatemode-proxy:8080/v1") class TransformerSentenceEmbedder(SentenceEmbedder): @@ -224,7 +228,7 @@ def __init__( self.model_name = model_name self.openai_client = OpenAI( api_key="dummy", # Set in proxy - base_url="http://privatemode-proxy:8080/v1", + base_url=PRIVATEMODE_AI_URL, ) def _encode( From 9b72ab919e47ba657b14dbb7faa6d93af3406778 Mon Sep 17 00:00:00 2001 From: JWittmeyer Date: Mon, 4 Aug 2025 16:19:35 +0200 Subject: [PATCH 3/4] Adds auto trim size for privatemode ai --- src/embedders/classification/contextual.py | 15 ++++++++++++++- src/embedders/util.py | 1 + 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/embedders/classification/contextual.py b/src/embedders/classification/contextual.py index a0e91ad..230cdad 100644 --- a/src/embedders/classification/contextual.py +++ b/src/embedders/classification/contextual.py @@ -9,6 +9,7 @@ from openai import AuthenticationError, RateLimitError import time import os +from transformers import AutoTokenizer PRIVATEMODE_AI_URL = os.getenv("PRIVATEMODE_AI_URL", "http://privatemode-proxy:8080/v1") @@ -206,6 +207,7 @@ def dump(self, project_id: str, embedding_id: str) -> None: class PrivatemodeAISentenceEmbedder(SentenceEmbedder): + def __init__( self, batch_size: int = 128, @@ -230,12 +232,14 @@ def __init__( api_key="dummy", # Set in proxy base_url=PRIVATEMODE_AI_URL, ) + # for trimming the length of the text if > 512 tokens + self._auto_tokenizer = AutoTokenizer.from_pretrained(self.model_name) def _encode( self, documents: List[Union[str, Doc]], fit_model: bool ) -> Generator[List[List[float]], None, None]: for documents_batch in util.batch(documents, self.batch_size): - documents_batch = [doc.replace("\n", " ") for doc in documents_batch] + documents_batch = [self._trim_length(doc.replace("\n", " ")) for doc in documents_batch] try: response = self.openai_client.embeddings.create( input=documents_batch, model=self.model_name @@ -265,3 +269,12 @@ def dump(self, project_id: str, embedding_id: str) -> None: export_file = util.INFERENCE_DIR / project_id / f"embedder-{embedding_id}.json" export_file.parent.mkdir(parents=True, exist_ok=True) util.write_json(self.to_json(), export_file, indent=2) + + def _trim_length(self, text: str, max_length: int=512) -> str: + tokens = self._auto_tokenizer( + text, + truncation=True, + max_length=max_length, + return_tensors=None # No tensors needed for just truncating + ) + return self._auto_tokenizer.decode(tokens["input_ids"], skip_special_tokens=True) diff --git a/src/embedders/util.py b/src/embedders/util.py index 2ddecb5..b0021ff 100644 --- a/src/embedders/util.py +++ b/src/embedders/util.py @@ -35,3 +35,4 @@ def read_json(file_path: str) -> dict[str, Any]: def write_json(obj: Any, file_path: str, **kwargs) -> None: with open(file_path, "w") as f: json.dump(obj, f, **kwargs) + From f9498ff1cecc94e379ce262d2a0301e65756ed4c Mon Sep 17 00:00:00 2001 From: JWittmeyer Date: Thu, 7 Aug 2025 15:07:42 +0200 Subject: [PATCH 4/4] Submodule merge --- submodules/model | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/submodules/model b/submodules/model index 775d27f..a169a62 160000 --- a/submodules/model +++ b/submodules/model @@ -1 +1 @@ -Subproject commit 775d27f52f9a8ff0665a026692fc99e959d6aeda +Subproject commit a169a6243f5e9285044bcf7f1fbdb26a7395b257