From b55ca8814bf0c3549f7ec59c6d978a7f7bb52ebf Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Wed, 28 May 2025 22:41:47 +0000 Subject: [PATCH 01/77] Prepare change for multimodal, rm old vision approach stuff --- .azdo/pipelines/azure-dev.yml | 7 +- .github/workflows/azure-dev.yml | 7 +- .github/workflows/evaluate.yaml | 7 +- README.md | 6 +- app/backend/app.py | 105 +-------- .../approaches/chatreadretrieveread.py | 3 + .../approaches/chatreadretrievereadvision.py | 222 ------------------ .../prompts/ask_answer_question.prompty | 15 +- .../ask_answer_question_vision.prompty | 31 --- .../prompts/chat_answer_question.prompty | 16 +- .../chat_answer_question_vision.prompty | 48 ---- .../approaches/retrievethenreadvision.py | 181 -------------- app/backend/config.py | 5 +- app/backend/prepdocs.py | 25 +- app/backend/prepdocslib/goals.json | 7 + app/backend/prepdocslib/pdfparser.py | 3 +- azure.yaml | 7 +- infra/main.bicep | 49 +--- infra/main.parameters.json | 19 +- locustfile.py | 65 ----- 20 files changed, 71 insertions(+), 757 deletions(-) delete mode 100644 app/backend/approaches/chatreadretrievereadvision.py delete mode 100644 app/backend/approaches/prompts/ask_answer_question_vision.prompty delete mode 100644 app/backend/approaches/prompts/chat_answer_question_vision.prompty delete mode 100644 app/backend/approaches/retrievethenreadvision.py create mode 100644 app/backend/prepdocslib/goals.json diff --git a/.azdo/pipelines/azure-dev.yml b/.azdo/pipelines/azure-dev.yml index 752556f709..ed3bf3a58e 100644 --- a/.azdo/pipelines/azure-dev.yml +++ b/.azdo/pipelines/azure-dev.yml @@ -77,11 +77,6 @@ steps: AZURE_OPENAI_EMB_DEPLOYMENT_VERSION: $(AZURE_OPENAI_EMB_DEPLOYMENT_VERSION) AZURE_OPENAI_EMB_DEPLOYMENT_SKU: $(AZURE_OPENAI_EMB_DEPLOYMENT_SKU) AZURE_OPENAI_EMB_DIMENSIONS: $(AZURE_OPENAI_EMB_DIMENSIONS) - AZURE_OPENAI_GPT4V_MODEL: $(AZURE_OPENAI_GPT4V_MODEL) - AZURE_OPENAI_GPT4V_DEPLOYMENT: $(AZURE_OPENAI_GPT4V_DEPLOYMENT) - AZURE_OPENAI_GPT4V_DEPLOYMENT_CAPACITY: $(AZURE_OPENAI_GPT4V_DEPLOYMENT_CAPACITY) - AZURE_OPENAI_GPT4V_DEPLOYMENT_VERSION: $(AZURE_OPENAI_GPT4V_DEPLOYMENT_VERSION) - AZURE_OPENAI_GPT4V_DEPLOYMENT_SKU: $(AZURE_OPENAI_GPT4V_DEPLOYMENT_SKU) AZURE_OPENAI_DISABLE_KEYS: $(AZURE_OPENAI_DISABLE_KEYS) OPENAI_HOST: $(OPENAI_HOST) OPENAI_API_KEY: $(OPENAI_API_KEY) @@ -91,7 +86,7 @@ steps: AZURE_APPLICATION_INSIGHTS_DASHBOARD: $(AZURE_APPLICATION_INSIGHTS_DASHBOARD) AZURE_LOG_ANALYTICS: $(AZURE_LOG_ANALYTICS) USE_VECTORS: $(USE_VECTORS) - USE_GPT4V: $(USE_GPT4V) + USE_MULTIMODAL: $(USE_MULTIMODAL) AZURE_VISION_ENDPOINT: $(AZURE_VISION_ENDPOINT) VISION_SECRET_NAME: $(VISION_SECRET_NAME) AZURE_COMPUTER_VISION_SERVICE: $(AZURE_COMPUTER_VISION_SERVICE) diff --git a/.github/workflows/azure-dev.yml b/.github/workflows/azure-dev.yml index fa99f45a9e..a1e4847d13 100644 --- a/.github/workflows/azure-dev.yml +++ b/.github/workflows/azure-dev.yml @@ -67,11 +67,6 @@ jobs: AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY: ${{ vars.AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY }} AZURE_OPENAI_EMB_DEPLOYMENT_VERSION: ${{ vars.AZURE_OPENAI_EMB_DEPLOYMENT_VERSION }} AZURE_OPENAI_EMB_DIMENSIONS: ${{ vars.AZURE_OPENAI_EMB_DIMENSIONS }} - AZURE_OPENAI_GPT4V_MODEL: ${{ vars.AZURE_OPENAI_GPT4V_MODEL }} - AZURE_OPENAI_GPT4V_DEPLOYMENT: ${{ vars.AZURE_OPENAI_GPT4V_DEPLOYMENT }} - AZURE_OPENAI_GPT4V_DEPLOYMENT_CAPACITY: ${{ vars.AZURE_OPENAI_GPT4V_DEPLOYMENT_CAPACITY }} - AZURE_OPENAI_GPT4V_DEPLOYMENT_VERSION: ${{ vars.AZURE_OPENAI_GPT4V_DEPLOYMENT_VERSION }} - AZURE_OPENAI_GPT4V_DEPLOYMENT_SKU: ${{ vars.AZURE_OPENAI_GPT4V_DEPLOYMENT_SKU }} USE_EVAL: ${{ vars.USE_EVAL }} AZURE_OPENAI_EVAL_MODEL: ${{ vars.AZURE_OPENAI_EVAL_MODEL }} AZURE_OPENAI_EVAL_MODEL_VERSION: ${{ vars.AZURE_OPENAI_EVAL_MODEL_VERSION }} @@ -87,7 +82,7 @@ jobs: AZURE_APPLICATION_INSIGHTS_DASHBOARD: ${{ vars.AZURE_APPLICATION_INSIGHTS_DASHBOARD }} AZURE_LOG_ANALYTICS: ${{ vars.AZURE_LOG_ANALYTICS }} USE_VECTORS: ${{ vars.USE_VECTORS }} - USE_GPT4V: ${{ vars.USE_GPT4V }} + USE_MULTIMODAL: ${{ vars.USE_MULTIMODAL }} AZURE_VISION_ENDPOINT: ${{ vars.AZURE_VISION_ENDPOINT }} VISION_SECRET_NAME: ${{ vars.VISION_SECRET_NAME }} ENABLE_LANGUAGE_PICKER: ${{ vars.ENABLE_LANGUAGE_PICKER }} diff --git a/.github/workflows/evaluate.yaml b/.github/workflows/evaluate.yaml index abb5f47465..f022fd1bc9 100644 --- a/.github/workflows/evaluate.yaml +++ b/.github/workflows/evaluate.yaml @@ -62,11 +62,6 @@ jobs: AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY: ${{ vars.AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY }} AZURE_OPENAI_EMB_DEPLOYMENT_VERSION: ${{ vars.AZURE_OPENAI_EMB_DEPLOYMENT_VERSION }} AZURE_OPENAI_EMB_DIMENSIONS: ${{ vars.AZURE_OPENAI_EMB_DIMENSIONS }} - AZURE_OPENAI_GPT4V_MODEL: ${{ vars.AZURE_OPENAI_GPT4V_MODEL }} - AZURE_OPENAI_GPT4V_DEPLOYMENT: ${{ vars.AZURE_OPENAI_GPT4V_DEPLOYMENT }} - AZURE_OPENAI_GPT4V_DEPLOYMENT_CAPACITY: ${{ vars.AZURE_OPENAI_GPT4V_DEPLOYMENT_CAPACITY }} - AZURE_OPENAI_GPT4V_DEPLOYMENT_VERSION: ${{ vars.AZURE_OPENAI_GPT4V_DEPLOYMENT_VERSION }} - AZURE_OPENAI_GPT4V_DEPLOYMENT_SKU: ${{ vars.AZURE_OPENAI_GPT4V_DEPLOYMENT_SKU }} USE_EVAL: ${{ vars.USE_EVAL }} AZURE_OPENAI_EVAL_MODEL: ${{ vars.AZURE_OPENAI_EVAL_MODEL }} AZURE_OPENAI_EVAL_MODEL_VERSION: ${{ vars.AZURE_OPENAI_EVAL_MODEL_VERSION }} @@ -82,7 +77,7 @@ jobs: AZURE_APPLICATION_INSIGHTS_DASHBOARD: ${{ vars.AZURE_APPLICATION_INSIGHTS_DASHBOARD }} AZURE_LOG_ANALYTICS: ${{ vars.AZURE_LOG_ANALYTICS }} USE_VECTORS: ${{ vars.USE_VECTORS }} - USE_GPT4V: ${{ vars.USE_GPT4V }} + USE_MULTIMODAL: ${{ vars.USE_MULTIMODAL }} AZURE_VISION_ENDPOINT: ${{ vars.AZURE_VISION_ENDPOINT }} VISION_SECRET_NAME: ${{ vars.VISION_SECRET_NAME }} ENABLE_LANGUAGE_PICKER: ${{ vars.ENABLE_LANGUAGE_PICKER }} diff --git a/README.md b/README.md index 08cc88fd77..afcd07d762 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ The repo includes sample data so it's ready to try end to end. In this sample ap - Renders citations and thought process for each answer - Includes settings directly in the UI to tweak the behavior and experiment with options - Integrates Azure AI Search for indexing and retrieval of documents, with support for [many document formats](/docs/data_ingestion.md#supported-document-formats) as well as [integrated vectorization](/docs/data_ingestion.md#overview-of-integrated-vectorization) -- Optional usage of [GPT-4 with vision](/docs/gpt4v.md) to reason over image-heavy documents +- Optional usage of [multimodal models](/docs/multimodal.md) to reason over image-heavy documents - Optional addition of [speech input/output](/docs/deploy_features.md#enabling-speech-inputoutput) for accessibility - Optional automation of [user login and data access](/docs/login_and_acl.md) via Microsoft Entra - Performance tracing and monitoring with Application Insights @@ -92,7 +92,7 @@ However, you can try the [Azure pricing calculator](https://azure.com/e/e3490de2 - Azure AI Search: Basic tier, 1 replica, free level of semantic search. Pricing per hour. [Pricing](https://azure.microsoft.com/pricing/details/search/) - Azure Blob Storage: Standard tier with ZRS (Zone-redundant storage). Pricing per storage and read operations. [Pricing](https://azure.microsoft.com/pricing/details/storage/blobs/) - Azure Cosmos DB: Only provisioned if you enabled [chat history with Cosmos DB](docs/deploy_features.md#enabling-persistent-chat-history-with-azure-cosmos-db). Serverless tier. Pricing per request unit and storage. [Pricing](https://azure.microsoft.com/pricing/details/cosmos-db/) -- Azure AI Vision: Only provisioned if you enabled [GPT-4 with vision](docs/gpt4v.md). Pricing per 1K transactions. [Pricing](https://azure.microsoft.com/pricing/details/cognitive-services/computer-vision/) +- Azure AI Vision: Only provisioned if you enabled [multimodal approach](docs/multimodal.md). Pricing per 1K transactions. [Pricing](https://azure.microsoft.com/pricing/details/cognitive-services/computer-vision/) - Azure AI Content Understanding: Only provisioned if you enabled [media description](docs/deploy_features.md#enabling-media-description-with-azure-content-understanding). Pricing per 1K images. [Pricing](https://azure.microsoft.com/pricing/details/content-understanding/) - Azure Monitor: Pay-as-you-go tier. Costs based on data ingested. [Pricing](https://azure.microsoft.com/pricing/details/monitor/) @@ -255,7 +255,7 @@ You can find extensive documentation in the [docs](docs/README.md) folder: - [Enabling optional features](docs/deploy_features.md) - [All features](docs/deploy_features.md) - [Login and access control](docs/login_and_acl.md) - - [GPT-4 Turbo with Vision](docs/gpt4v.md) + - [Multimodal](docs/multimodal.md) - [Reasoning](docs/reasoning.md) - [Private endpoints](docs/deploy_private.md) - [Sharing deployment environments](docs/sharing_environments.md) diff --git a/app/backend/app.py b/app/backend/app.py index 1b4563bb98..f7e3043fbd 100644 --- a/app/backend/app.py +++ b/app/backend/app.py @@ -52,25 +52,20 @@ from approaches.approach import Approach from approaches.chatreadretrieveread import ChatReadRetrieveReadApproach -from approaches.chatreadretrievereadvision import ChatReadRetrieveReadVisionApproach from approaches.promptmanager import PromptyManager from approaches.retrievethenread import RetrieveThenReadApproach -from approaches.retrievethenreadvision import RetrieveThenReadVisionApproach from chat_history.cosmosdb import chat_history_cosmosdb_bp from config import ( CONFIG_AGENT_CLIENT, CONFIG_AGENTIC_RETRIEVAL_ENABLED, CONFIG_ASK_APPROACH, - CONFIG_ASK_VISION_APPROACH, CONFIG_AUTH_CLIENT, CONFIG_BLOB_CONTAINER_CLIENT, CONFIG_CHAT_APPROACH, CONFIG_CHAT_HISTORY_BROWSER_ENABLED, CONFIG_CHAT_HISTORY_COSMOS_ENABLED, - CONFIG_CHAT_VISION_APPROACH, CONFIG_CREDENTIAL, CONFIG_DEFAULT_REASONING_EFFORT, - CONFIG_GPT4V_DEPLOYED, CONFIG_INGESTER, CONFIG_LANGUAGE_PICKER_ENABLED, CONFIG_OPENAI_CLIENT, @@ -185,15 +180,8 @@ async def ask(auth_claims: dict[str, Any]): context = request_json.get("context", {}) context["auth_claims"] = auth_claims try: - use_gpt4v = context.get("overrides", {}).get("use_gpt4v", False) - approach: Approach - if use_gpt4v and CONFIG_ASK_VISION_APPROACH in current_app.config: - approach = cast(Approach, current_app.config[CONFIG_ASK_VISION_APPROACH]) - else: - approach = cast(Approach, current_app.config[CONFIG_ASK_APPROACH]) - r = await approach.run( - request_json["messages"], context=context, session_state=request_json.get("session_state") - ) + approach: Approach = cast(Approach, current_app.config[CONFIG_ASK_APPROACH]) + r = await approach.run(request_json["messages"], context=context, session_state=request_json.get("session_state")) return jsonify(r) except Exception as error: return error_response(error, "/ask") @@ -224,12 +212,7 @@ async def chat(auth_claims: dict[str, Any]): context = request_json.get("context", {}) context["auth_claims"] = auth_claims try: - use_gpt4v = context.get("overrides", {}).get("use_gpt4v", False) - approach: Approach - if use_gpt4v and CONFIG_CHAT_VISION_APPROACH in current_app.config: - approach = cast(Approach, current_app.config[CONFIG_CHAT_VISION_APPROACH]) - else: - approach = cast(Approach, current_app.config[CONFIG_CHAT_APPROACH]) + approach: Approach = cast(Approach, current_app.config[CONFIG_CHAT_APPROACH]) # If session state is provided, persists the session state, # else creates a new session_id depending on the chat history options enabled. @@ -258,12 +241,7 @@ async def chat_stream(auth_claims: dict[str, Any]): context = request_json.get("context", {}) context["auth_claims"] = auth_claims try: - use_gpt4v = context.get("overrides", {}).get("use_gpt4v", False) - approach: Approach - if use_gpt4v and CONFIG_CHAT_VISION_APPROACH in current_app.config: - approach = cast(Approach, current_app.config[CONFIG_CHAT_VISION_APPROACH]) - else: - approach = cast(Approach, current_app.config[CONFIG_CHAT_APPROACH]) + approach: Approach = cast(Approach, current_app.config[CONFIG_CHAT_APPROACH]) # If session state is provided, persists the session state, # else creates a new session_id depending on the chat history options enabled. @@ -297,7 +275,7 @@ def auth_setup(): def config(): return jsonify( { - "showGPT4VOptions": current_app.config[CONFIG_GPT4V_DEPLOYED], + "showMultimodalOption": current_app.config[CONFIG_MULTIMODAL_ENABLED], "showSemanticRankerOption": current_app.config[CONFIG_SEMANTIC_RANKER_DEPLOYED], "showQueryRewritingOption": current_app.config[CONFIG_QUERY_REWRITING_ENABLED], "showReasoningEffortOption": current_app.config[CONFIG_REASONING_EFFORT_ENABLED], @@ -441,8 +419,6 @@ async def setup_clients(): OPENAI_REASONING_EFFORT = os.getenv("AZURE_OPENAI_REASONING_EFFORT") # Used with Azure OpenAI deployments AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE") - AZURE_OPENAI_GPT4V_DEPLOYMENT = os.environ.get("AZURE_OPENAI_GPT4V_DEPLOYMENT") - AZURE_OPENAI_GPT4V_MODEL = os.environ.get("AZURE_OPENAI_GPT4V_MODEL") AZURE_OPENAI_CHATGPT_DEPLOYMENT = ( os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT") if OPENAI_HOST.startswith("azure") else None ) @@ -479,7 +455,7 @@ async def setup_clients(): AZURE_SPEECH_SERVICE_LOCATION = os.getenv("AZURE_SPEECH_SERVICE_LOCATION") AZURE_SPEECH_SERVICE_VOICE = os.getenv("AZURE_SPEECH_SERVICE_VOICE") or "en-US-AndrewMultilingualNeural" - USE_GPT4V = os.getenv("USE_GPT4V", "").lower() == "true" + USE_MULTIMODAL = os.getenv("USE_MULTIMODAL", "").lower() == "true" USE_USER_UPLOAD = os.getenv("USE_USER_UPLOAD", "").lower() == "true" ENABLE_LANGUAGE_PICKER = os.getenv("ENABLE_LANGUAGE_PICKER", "").lower() == "true" USE_SPEECH_INPUT_BROWSER = os.getenv("USE_SPEECH_INPUT_BROWSER", "").lower() == "true" @@ -575,7 +551,7 @@ async def setup_clients(): document_intelligence_service=os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE"), local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER", "").lower() == "true", local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER", "").lower() == "true", - search_images=USE_GPT4V, + search_images=USE_MULTIMODAL, ) search_info = await setup_search_info( search_service=AZURE_SEARCH_SERVICE, index_name=AZURE_SEARCH_INDEX, azure_credential=azure_credential @@ -661,7 +637,6 @@ async def setup_clients(): current_app.config[CONFIG_BLOB_CONTAINER_CLIENT] = blob_container_client current_app.config[CONFIG_AUTH_CLIENT] = auth_helper - current_app.config[CONFIG_GPT4V_DEPLOYED] = bool(USE_GPT4V) current_app.config[CONFIG_SEMANTIC_RANKER_DEPLOYED] = AZURE_SEARCH_SEMANTIC_RANKER != "disabled" current_app.config[CONFIG_QUERY_REWRITING_ENABLED] = ( AZURE_SEARCH_QUERY_REWRITING == "true" and AZURE_SEARCH_SEMANTIC_RANKER != "disabled" @@ -669,8 +644,7 @@ async def setup_clients(): current_app.config[CONFIG_DEFAULT_REASONING_EFFORT] = OPENAI_REASONING_EFFORT current_app.config[CONFIG_REASONING_EFFORT_ENABLED] = OPENAI_CHATGPT_MODEL in Approach.GPT_REASONING_MODELS current_app.config[CONFIG_STREAMING_ENABLED] = ( - bool(USE_GPT4V) - or OPENAI_CHATGPT_MODEL not in Approach.GPT_REASONING_MODELS + OPENAI_CHATGPT_MODEL not in Approach.GPT_REASONING_MODELS or Approach.GPT_REASONING_MODELS[OPENAI_CHATGPT_MODEL].streaming ) current_app.config[CONFIG_VECTOR_SEARCH_ENABLED] = os.getenv("USE_VECTORS", "").lower() != "false" @@ -682,6 +656,7 @@ async def setup_clients(): current_app.config[CONFIG_CHAT_HISTORY_BROWSER_ENABLED] = USE_CHAT_HISTORY_BROWSER current_app.config[CONFIG_CHAT_HISTORY_COSMOS_ENABLED] = USE_CHAT_HISTORY_COSMOS current_app.config[CONFIG_AGENTIC_RETRIEVAL_ENABLED] = USE_AGENTIC_RETRIEVAL + current_app.config[CONFIG_MULTIMODAL_ENABLED] = USE_MULTIMODAL prompt_manager = PromptyManager() @@ -732,68 +707,6 @@ async def setup_clients(): reasoning_effort=OPENAI_REASONING_EFFORT, ) - if USE_GPT4V: - current_app.logger.info("USE_GPT4V is true, setting up GPT4V approach") - if not AZURE_OPENAI_GPT4V_MODEL: - raise ValueError("AZURE_OPENAI_GPT4V_MODEL must be set when USE_GPT4V is true") - if any( - model in Approach.GPT_REASONING_MODELS - for model in [ - OPENAI_CHATGPT_MODEL, - AZURE_OPENAI_GPT4V_MODEL, - AZURE_OPENAI_CHATGPT_DEPLOYMENT, - AZURE_OPENAI_GPT4V_DEPLOYMENT, - ] - ): - raise ValueError( - "AZURE_OPENAI_CHATGPT_MODEL and AZURE_OPENAI_GPT4V_MODEL must not be a reasoning model when USE_GPT4V is true" - ) - - token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default") - - current_app.config[CONFIG_ASK_VISION_APPROACH] = RetrieveThenReadVisionApproach( - search_client=search_client, - openai_client=openai_client, - blob_container_client=blob_container_client, - auth_helper=auth_helper, - vision_endpoint=AZURE_VISION_ENDPOINT, - vision_token_provider=token_provider, - gpt4v_deployment=AZURE_OPENAI_GPT4V_DEPLOYMENT, - gpt4v_model=AZURE_OPENAI_GPT4V_MODEL, - embedding_model=OPENAI_EMB_MODEL, - embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT, - embedding_dimensions=OPENAI_EMB_DIMENSIONS, - embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING, - sourcepage_field=KB_FIELDS_SOURCEPAGE, - content_field=KB_FIELDS_CONTENT, - query_language=AZURE_SEARCH_QUERY_LANGUAGE, - query_speller=AZURE_SEARCH_QUERY_SPELLER, - prompt_manager=prompt_manager, - ) - - current_app.config[CONFIG_CHAT_VISION_APPROACH] = ChatReadRetrieveReadVisionApproach( - search_client=search_client, - openai_client=openai_client, - blob_container_client=blob_container_client, - auth_helper=auth_helper, - vision_endpoint=AZURE_VISION_ENDPOINT, - vision_token_provider=token_provider, - chatgpt_model=OPENAI_CHATGPT_MODEL, - chatgpt_deployment=AZURE_OPENAI_CHATGPT_DEPLOYMENT, - gpt4v_deployment=AZURE_OPENAI_GPT4V_DEPLOYMENT, - gpt4v_model=AZURE_OPENAI_GPT4V_MODEL, - embedding_model=OPENAI_EMB_MODEL, - embedding_deployment=AZURE_OPENAI_EMB_DEPLOYMENT, - embedding_dimensions=OPENAI_EMB_DIMENSIONS, - embedding_field=AZURE_SEARCH_FIELD_NAME_EMBEDDING, - sourcepage_field=KB_FIELDS_SOURCEPAGE, - content_field=KB_FIELDS_CONTENT, - query_language=AZURE_SEARCH_QUERY_LANGUAGE, - query_speller=AZURE_SEARCH_QUERY_SPELLER, - prompt_manager=prompt_manager, - ) - - @bp.after_app_serving async def close_clients(): await current_app.config[CONFIG_SEARCH_CLIENT].close() diff --git a/app/backend/approaches/chatreadretrieveread.py b/app/backend/approaches/chatreadretrieveread.py index ed87976e3b..2e5e765c68 100644 --- a/app/backend/approaches/chatreadretrieveread.py +++ b/app/backend/approaches/chatreadretrieveread.py @@ -92,6 +92,7 @@ async def run_until_final_call( else: extra_info = await self.run_search_approach(messages, overrides, auth_claims) + # If there are images, send the images to the model as well messages = self.prompt_manager.render_prompt( self.answer_prompt, self.get_system_prompt_variables(overrides.get("prompt_template")) @@ -174,6 +175,8 @@ async def run_search_approach( vectors: list[VectorQuery] = [] if use_vector_search: vectors.append(await self.compute_text_embedding(query_text)) + # Optionally add image embeddings if using multimodal approach + vectors.append(await self.compute_image_embedding(query_text)) results = await self.search( top, diff --git a/app/backend/approaches/chatreadretrievereadvision.py b/app/backend/approaches/chatreadretrievereadvision.py deleted file mode 100644 index f8aaf3c37d..0000000000 --- a/app/backend/approaches/chatreadretrievereadvision.py +++ /dev/null @@ -1,222 +0,0 @@ -from collections.abc import Awaitable -from typing import Any, Callable, Optional, Union, cast - -from azure.search.documents.aio import SearchClient -from azure.storage.blob.aio import ContainerClient -from openai import AsyncOpenAI, AsyncStream -from openai.types.chat import ( - ChatCompletion, - ChatCompletionChunk, - ChatCompletionMessageParam, - ChatCompletionToolParam, -) - -from approaches.approach import DataPoints, ExtraInfo, ThoughtStep -from approaches.chatapproach import ChatApproach -from approaches.promptmanager import PromptManager -from core.authentication import AuthenticationHelper -from core.imageshelper import fetch_image - - -class ChatReadRetrieveReadVisionApproach(ChatApproach): - """ - A multi-step approach that first uses OpenAI to turn the user's question into a search query, - then uses Azure AI Search to retrieve relevant documents, and then sends the conversation history, - original user question, and search results to OpenAI to generate a response. - """ - - def __init__( - self, - *, - search_client: SearchClient, - blob_container_client: ContainerClient, - openai_client: AsyncOpenAI, - auth_helper: AuthenticationHelper, - chatgpt_model: str, - chatgpt_deployment: Optional[str], # Not needed for non-Azure OpenAI - gpt4v_deployment: Optional[str], # Not needed for non-Azure OpenAI - gpt4v_model: str, - embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text" - embedding_model: str, - embedding_dimensions: int, - embedding_field: str, - sourcepage_field: str, - content_field: str, - query_language: str, - query_speller: str, - vision_endpoint: str, - vision_token_provider: Callable[[], Awaitable[str]], - prompt_manager: PromptManager, - ): - self.search_client = search_client - self.blob_container_client = blob_container_client - self.openai_client = openai_client - self.auth_helper = auth_helper - self.chatgpt_model = chatgpt_model - self.chatgpt_deployment = chatgpt_deployment - self.gpt4v_deployment = gpt4v_deployment - self.gpt4v_model = gpt4v_model - self.embedding_deployment = embedding_deployment - self.embedding_model = embedding_model - self.embedding_dimensions = embedding_dimensions - self.embedding_field = embedding_field - self.sourcepage_field = sourcepage_field - self.content_field = content_field - self.query_language = query_language - self.query_speller = query_speller - self.vision_endpoint = vision_endpoint - self.vision_token_provider = vision_token_provider - self.prompt_manager = prompt_manager - self.query_rewrite_prompt = self.prompt_manager.load_prompt("chat_query_rewrite.prompty") - self.query_rewrite_tools = self.prompt_manager.load_tools("chat_query_rewrite_tools.json") - self.answer_prompt = self.prompt_manager.load_prompt("chat_answer_question_vision.prompty") - # Currently disabled due to issues with rendering token usage in the UI - self.include_token_usage = False - - async def run_until_final_call( - self, - messages: list[ChatCompletionMessageParam], - overrides: dict[str, Any], - auth_claims: dict[str, Any], - should_stream: bool = False, - ) -> tuple[ExtraInfo, Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]]]: - seed = overrides.get("seed", None) - use_text_search = overrides.get("retrieval_mode") in ["text", "hybrid", None] - use_vector_search = overrides.get("retrieval_mode") in ["vectors", "hybrid", None] - use_semantic_ranker = True if overrides.get("semantic_ranker") else False - use_query_rewriting = True if overrides.get("query_rewriting") else False - use_semantic_captions = True if overrides.get("semantic_captions") else False - top = overrides.get("top", 3) - minimum_search_score = overrides.get("minimum_search_score", 0.0) - minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0) - filter = self.build_filter(overrides, auth_claims) - - vector_fields = overrides.get("vector_fields", "textAndImageEmbeddings") - send_text_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "texts", None] - send_images_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "images", None] - - original_user_query = messages[-1]["content"] - if not isinstance(original_user_query, str): - raise ValueError("The most recent message content must be a string.") - - # Use prompty to prepare the query prompt - query_messages = self.prompt_manager.render_prompt( - self.query_rewrite_prompt, {"user_query": original_user_query, "past_messages": messages[:-1]} - ) - tools: list[ChatCompletionToolParam] = self.query_rewrite_tools - - # STEP 1: Generate an optimized keyword search query based on the chat history and the last question - chat_completion: ChatCompletion = await self.openai_client.chat.completions.create( - messages=query_messages, - # Azure OpenAI takes the deployment name as the model name - model=self.chatgpt_deployment if self.chatgpt_deployment else self.chatgpt_model, - temperature=0.0, # Minimize creativity for search query generation - max_tokens=100, - n=1, - tools=tools, - seed=seed, - ) - - query_text = self.get_search_query(chat_completion, original_user_query) - - # STEP 2: Retrieve relevant documents from the search index with the GPT optimized query - - # If retrieval mode includes vectors, compute an embedding for the query - vectors = [] - if use_vector_search: - if vector_fields == "textEmbeddingOnly" or vector_fields == "textAndImageEmbeddings": - vectors.append(await self.compute_text_embedding(query_text)) - if vector_fields == "imageEmbeddingOnly" or vector_fields == "textAndImageEmbeddings": - vectors.append(await self.compute_image_embedding(query_text)) - - results = await self.search( - top, - query_text, - filter, - vectors, - use_text_search, - use_vector_search, - use_semantic_ranker, - use_semantic_captions, - minimum_search_score, - minimum_reranker_score, - use_query_rewriting, - ) - - # STEP 3: Generate a contextual and content specific answer using the search results and chat history - text_sources = [] - image_sources = [] - if send_text_to_gptvision: - text_sources = self.get_sources_content(results, use_semantic_captions, use_image_citation=True) - if send_images_to_gptvision: - for result in results: - url = await fetch_image(self.blob_container_client, result) - if url: - image_sources.append(url) - - messages = self.prompt_manager.render_prompt( - self.answer_prompt, - self.get_system_prompt_variables(overrides.get("prompt_template")) - | { - "include_follow_up_questions": bool(overrides.get("suggest_followup_questions")), - "past_messages": messages[:-1], - "user_query": original_user_query, - "text_sources": text_sources, - "image_sources": image_sources, - }, - ) - - extra_info = ExtraInfo( - DataPoints(text=text_sources, images=image_sources), - [ - ThoughtStep( - "Prompt to generate search query", - query_messages, - ( - {"model": self.chatgpt_model, "deployment": self.chatgpt_deployment} - if self.chatgpt_deployment - else {"model": self.chatgpt_model} - ), - ), - ThoughtStep( - "Search using generated search query", - query_text, - { - "use_semantic_captions": use_semantic_captions, - "use_semantic_ranker": use_semantic_ranker, - "use_query_rewriting": use_query_rewriting, - "top": top, - "filter": filter, - "vector_fields": vector_fields, - "use_text_search": use_text_search, - }, - ), - ThoughtStep( - "Search results", - [result.serialize_for_results() for result in results], - ), - ThoughtStep( - "Prompt to generate answer", - messages, - ( - {"model": self.gpt4v_model, "deployment": self.gpt4v_deployment} - if self.gpt4v_deployment - else {"model": self.gpt4v_model} - ), - ), - ], - ) - - chat_coroutine = cast( - Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]], - self.openai_client.chat.completions.create( - model=self.gpt4v_deployment if self.gpt4v_deployment else self.gpt4v_model, - messages=messages, - temperature=overrides.get("temperature", 0.3), - max_tokens=1024, - n=1, - stream=should_stream, - seed=seed, - ), - ) - return (extra_info, chat_coroutine) diff --git a/app/backend/approaches/prompts/ask_answer_question.prompty b/app/backend/approaches/prompts/ask_answer_question.prompty index 7ff73d232f..c384ad8b65 100644 --- a/app/backend/approaches/prompts/ask_answer_question.prompty +++ b/app/backend/approaches/prompts/ask_answer_question.prompty @@ -18,7 +18,15 @@ You are an intelligent assistant helping Contoso Inc employees with their health Use 'you' to refer to the individual asking the questions even if they ask with 'I'. Answer the following question using only the data provided in the sources below. Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. -If you cannot answer using the sources below, say you don't know. Use below example to answer +If you cannot answer using the sources below, say you don't know. Use below example to answer. +{% if use_images %} +Each image source has the file name in the top left corner of the image with coordinates (10,10) pixels and is in the format SourceFileName:. +Each text source starts in a new line and has the file name followed by colon and the actual information. +Always include the source name from the image or text for each fact you use in the response in the format: [filename]. +Answer the following question using only the data provided in the sources below. +The text and image source can be the same file name, don't use the image title when citing the image source, only use the file name as mentioned. +If you cannot answer using the sources below, say you don't know. Return just the answer without any input texts. +{% endif %} {{ injected_prompt }} {% endif %} @@ -36,7 +44,12 @@ In-network deductibles are $500 for employee and $1000 for family [info1.txt] an user: {{ user_query }} +{% for image_source in image_sources %} +![Image]({{image_source}}) +{% endfor %} +{% if text_sources is defined %} Sources: {% for text_source in text_sources %} {{ text_source }} {% endfor %} +{% endif %} diff --git a/app/backend/approaches/prompts/ask_answer_question_vision.prompty b/app/backend/approaches/prompts/ask_answer_question_vision.prompty deleted file mode 100644 index 25ab9656a7..0000000000 --- a/app/backend/approaches/prompts/ask_answer_question_vision.prompty +++ /dev/null @@ -1,31 +0,0 @@ ---- -name: Ask with vision -description: Answer a single question (with no chat history) using both text and image sources. -model: - api: chat ---- -system: -{% if override_prompt %} -{{ override_prompt }} -{% else %} -You are an intelligent assistant helping analyze the Annual Financial Report of Contoso Ltd., The documents contain text, graphs, tables and images. -Each image source has the file name in the top left corner of the image with coordinates (10,10) pixels and is in the format SourceFileName:. -Each text source starts in a new line and has the file name followed by colon and the actual information. -Always include the source name from the image or text for each fact you use in the response in the format: [filename]. -Answer the following question using only the data provided in the sources below. -The text and image source can be the same file name, don't use the image title when citing the image source, only use the file name as mentioned. -If you cannot answer using the sources below, say you don't know. Return just the answer without any input texts. -{{ injected_prompt }} -{% endif %} - -user: -{{ user_query }} -{% for image_source in image_sources %} -![Image]({{image_source}}) -{% endfor %} -{% if text_sources is defined %} -Sources: -{% for text_source in text_sources %} -{{ text_source }} -{% endfor %} -{% endif %} diff --git a/app/backend/approaches/prompts/chat_answer_question.prompty b/app/backend/approaches/prompts/chat_answer_question.prompty index 3dcb05ae21..9be2ad660d 100644 --- a/app/backend/approaches/prompts/chat_answer_question.prompty +++ b/app/backend/approaches/prompts/chat_answer_question.prompty @@ -24,6 +24,16 @@ Assistant helps the company employees with their healthcare plan questions, and Answer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question. If the question is not in English, answer in the language used in the question. Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf]. +{% if include_images %} +Each image source has the file name in the top left corner of the image with coordinates (10,10) pixels and is in the format SourceFileName: +Each text source starts in a new line and has the file name followed by colon and the actual information +Always include the source name from the image or text for each fact you use in the response in the format: [filename] +Answer the following question using only the data provided in the sources below. +If asking a clarifying question to the user would help, ask the question. +Be brief in your answers. +The text and image source can be the same file name, don't use the image title when citing the image source, only use the file name as mentioned +If you cannot answer using the sources below, say you don't know. Return just the answer without any input texts. +{% endif %} {{ injected_prompt }} {% endif %} @@ -44,8 +54,12 @@ Make sure the last question ends with ">>". user: {{ user_query }} - +{% for image_source in image_sources %} +![Image]({{image_source}}) +{% endfor %} +{% if text_sources is defined %} Sources: {% for text_source in text_sources %} {{ text_source }} {% endfor %} +{% endif %} \ No newline at end of file diff --git a/app/backend/approaches/prompts/chat_answer_question_vision.prompty b/app/backend/approaches/prompts/chat_answer_question_vision.prompty deleted file mode 100644 index 58b3624121..0000000000 --- a/app/backend/approaches/prompts/chat_answer_question_vision.prompty +++ /dev/null @@ -1,48 +0,0 @@ ---- -name: Chat with vision -description: Answer a question (with chat history) using both text and image sources. -model: - api: chat ---- -system: -{% if override_prompt %} -{{ override_prompt }} -{% else %} -You are an intelligent assistant helping analyze the Annual Financial Report of Contoso Ltd., The documents contain text, graphs, tables and images. -Each image source has the file name in the top left corner of the image with coordinates (10,10) pixels and is in the format SourceFileName: -Each text source starts in a new line and has the file name followed by colon and the actual information -Always include the source name from the image or text for each fact you use in the response in the format: [filename] -Answer the following question using only the data provided in the sources below. -If asking a clarifying question to the user would help, ask the question. -Be brief in your answers. -The text and image source can be the same file name, don't use the image title when citing the image source, only use the file name as mentioned -If you cannot answer using the sources below, say you don't know. Return just the answer without any input texts. -{{injected_prompt}} -{% endif %} - -{% if include_follow_up_questions %} -Generate 3 very brief follow-up questions that the user would likely ask next. -Enclose the follow-up questions in double angle brackets. Example: -<> -<> -<> -Do not repeat questions that have already been asked. -Make sure the last question ends with ">>". -{% endif %} - -{% for message in past_messages %} -{{ message["role"] }}: -{{ message["content"] }} -{% endfor %} - -user: -{{ user_query }} -{% for image_source in image_sources %} -![Image]({{image_source}}) -{% endfor %} -{% if text_sources is defined %} -Sources: -{% for text_source in text_sources %} -{{ text_source }} -{% endfor %} -{% endif %} diff --git a/app/backend/approaches/retrievethenreadvision.py b/app/backend/approaches/retrievethenreadvision.py deleted file mode 100644 index a021537c52..0000000000 --- a/app/backend/approaches/retrievethenreadvision.py +++ /dev/null @@ -1,181 +0,0 @@ -from collections.abc import Awaitable -from typing import Any, Callable, Optional - -from azure.search.documents.aio import SearchClient -from azure.storage.blob.aio import ContainerClient -from openai import AsyncOpenAI -from openai.types.chat import ( - ChatCompletionMessageParam, -) - -from approaches.approach import Approach, DataPoints, ExtraInfo, ThoughtStep -from approaches.promptmanager import PromptManager -from core.authentication import AuthenticationHelper -from core.imageshelper import fetch_image - - -class RetrieveThenReadVisionApproach(Approach): - """ - Simple retrieve-then-read implementation, using the AI Search and OpenAI APIs directly. It first retrieves - top documents including images from search, then constructs a prompt with them, and then uses OpenAI to generate an completion - (answer) with that prompt. - """ - - def __init__( - self, - *, - search_client: SearchClient, - blob_container_client: ContainerClient, - openai_client: AsyncOpenAI, - auth_helper: AuthenticationHelper, - gpt4v_deployment: Optional[str], - gpt4v_model: str, - embedding_deployment: Optional[str], # Not needed for non-Azure OpenAI or for retrieval_mode="text" - embedding_model: str, - embedding_dimensions: int, - embedding_field: str, - sourcepage_field: str, - content_field: str, - query_language: str, - query_speller: str, - vision_endpoint: str, - vision_token_provider: Callable[[], Awaitable[str]], - prompt_manager: PromptManager, - ): - self.search_client = search_client - self.blob_container_client = blob_container_client - self.openai_client = openai_client - self.auth_helper = auth_helper - self.embedding_model = embedding_model - self.embedding_deployment = embedding_deployment - self.embedding_dimensions = embedding_dimensions - self.embedding_field = embedding_field - self.sourcepage_field = sourcepage_field - self.content_field = content_field - self.gpt4v_deployment = gpt4v_deployment - self.gpt4v_model = gpt4v_model - self.query_language = query_language - self.query_speller = query_speller - self.vision_endpoint = vision_endpoint - self.vision_token_provider = vision_token_provider - self.prompt_manager = prompt_manager - self.answer_prompt = self.prompt_manager.load_prompt("ask_answer_question_vision.prompty") - # Currently disabled due to issues with rendering token usage in the UI - self.include_token_usage = False - - async def run( - self, - messages: list[ChatCompletionMessageParam], - session_state: Any = None, - context: dict[str, Any] = {}, - ) -> dict[str, Any]: - q = messages[-1]["content"] - if not isinstance(q, str): - raise ValueError("The most recent message content must be a string.") - - overrides = context.get("overrides", {}) - seed = overrides.get("seed", None) - auth_claims = context.get("auth_claims", {}) - use_text_search = overrides.get("retrieval_mode") in ["text", "hybrid", None] - use_vector_search = overrides.get("retrieval_mode") in ["vectors", "hybrid", None] - use_semantic_ranker = True if overrides.get("semantic_ranker") else False - use_query_rewriting = True if overrides.get("query_rewriting") else False - use_semantic_captions = True if overrides.get("semantic_captions") else False - top = overrides.get("top", 3) - minimum_search_score = overrides.get("minimum_search_score", 0.0) - minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0) - filter = self.build_filter(overrides, auth_claims) - - vector_fields = overrides.get("vector_fields", "textAndImageEmbeddings") - send_text_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "texts", None] - send_images_to_gptvision = overrides.get("gpt4v_input") in ["textAndImages", "images", None] - - # If retrieval mode includes vectors, compute an embedding for the query - vectors = [] - if use_vector_search: - if vector_fields == "textEmbeddingOnly" or vector_fields == "textAndImageEmbeddings": - vectors.append(await self.compute_text_embedding(q)) - if vector_fields == "imageEmbeddingOnly" or vector_fields == "textAndImageEmbeddings": - vectors.append(await self.compute_image_embedding(q)) - - results = await self.search( - top, - q, - filter, - vectors, - use_text_search, - use_vector_search, - use_semantic_ranker, - use_semantic_captions, - minimum_search_score, - minimum_reranker_score, - use_query_rewriting, - ) - - # Process results - text_sources = [] - image_sources = [] - if send_text_to_gptvision: - text_sources = self.get_sources_content(results, use_semantic_captions, use_image_citation=True) - if send_images_to_gptvision: - for result in results: - url = await fetch_image(self.blob_container_client, result) - if url: - image_sources.append(url) - - messages = self.prompt_manager.render_prompt( - self.answer_prompt, - self.get_system_prompt_variables(overrides.get("prompt_template")) - | {"user_query": q, "text_sources": text_sources, "image_sources": image_sources}, - ) - - chat_completion = await self.openai_client.chat.completions.create( - model=self.gpt4v_deployment if self.gpt4v_deployment else self.gpt4v_model, - messages=messages, - temperature=overrides.get("temperature", 0.3), - max_tokens=1024, - n=1, - seed=seed, - ) - - extra_info = ExtraInfo( - DataPoints(text=text_sources, images=image_sources), - [ - ThoughtStep( - "Search using user query", - q, - { - "use_semantic_captions": use_semantic_captions, - "use_semantic_ranker": use_semantic_ranker, - "use_query_rewriting": use_query_rewriting, - "top": top, - "filter": filter, - "vector_fields": vector_fields, - "use_vector_search": use_vector_search, - "use_text_search": use_text_search, - }, - ), - ThoughtStep( - "Search results", - [result.serialize_for_results() for result in results], - ), - ThoughtStep( - "Prompt to generate answer", - messages, - ( - {"model": self.gpt4v_model, "deployment": self.gpt4v_deployment} - if self.gpt4v_deployment - else {"model": self.gpt4v_model} - ), - ), - ], - ) - - return { - "message": { - "content": chat_completion.choices[0].message.content, - "role": chat_completion.choices[0].message.role, - }, - "context": extra_info, - "session_state": session_state, - } diff --git a/app/backend/config.py b/app/backend/config.py index 443c0171fa..b881ff857f 100644 --- a/app/backend/config.py +++ b/app/backend/config.py @@ -1,18 +1,14 @@ CONFIG_OPENAI_TOKEN = "openai_token" CONFIG_CREDENTIAL = "azure_credential" CONFIG_ASK_APPROACH = "ask_approach" -CONFIG_ASK_VISION_APPROACH = "ask_vision_approach" -CONFIG_CHAT_VISION_APPROACH = "chat_vision_approach" CONFIG_CHAT_APPROACH = "chat_approach" CONFIG_BLOB_CONTAINER_CLIENT = "blob_container_client" CONFIG_USER_UPLOAD_ENABLED = "user_upload_enabled" CONFIG_USER_BLOB_CONTAINER_CLIENT = "user_blob_container_client" CONFIG_AUTH_CLIENT = "auth_client" -CONFIG_GPT4V_DEPLOYED = "gpt4v_deployed" CONFIG_SEMANTIC_RANKER_DEPLOYED = "semantic_ranker_deployed" CONFIG_QUERY_REWRITING_ENABLED = "query_rewriting_enabled" CONFIG_REASONING_EFFORT_ENABLED = "reasoning_effort_enabled" -CONFIG_VISION_REASONING_EFFORT_ENABLED = "vision_reasoning_effort_enabled" CONFIG_DEFAULT_REASONING_EFFORT = "default_reasoning_effort" CONFIG_VECTOR_SEARCH_ENABLED = "vector_search_enabled" CONFIG_SEARCH_CLIENT = "search_client" @@ -34,3 +30,4 @@ CONFIG_COSMOS_HISTORY_CLIENT = "cosmos_history_client" CONFIG_COSMOS_HISTORY_CONTAINER = "cosmos_history_container" CONFIG_COSMOS_HISTORY_VERSION = "cosmos_history_version" +CONFIG_MULTIMODAL_ENABLED = "multimodal_enabled" \ No newline at end of file diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index f03baac0dc..a41d1c1437 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -177,6 +177,7 @@ def setup_file_processors( local_html_parser: bool = False, search_images: bool = False, use_content_understanding: bool = False, + use_multimodal: bool = False, content_understanding_endpoint: Union[str, None] = None, ): sentence_text_splitter = SentenceTextSplitter() @@ -190,7 +191,7 @@ def setup_file_processors( doc_int_parser = DocumentAnalysisParser( endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/", credential=documentintelligence_creds, - use_content_understanding=use_content_understanding, + include_media_description=use_content_understanding or use_multimodal, content_understanding_endpoint=content_understanding_endpoint, ) @@ -240,20 +241,6 @@ def setup_file_processors( return file_processors -def setup_image_embeddings_service( - azure_credential: AsyncTokenCredential, vision_endpoint: Union[str, None], search_images: bool -) -> Union[ImageEmbeddings, None]: - image_embeddings_service: Optional[ImageEmbeddings] = None - if search_images: - if vision_endpoint is None: - raise ValueError("A computer vision endpoint is required when GPT-4-vision is enabled.") - image_embeddings_service = ImageEmbeddings( - endpoint=vision_endpoint, - token_provider=get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default"), - ) - return image_embeddings_service - - async def main(strategy: Strategy, setup_index: bool = True): if setup_index: await strategy.setup() @@ -328,7 +315,7 @@ async def main(strategy: Strategy, setup_index: bool = True): exit(0) use_int_vectorization = os.getenv("USE_FEATURE_INT_VECTORIZATION", "").lower() == "true" - use_gptvision = os.getenv("USE_GPT4V", "").lower() == "true" + use_multimodal = os.getenv("USE_MULTIMODAL", "").lower() == "true" use_acls = os.getenv("AZURE_ENFORCE_ACCESS_CONTROL") is not None dont_use_vectors = os.getenv("USE_VECTORS", "").lower() == "false" use_agentic_retrieval = os.getenv("USE_AGENTIC_RETRIEVAL", "").lower() == "true" @@ -444,13 +431,9 @@ async def main(strategy: Strategy, setup_index: bool = True): local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER") == "true", search_images=use_gptvision, use_content_understanding=use_content_understanding, + use_multimodal=use_multimodal, content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"), ) - image_embeddings_service = setup_image_embeddings_service( - azure_credential=azd_credential, - vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"), - search_images=use_gptvision, - ) ingestion_strategy = FileStrategy( search_info=search_info, diff --git a/app/backend/prepdocslib/goals.json b/app/backend/prepdocslib/goals.json new file mode 100644 index 0000000000..3f0b96d5a5 --- /dev/null +++ b/app/backend/prepdocslib/goals.json @@ -0,0 +1,7 @@ +{"content": "text verbalization text verbalization", + "embedding": [0, 1, 2], + "sourcepage": "bla.pdf#page=2", + "sourcefile": "bla.pdf", + "images": # collection of objects with fields https://learn.microsoft.com/en-us/azure/search/vector-search-multi-vector-fields + [ {embedding, url, verbalization, boundingbox}, + {embedding, url, verbalization, boundingbox} ] \ No newline at end of file diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py index c96980d21c..5e4a54d482 100644 --- a/app/backend/prepdocslib/pdfparser.py +++ b/app/backend/prepdocslib/pdfparser.py @@ -18,6 +18,7 @@ from azure.core.exceptions import HttpResponseError from PIL import Image from pypdf import PdfReader +from openai import AsyncOpenAI from .mediadescriber import ContentUnderstandingDescriber from .page import Page @@ -55,7 +56,7 @@ def __init__( endpoint: str, credential: Union[AsyncTokenCredential, AzureKeyCredential], model_id="prebuilt-layout", - use_content_understanding=True, + include_media_description: bool = False, content_understanding_endpoint: Union[str, None] = None, ): self.model_id = model_id diff --git a/azure.yaml b/azure.yaml index f77bfb5828..3c7664caf7 100644 --- a/azure.yaml +++ b/azure.yaml @@ -81,11 +81,6 @@ pipeline: - AZURE_OPENAI_EVAL_DEPLOYMENT - AZURE_OPENAI_EVAL_DEPLOYMENT_SKU - AZURE_OPENAI_EVAL_DEPLOYMENT_CAPACITY - - AZURE_OPENAI_GPT4V_MODEL - - AZURE_OPENAI_GPT4V_DEPLOYMENT - - AZURE_OPENAI_GPT4V_DEPLOYMENT_CAPACITY - - AZURE_OPENAI_GPT4V_DEPLOYMENT_VERSION - - AZURE_OPENAI_GPT4V_DEPLOYMENT_SKU - AZURE_OPENAI_DISABLE_KEYS - OPENAI_HOST - OPENAI_API_KEY @@ -95,7 +90,7 @@ pipeline: - AZURE_APPLICATION_INSIGHTS_DASHBOARD - AZURE_LOG_ANALYTICS - USE_VECTORS - - USE_GPT4V + - USE_MULTIMODAL - AZURE_VISION_ENDPOINT - VISION_SECRET_NAME - AZURE_COMPUTER_VISION_SERVICE diff --git a/infra/main.bicep b/infra/main.bicep index 2d9340e14b..299b85e214 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -66,7 +66,7 @@ param speechServiceLocation string = '' param speechServiceName string = '' param speechServiceSkuName string // Set in main.parameters.json param speechServiceVoice string = '' -param useGPT4V bool = false +param useMultimodal bool = false param useEval bool = false @allowed(['free', 'provisioned', 'serverless']) @@ -172,19 +172,6 @@ var embedding = { dimensions: embeddingDimensions != 0 ? embeddingDimensions : 3072 } -param gpt4vModelName string = '' -param gpt4vDeploymentName string = '' -param gpt4vModelVersion string = '' -param gpt4vDeploymentSkuName string = '' -param gpt4vDeploymentCapacity int = 0 -var gpt4v = { - modelName: !empty(gpt4vModelName) ? gpt4vModelName : 'gpt-4o' - deploymentName: !empty(gpt4vDeploymentName) ? gpt4vDeploymentName : 'vision' - deploymentVersion: !empty(gpt4vModelVersion) ? gpt4vModelVersion : '2024-08-06' - deploymentSkuName: !empty(gpt4vDeploymentSkuName) ? gpt4vDeploymentSkuName : 'GlobalStandard' // Not-backward compatible - deploymentCapacity: gpt4vDeploymentCapacity != 0 ? gpt4vDeploymentCapacity : 10 -} - param evalModelName string = '' param evalDeploymentName string = '' param evalModelVersion string = '' @@ -406,7 +393,7 @@ var appEnvVariables = { AZURE_SEARCH_SERVICE: searchService.outputs.name AZURE_SEARCH_SEMANTIC_RANKER: actualSearchServiceSemanticRankerLevel AZURE_SEARCH_QUERY_REWRITING: searchServiceQueryRewriting - AZURE_VISION_ENDPOINT: useGPT4V ? computerVision.outputs.endpoint : '' + AZURE_VISION_ENDPOINT: useMultimodal ? computerVision.outputs.endpoint : '' AZURE_SEARCH_QUERY_LANGUAGE: searchQueryLanguage AZURE_SEARCH_QUERY_SPELLER: searchQuerySpeller AZURE_SEARCH_FIELD_NAME_EMBEDDING: searchFieldNameEmbedding @@ -433,13 +420,11 @@ var appEnvVariables = { AZURE_OPENAI_EMB_MODEL_NAME: embedding.modelName AZURE_OPENAI_EMB_DIMENSIONS: embedding.dimensions AZURE_OPENAI_CHATGPT_MODEL: chatGpt.modelName - AZURE_OPENAI_GPT4V_MODEL: gpt4v.modelName AZURE_OPENAI_REASONING_EFFORT: defaultReasoningEffort // Specific to Azure OpenAI AZURE_OPENAI_SERVICE: isAzureOpenAiHost && deployAzureOpenAi ? openAi.outputs.name : '' AZURE_OPENAI_CHATGPT_DEPLOYMENT: chatGpt.deploymentName AZURE_OPENAI_EMB_DEPLOYMENT: embedding.deploymentName - AZURE_OPENAI_GPT4V_DEPLOYMENT: useGPT4V ? gpt4v.deploymentName : '' AZURE_OPENAI_SEARCHAGENT_MODEL: searchAgent.modelName AZURE_OPENAI_SEARCHAGENT_DEPLOYMENT: searchAgent.deploymentName AZURE_OPENAI_API_VERSION: azureOpenAiApiVersion @@ -461,7 +446,7 @@ var appEnvVariables = { // CORS support, for frontends on other hosts ALLOWED_ORIGIN: join(allowedOrigins, ';') USE_VECTORS: useVectors - USE_GPT4V: useGPT4V + USE_MULTIMODAL: useMultimodal USE_USER_UPLOAD: useUserUpload AZURE_USERSTORAGE_ACCOUNT: useUserUpload ? userStorage.outputs.name : '' AZURE_USERSTORAGE_CONTAINER: useUserUpload ? userStorageContainerName : '' @@ -635,22 +620,6 @@ var openAiDeployments = concat( } } ] : [], - useGPT4V - ? [ - { - name: gpt4v.deploymentName - model: { - format: 'OpenAI' - name: gpt4v.modelName - version: gpt4v.deploymentVersion - } - sku: { - name: gpt4v.deploymentSkuName - capacity: gpt4v.deploymentCapacity - } - } - ] - : [], useAgenticRetrieval ? [ { @@ -715,7 +684,7 @@ module documentIntelligence 'br/public:avm/res/cognitive-services/account:0.7.2' } } -module computerVision 'br/public:avm/res/cognitive-services/account:0.7.2' = if (useGPT4V) { +module computerVision 'br/public:avm/res/cognitive-services/account:0.7.2' = if (useMultimodal) { name: 'computerVision' scope: computerVisionResourceGroup params: { @@ -1179,7 +1148,7 @@ var openAiPrivateEndpointConnection = (isAzureOpenAiHost && deployAzureOpenAi && dnsZoneName: 'privatelink.openai.azure.com' resourceIds: concat( [openAi.outputs.resourceId], - useGPT4V ? [computerVision.outputs.resourceId] : [], + useMultimodal ? [computerVision.outputs.resourceId] : [], useMediaDescriberAzureCU ? [contentUnderstanding.outputs.resourceId] : [], !useLocalPdfParser ? [documentIntelligence.outputs.resourceId] : [] ) @@ -1256,7 +1225,7 @@ module searchContribRoleBackend 'core/security/role.bicep' = if (useUserUpload) } // For computer vision access by the backend -module computerVisionRoleBackend 'core/security/role.bicep' = if (useGPT4V) { +module computerVisionRoleBackend 'core/security/role.bicep' = if (useMultimodal) { scope: computerVisionResourceGroup name: 'computervision-role-backend' params: { @@ -1291,7 +1260,6 @@ output OPENAI_HOST string = openAiHost output AZURE_OPENAI_EMB_MODEL_NAME string = embedding.modelName output AZURE_OPENAI_EMB_DIMENSIONS int = embedding.dimensions output AZURE_OPENAI_CHATGPT_MODEL string = chatGpt.modelName -output AZURE_OPENAI_GPT4V_MODEL string = gpt4v.modelName // Specific to Azure OpenAI output AZURE_OPENAI_SERVICE string = isAzureOpenAiHost && deployAzureOpenAi ? openAi.outputs.name : '' @@ -1304,9 +1272,6 @@ output AZURE_OPENAI_CHATGPT_DEPLOYMENT_SKU string = isAzureOpenAiHost ? chatGpt. output AZURE_OPENAI_EMB_DEPLOYMENT string = isAzureOpenAiHost ? embedding.deploymentName : '' output AZURE_OPENAI_EMB_DEPLOYMENT_VERSION string = isAzureOpenAiHost ? embedding.deploymentVersion : '' output AZURE_OPENAI_EMB_DEPLOYMENT_SKU string = isAzureOpenAiHost ? embedding.deploymentSkuName : '' -output AZURE_OPENAI_GPT4V_DEPLOYMENT string = isAzureOpenAiHost && useGPT4V ? gpt4v.deploymentName : '' -output AZURE_OPENAI_GPT4V_DEPLOYMENT_VERSION string = isAzureOpenAiHost && useGPT4V ? gpt4v.deploymentVersion : '' -output AZURE_OPENAI_GPT4V_DEPLOYMENT_SKU string = isAzureOpenAiHost && useGPT4V ? gpt4v.deploymentSkuName : '' output AZURE_OPENAI_EVAL_DEPLOYMENT string = isAzureOpenAiHost && useEval ? eval.deploymentName : '' output AZURE_OPENAI_EVAL_DEPLOYMENT_VERSION string = isAzureOpenAiHost && useEval ? eval.deploymentVersion : '' output AZURE_OPENAI_EVAL_DEPLOYMENT_SKU string = isAzureOpenAiHost && useEval ? eval.deploymentSkuName : '' @@ -1317,7 +1282,7 @@ output AZURE_OPENAI_REASONING_EFFORT string = defaultReasoningEffort output AZURE_SPEECH_SERVICE_ID string = useSpeechOutputAzure ? speech.outputs.resourceId : '' output AZURE_SPEECH_SERVICE_LOCATION string = useSpeechOutputAzure ? speech.outputs.location : '' -output AZURE_VISION_ENDPOINT string = useGPT4V ? computerVision.outputs.endpoint : '' +output AZURE_VISION_ENDPOINT string = useMultimodal ? computerVision.outputs.endpoint : '' output AZURE_CONTENTUNDERSTANDING_ENDPOINT string = useMediaDescriberAzureCU ? contentUnderstanding.outputs.endpoint : '' output AZURE_DOCUMENTINTELLIGENCE_SERVICE string = documentIntelligence.outputs.name diff --git a/infra/main.parameters.json b/infra/main.parameters.json index 1c34063020..79b6021b97 100644 --- a/infra/main.parameters.json +++ b/infra/main.parameters.json @@ -140,21 +140,6 @@ "embeddingDimensions": { "value": "${AZURE_OPENAI_EMB_DIMENSIONS}" }, - "gpt4vModelName":{ - "value": "${AZURE_OPENAI_GPT4V_MODEL}" - }, - "gpt4vDeploymentName": { - "value": "${AZURE_OPENAI_GPT4V_DEPLOYMENT}" - }, - "gpt4vDeploymentVersion":{ - "value": "${AZURE_OPENAI_GPT4V_DEPLOYMENT_VERSION}" - }, - "gpt4vDeploymentSkuName":{ - "value": "${AZURE_OPENAI_GPT4V_DEPLOYMENT_SKU}" - }, - "gpt4vDeploymentCapacity":{ - "value": "${AZURE_OPENAI_GPT4V_DEPLOYMENT_CAPACITY}" - }, "evalModelName":{ "value": "${AZURE_OPENAI_EVAL_MODEL}" }, @@ -221,8 +206,8 @@ "useVectors": { "value": "${USE_VECTORS=true}" }, - "useGPT4V": { - "value": "${USE_GPT4V=false}" + "useMultimodal": { + "value": "${USE_MULTIMODAL=false}" }, "useEval": { "value": "${USE_EVAL=false}" diff --git a/locustfile.py b/locustfile.py index b41b9bd372..561d342f19 100644 --- a/locustfile.py +++ b/locustfile.py @@ -72,68 +72,3 @@ def ask_question(self): }, }, ) - - -class ChatVisionUser(HttpUser): - wait_time = between(5, 20) - - @task - def ask_question(self): - self.client.get("/") - time.sleep(self.wait_time()) - self.client.post( - "/chat/stream", - json={ - "messages": [ - { - "content": "Can you identify any correlation between oil prices and stock market trends?", - "role": "user", - } - ], - "context": { - "overrides": { - "top": 3, - "temperature": 0.3, - "minimum_reranker_score": 0, - "minimum_search_score": 0, - "retrieval_mode": "hybrid", - "semantic_ranker": True, - "semantic_captions": False, - "suggest_followup_questions": False, - "use_oid_security_filter": False, - "use_groups_security_filter": False, - "vector_fields": "textAndImageEmbeddings", - "use_gpt4v": True, - "gpt4v_input": "textAndImages", - } - }, - "session_state": None, - }, - ) - time.sleep(self.wait_time()) - self.client.post( - "/chat/stream", - json={ - "messages": [ - {"content": "Compare the impact of interest rates and GDP in financial markets.", "role": "user"} - ], - "context": { - "overrides": { - "top": 3, - "temperature": 0.3, - "minimum_reranker_score": 0, - "minimum_search_score": 0, - "retrieval_mode": "hybrid", - "semantic_ranker": True, - "semantic_captions": False, - "suggest_followup_questions": False, - "use_oid_security_filter": False, - "use_groups_security_filter": False, - "vector_fields": "textAndImageEmbeddings", - "use_gpt4v": True, - "gpt4v_input": "textAndImages", - } - }, - "session_state": None, - }, - ) From 74fdf48b8f50168aa89e0ecc07700d0a3d93e96d Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Thu, 29 May 2025 18:32:39 +0000 Subject: [PATCH 02/77] Add LLM-based media describer --- app/backend/prepdocs.py | 11 ++++- app/backend/prepdocslib/goals.json | 2 + app/backend/prepdocslib/mediadescriber.py | 30 +++++++++++++ app/backend/prepdocslib/pdfparser.py | 54 +++++++++++++++++------ 4 files changed, 82 insertions(+), 15 deletions(-) diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index a41d1c1437..11ce09269e 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -8,6 +8,7 @@ from azure.core.credentials_async import AsyncTokenCredential from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider from rich.logging import RichHandler +from openai import AsyncAzureOpenAI, AsyncOpenAI from load_azd_env import load_azd_env from prepdocslib.blobmanager import BlobManager @@ -30,7 +31,7 @@ LocalListFileStrategy, ) from prepdocslib.parser import Parser -from prepdocslib.pdfparser import DocumentAnalysisParser, LocalPdfParser +from prepdocslib.pdfparser import DocumentAnalysisParser, LocalPdfParser, MediaDescriptionStrategy from prepdocslib.strategy import DocumentAction, SearchInfo, Strategy from prepdocslib.textparser import TextParser from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter @@ -178,6 +179,9 @@ def setup_file_processors( search_images: bool = False, use_content_understanding: bool = False, use_multimodal: bool = False, + openai_client: Union[AsyncOpenAI, None] = None, + openai_model: Union[str, None] = None, + openai_deployment: Union[str, None] = None, content_understanding_endpoint: Union[str, None] = None, ): sentence_text_splitter = SentenceTextSplitter() @@ -191,7 +195,10 @@ def setup_file_processors( doc_int_parser = DocumentAnalysisParser( endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/", credential=documentintelligence_creds, - include_media_description=use_content_understanding or use_multimodal, + media_description_strategy = "openai" if use_multimodal else "contentunderstanding" if use_content_understanding else "none", + openai_client=openai_client, + openai_model=openai_model, + openai_deployment=openai_deployment, content_understanding_endpoint=content_understanding_endpoint, ) diff --git a/app/backend/prepdocslib/goals.json b/app/backend/prepdocslib/goals.json index 3f0b96d5a5..523b48252b 100644 --- a/app/backend/prepdocslib/goals.json +++ b/app/backend/prepdocslib/goals.json @@ -2,6 +2,8 @@ "embedding": [0, 1, 2], "sourcepage": "bla.pdf#page=2", "sourcefile": "bla.pdf", + "oids": [], + "groups": [], "images": # collection of objects with fields https://learn.microsoft.com/en-us/azure/search/vector-search-multi-vector-fields [ {embedding, url, verbalization, boundingbox}, {embedding, url, verbalization, boundingbox} ] \ No newline at end of file diff --git a/app/backend/prepdocslib/mediadescriber.py b/app/backend/prepdocslib/mediadescriber.py index 5aae79232e..5a6b2639b8 100644 --- a/app/backend/prepdocslib/mediadescriber.py +++ b/app/backend/prepdocslib/mediadescriber.py @@ -1,11 +1,13 @@ import logging from abc import ABC +import base64 import aiohttp from azure.core.credentials_async import AsyncTokenCredential from azure.identity.aio import get_bearer_token_provider from rich.progress import Progress from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_fixed +from openai import AsyncOpenAI logger = logging.getLogger("scripts") @@ -105,3 +107,31 @@ async def describe_image(self, image_bytes: bytes) -> str: fields = results["result"]["contents"][0]["fields"] return fields["Description"]["valueString"] + +class MultimodalModelDescriber(MediaDescriber): + def __init__(self, openai_client: AsyncOpenAI, model: str, deployment: str): + self.openai_client = openai_client + self.model = model + self.deployment = deployment + + async def describe_image(self, image_bytes: bytes) -> str: + logger.info("Describing image using LLM...") + image_base64 = base64.b64encode(image_bytes).decode("utf-8") + image_datauri = f"data:image/png;base64,{image_base64}" + + response = await self.openai_client.chat.completions.create( + model=self.model if self.deployment is None else self.deployment, + messages=[ + { + "role": "system", + "content": "You are a helpful assistant that describes images.", + }, + { + "role": "user", + "content": + [{"text": "Describe this image in detail", "type": "text"}, + {"image_url": {"url": image_datauri}, "type": "image_url"}] + } + ]) + return response.choices[0].message.content.strip() if response.choices else "" + diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py index 5e4a54d482..f7204c4d50 100644 --- a/app/backend/prepdocslib/pdfparser.py +++ b/app/backend/prepdocslib/pdfparser.py @@ -3,7 +3,7 @@ import logging from collections.abc import AsyncGenerator from enum import Enum -from typing import IO, Union +from typing import IO, Union, Optional import pymupdf from azure.ai.documentintelligence.aio import DocumentIntelligenceClient @@ -20,7 +20,7 @@ from pypdf import PdfReader from openai import AsyncOpenAI -from .mediadescriber import ContentUnderstandingDescriber +from .mediadescriber import MediaDescriber, ContentUnderstandingDescriber, MultimodalModelDescriber from .page import Page from .parser import Parser @@ -45,6 +45,11 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: offset += len(page_text) +class MediaDescriptionStrategy(Enum): + NONE = "none" + OPENAI = "openai" + CONTENTUNDERSTANDING = "content_understanding" + class DocumentAnalysisParser(Parser): """ Concrete parser backed by Azure AI Document Intelligence that can parse many document formats into pages @@ -57,13 +62,27 @@ def __init__( credential: Union[AsyncTokenCredential, AzureKeyCredential], model_id="prebuilt-layout", include_media_description: bool = False, + media_description_strategy: Enum = MediaDescriptionStrategy.NONE, + # If using OpenAI, this is the client to use + openai_client: Union[AsyncOpenAI, None] = None, + openai_model: Optional[str] = None, + openai_deployment: Optional[str] = None, + # If using Content Understanding, this is the endpoint for the service content_understanding_endpoint: Union[str, None] = None, ): self.model_id = model_id self.endpoint = endpoint self.credential = credential - self.use_content_understanding = use_content_understanding - self.content_understanding_endpoint = content_understanding_endpoint + self.media_description_strategy = media_description_strategy + if media_description_strategy == MediaDescriptionStrategy.OPENAI: + logger.info("Including media description with OpenAI") + self.use_content_understanding = False + self.openai_client = openai_client + self.openai_model = openai_model + self.openai_deployment = openai_deployment + if media_description_strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING: + logger.info("Including media description with Azure Content Understanding") + self.content_understanding_endpoint = content_understanding_endpoint async def parse(self, content: IO) -> AsyncGenerator[Page, None]: logger.info("Extracting text from '%s' using Azure Document Intelligence", content.name) @@ -72,14 +91,23 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: endpoint=self.endpoint, credential=self.credential ) as document_intelligence_client: file_analyzed = False - if self.use_content_understanding: + + media_describer: Union[ContentUnderstandingDescriber, MultimodalModelDescriber, None] = None + if self.media_description_strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING: if self.content_understanding_endpoint is None: - raise ValueError("Content Understanding is enabled but no endpoint was provided") + raise ValueError("Content Understanding endpoint must be provided when using Content Understanding strategy") if isinstance(self.credential, AzureKeyCredential): raise ValueError( "AzureKeyCredential is not supported for Content Understanding, use keyless auth instead" ) - cu_describer = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.credential) + media_describer = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.credential) + + if self.media_description_strategy == MediaDescriptionStrategy.OPENAI: + if self.openai_client is None or self.openai_model is None: + raise ValueError("OpenAI client must be provided when using OpenAI media description strategy") + media_describer = MultimodalModelDescriber(self.openai_client, self.openai_model, self.openai_deployment) + + if media_describer is not None: content_bytes = content.read() try: poller = await document_intelligence_client.begin_analyze_document( @@ -117,7 +145,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: if table.bounding_regions and table.bounding_regions[0].page_number == page.page_number ] figures_on_page = [] - if self.use_content_understanding: + if self.media_description_strategy != MediaDescriptionStrategy.NONE: figures_on_page = [ figure for figure in (analyze_result.figures or []) @@ -163,13 +191,13 @@ class ObjectType(Enum): page_text += DocumentAnalysisParser.table_to_html(tables_on_page[object_idx]) added_objects.add(mask_char) elif object_type == ObjectType.FIGURE: - if cu_describer is None: - raise ValueError("cu_describer should not be None, unable to describe figure") + if media_describer is None: + raise ValueError("media_describer should not be None, unable to describe figure") if object_idx is None: raise ValueError("Expected object_idx to be set") if mask_char not in added_objects: figure_html = await DocumentAnalysisParser.figure_to_html( - doc_for_pymupdf, figures_on_page[object_idx], cu_describer + doc_for_pymupdf, figures_on_page[object_idx], media_describer ) page_text += figure_html added_objects.add(mask_char) @@ -182,7 +210,7 @@ class ObjectType(Enum): @staticmethod async def figure_to_html( - doc: pymupdf.Document, figure: DocumentFigure, cu_describer: ContentUnderstandingDescriber + doc: pymupdf.Document, figure: DocumentFigure, media_describer: MediaDescriber ) -> str: figure_title = (figure.caption and figure.caption.content) or "" logger.info("Describing figure %s with title '%s'", figure.id, figure_title) @@ -200,7 +228,7 @@ async def figure_to_html( ) page_number = first_region["pageNumber"] # 1-indexed cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box) - figure_description = await cu_describer.describe_image(cropped_img) + figure_description = await media_describer.describe_image(cropped_img) return f"
{figure_title}
{figure_description}
" @staticmethod From 001c86f65f409c97b2f9086cb16c4d33b9c340f8 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Sun, 1 Jun 2025 05:20:05 +0000 Subject: [PATCH 03/77] Prepdocs progress --- app/backend/prepdocs.py | 28 +++++++++++++++++++----- app/backend/prepdocslib/searchmanager.py | 2 +- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index 11ce09269e..64af8300d6 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -83,7 +83,7 @@ def setup_blob_manager( storage_container: str, storage_resource_group: str, subscription_id: str, - search_images: bool, + store_page_images: bool, storage_key: Union[str, None] = None, ): storage_creds: Union[AsyncTokenCredential, str] = azure_credential if storage_key is None else storage_key @@ -94,7 +94,7 @@ def setup_blob_manager( credential=storage_creds, resourceGroup=storage_resource_group, subscriptionId=subscription_id, - store_page_images=search_images, + store_page_images=store_page_images, ) @@ -176,7 +176,6 @@ def setup_file_processors( document_intelligence_key: Union[str, None] = None, local_pdf_parser: bool = False, local_html_parser: bool = False, - search_images: bool = False, use_content_understanding: bool = False, use_multimodal: bool = False, openai_client: Union[AsyncOpenAI, None] = None, @@ -248,6 +247,20 @@ def setup_file_processors( return file_processors +def setup_image_embeddings_service( + azure_credential: AsyncTokenCredential, vision_endpoint: Union[str, None], use_multimodal: bool +) -> Union[ImageEmbeddings, None]: + image_embeddings_service: Optional[ImageEmbeddings] = None + if use_multimodal: + if vision_endpoint is None: + raise ValueError("A computer vision endpoint is required when GPT-4-vision is enabled.") + image_embeddings_service = ImageEmbeddings( + endpoint=vision_endpoint, + token_provider=get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default"), + ) + return image_embeddings_service + + async def main(strategy: Strategy, setup_index: bool = True): if setup_index: await strategy.setup() @@ -372,7 +385,7 @@ async def main(strategy: Strategy, setup_index: bool = True): storage_container=os.environ["AZURE_STORAGE_CONTAINER"], storage_resource_group=os.environ["AZURE_STORAGE_RESOURCE_GROUP"], subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"], - search_images=use_gptvision, + store_page_images=use_multimodal, storage_key=clean_key_if_exists(args.storagekey), ) list_file_strategy = setup_list_file_strategy( @@ -436,12 +449,17 @@ async def main(strategy: Strategy, setup_index: bool = True): document_intelligence_key=clean_key_if_exists(args.documentintelligencekey), local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER") == "true", local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER") == "true", - search_images=use_gptvision, use_content_understanding=use_content_understanding, use_multimodal=use_multimodal, content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"), ) + image_embeddings_service = setup_image_embeddings_service( + azure_credential=azd_credential, + vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"), + use_multimodal=use_multimodal, + ) + ingestion_strategy = FileStrategy( search_info=search_info, list_file_strategy=list_file_strategy, diff --git a/app/backend/prepdocslib/searchmanager.py b/app/backend/prepdocslib/searchmanager.py index e6ca925e24..b5b401c177 100644 --- a/app/backend/prepdocslib/searchmanager.py +++ b/app/backend/prepdocslib/searchmanager.py @@ -316,7 +316,7 @@ async def create_index(self): existing_index.vector_search.profiles.append(text_vector_search_profile) if existing_index.vector_search.algorithms is None: existing_index.vector_search.algorithms = [] - existing_index.vector_search.algorithms.append(text_vector_algorithm) + #existing_index.vector_search.algorithms.append(text_vector_algorithm) if existing_index.vector_search.compressions is None: existing_index.vector_search.compressions = [] existing_index.vector_search.compressions.append(text_vector_compression) From 7c8f8255fa4f288846e6b764f33a1515e936e320 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Mon, 2 Jun 2025 20:32:37 +0000 Subject: [PATCH 04/77] Fix media description with OpenAI --- app/backend/app.py | 5 +- .../approaches/chatreadretrieveread.py | 6 +- app/backend/prepdocs.py | 143 +++++++++++++----- app/backend/prepdocslib/blobmanager.py | 10 +- app/backend/prepdocslib/mediadescriber.py | 11 +- app/backend/prepdocslib/pdfparser.py | 5 +- app/backend/prepdocslib/searchmanager.py | 1 + app/frontend/package-lock.json | 2 +- 8 files changed, 134 insertions(+), 49 deletions(-) diff --git a/app/backend/app.py b/app/backend/app.py index f7e3043fbd..a8f423510c 100644 --- a/app/backend/app.py +++ b/app/backend/app.py @@ -84,6 +84,7 @@ CONFIG_USER_BLOB_CONTAINER_CLIENT, CONFIG_USER_UPLOAD_ENABLED, CONFIG_VECTOR_SEARCH_ENABLED, + CONFIG_MULTIMODAL_ENABLED ) from core.authentication import AuthenticationHelper from core.sessionhelper import create_session_id @@ -705,6 +706,8 @@ async def setup_clients(): query_speller=AZURE_SEARCH_QUERY_SPELLER, prompt_manager=prompt_manager, reasoning_effort=OPENAI_REASONING_EFFORT, + vision_endpoint=AZURE_VISION_ENDPOINT, + vision_token_provider=token_provider, ) @bp.after_app_serving @@ -734,7 +737,7 @@ def create_app(): # Log levels should be one of https://docs.python.org/3/library/logging.html#logging-levels # Set root level to WARNING to avoid seeing overly verbose logs from SDKS - logging.basicConfig(level=logging.WARNING) + logging.basicConfig(level=logging.DEBUG) # Set our own logger levels to INFO by default app_level = os.getenv("APP_LOG_LEVEL", "INFO") app.logger.setLevel(os.getenv("APP_LOG_LEVEL", app_level)) diff --git a/app/backend/approaches/chatreadretrieveread.py b/app/backend/approaches/chatreadretrieveread.py index 2e5e765c68..26925c5e7f 100644 --- a/app/backend/approaches/chatreadretrieveread.py +++ b/app/backend/approaches/chatreadretrieveread.py @@ -1,5 +1,5 @@ from collections.abc import Awaitable -from typing import Any, Optional, Union, cast +from typing import Any, Optional, Union, cast, Callable from azure.search.documents.agent.aio import KnowledgeAgentRetrievalClient from azure.search.documents.aio import SearchClient @@ -47,6 +47,8 @@ def __init__( query_speller: str, prompt_manager: PromptManager, reasoning_effort: Optional[str] = None, + vision_endpoint: Optional[str] = None, + vision_token_provider: Callable[[], Awaitable[str]], ): self.search_client = search_client self.search_index_name = search_index_name @@ -71,6 +73,8 @@ def __init__( self.answer_prompt = self.prompt_manager.load_prompt("chat_answer_question.prompty") self.reasoning_effort = reasoning_effort self.include_token_usage = True + self.vision_endpoint = vision_endpoint + self.vision_token_provider = vision_token_provider async def run_until_final_call( self, diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index 64af8300d6..0f618f8dcd 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -35,6 +35,7 @@ from prepdocslib.strategy import DocumentAction, SearchInfo, Strategy from prepdocslib.textparser import TextParser from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter +from enum import Enum logger = logging.getLogger("scripts") @@ -126,15 +127,23 @@ def setup_list_file_strategy( return list_file_strategy +class OpenAIHost(str, Enum): + OPENAI = "openai" + AZURE = "azure" + AZURE_CUSTOM = "azure_custom" + LOCAL = "local" + + def setup_embeddings_service( azure_credential: AsyncTokenCredential, - openai_host: str, - openai_model_name: str, - openai_service: Union[str, None], - openai_custom_url: Union[str, None], - openai_deployment: Union[str, None], - openai_dimensions: int, - openai_api_version: str, + openai_host: OpenAIHost, + emb_model_name: str, + emb_model_dimensions: int, + azure_openai_service: Union[str, None], + azure_openai_custom_url: Union[str, None], + azure_openai_deployment: Union[str, None], + azure_openai_key: Union[str, None], + azure_openai_api_version: str, openai_key: Union[str, None], openai_org: Union[str, None], disable_vectors: bool = False, @@ -144,17 +153,17 @@ def setup_embeddings_service( logger.info("Not setting up embeddings service") return None - if openai_host != "openai": + if openai_host in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM]: azure_open_ai_credential: Union[AsyncTokenCredential, AzureKeyCredential] = ( - azure_credential if openai_key is None else AzureKeyCredential(openai_key) + azure_credential if azure_openai_key is None else AzureKeyCredential(azure_openai_key) ) return AzureOpenAIEmbeddingService( - open_ai_service=openai_service, - open_ai_custom_url=openai_custom_url, - open_ai_deployment=openai_deployment, - open_ai_model_name=openai_model_name, - open_ai_dimensions=openai_dimensions, - open_ai_api_version=openai_api_version, + open_ai_service=azure_openai_service, + open_ai_custom_url=azure_openai_custom_url, + open_ai_deployment=azure_openai_deployment, + open_ai_model_name=emb_model_name, + open_ai_dimensions=emb_model_dimensions, + open_ai_api_version=azure_openai_api_version, credential=azure_open_ai_credential, disable_batch=disable_batch_vectors, ) @@ -162,13 +171,65 @@ def setup_embeddings_service( if openai_key is None: raise ValueError("OpenAI key is required when using the non-Azure OpenAI API") return OpenAIEmbeddingService( - open_ai_model_name=openai_model_name, - open_ai_dimensions=openai_dimensions, + open_ai_model_name=emb_model_name, + open_ai_dimensions=emb_model_dimensions, credential=openai_key, organization=openai_org, disable_batch=disable_batch_vectors, ) +def setup_openai_client( + openai_host: OpenAIHost, + azure_openai_api_key: Union[str, None] = None, + azure_openai_api_version: Union[str, None] = None, + azure_openai_service: Union[str, None] = None, + azure_openai_custom_url: Union[str, None] = None, + azure_credential: AsyncTokenCredential = None, + openai_api_key: Union[str, None] = None, + openai_organization: Union[str, None] = None, +): + if openai_host not in OpenAIHost: + raise ValueError(f"Invalid OPENAI_HOST value: {openai_host}. Must be one of {[h.value for h in OpenAIHost]}.") + + if openai_host in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM]: + if openai_host == OpenAIHost.AZURE_CUSTOM: + logger.info("OPENAI_HOST is azure_custom, setting up Azure OpenAI custom client") + if not azure_openai_custom_url: + raise ValueError("AZURE_OPENAI_CUSTOM_URL must be set when OPENAI_HOST is azure_custom") + endpoint = azure_openai_custom_url + else: + logger.info("OPENAI_HOST is azure, setting up Azure OpenAI client") + if not azure_openai_service: + raise ValueError("AZURE_OPENAI_SERVICE must be set when OPENAI_HOST is azure") + endpoint = f"https://{azure_openai_service}.openai.azure.com" + if azure_openai_api_key: + logger.info("AZURE_OPENAI_API_KEY_OVERRIDE found, using as api_key for Azure OpenAI client") + openai_client = AsyncAzureOpenAI( + api_version=azure_openai_api_version, azure_endpoint=endpoint, api_key=azure_openai_api_key + ) + else: + logger.info("Using Azure credential (passwordless authentication) for Azure OpenAI client") + token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default") + openai_client = AsyncAzureOpenAI( + api_version=azure_openai_api_version, + azure_endpoint=endpoint, + azure_ad_token_provider=token_provider, + ) + elif openai_host == OpenAIHost.LOCAL: + logger.info("OPENAI_HOST is local, setting up local OpenAI client for OPENAI_BASE_URL with no key") + openai_client = AsyncOpenAI( + base_url=os.environ["OPENAI_BASE_URL"], + api_key="no-key-required", + ) + else: + logger.info( + "OPENAI_HOST is not azure, setting up OpenAI client using OPENAI_API_KEY and OPENAI_ORGANIZATION environment variables" + ) + openai_client = AsyncOpenAI( + api_key=openai_api_key, + organization=openai_organization, + ) + return openai_client def setup_file_processors( azure_credential: AsyncTokenCredential, @@ -194,7 +255,7 @@ def setup_file_processors( doc_int_parser = DocumentAnalysisParser( endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/", credential=documentintelligence_creds, - media_description_strategy = "openai" if use_multimodal else "contentunderstanding" if use_content_understanding else "none", + media_description_strategy = MediaDescriptionStrategy.OPENAI if use_multimodal else MediaDescriptionStrategy.CONTENTUNDERSTANDING if use_content_understanding else MediaDescriptionStrategy.NONE, openai_client=openai_client, openai_model=openai_model, openai_deployment=openai_deployment, @@ -323,7 +384,7 @@ async def main(strategy: Strategy, setup_index: bool = True): args = parser.parse_args() if args.verbose: - logging.basicConfig(format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)]) + logging.basicConfig(format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)], level=logging.WARNING) # We only set the level to INFO for our logger, # to avoid seeing the noisy INFO level logs from the Azure SDKs logger.setLevel(logging.DEBUG) @@ -397,31 +458,38 @@ async def main(strategy: Strategy, setup_index: bool = True): datalake_key=clean_key_if_exists(args.datalakekey), ) - openai_host = os.environ["OPENAI_HOST"] - openai_key = None - if os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE"): - openai_key = os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE") - elif not openai_host.startswith("azure") and os.getenv("OPENAI_API_KEY"): - openai_key = os.getenv("OPENAI_API_KEY") - - openai_dimensions = 1536 + openai_host = OpenAIHost(os.environ["OPENAI_HOST"]) + # https://learn.microsoft.com/azure/ai-services/openai/api-version-deprecation#latest-ga-api-release + azure_openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION") or "2024-06-01" + emb_model_dimensions = 1536 if os.getenv("AZURE_OPENAI_EMB_DIMENSIONS"): - openai_dimensions = int(os.environ["AZURE_OPENAI_EMB_DIMENSIONS"]) + emb_model_dimensions = int(os.environ["AZURE_OPENAI_EMB_DIMENSIONS"]) openai_embeddings_service = setup_embeddings_service( azure_credential=azd_credential, openai_host=openai_host, - openai_model_name=os.environ["AZURE_OPENAI_EMB_MODEL_NAME"], - openai_service=os.getenv("AZURE_OPENAI_SERVICE"), - openai_custom_url=os.getenv("AZURE_OPENAI_CUSTOM_URL"), - openai_deployment=os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT"), - # https://learn.microsoft.com/azure/ai-services/openai/api-version-deprecation#latest-ga-api-release - openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION") or "2024-06-01", - openai_dimensions=openai_dimensions, - openai_key=clean_key_if_exists(openai_key), + emb_model_name=os.environ["AZURE_OPENAI_EMB_MODEL_NAME"], + emb_model_dimensions=emb_model_dimensions, + azure_openai_service=os.getenv("AZURE_OPENAI_SERVICE"), + azure_openai_custom_url=os.getenv("AZURE_OPENAI_CUSTOM_URL"), + azure_openai_deployment=os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT"), + azure_openai_api_version=azure_openai_api_version, + azure_openai_key=os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE"), + openai_key=clean_key_if_exists(os.getenv("OPENAI_API_KEY")), openai_org=os.getenv("OPENAI_ORGANIZATION"), disable_vectors=dont_use_vectors, disable_batch_vectors=args.disablebatchvectors, ) + openai_client = setup_openai_client( + openai_host=openai_host, + azure_openai_api_version=azure_openai_api_version, + azure_openai_service=os.getenv("AZURE_OPENAI_SERVICE"), + azure_openai_custom_url=os.getenv("AZURE_OPENAI_CUSTOM_URL"), + azure_openai_api_key=os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE"), + azure_credential=azd_credential, + openai_api_key=clean_key_if_exists(os.getenv("OPENAI_API_KEY")), + openai_organization=os.getenv("OPENAI_ORGANIZATION"), + ) + ingestion_strategy: Strategy if use_int_vectorization: @@ -452,6 +520,9 @@ async def main(strategy: Strategy, setup_index: bool = True): use_content_understanding=use_content_understanding, use_multimodal=use_multimodal, content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"), + openai_client=openai_client, + openai_model=os.getenv("AZURE_OPENAI_CHATGPT_MODEL"), + openai_deployment=os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT") if openai_host == OpenAIHost.AZURE else None, ) image_embeddings_service = setup_image_embeddings_service( diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py index d5c21e0d41..30b47f136e 100644 --- a/app/backend/prepdocslib/blobmanager.py +++ b/app/backend/prepdocslib/blobmanager.py @@ -60,11 +60,11 @@ async def upload_blob(self, file: File) -> Optional[list[str]]: blob_client = await container_client.upload_blob(blob_name, reopened_file, overwrite=True) file.url = blob_client.url - if self.store_page_images: - if os.path.splitext(file.content.name)[1].lower() == ".pdf": - return await self.upload_pdf_blob_images(service_client, container_client, file) - else: - logger.info("File %s is not a PDF, skipping image upload", file.content.name) + #if self.store_page_images: + # if os.path.splitext(file.content.name)[1].lower() == ".pdf": + # return await self.upload_pdf_blob_images(service_client, container_client, file) + # else: + # logger.info("File %s is not a PDF, skipping image upload", file.content.name) return None diff --git a/app/backend/prepdocslib/mediadescriber.py b/app/backend/prepdocslib/mediadescriber.py index 5a6b2639b8..3d284f29c0 100644 --- a/app/backend/prepdocslib/mediadescriber.py +++ b/app/backend/prepdocslib/mediadescriber.py @@ -121,17 +121,20 @@ async def describe_image(self, image_bytes: bytes) -> str: response = await self.openai_client.chat.completions.create( model=self.model if self.deployment is None else self.deployment, + max_tokens=500, messages=[ { "role": "system", - "content": "You are a helpful assistant that describes images.", + "content": "You are a helpful assistant that describes images from organizational documents.", }, { "role": "user", "content": - [{"text": "Describe this image in detail", "type": "text"}, - {"image_url": {"url": image_datauri}, "type": "image_url"}] + [{"text": "Describe image with no more than 5 sentences. Do not speculate about anything you don't know.", "type": "text"}, + {"image_url": {"url": image_datauri}, "type": "image_url", "detail": "low"}] } ]) - return response.choices[0].message.content.strip() if response.choices else "" + description = response.choices[0].message.content.strip() if response.choices else "" + print(description) + return description diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py index f7204c4d50..94243ef0e8 100644 --- a/app/backend/prepdocslib/pdfparser.py +++ b/app/backend/prepdocslib/pdfparser.py @@ -61,7 +61,6 @@ def __init__( endpoint: str, credential: Union[AsyncTokenCredential, AzureKeyCredential], model_id="prebuilt-layout", - include_media_description: bool = False, media_description_strategy: Enum = MediaDescriptionStrategy.NONE, # If using OpenAI, this is the client to use openai_client: Union[AsyncOpenAI, None] = None, @@ -275,6 +274,10 @@ def crop_image_from_pdf_page( pix = page.get_pixmap(matrix=pymupdf.Matrix(page_dpi / bbox_dpi, page_dpi / bbox_dpi), clip=rect) img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples) + # print out the number of pixels + print(f"Cropped image size: {img.size} pixels") bytes_io = io.BytesIO() img.save(bytes_io, format="PNG") + with open(f"cropped_page_{page_number + 1}.png", "wb") as f: + f.write(bytes_io.getvalue()) return bytes_io.getvalue() diff --git a/app/backend/prepdocslib/searchmanager.py b/app/backend/prepdocslib/searchmanager.py index b5b401c177..6e25b34d57 100644 --- a/app/backend/prepdocslib/searchmanager.py +++ b/app/backend/prepdocslib/searchmanager.py @@ -298,6 +298,7 @@ async def create_index(self): field.name == self.field_name_embedding for field in existing_index.fields ): logger.info("Adding %s field for text embeddings", self.field_name_embedding) + embedding_field.stored = True existing_index.fields.append(embedding_field) if existing_index.vector_search is None: raise ValueError("Vector search is not enabled for the existing index") diff --git a/app/frontend/package-lock.json b/app/frontend/package-lock.json index b4ec8fb7a0..6da48b3591 100644 --- a/app/frontend/package-lock.json +++ b/app/frontend/package-lock.json @@ -44,7 +44,7 @@ "vite": "^5.4.18" }, "engines": { - "node": ">=14.0.0" + "node": ">=20.0.0" } }, "node_modules/@ampproject/remapping": { From ea3ee284f54934904c824572a16ca838926046e5 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Tue, 3 Jun 2025 06:55:48 +0000 Subject: [PATCH 05/77] More prepdocs improvements for image handling --- app/backend/prepdocslib/blobmanager.py | 21 +++--- app/backend/prepdocslib/embeddings.py | 24 +++---- app/backend/prepdocslib/filestrategy.py | 27 +++++--- app/backend/prepdocslib/goals.json | 11 +++- app/backend/prepdocslib/mediadescriber.py | 5 +- app/backend/prepdocslib/page.py | 32 ++++++--- app/backend/prepdocslib/pdfparser.py | 29 +++++---- app/backend/prepdocslib/searchmanager.py | 79 ++++++++++++----------- infra/main.bicep | 2 +- 9 files changed, 133 insertions(+), 97 deletions(-) diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py index 30b47f136e..628d3e0fc7 100644 --- a/app/backend/prepdocslib/blobmanager.py +++ b/app/backend/prepdocslib/blobmanager.py @@ -56,16 +56,21 @@ async def upload_blob(self, file: File) -> Optional[list[str]]: if file.url is None: with open(file.content.name, "rb") as reopened_file: blob_name = BlobManager.blob_name_from_file_name(file.content.name) - logger.info("Uploading blob for whole file -> %s", blob_name) + logger.info("Uploading blob for document %s", blob_name) blob_client = await container_client.upload_blob(blob_name, reopened_file, overwrite=True) file.url = blob_client.url - - #if self.store_page_images: - # if os.path.splitext(file.content.name)[1].lower() == ".pdf": - # return await self.upload_pdf_blob_images(service_client, container_client, file) - # else: - # logger.info("File %s is not a PDF, skipping image upload", file.content.name) - + return None + + async def upload_document_image(self, document_file: File, image_bytes: bytes, image_filename: str) -> Optional[str]: + async with BlobServiceClient( + account_url=self.endpoint, credential=self.credential, max_single_put_size=4 * 1024 * 1024 + ) as service_client, service_client.get_container_client(self.container) as container_client: + if not await container_client.exists(): + await container_client.create_container() + blob_name = BlobManager.blob_name_from_file_name(document_file.content.name) + "/" + image_filename + logger.info("Uploading blob for document image %s", blob_name) + blob_client = await container_client.upload_blob(blob_name, io.BytesIO(image_bytes), overwrite=True) + return blob_client.url return None def get_managedidentity_connectionstring(self): diff --git a/app/backend/prepdocslib/embeddings.py b/app/backend/prepdocslib/embeddings.py index df56f39c08..64c11fe14d 100644 --- a/app/backend/prepdocslib/embeddings.py +++ b/app/backend/prepdocslib/embeddings.py @@ -236,28 +236,24 @@ def __init__(self, endpoint: str, token_provider: Callable[[], Awaitable[str]]): self.token_provider = token_provider self.endpoint = endpoint - async def create_embeddings(self, blob_urls: list[str]) -> list[list[float]]: + async def create_embedding(self, image_bytes: bytes) -> list[float]: endpoint = urljoin(self.endpoint, "computervision/retrieval:vectorizeImage") - headers = {"Content-Type": "application/json"} params = {"api-version": "2024-02-01", "model-version": "2023-04-15"} - headers["Authorization"] = "Bearer " + await self.token_provider() + headers = {"Authorization": "Bearer " + await self.token_provider()} - embeddings: list[list[float]] = [] async with aiohttp.ClientSession(headers=headers) as session: - for blob_url in blob_urls: - async for attempt in AsyncRetrying( - retry=retry_if_exception_type(Exception), + async for attempt in AsyncRetrying( + retry=retry_if_exception_type(Exception), wait=wait_random_exponential(min=15, max=60), stop=stop_after_attempt(15), before_sleep=self.before_retry_sleep, ): - with attempt: - body = {"url": blob_url} - async with session.post(url=endpoint, params=params, json=body) as resp: - resp_json = await resp.json() - embeddings.append(resp_json["vector"]) - - return embeddings + with attempt: + async with session.post(url=endpoint, params=params, data=image_bytes) as resp: + resp_json = await resp.json() + return resp_json["vector"] + + return [] def before_retry_sleep(self, retry_state): logger.info("Rate limited on the Vision embeddings API, sleeping before retrying...") diff --git a/app/backend/prepdocslib/filestrategy.py b/app/backend/prepdocslib/filestrategy.py index 37f399cf4b..b44ea46ae7 100644 --- a/app/backend/prepdocslib/filestrategy.py +++ b/app/backend/prepdocslib/filestrategy.py @@ -18,7 +18,8 @@ async def parse_file( file: File, file_processors: dict[str, FileProcessor], category: Optional[str] = None, - image_embeddings: Optional[ImageEmbeddings] = None, + blob_manager: Optional[BlobManager] = None, + image_embeddings_client: Optional[ImageEmbeddings] = None, ) -> list[Section]: key = file.file_extension().lower() processor = file_processors.get(key) @@ -27,12 +28,24 @@ async def parse_file( return [] logger.info("Ingesting '%s'", file.filename()) pages = [page async for page in processor.parser.parse(content=file.content)] + for page in pages: + for image in page.images: + if image.url is None: + image.url = await blob_manager.upload_document_image(file, image.bytes, image.filename) + if image_embeddings_client: + image.embedding = await image_embeddings_client.create_embedding(image.bytes) logger.info("Splitting '%s' into sections", file.filename()) - if image_embeddings: - logger.warning("Each page will be split into smaller chunks of text, but images will be of the entire page.") sections = [ Section(split_page, content=file, category=category) for split_page in processor.splitter.split_pages(pages) ] + # For now, add the images back to each split page based off split_page.page_num + for section in sections: + section.split_page.images = [ + image for page in pages if page.page_num == section.split_page.page_num for image in page.images + ] + logger.info( + "Section for page %d has %d images", section.split_page.page_num, len(section.split_page.images) + ) return sections @@ -102,13 +115,9 @@ async def run(self): files = self.list_file_strategy.list() async for file in files: try: - sections = await parse_file(file, self.file_processors, self.category, self.image_embeddings) + sections = await parse_file(file, self.file_processors, self.category, self.blob_manager, self.image_embeddings) if sections: - blob_sas_uris = await self.blob_manager.upload_blob(file) - blob_image_embeddings: Optional[list[list[float]]] = None - if self.image_embeddings and blob_sas_uris: - blob_image_embeddings = await self.image_embeddings.create_embeddings(blob_sas_uris) - await self.search_manager.update_content(sections, blob_image_embeddings, url=file.url) + await self.search_manager.update_content(sections, url=file.url) finally: if file: file.close() diff --git a/app/backend/prepdocslib/goals.json b/app/backend/prepdocslib/goals.json index 523b48252b..61e0577fde 100644 --- a/app/backend/prepdocslib/goals.json +++ b/app/backend/prepdocslib/goals.json @@ -5,5 +5,12 @@ "oids": [], "groups": [], "images": # collection of objects with fields https://learn.microsoft.com/en-us/azure/search/vector-search-multi-vector-fields - [ {embedding, url, verbalization, boundingbox}, - {embedding, url, verbalization, boundingbox} ] \ No newline at end of file + [ {embedding, url, description, boundingbox}, + {embedding, url, description, boundingbox} ] + +# Consider gpt-4.1-mini as default: pricier? but relatively not pricey compared to o3 and gpt-4o. run our evals. its better at instruction following. + +# Parse each page, get back text with descritpions, associate each page with images on that page +# Each image needs the citation file.pdf#figure=1 via Pillow +# Each image needs to be stored in Blob storage +# Update the search index with all the info \ No newline at end of file diff --git a/app/backend/prepdocslib/mediadescriber.py b/app/backend/prepdocslib/mediadescriber.py index 3d284f29c0..165c856ab4 100644 --- a/app/backend/prepdocslib/mediadescriber.py +++ b/app/backend/prepdocslib/mediadescriber.py @@ -86,7 +86,6 @@ async def create_analyzer(self): await self.poll_api(session, poll_url, headers) async def describe_image(self, image_bytes: bytes) -> str: - logger.info("Sending image to Azure Content Understanding service...") async with aiohttp.ClientSession() as session: token = await self.credential.get_token("https://cognitiveservices.azure.com/.default") headers = {"Authorization": "Bearer " + token.token} @@ -115,7 +114,6 @@ def __init__(self, openai_client: AsyncOpenAI, model: str, deployment: str): self.deployment = deployment async def describe_image(self, image_bytes: bytes) -> str: - logger.info("Describing image using LLM...") image_base64 = base64.b64encode(image_bytes).decode("utf-8") image_datauri = f"data:image/png;base64,{image_base64}" @@ -131,10 +129,9 @@ async def describe_image(self, image_bytes: bytes) -> str: "role": "user", "content": [{"text": "Describe image with no more than 5 sentences. Do not speculate about anything you don't know.", "type": "text"}, - {"image_url": {"url": image_datauri}, "type": "image_url", "detail": "low"}] + {"image_url": {"url": image_datauri}, "type": "image_url", "detail": "auto"}] } ]) description = response.choices[0].message.content.strip() if response.choices else "" - print(description) return description diff --git a/app/backend/prepdocslib/page.py b/app/backend/prepdocslib/page.py index 857235c571..d203b5e159 100644 --- a/app/backend/prepdocslib/page.py +++ b/app/backend/prepdocslib/page.py @@ -1,3 +1,17 @@ +from typing import Sequence +from dataclasses import dataclass, field + + +@dataclass +class ImageOnPage: + bytes: bytes + bbox: tuple[float, float, float, float] + filename: str + description: str + url: str | None = None + embedding: list[float] | None = None + +@dataclass class Page: """ A single page from a document @@ -7,13 +21,12 @@ class Page: offset (int): If the text of the entire Document was concatenated into a single string, the index of the first character on the page. For example, if page 1 had the text "hello" and page 2 had the text "world", the offset of page 2 is 5 ("hellow") text (str): The text of the page """ + page_num: int + offset: int + text: str + images: list[ImageOnPage] = field(default_factory=list) - def __init__(self, page_num: int, offset: int, text: str): - self.page_num = page_num - self.offset = offset - self.text = text - - +@dataclass class SplitPage: """ A section of a page that has been split into a smaller chunk. @@ -22,7 +35,6 @@ class SplitPage: page_num (int): Page number (0-indexed) text (str): The text of the section """ - - def __init__(self, page_num: int, text: str): - self.page_num = page_num - self.text = text + page_num: int + text: str + images: list[ImageOnPage] = field(default_factory=list) \ No newline at end of file diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py index 94243ef0e8..4b8daa6dd5 100644 --- a/app/backend/prepdocslib/pdfparser.py +++ b/app/backend/prepdocslib/pdfparser.py @@ -21,7 +21,7 @@ from openai import AsyncOpenAI from .mediadescriber import MediaDescriber, ContentUnderstandingDescriber, MultimodalModelDescriber -from .page import Page +from .page import Page, ImageOnPage from .parser import Parser logger = logging.getLogger("scripts") @@ -50,6 +50,8 @@ class MediaDescriptionStrategy(Enum): OPENAI = "openai" CONTENTUNDERSTANDING = "content_understanding" + + class DocumentAnalysisParser(Parser): """ Concrete parser backed by Azure AI Document Intelligence that can parse many document formats into pages @@ -68,6 +70,7 @@ def __init__( openai_deployment: Optional[str] = None, # If using Content Understanding, this is the endpoint for the service content_understanding_endpoint: Union[str, None] = None, + # should this take the blob storage info too? ): self.model_id = model_id self.endpoint = endpoint @@ -137,6 +140,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: analyze_result: AnalyzeResult = await poller.result() offset = 0 + for page in analyze_result.pages: tables_on_page = [ table @@ -150,6 +154,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: for figure in (analyze_result.figures or []) if figure.bounding_regions and figure.bounding_regions[0].page_number == page.page_number ] + page_images: list[ImageOnPage] = [] class ObjectType(Enum): NONE = -1 @@ -195,24 +200,25 @@ class ObjectType(Enum): if object_idx is None: raise ValueError("Expected object_idx to be set") if mask_char not in added_objects: - figure_html = await DocumentAnalysisParser.figure_to_html( + image_on_page = await DocumentAnalysisParser.process_figure( doc_for_pymupdf, figures_on_page[object_idx], media_describer ) - page_text += figure_html + page_images.append(image_on_page) + page_text += image_on_page.description added_objects.add(mask_char) # We remove these comments since they are not needed and skew the page numbers page_text = page_text.replace("", "") # We remove excess newlines at the beginning and end of the page page_text = page_text.strip() - yield Page(page_num=page.page_number - 1, offset=offset, text=page_text) + yield Page(page_num=page.page_number - 1, offset=offset, text=page_text, images=page_images) offset += len(page_text) @staticmethod - async def figure_to_html( + async def process_figure( doc: pymupdf.Document, figure: DocumentFigure, media_describer: MediaDescriber ) -> str: figure_title = (figure.caption and figure.caption.content) or "" - logger.info("Describing figure %s with title '%s'", figure.id, figure_title) + logger.info("Describing figure %s with title '%s' using %s", figure.id, figure_title, type(media_describer).__name__) if not figure.bounding_regions: return f"
{figure_title}
" if len(figure.bounding_regions) > 1: @@ -228,7 +234,12 @@ async def figure_to_html( page_number = first_region["pageNumber"] # 1-indexed cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box) figure_description = await media_describer.describe_image(cropped_img) - return f"
{figure_title}
{figure_description}
" + return ImageOnPage( + bytes=cropped_img, + filename=f"page_{page_number}_figure_{figure.id}.png", + bbox=bounding_box, + description=f"
{figure_title}
{figure_description}
" + ) @staticmethod def table_to_html(table: DocumentTable): @@ -274,10 +285,6 @@ def crop_image_from_pdf_page( pix = page.get_pixmap(matrix=pymupdf.Matrix(page_dpi / bbox_dpi, page_dpi / bbox_dpi), clip=rect) img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples) - # print out the number of pixels - print(f"Cropped image size: {img.size} pixels") bytes_io = io.BytesIO() img.save(bytes_io, format="PNG") - with open(f"cropped_page_{page_number + 1}.png", "wb") as f: - f.write(bytes_io.getvalue()) return bytes_io.getvalue() diff --git a/app/backend/prepdocslib/searchmanager.py b/app/backend/prepdocslib/searchmanager.py index 6e25b34d57..fc0575899f 100644 --- a/app/backend/prepdocslib/searchmanager.py +++ b/app/backend/prepdocslib/searchmanager.py @@ -46,9 +46,10 @@ class Section: """ def __init__(self, split_page: SplitPage, content: File, category: Optional[str] = None): - self.split_page = split_page - self.content = content - self.category = category + self.split_page = split_page # content comes from here + self.content = content # sourcepage and sourcefile come from here + self.category = category + # this also needs images which will become the images field class SearchManager: @@ -82,7 +83,7 @@ async def create_index(self): async with self.search_info.create_search_index_client() as search_index_client: embedding_field = None - image_embedding_field = None + images_field = None text_vector_search_profile = None text_vector_algorithm = None text_vector_compression = None @@ -148,23 +149,29 @@ async def create_index(self): if self.search_images: image_vector_algorithm = HnswAlgorithmConfiguration( - name="image_hnsw_config", + name="images_hnsw_config", parameters=HnswParameters(metric="cosine"), ) image_vector_search_profile = VectorSearchProfile( - name="imageEmbedding-profile", + name="images_embedding_profile", algorithm_configuration_name=image_vector_algorithm.name, ) - image_embedding_field = SearchField( - name="imageEmbedding", - type=SearchFieldDataType.Collection(SearchFieldDataType.Single), - hidden=False, - searchable=True, - filterable=False, - sortable=False, - facetable=False, - vector_search_dimensions=1024, - vector_search_profile_name=image_vector_search_profile.name, + images_field = SearchField( + name="images", + type=SearchFieldDataType.Collection(SearchFieldDataType.ComplexType), + fields=[ + SearchField(name="embedding", + type=SearchFieldDataType.Collection(SearchFieldDataType.Single), + searchable=True, + stored=False, + vector_search_dimensions=1024, + vector_search_profile_name=image_vector_search_profile.name), + SearchField(name="url", type=SearchFieldDataType.String, searchable=False, filterable=True, sortable=False, facetable=True), + SearchField(name="description", type=SearchFieldDataType.String, searchable=True, filterable=False, sortable=False, facetable=False), + SearchField(name="boundingbox", + type=SearchFieldDataType.Collection(SearchFieldDataType.Int32), + searchable=False, filterable=False, sortable=False, facetable=False), + ] ) if self.search_info.index_name not in [name async for name in search_index_client.list_index_names()]: @@ -247,9 +254,9 @@ async def create_index(self): vector_algorithms.append(text_vector_algorithm) vector_compressions.append(text_vector_compression) - if image_embedding_field: - logger.info("Including %s field for image vectors in new index", image_embedding_field.name) - fields.append(image_embedding_field) + if images_field: + logger.info("Including %s field for image descriptions and vectors in new index", images_field.name) + fields.append(images_field) if image_vector_search_profile is None or image_vector_algorithm is None: raise ValueError("Image search profile and algorithm must be set") vector_search_profiles.append(image_vector_search_profile) @@ -323,9 +330,10 @@ async def create_index(self): existing_index.vector_search.compressions.append(text_vector_compression) await search_index_client.create_or_update_index(existing_index) - if image_embedding_field and not any(field.name == "imageEmbedding" for field in existing_index.fields): - logger.info("Adding %s field for image embeddings", image_embedding_field.name) - existing_index.fields.append(image_embedding_field) + if images_field and not any(field.name == "images" for field in existing_index.fields): + logger.info("Adding %s field for image embeddings", images_field.name) + images_field.fields[0].stored = True + existing_index.fields.append(images_field) if image_vector_search_profile is None or image_vector_algorithm is None: raise ValueError("Image vector search profile and algorithm must be set") if existing_index.vector_search is None: @@ -412,7 +420,7 @@ async def create_agent(self): logger.info("Agent %s created successfully", self.search_info.agent_name) async def update_content( - self, sections: list[Section], image_embeddings: Optional[list[list[float]]] = None, url: Optional[str] = None + self, sections: list[Section], url: Optional[str] = None ): MAX_BATCH_SIZE = 1000 section_batches = [sections[i : i + MAX_BATCH_SIZE] for i in range(0, len(sections), MAX_BATCH_SIZE)] @@ -424,18 +432,16 @@ async def update_content( "id": f"{section.content.filename_to_id()}-page-{section_index + batch_index * MAX_BATCH_SIZE}", "content": section.split_page.text, "category": section.category, - "sourcepage": ( - BlobManager.blob_image_name_from_file_page( - filename=section.content.filename(), - page=section.split_page.page_num, - ) - if image_embeddings - else BlobManager.sourcepage_from_file_page( - filename=section.content.filename(), - page=section.split_page.page_num, - ) - ), + "sourcepage": BlobManager.sourcepage_from_file_page(filename=section.content.filename(), page=section.split_page.page_num), "sourcefile": section.content.filename(), + "images": [ + { + "url": image.url, + "description": image.description, + #"boundingbox": list(image.bbox), # TODO: decide if it should be a float, ask mattg + "embedding": image.embedding, + } + for image in section.split_page.images], **section.content.acls, } for section_index, section in enumerate(batch) @@ -451,10 +457,7 @@ async def update_content( ) for i, document in enumerate(documents): document[self.field_name_embedding] = embeddings[i] - if image_embeddings: - for i, (document, section) in enumerate(zip(documents, batch)): - document["imageEmbedding"] = image_embeddings[section.split_page.page_num] - + logger.info("Uploading batch %d with %d sections to search index '%s'", batch_index + 1, len(documents), self.search_info.index_name) await search_client.upload_documents(documents) async def remove_content(self, path: Optional[str] = None, only_oid: Optional[str] = None): diff --git a/infra/main.bicep b/infra/main.bicep index 299b85e214..ae18d38aa1 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -154,7 +154,7 @@ var chatGpt = { deploymentName: !empty(chatGptDeploymentName) ? chatGptDeploymentName : 'chat' deploymentVersion: !empty(chatGptDeploymentVersion) ? chatGptDeploymentVersion : '2024-07-18' deploymentSkuName: !empty(chatGptDeploymentSkuName) ? chatGptDeploymentSkuName : 'GlobalStandard' // Not backward-compatible - deploymentCapacity: chatGptDeploymentCapacity != 0 ? chatGptDeploymentCapacity : 30 + deploymentCapacity: chatGptDeploymentCapacity != 0 ? chatGptDeploymentCapacity : 50 } param embeddingModelName string = '' From e85f8c570925fc89310254e938068c076d247872 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Tue, 3 Jun 2025 12:31:19 -0700 Subject: [PATCH 06/77] Store bbox as list of pixel floats, add storage container just for extracted images --- app/backend/app.py | 22 +++++++- app/backend/config.py | 3 +- app/backend/prepdocs.py | 33 +++++++++--- app/backend/prepdocslib/blobmanager.py | 14 +++-- app/backend/prepdocslib/page.py | 9 ++-- app/backend/prepdocslib/pdfparser.py | 45 +++++++++------- app/backend/prepdocslib/searchmanager.py | 65 +++++++++++++++++------- infra/main.bicep | 9 ++++ 8 files changed, 146 insertions(+), 54 deletions(-) diff --git a/app/backend/app.py b/app/backend/app.py index a8f423510c..f9215a9682 100644 --- a/app/backend/app.py +++ b/app/backend/app.py @@ -66,8 +66,10 @@ CONFIG_CHAT_HISTORY_COSMOS_ENABLED, CONFIG_CREDENTIAL, CONFIG_DEFAULT_REASONING_EFFORT, + CONFIG_IMAGE_BLOB_CONTAINER_CLIENT, # Added this line CONFIG_INGESTER, CONFIG_LANGUAGE_PICKER_ENABLED, + CONFIG_MULTIMODAL_ENABLED, CONFIG_OPENAI_CLIENT, CONFIG_QUERY_REWRITING_ENABLED, CONFIG_REASONING_EFFORT_ENABLED, @@ -84,7 +86,6 @@ CONFIG_USER_BLOB_CONTAINER_CLIENT, CONFIG_USER_UPLOAD_ENABLED, CONFIG_VECTOR_SEARCH_ENABLED, - CONFIG_MULTIMODAL_ENABLED ) from core.authentication import AuthenticationHelper from core.sessionhelper import create_session_id @@ -182,7 +183,9 @@ async def ask(auth_claims: dict[str, Any]): context["auth_claims"] = auth_claims try: approach: Approach = cast(Approach, current_app.config[CONFIG_ASK_APPROACH]) - r = await approach.run(request_json["messages"], context=context, session_state=request_json.get("session_state")) + r = await approach.run( + request_json["messages"], context=context, session_state=request_json.get("session_state") + ) return jsonify(r) except Exception as error: return error_response(error, "/ask") @@ -404,6 +407,7 @@ async def setup_clients(): # Replace these with your own values, either in environment variables or directly here AZURE_STORAGE_ACCOUNT = os.environ["AZURE_STORAGE_ACCOUNT"] AZURE_STORAGE_CONTAINER = os.environ["AZURE_STORAGE_CONTAINER"] + AZURE_IMAGESTORAGE_CONTAINER = os.environ.get("AZURE_IMAGESTORAGE_CONTAINER") AZURE_USERSTORAGE_ACCOUNT = os.environ.get("AZURE_USERSTORAGE_ACCOUNT") AZURE_USERSTORAGE_CONTAINER = os.environ.get("AZURE_USERSTORAGE_CONTAINER") AZURE_SEARCH_SERVICE = os.environ["AZURE_SEARCH_SERVICE"] @@ -511,6 +515,15 @@ async def setup_clients(): f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net", AZURE_STORAGE_CONTAINER, credential=azure_credential ) + # Set up the image storage container client if configured + image_blob_container_client = None + if AZURE_IMAGESTORAGE_CONTAINER: + image_blob_container_client = ContainerClient( + f"https://{AZURE_STORAGE_ACCOUNT}.blob.core.windows.net", + AZURE_IMAGESTORAGE_CONTAINER, + credential=azure_credential, + ) + # Set up authentication helper search_index = None if AZURE_USE_AUTHENTICATION: @@ -636,6 +649,8 @@ async def setup_clients(): current_app.config[CONFIG_SEARCH_CLIENT] = search_client current_app.config[CONFIG_AGENT_CLIENT] = agent_client current_app.config[CONFIG_BLOB_CONTAINER_CLIENT] = blob_container_client + if image_blob_container_client: + current_app.config[CONFIG_IMAGE_BLOB_CONTAINER_CLIENT] = image_blob_container_client current_app.config[CONFIG_AUTH_CLIENT] = auth_helper current_app.config[CONFIG_SEMANTIC_RANKER_DEPLOYED] = AZURE_SEARCH_SEMANTIC_RANKER != "disabled" @@ -710,12 +725,15 @@ async def setup_clients(): vision_token_provider=token_provider, ) + @bp.after_app_serving async def close_clients(): await current_app.config[CONFIG_SEARCH_CLIENT].close() await current_app.config[CONFIG_BLOB_CONTAINER_CLIENT].close() if current_app.config.get(CONFIG_USER_BLOB_CONTAINER_CLIENT): await current_app.config[CONFIG_USER_BLOB_CONTAINER_CLIENT].close() + if current_app.config.get(CONFIG_IMAGE_BLOB_CONTAINER_CLIENT): + await current_app.config[CONFIG_IMAGE_BLOB_CONTAINER_CLIENT].close() def create_app(): diff --git a/app/backend/config.py b/app/backend/config.py index b881ff857f..70fd591a56 100644 --- a/app/backend/config.py +++ b/app/backend/config.py @@ -3,6 +3,7 @@ CONFIG_ASK_APPROACH = "ask_approach" CONFIG_CHAT_APPROACH = "chat_approach" CONFIG_BLOB_CONTAINER_CLIENT = "blob_container_client" +CONFIG_IMAGE_BLOB_CONTAINER_CLIENT = "image_blob_container_client" CONFIG_USER_UPLOAD_ENABLED = "user_upload_enabled" CONFIG_USER_BLOB_CONTAINER_CLIENT = "user_blob_container_client" CONFIG_AUTH_CLIENT = "auth_client" @@ -30,4 +31,4 @@ CONFIG_COSMOS_HISTORY_CLIENT = "cosmos_history_client" CONFIG_COSMOS_HISTORY_CONTAINER = "cosmos_history_container" CONFIG_COSMOS_HISTORY_VERSION = "cosmos_history_version" -CONFIG_MULTIMODAL_ENABLED = "multimodal_enabled" \ No newline at end of file +CONFIG_MULTIMODAL_ENABLED = "multimodal_enabled" diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index 0f618f8dcd..1eb918a3ca 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -2,13 +2,14 @@ import asyncio import logging import os +from enum import Enum from typing import Optional, Union from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider -from rich.logging import RichHandler from openai import AsyncAzureOpenAI, AsyncOpenAI +from rich.logging import RichHandler from load_azd_env import load_azd_env from prepdocslib.blobmanager import BlobManager @@ -31,11 +32,14 @@ LocalListFileStrategy, ) from prepdocslib.parser import Parser -from prepdocslib.pdfparser import DocumentAnalysisParser, LocalPdfParser, MediaDescriptionStrategy +from prepdocslib.pdfparser import ( + DocumentAnalysisParser, + LocalPdfParser, + MediaDescriptionStrategy, +) from prepdocslib.strategy import DocumentAction, SearchInfo, Strategy from prepdocslib.textparser import TextParser from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter -from enum import Enum logger = logging.getLogger("scripts") @@ -86,11 +90,14 @@ def setup_blob_manager( subscription_id: str, store_page_images: bool, storage_key: Union[str, None] = None, + image_storage_container: Union[str, None] = None, # Added this parameter ): storage_creds: Union[AsyncTokenCredential, str] = azure_credential if storage_key is None else storage_key + return BlobManager( endpoint=f"https://{storage_account}.blob.core.windows.net", container=storage_container, + image_container=image_storage_container, account=storage_account, credential=storage_creds, resourceGroup=storage_resource_group, @@ -178,6 +185,7 @@ def setup_embeddings_service( disable_batch=disable_batch_vectors, ) + def setup_openai_client( openai_host: OpenAIHost, azure_openai_api_key: Union[str, None] = None, @@ -231,6 +239,7 @@ def setup_openai_client( ) return openai_client + def setup_file_processors( azure_credential: AsyncTokenCredential, document_intelligence_service: Union[str, None], @@ -255,7 +264,15 @@ def setup_file_processors( doc_int_parser = DocumentAnalysisParser( endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/", credential=documentintelligence_creds, - media_description_strategy = MediaDescriptionStrategy.OPENAI if use_multimodal else MediaDescriptionStrategy.CONTENTUNDERSTANDING if use_content_understanding else MediaDescriptionStrategy.NONE, + media_description_strategy=( + MediaDescriptionStrategy.OPENAI + if use_multimodal + else ( + MediaDescriptionStrategy.CONTENTUNDERSTANDING + if use_content_understanding + else MediaDescriptionStrategy.NONE + ) + ), openai_client=openai_client, openai_model=openai_model, openai_deployment=openai_deployment, @@ -384,7 +401,9 @@ async def main(strategy: Strategy, setup_index: bool = True): args = parser.parse_args() if args.verbose: - logging.basicConfig(format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)], level=logging.WARNING) + logging.basicConfig( + format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)], level=logging.WARNING + ) # We only set the level to INFO for our logger, # to avoid seeing the noisy INFO level logs from the Azure SDKs logger.setLevel(logging.DEBUG) @@ -448,6 +467,7 @@ async def main(strategy: Strategy, setup_index: bool = True): subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"], store_page_images=use_multimodal, storage_key=clean_key_if_exists(args.storagekey), + image_storage_container=os.environ.get("AZURE_IMAGESTORAGE_CONTAINER"), # Pass the image container ) list_file_strategy = setup_list_file_strategy( azure_credential=azd_credential, @@ -460,7 +480,7 @@ async def main(strategy: Strategy, setup_index: bool = True): openai_host = OpenAIHost(os.environ["OPENAI_HOST"]) # https://learn.microsoft.com/azure/ai-services/openai/api-version-deprecation#latest-ga-api-release - azure_openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION") or "2024-06-01" + azure_openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION") or "2024-06-01" emb_model_dimensions = 1536 if os.getenv("AZURE_OPENAI_EMB_DIMENSIONS"): emb_model_dimensions = int(os.environ["AZURE_OPENAI_EMB_DIMENSIONS"]) @@ -490,7 +510,6 @@ async def main(strategy: Strategy, setup_index: bool = True): openai_organization=os.getenv("OPENAI_ORGANIZATION"), ) - ingestion_strategy: Strategy if use_int_vectorization: diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py index 628d3e0fc7..f1d6764394 100644 --- a/app/backend/prepdocslib/blobmanager.py +++ b/app/backend/prepdocslib/blobmanager.py @@ -35,11 +35,13 @@ def __init__( resourceGroup: str, subscriptionId: str, store_page_images: bool = False, + image_container: Optional[str] = None, # Added this parameter ): self.endpoint = endpoint self.credential = credential self.account = account self.container = container + self.image_container = image_container self.store_page_images = store_page_images self.resourceGroup = resourceGroup self.subscriptionId = subscriptionId @@ -60,11 +62,17 @@ async def upload_blob(self, file: File) -> Optional[list[str]]: blob_client = await container_client.upload_blob(blob_name, reopened_file, overwrite=True) file.url = blob_client.url return None - - async def upload_document_image(self, document_file: File, image_bytes: bytes, image_filename: str) -> Optional[str]: + + async def upload_document_image( + self, document_file: File, image_bytes: bytes, image_filename: str + ) -> Optional[str]: + if self.image_container is None: + raise ValueError( + "Image container name is not set. Re-run `azd provision` to automatically set up the images container." + ) async with BlobServiceClient( account_url=self.endpoint, credential=self.credential, max_single_put_size=4 * 1024 * 1024 - ) as service_client, service_client.get_container_client(self.container) as container_client: + ) as service_client, service_client.get_container_client(self.image_container) as container_client: if not await container_client.exists(): await container_client.create_container() blob_name = BlobManager.blob_name_from_file_name(document_file.content.name) + "/" + image_filename diff --git a/app/backend/prepdocslib/page.py b/app/backend/prepdocslib/page.py index d203b5e159..8f444b142e 100644 --- a/app/backend/prepdocslib/page.py +++ b/app/backend/prepdocslib/page.py @@ -1,16 +1,16 @@ -from typing import Sequence from dataclasses import dataclass, field @dataclass class ImageOnPage: bytes: bytes - bbox: tuple[float, float, float, float] + bbox: list[float, float, float, float] # Pixels filename: str description: str url: str | None = None embedding: list[float] | None = None + @dataclass class Page: """ @@ -21,11 +21,13 @@ class Page: offset (int): If the text of the entire Document was concatenated into a single string, the index of the first character on the page. For example, if page 1 had the text "hello" and page 2 had the text "world", the offset of page 2 is 5 ("hellow") text (str): The text of the page """ + page_num: int offset: int text: str images: list[ImageOnPage] = field(default_factory=list) + @dataclass class SplitPage: """ @@ -35,6 +37,7 @@ class SplitPage: page_num (int): Page number (0-indexed) text (str): The text of the section """ + page_num: int text: str - images: list[ImageOnPage] = field(default_factory=list) \ No newline at end of file + images: list[ImageOnPage] = field(default_factory=list) diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py index 4b8daa6dd5..b5d936f225 100644 --- a/app/backend/prepdocslib/pdfparser.py +++ b/app/backend/prepdocslib/pdfparser.py @@ -3,7 +3,7 @@ import logging from collections.abc import AsyncGenerator from enum import Enum -from typing import IO, Union, Optional +from typing import IO, Optional, Union import pymupdf from azure.ai.documentintelligence.aio import DocumentIntelligenceClient @@ -16,12 +16,16 @@ from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential from azure.core.exceptions import HttpResponseError +from openai import AsyncOpenAI from PIL import Image from pypdf import PdfReader -from openai import AsyncOpenAI -from .mediadescriber import MediaDescriber, ContentUnderstandingDescriber, MultimodalModelDescriber -from .page import Page, ImageOnPage +from .mediadescriber import ( + ContentUnderstandingDescriber, + MediaDescriber, + MultimodalModelDescriber, +) +from .page import ImageOnPage, Page from .parser import Parser logger = logging.getLogger("scripts") @@ -51,7 +55,6 @@ class MediaDescriptionStrategy(Enum): CONTENTUNDERSTANDING = "content_understanding" - class DocumentAnalysisParser(Parser): """ Concrete parser backed by Azure AI Document Intelligence that can parse many document formats into pages @@ -97,18 +100,22 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: media_describer: Union[ContentUnderstandingDescriber, MultimodalModelDescriber, None] = None if self.media_description_strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING: if self.content_understanding_endpoint is None: - raise ValueError("Content Understanding endpoint must be provided when using Content Understanding strategy") + raise ValueError( + "Content Understanding endpoint must be provided when using Content Understanding strategy" + ) if isinstance(self.credential, AzureKeyCredential): raise ValueError( "AzureKeyCredential is not supported for Content Understanding, use keyless auth instead" ) media_describer = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.credential) - + if self.media_description_strategy == MediaDescriptionStrategy.OPENAI: if self.openai_client is None or self.openai_model is None: raise ValueError("OpenAI client must be provided when using OpenAI media description strategy") - media_describer = MultimodalModelDescriber(self.openai_client, self.openai_model, self.openai_deployment) - + media_describer = MultimodalModelDescriber( + self.openai_client, self.openai_model, self.openai_deployment + ) + if media_describer is not None: content_bytes = content.read() try: @@ -140,7 +147,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: analyze_result: AnalyzeResult = await poller.result() offset = 0 - + for page in analyze_result.pages: tables_on_page = [ table @@ -214,11 +221,11 @@ class ObjectType(Enum): offset += len(page_text) @staticmethod - async def process_figure( - doc: pymupdf.Document, figure: DocumentFigure, media_describer: MediaDescriber - ) -> str: + async def process_figure(doc: pymupdf.Document, figure: DocumentFigure, media_describer: MediaDescriber) -> str: figure_title = (figure.caption and figure.caption.content) or "" - logger.info("Describing figure %s with title '%s' using %s", figure.id, figure_title, type(media_describer).__name__) + logger.info( + "Describing figure %s with title '%s' using %s", figure.id, figure_title, type(media_describer).__name__ + ) if not figure.bounding_regions: return f"
{figure_title}
" if len(figure.bounding_regions) > 1: @@ -232,13 +239,13 @@ async def process_figure( first_region.polygon[5], # y1 (bottom) ) page_number = first_region["pageNumber"] # 1-indexed - cropped_img = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box) + cropped_img, bbox_pixels = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box) figure_description = await media_describer.describe_image(cropped_img) return ImageOnPage( bytes=cropped_img, filename=f"page_{page_number}_figure_{figure.id}.png", - bbox=bounding_box, - description=f"
{figure_title}
{figure_description}
" + bbox=bbox_pixels, + description=f"
{figure_title}
{figure_description}
", ) @staticmethod @@ -265,7 +272,7 @@ def table_to_html(table: DocumentTable): @staticmethod def crop_image_from_pdf_page( doc: pymupdf.Document, page_number: int, bbox_inches: tuple[float, float, float, float] - ) -> bytes: + ) -> tuple[bytes, list[float]]: """ Crops a region from a given page in a PDF and returns it as an image. @@ -287,4 +294,4 @@ def crop_image_from_pdf_page( img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples) bytes_io = io.BytesIO() img.save(bytes_io, format="PNG") - return bytes_io.getvalue() + return bytes_io.getvalue(), bbox_pixels diff --git a/app/backend/prepdocslib/searchmanager.py b/app/backend/prepdocslib/searchmanager.py index fc0575899f..95f07a32f3 100644 --- a/app/backend/prepdocslib/searchmanager.py +++ b/app/backend/prepdocslib/searchmanager.py @@ -46,9 +46,9 @@ class Section: """ def __init__(self, split_page: SplitPage, content: File, category: Optional[str] = None): - self.split_page = split_page # content comes from here - self.content = content # sourcepage and sourcefile come from here - self.category = category + self.split_page = split_page # content comes from here + self.content = content # sourcepage and sourcefile come from here + self.category = category # this also needs images which will become the images field @@ -160,18 +160,39 @@ async def create_index(self): name="images", type=SearchFieldDataType.Collection(SearchFieldDataType.ComplexType), fields=[ - SearchField(name="embedding", + SearchField( + name="embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, stored=False, vector_search_dimensions=1024, - vector_search_profile_name=image_vector_search_profile.name), - SearchField(name="url", type=SearchFieldDataType.String, searchable=False, filterable=True, sortable=False, facetable=True), - SearchField(name="description", type=SearchFieldDataType.String, searchable=True, filterable=False, sortable=False, facetable=False), - SearchField(name="boundingbox", - type=SearchFieldDataType.Collection(SearchFieldDataType.Int32), - searchable=False, filterable=False, sortable=False, facetable=False), - ] + vector_search_profile_name=image_vector_search_profile.name, + ), + SearchField( + name="url", + type=SearchFieldDataType.String, + searchable=False, + filterable=True, + sortable=False, + facetable=True, + ), + SearchField( + name="description", + type=SearchFieldDataType.String, + searchable=True, + filterable=False, + sortable=False, + facetable=False, + ), + SearchField( + name="boundingbox", + type=SearchFieldDataType.Collection(SearchFieldDataType.Double), + searchable=False, + filterable=False, + sortable=False, + facetable=False, + ), + ], ) if self.search_info.index_name not in [name async for name in search_index_client.list_index_names()]: @@ -324,7 +345,7 @@ async def create_index(self): existing_index.vector_search.profiles.append(text_vector_search_profile) if existing_index.vector_search.algorithms is None: existing_index.vector_search.algorithms = [] - #existing_index.vector_search.algorithms.append(text_vector_algorithm) + # existing_index.vector_search.algorithms.append(text_vector_algorithm) if existing_index.vector_search.compressions is None: existing_index.vector_search.compressions = [] existing_index.vector_search.compressions.append(text_vector_compression) @@ -419,9 +440,7 @@ async def create_agent(self): logger.info("Agent %s created successfully", self.search_info.agent_name) - async def update_content( - self, sections: list[Section], url: Optional[str] = None - ): + async def update_content(self, sections: list[Section], url: Optional[str] = None): MAX_BATCH_SIZE = 1000 section_batches = [sections[i : i + MAX_BATCH_SIZE] for i in range(0, len(sections), MAX_BATCH_SIZE)] @@ -432,16 +451,19 @@ async def update_content( "id": f"{section.content.filename_to_id()}-page-{section_index + batch_index * MAX_BATCH_SIZE}", "content": section.split_page.text, "category": section.category, - "sourcepage": BlobManager.sourcepage_from_file_page(filename=section.content.filename(), page=section.split_page.page_num), + "sourcepage": BlobManager.sourcepage_from_file_page( + filename=section.content.filename(), page=section.split_page.page_num + ), "sourcefile": section.content.filename(), "images": [ { "url": image.url, "description": image.description, - #"boundingbox": list(image.bbox), # TODO: decide if it should be a float, ask mattg + "boundingbox": image.bbox, "embedding": image.embedding, } - for image in section.split_page.images], + for image in section.split_page.images + ], **section.content.acls, } for section_index, section in enumerate(batch) @@ -457,7 +479,12 @@ async def update_content( ) for i, document in enumerate(documents): document[self.field_name_embedding] = embeddings[i] - logger.info("Uploading batch %d with %d sections to search index '%s'", batch_index + 1, len(documents), self.search_info.index_name) + logger.info( + "Uploading batch %d with %d sections to search index '%s'", + batch_index + 1, + len(documents), + self.search_info.index_name, + ) await search_client.upload_documents(documents) async def remove_content(self, path: Optional[str] = None, only_oid: Optional[str] = None): diff --git a/infra/main.bicep b/infra/main.bicep index d0b0d4e903..d962b1acad 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -47,6 +47,8 @@ param userStorageContainerName string = 'user-content' param tokenStorageContainerName string = 'tokens' +param imageStorageContainerName string = 'images' + param appServiceSkuName string // Set in main.parameters.json @allowed(['azure', 'openai', 'azure_custom']) @@ -450,6 +452,7 @@ var appEnvVariables = { USE_USER_UPLOAD: useUserUpload AZURE_USERSTORAGE_ACCOUNT: useUserUpload ? userStorage.outputs.name : '' AZURE_USERSTORAGE_CONTAINER: useUserUpload ? userStorageContainerName : '' + AZURE_IMAGESTORAGE_CONTAINER: useMultimodal ? imageStorageContainerName : '' AZURE_DOCUMENTINTELLIGENCE_SERVICE: documentIntelligence.outputs.name USE_LOCAL_PDF_PARSER: useLocalPdfParser USE_LOCAL_HTML_PARSER: useLocalHtmlParser @@ -794,6 +797,10 @@ module storage 'core/storage/storage-account.bicep' = { name: storageContainerName publicAccess: 'None' } + { + name: imageStorageContainerName + publicAccess: 'None' + } { name: tokenStorageContainerName publicAccess: 'None' @@ -1309,6 +1316,8 @@ output AZURE_USERSTORAGE_ACCOUNT string = useUserUpload ? userStorage.outputs.na output AZURE_USERSTORAGE_CONTAINER string = userStorageContainerName output AZURE_USERSTORAGE_RESOURCE_GROUP string = storageResourceGroup.name +output AZURE_IMAGESTORAGE_CONTAINER string = useMultimodal ? imageStorageContainerName : '' + output AZURE_AI_PROJECT string = useAiProject ? ai.outputs.projectName : '' output AZURE_USE_AUTHENTICATION bool = useAuthentication From 2a73065f275d2d858b108616b737ef338890dd44 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Tue, 3 Jun 2025 17:11:57 -0700 Subject: [PATCH 07/77] Getting image citations almost working --- app/backend/app.py | 1 + app/backend/approaches/approach.py | 3 + .../prompts/ask_answer_question.prompty | 9 +- app/backend/approaches/retrievethenread.py | 37 +++++- app/backend/core/imageshelper.py | 36 +++--- app/backend/prepdocslib/blobmanager.py | 115 +++++++----------- app/backend/prepdocslib/filestrategy.py | 9 +- app/backend/prepdocslib/page.py | 2 + app/backend/prepdocslib/pdfparser.py | 4 +- 9 files changed, 114 insertions(+), 102 deletions(-) diff --git a/app/backend/app.py b/app/backend/app.py index f9215a9682..3c6e91068b 100644 --- a/app/backend/app.py +++ b/app/backend/app.py @@ -686,6 +686,7 @@ async def setup_clients(): agent_client=agent_client, openai_client=openai_client, auth_helper=auth_helper, + images_blob_container_client=image_blob_container_client, chatgpt_model=OPENAI_CHATGPT_MODEL, chatgpt_deployment=AZURE_OPENAI_CHATGPT_DEPLOYMENT, embedding_model=OPENAI_EMB_MODEL, diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py index 7dc81814bd..c9185932cf 100644 --- a/app/backend/approaches/approach.py +++ b/app/backend/approaches/approach.py @@ -50,6 +50,7 @@ class Document: score: Optional[float] = None reranker_score: Optional[float] = None search_agent_query: Optional[str] = None + images: Optional[list[dict[str, Any]]] = None def serialize_for_results(self) -> dict[str, Any]: result_dict = { @@ -75,6 +76,7 @@ def serialize_for_results(self) -> dict[str, Any]: "score": self.score, "reranker_score": self.reranker_score, "search_agent_query": self.search_agent_query, + "images": self.images, } return result_dict @@ -238,6 +240,7 @@ async def search( captions=cast(list[QueryCaptionResult], document.get("@search.captions")), score=document.get("@search.score"), reranker_score=document.get("@search.reranker_score"), + images=document.get("images"), ) ) diff --git a/app/backend/approaches/prompts/ask_answer_question.prompty b/app/backend/approaches/prompts/ask_answer_question.prompty index c384ad8b65..464634bb26 100644 --- a/app/backend/approaches/prompts/ask_answer_question.prompty +++ b/app/backend/approaches/prompts/ask_answer_question.prompty @@ -19,10 +19,12 @@ Use 'you' to refer to the individual asking the questions even if they ask with Answer the following question using only the data provided in the sources below. Each source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. If you cannot answer using the sources below, say you don't know. Use below example to answer. -{% if use_images %} -Each image source has the file name in the top left corner of the image with coordinates (10,10) pixels and is in the format SourceFileName:. +{% if image_sources %} +Each image source has the original document file name in the top left corner of the image with coordinates (10,10) pixels and is in the format Document:. +The filename of the actual image is in the top right corner of the image and is in the format Figure:. Each text source starts in a new line and has the file name followed by colon and the actual information. -Always include the source name from the image or text for each fact you use in the response in the format: [filename]. +Always include the source document filename for each fact you use in the response in the format: [document_name.ext#page=N]. +If you are referencing an image, add the image filename in the format: [document_name.ext#page=N(image_name.png)]. Answer the following question using only the data provided in the sources below. The text and image source can be the same file name, don't use the image title when citing the image source, only use the file name as mentioned. If you cannot answer using the sources below, say you don't know. Return just the answer without any input texts. @@ -50,6 +52,5 @@ user: {% if text_sources is defined %} Sources: {% for text_source in text_sources %} -{{ text_source }} {% endfor %} {% endif %} diff --git a/app/backend/approaches/retrievethenread.py b/app/backend/approaches/retrievethenread.py index d59f903b0e..e49065da59 100644 --- a/app/backend/approaches/retrievethenread.py +++ b/app/backend/approaches/retrievethenread.py @@ -3,12 +3,14 @@ from azure.search.documents.agent.aio import KnowledgeAgentRetrievalClient from azure.search.documents.aio import SearchClient from azure.search.documents.models import VectorQuery +from azure.storage.blob.aio import ContainerClient from openai import AsyncOpenAI from openai.types.chat import ChatCompletion, ChatCompletionMessageParam from approaches.approach import Approach, DataPoints, ExtraInfo, ThoughtStep from approaches.promptmanager import PromptManager from core.authentication import AuthenticationHelper +from core.imageshelper import download_blob_as_base64 class RetrieveThenReadApproach(Approach): @@ -27,6 +29,7 @@ def __init__( agent_deployment: Optional[str], agent_client: KnowledgeAgentRetrievalClient, auth_helper: AuthenticationHelper, + images_blob_container_client: ContainerClient, openai_client: AsyncOpenAI, chatgpt_model: str, chatgpt_deployment: Optional[str], # Not needed for non-Azure OpenAI @@ -49,6 +52,7 @@ def __init__( self.chatgpt_deployment = chatgpt_deployment self.openai_client = openai_client self.auth_helper = auth_helper + self.images_blob_container_client = images_blob_container_client self.chatgpt_model = chatgpt_model self.embedding_model = embedding_model self.embedding_dimensions = embedding_dimensions @@ -86,7 +90,11 @@ async def run( messages = self.prompt_manager.render_prompt( self.answer_prompt, self.get_system_prompt_variables(overrides.get("prompt_template")) - | {"user_query": q, "text_sources": extra_info.data_points.text}, + | { + "user_query": q, + "text_sources": extra_info.data_points.text, + "image_sources": extra_info.data_points.images, + }, ) chat_completion = cast( @@ -126,6 +134,7 @@ async def run_search_approach( use_semantic_ranker = True if overrides.get("semantic_ranker") else False use_query_rewriting = True if overrides.get("query_rewriting") else False use_semantic_captions = True if overrides.get("semantic_captions") else False + use_multimodal = True # TODO: if overrides.get("use_multimodal") else False top = overrides.get("top", 3) minimum_search_score = overrides.get("minimum_search_score", 0.0) minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0) @@ -137,6 +146,11 @@ async def run_search_approach( if use_vector_search: vectors.append(await self.compute_text_embedding(q)) + # If multimodal is enabled, also compute image embeddings + # TODO: will this work with agentic? is this doing multivector search correctly? + # if use_multimodal: + # vectors.append(await self.compute_image_embedding(q)) + results = await self.search( top, q, @@ -151,10 +165,26 @@ async def run_search_approach( use_query_rewriting, ) - text_sources = self.get_sources_content(results, use_semantic_captions, use_image_citation=False) + text_sources = self.get_sources_content(results, use_semantic_captions, use_image_citation=use_multimodal) + + # Extract unique image URLs from results if multimodal is enabled + + seen_urls = set() + image_sources = [] + if use_multimodal: + for doc in results: + if hasattr(doc, "images") and doc.images: + for img in doc.images: + # Skip if we've already processed this URL + if img["url"] in seen_urls: + continue + seen_urls.add(img["url"]) + url = await download_blob_as_base64(self.images_blob_container_client, img["url"]) + if url: + image_sources.append(url) return ExtraInfo( - DataPoints(text=text_sources), + DataPoints(text=text_sources, images=image_sources), thoughts=[ ThoughtStep( "Search using user query", @@ -167,6 +197,7 @@ async def run_search_approach( "filter": filter, "use_vector_search": use_vector_search, "use_text_search": use_text_search, + "use_multimodal": use_multimodal, }, ), ThoughtStep( diff --git a/app/backend/core/imageshelper.py b/app/backend/core/imageshelper.py index 87e8b8970f..2ba1f11dcc 100644 --- a/app/backend/core/imageshelper.py +++ b/app/backend/core/imageshelper.py @@ -1,14 +1,11 @@ import base64 import logging -import os from typing import Optional from azure.core.exceptions import ResourceNotFoundError from azure.storage.blob.aio import ContainerClient from typing_extensions import Literal, Required, TypedDict -from approaches.approach import Document - class ImageURL(TypedDict, total=False): url: Required[str] @@ -18,23 +15,30 @@ class ImageURL(TypedDict, total=False): """Specifies the detail level of the image.""" -async def download_blob_as_base64(blob_container_client: ContainerClient, file_path: str) -> Optional[str]: - base_name, _ = os.path.splitext(file_path) - image_filename = base_name + ".png" +async def download_blob_as_base64(blob_container_client: ContainerClient, blob_url: str) -> Optional[str]: try: - blob = await blob_container_client.get_blob_client(image_filename).download_blob() + # Handle full URLs + if blob_url.startswith("http"): + # Extract blob path from full URL + # URL format: https://{account}.blob.core.windows.net/{container}/{blob_path} + url_parts = blob_url.split("/") + # Skip the domain parts and container name to get the blob path + blob_path = "/".join(url_parts[4:]) + else: + # Treat as a direct blob path + blob_path = blob_url + + # Download the blob + blob = await blob_container_client.get_blob_client(blob_path).download_blob() if not blob.properties: - logging.warning(f"No blob exists for {image_filename}") + logging.warning(f"No blob exists for {blob_path}") return None + img = base64.b64encode(await blob.readall()).decode("utf-8") return f"data:image/png;base64,{img}" except ResourceNotFoundError: - logging.warning(f"No blob exists for {image_filename}") + logging.warning(f"No blob exists for {blob_path}") + return None + except Exception as e: + logging.error(f"Error downloading blob {blob_url}: {str(e)}") return None - - -async def fetch_image(blob_container_client: ContainerClient, result: Document) -> Optional[str]: - if result.sourcepage: - img = await download_blob_as_base64(blob_container_client, result.sourcepage) - return img - return None diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py index f1d6764394..3dc2c1e5b8 100644 --- a/app/backend/prepdocslib/blobmanager.py +++ b/app/backend/prepdocslib/blobmanager.py @@ -1,20 +1,15 @@ -import datetime import io import logging import os import re from typing import Optional, Union -import pymupdf from azure.core.credentials_async import AsyncTokenCredential from azure.storage.blob import ( - BlobSasPermissions, UserDelegationKey, - generate_blob_sas, ) -from azure.storage.blob.aio import BlobServiceClient, ContainerClient +from azure.storage.blob.aio import BlobServiceClient from PIL import Image, ImageDraw, ImageFont -from pypdf import PdfReader from .listfilestrategy import File @@ -64,7 +59,7 @@ async def upload_blob(self, file: File) -> Optional[list[str]]: return None async def upload_document_image( - self, document_file: File, image_bytes: bytes, image_filename: str + self, document_file: File, image_bytes: bytes, image_filename: str, image_page_num: int ) -> Optional[str]: if self.image_container is None: raise ValueError( @@ -75,81 +70,55 @@ async def upload_document_image( ) as service_client, service_client.get_container_client(self.image_container) as container_client: if not await container_client.exists(): await container_client.create_container() - blob_name = BlobManager.blob_name_from_file_name(document_file.content.name) + "/" + image_filename - logger.info("Uploading blob for document image %s", blob_name) - blob_client = await container_client.upload_blob(blob_name, io.BytesIO(image_bytes), overwrite=True) - return blob_client.url - return None - - def get_managedidentity_connectionstring(self): - return f"ResourceId=/subscriptions/{self.subscriptionId}/resourceGroups/{self.resourceGroup}/providers/Microsoft.Storage/storageAccounts/{self.account};" - - async def upload_pdf_blob_images( - self, service_client: BlobServiceClient, container_client: ContainerClient, file: File - ) -> list[str]: - with open(file.content.name, "rb") as reopened_file: - reader = PdfReader(reopened_file) - page_count = len(reader.pages) - doc = pymupdf.open(file.content.name) - sas_uris = [] - start_time = datetime.datetime.now(datetime.timezone.utc) - expiry_time = start_time + datetime.timedelta(days=1) - - font = None - try: - font = ImageFont.truetype("arial.ttf", 20) - except OSError: - try: - font = ImageFont.truetype("/usr/share/fonts/truetype/freefont/FreeMono.ttf", 20) - except OSError: - logger.info("Unable to find arial.ttf or FreeMono.ttf, using default font") - - for i in range(page_count): - blob_name = BlobManager.blob_image_name_from_file_page(file.content.name, i) - logger.info("Converting page %s to image and uploading -> %s", i, blob_name) - - doc = pymupdf.open(file.content.name) - page = doc.load_page(i) - pix = page.get_pixmap() - original_img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) # type: ignore - # Create a new image with additional space for text - text_height = 40 # Height of the text area - new_img = Image.new("RGB", (original_img.width, original_img.height + text_height), "white") + # Load and modify the image to add text + image = Image.open(io.BytesIO(image_bytes)) + text_height = 40 + new_img = Image.new("RGB", (image.width, image.height + text_height), "white") + new_img.paste(image, (0, text_height)) - # Paste the original image onto the new image - new_img.paste(original_img, (0, text_height)) - - # Draw the text on the white area + # Add text draw = ImageDraw.Draw(new_img) - text = f"SourceFileName:{blob_name}" + sourcepage = BlobManager.sourcepage_from_file_page(document_file.content.name, page=image_page_num) + text = f"Document: {sourcepage}" - # 10 pixels from the top and left of the image - x = 10 - y = 10 - draw.text((x, y), text, font=font, fill="black") + font = None + try: + font = ImageFont.truetype("arial.ttf", 24) + except OSError: + try: + font = ImageFont.truetype("/usr/share/fonts/truetype/freefont/FreeMono.ttf", 24) + except OSError: + logger.info("Unable to find arial.ttf or FreeMono.ttf, using default font") + + # Draw document text on left + draw.text((10, 10), text, font=font, fill="black") + + # Draw figure text on right + figure_text = f"Figure: {image_filename}" + if font: + # Get the width of the text to position it on the right + text_width = draw.textlength(figure_text, font=font) + draw.text((new_img.width - text_width - 10, 10), figure_text, font=font, fill="black") + else: + # If no font available, make a best effort to position on right + draw.text((new_img.width - 200, 10), figure_text, font=font, fill="black") + # Convert back to bytes output = io.BytesIO() - new_img.save(output, format="PNG") + new_img.save(output, format=image.format or "PNG") output.seek(0) + blob_name = ( + f"{self.blob_name_from_file_name(document_file.content.name)}/page{image_page_num}/{image_filename}" + ) + logger.info("Uploading blob for document image %s", blob_name) blob_client = await container_client.upload_blob(blob_name, output, overwrite=True) - if not self.user_delegation_key: - self.user_delegation_key = await service_client.get_user_delegation_key(start_time, expiry_time) - - if blob_client.account_name is not None: - sas_token = generate_blob_sas( - account_name=blob_client.account_name, - container_name=blob_client.container_name, - blob_name=blob_client.blob_name, - user_delegation_key=self.user_delegation_key, - permission=BlobSasPermissions(read=True), - expiry=expiry_time, - start=start_time, - ) - sas_uris.append(f"{blob_client.url}?{sas_token}") - - return sas_uris + return blob_client.url + return None + + def get_managedidentity_connectionstring(self): + return f"ResourceId=/subscriptions/{self.subscriptionId}/resourceGroups/{self.resourceGroup}/providers/Microsoft.Storage/storageAccounts/{self.account};" async def remove_blob(self, path: Optional[str] = None): async with BlobServiceClient( diff --git a/app/backend/prepdocslib/filestrategy.py b/app/backend/prepdocslib/filestrategy.py index b44ea46ae7..4739a83c6a 100644 --- a/app/backend/prepdocslib/filestrategy.py +++ b/app/backend/prepdocslib/filestrategy.py @@ -31,7 +31,7 @@ async def parse_file( for page in pages: for image in page.images: if image.url is None: - image.url = await blob_manager.upload_document_image(file, image.bytes, image.filename) + image.url = await blob_manager.upload_document_image(file, image.bytes, image.filename, image.page_num) if image_embeddings_client: image.embedding = await image_embeddings_client.create_embedding(image.bytes) logger.info("Splitting '%s' into sections", file.filename()) @@ -43,9 +43,6 @@ async def parse_file( section.split_page.images = [ image for page in pages if page.page_num == section.split_page.page_num for image in page.images ] - logger.info( - "Section for page %d has %d images", section.split_page.page_num, len(section.split_page.images) - ) return sections @@ -115,7 +112,9 @@ async def run(self): files = self.list_file_strategy.list() async for file in files: try: - sections = await parse_file(file, self.file_processors, self.category, self.blob_manager, self.image_embeddings) + sections = await parse_file( + file, self.file_processors, self.category, self.blob_manager, self.image_embeddings + ) if sections: await self.search_manager.update_content(sections, url=file.url) finally: diff --git a/app/backend/prepdocslib/page.py b/app/backend/prepdocslib/page.py index 8f444b142e..d2eeee3c7d 100644 --- a/app/backend/prepdocslib/page.py +++ b/app/backend/prepdocslib/page.py @@ -7,6 +7,8 @@ class ImageOnPage: bbox: list[float, float, float, float] # Pixels filename: str description: str + figure_id: str + page_num: int # 1-indexed url: str | None = None embedding: list[float] | None = None diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py index b5d936f225..d402e66c8b 100644 --- a/app/backend/prepdocslib/pdfparser.py +++ b/app/backend/prepdocslib/pdfparser.py @@ -243,8 +243,10 @@ async def process_figure(doc: pymupdf.Document, figure: DocumentFigure, media_de figure_description = await media_describer.describe_image(cropped_img) return ImageOnPage( bytes=cropped_img, - filename=f"page_{page_number}_figure_{figure.id}.png", + page_num=page_number, + figure_id=figure.id, bbox=bbox_pixels, + filename=f"figure{figure.id.replace(".", "_")}.png", description=f"
{figure_title}
{figure_description}
", ) From 751abd10a6cdb237d917a00c4a93d268cb167b49 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Fri, 27 Jun 2025 13:17:41 -0700 Subject: [PATCH 08/77] More progress on multimodal approach --- .azdo/pipelines/azure-dev.yml | 2 + .github/copilot-instructions.md | 62 ++++++ .github/workflows/azure-dev.yml | 2 + CONTRIBUTING.md | 17 +- app/backend/app.py | 48 +++-- app/backend/approaches/approach.py | 115 +++++++--- app/backend/approaches/chatapproach.py | 151 -------------- .../approaches/chatreadretrieveread.py | 197 ++++++++++++++++-- .../prompts/ask_answer_question.prompty | 20 +- app/backend/approaches/retrievethenread.py | 104 +++++---- app/backend/config.py | 2 + app/backend/core/imageshelper.py | 2 + app/backend/prepdocs.py | 9 +- app/backend/prepdocslib/blobmanager.py | 16 +- app/backend/prepdocslib/filestrategy.py | 2 + app/backend/prepdocslib/page.py | 2 +- app/backend/prepdocslib/pdfparser.py | 2 +- app/backend/prepdocslib/searchmanager.py | 1 + app/frontend/src/api/api.ts | 4 +- app/frontend/src/api/models.ts | 17 +- app/frontend/src/components/Answer/Answer.tsx | 4 +- .../src/components/Answer/AnswerParser.tsx | 32 +-- .../src/components/Example/ExampleList.tsx | 8 +- .../GPT4VSettings/GPT4VSettings.module.css | 3 - .../GPT4VSettings/GPT4VSettings.tsx | 79 ------- .../src/components/GPT4VSettings/index.ts | 1 - .../src/components/Settings/Settings.tsx | 184 ++++++++-------- app/frontend/src/locales/da/translation.json | 23 +- app/frontend/src/locales/en/translation.json | 24 +-- app/frontend/src/locales/es/translation.json | 24 +-- app/frontend/src/locales/fr/translation.json | 24 +-- app/frontend/src/locales/it/translation.json | 24 +-- app/frontend/src/locales/ja/translation.json | 24 +-- app/frontend/src/locales/nl/translation.json | 24 +-- .../src/locales/ptBR/translation.json | 24 +-- app/frontend/src/locales/tr/translation.json | 24 +-- app/frontend/src/pages/ask/Ask.tsx | 39 ++-- app/frontend/src/pages/chat/Chat.tsx | 38 ++-- azure.yaml | 2 + docs/README.md | 2 +- docs/customization.md | 5 + docs/deploy_features.md | 45 +++- docs/gpt4v.md | 2 + docs/http_protocol.md | 10 +- docs/productionizing.md | 2 +- infra/main.bicep | 13 +- infra/main.parameters.json | 6 + infra/main.test.bicep | 2 +- tests/conftest.py | 41 +++- tests/e2e.py | 27 ++- .../client0/result.json | 11 +- .../client1/result.json | 11 +- .../client0/result.json | 13 +- .../client1/result.json | 13 +- .../test_ask_rtr_hybrid/client0/result.json | 13 +- .../test_ask_rtr_hybrid/client1/result.json | 13 +- .../test_ask_rtr_text/client0/result.json | 13 +- .../test_ask_rtr_text/client1/result.json | 13 +- .../agent_client0/result.json | 11 +- .../agent_auth_client0/result.json | 11 +- .../auth_client0/result.json | 13 +- .../auth_public_documents_client0/result.json | 13 +- .../client0/result.json | 13 +- .../client1/result.json | 13 +- .../client0/result.json | 13 +- .../client1/result.json | 13 +- .../test_ask_vision/client0/result.json | 13 +- .../test_ask_vision/client1/result.json | 72 ++++--- .../test_chat_followup/client0/result.json | 12 +- .../test_chat_followup/client1/result.json | 12 +- .../test_chat_hybrid/client0/result.json | 10 +- .../test_chat_hybrid/client1/result.json | 10 +- .../client0/result.json | 10 +- .../client1/result.json | 10 +- .../client0/result.json | 10 +- .../client1/result.json | 10 +- .../client0/result.json | 10 +- .../client1/result.json | 10 +- .../client0/result.json | 12 +- .../client1/result.json | 12 +- .../test_chat_seed/client0/result.json | 10 +- .../test_chat_seed/client1/result.json | 10 +- .../client0/result.json | 10 +- .../client1/result.json | 10 +- .../client0/result.jsonlines | 6 +- .../client1/result.jsonlines | 6 +- .../client0/result.jsonlines | 4 +- .../client1/result.jsonlines | 4 +- .../client0/result.jsonlines | 4 +- .../client1/result.jsonlines | 4 +- .../auth_client0/result.jsonlines | 4 +- .../reasoning_client0/result.jsonlines | 4 +- .../reasoning_client1/result.jsonlines | 4 +- .../client0/result.jsonlines | 4 +- .../client1/result.jsonlines | 5 +- .../test_chat_text/client0/result.json | 10 +- .../test_chat_text/client1/result.json | 10 +- .../agent_client0/result.json | 8 +- .../auth_client0/result.json | 10 +- .../agent_auth_client0/result.json | 8 +- .../auth_public_documents_client0/result.json | 10 +- .../reasoning_client0/result.json | 10 +- .../reasoning_client1/result.json | 10 +- .../client0/result.json | 10 +- .../client1/result.json | 10 +- .../client0/result.json | 10 +- .../client1/result.json | 10 +- .../client0/result.json | 10 +- .../client1/result.json | 10 +- .../test_chat_vector/client0/result.json | 10 +- .../test_chat_vector/client1/result.json | 10 +- .../client0/result.json | 10 +- .../client1/result.json | 10 +- .../test_chat_vision/client0/result.json | 10 +- .../test_chat_vision/client1/result.json | 50 ++--- .../client0/result.json | 10 +- .../client1/result.json | 70 ++++--- .../client0/result.json | 10 +- .../client1/result.json | 10 +- tests/test_app.py | 12 +- tests/test_app_config.py | 16 +- tests/test_blob_manager.py | 41 +--- tests/test_chatvisionapproach.py | 153 -------------- tests/test_fetch_image.py | 22 +- tests/test_prepdocslib_filestrategy.py | 5 +- todo.txt | 8 + 126 files changed, 1484 insertions(+), 1146 deletions(-) create mode 100644 .github/copilot-instructions.md delete mode 100644 app/backend/approaches/chatapproach.py delete mode 100644 app/frontend/src/components/GPT4VSettings/GPT4VSettings.module.css delete mode 100644 app/frontend/src/components/GPT4VSettings/GPT4VSettings.tsx delete mode 100644 app/frontend/src/components/GPT4VSettings/index.ts delete mode 100644 tests/test_chatvisionapproach.py create mode 100644 todo.txt diff --git a/.azdo/pipelines/azure-dev.yml b/.azdo/pipelines/azure-dev.yml index ed3bf3a58e..8d3fc687a1 100644 --- a/.azdo/pipelines/azure-dev.yml +++ b/.azdo/pipelines/azure-dev.yml @@ -121,6 +121,8 @@ steps: AZURE_CONTAINER_APPS_WORKLOAD_PROFILE: $(AZURE_CONTAINER_APPS_WORKLOAD_PROFILE) USE_CHAT_HISTORY_BROWSER: $(USE_CHAT_HISTORY_BROWSER) USE_MEDIA_DESCRIBER_AZURE_CU: $(USE_MEDIA_DESCRIBER_AZURE_CU) + RAG_LLM_INPUTS_OVERRIDE: $(RAG_LLM_INPUTS_OVERRIDE) + RAG_VECTOR_FIELDS_DEFAULT: $(RAG_VECTOR_FIELDS_DEFAULT) - task: AzureCLI@2 displayName: Deploy Application inputs: diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md new file mode 100644 index 0000000000..4b6735ba93 --- /dev/null +++ b/.github/copilot-instructions.md @@ -0,0 +1,62 @@ +# Adding new data + +New files should be added to the `data` folder, and then either run scripts/prepdocs.sh or script/prepdocs.ps1 to ingest the data. + +# Overall code layout + +* app: Contains the main application code, including frontend and backend. + * app/backend: Contains the Python backend code, written with Quart framework. + * app/backend/approaches: Contains the different approaches + * app/backend/approaches/approach.py: Base class for all approaches + * app/backend/approaches/retrievethenread.py: Ask approach, just searches and answers + * app/backend/approaches/chatreadretrieveread.py: Chat approach, includes query rewriting step first + * app/backend/app.py: The main entry point for the backend application. + * app/frontend: Contains the React frontend code, built with TypeScript, built with vite. + * app/frontend/src/api: Contains the API client code for communicating with the backend. + * app/frontend/src/components: Contains the React components for the frontend. + * app/frontend/src/locales: Contains the translation files for internationalization. + * app/frontend/src/pages: Contains the main pages of the application +* infra: Contains the Bicep templates for provisioning Azure resources. +* tests: Contains the test code, including e2e tests, app integration tests, and unit tests. + +# Adding a new azd environment variable + +An azd environment variable is stored by the azd CLI for each environment. It is passed to the "azd up" command and can configure both provisioning options and application settings. +When adding new azd environment variables, update: + +1. infra/main.parameters.json : Add the new parameter with a Bicep-friendly variable name and map to the new environment variable +1. infra/main.bicep: Add the new Bicep parameter at the top, and add it to the `appEnvVariables` object +1. azure.yaml: Add the new environment variable under pipeline config section +1. .azdo/pipelines/azure-dev.yml: Add the new environment variable under `env` section +1. .github/workflows/azure-dev.yml: Add the new environment variable under `env` section + +# Adding a new setting to "Developer Settings" in RAG app + +When adding a new developer setting, update: + +* frontend: + * app/frontend/src/api/models.ts : Add to ChatAppRequestOverrides + * app/frontend/src/components/Settings.tsx : Add a UI element for the setting + * app/frontend/src/locales/*/translations.json: Add a translation for the setting label/tooltip for all languages + * app/frontend/src/pages/chat/Chat.tsx: Add the setting to the component, pass it to Settings + * app/frontend/src/pages/ask/Ask.tsx: Add the setting to the component, pass it to Settings + +* backend: + * app/backend/approaches/chatreadretrieveread.py : Retrieve from overrides parameter + * app/backend/approaches/retrievethenread.py : Retrieve from overrides parameter + * app/backend/app.py: Some settings may need to sent down in the /config route. + +# When adding tests for a new feature: + +All tests are in the `tests` folder and use the pytest framework. +There are three styles of tests: + +* e2e tests: These use playwright to run the app in a browser and test the UI end-to-end. They are in e2e.py and they mock the backend using the snapshots from the app tests. +* app integration tests: Mostly in test_app.py, these test the app's API endpoints and use mocks for services like Azure OpenAI and Azure Search. +* unit tests: The rest of the tests are unit tests that test individual functions and methods. They are in test_*.py files. + +When adding a new feature, add tests for it in the appropriate file. +If the feature is a UI element, add an e2e test for it. +If it is an API endpoint, add an app integration test for it. +If it is a function or method, add a unit test for it. +Use mocks from conftest.py to mock external services. diff --git a/.github/workflows/azure-dev.yml b/.github/workflows/azure-dev.yml index a1e4847d13..d3ab4d12e4 100644 --- a/.github/workflows/azure-dev.yml +++ b/.github/workflows/azure-dev.yml @@ -111,6 +111,8 @@ jobs: USE_CHAT_HISTORY_BROWSER: ${{ vars.USE_CHAT_HISTORY_BROWSER }} USE_MEDIA_DESCRIBER_AZURE_CU: ${{ vars.USE_MEDIA_DESCRIBER_AZURE_CU }} USE_AI_PROJECT: ${{ vars.USE_AI_PROJECT }} + RAG_LLM_INPUTS_OVERRIDE: ${{ vars.RAG_LLM_INPUTS_OVERRIDE }} + RAG_VECTOR_FIELDS_DEFAULT: ${{ vars.RAG_VECTOR_FIELDS_DEFAULT }} steps: - name: Checkout uses: actions/checkout@v4 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f3d1d104d6..ab24a91c00 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -17,8 +17,9 @@ contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additio - [Running unit tests](#running-unit-tests) - [Running E2E tests](#running-e2e-tests) - [Code style](#code-style) -- [Adding new azd environment variables](#adding-new-azd-environment-variables) -- [Adding new UI strings](#adding-new-ui-strings) +- [Adding new features](#adding-new-features) + - [Adding new azd environment variables](#adding-new-azd-environment-variables) + - [Adding new UI strings](#adding-new-ui-strings) ## Submitting a Pull Request (PR) @@ -118,7 +119,15 @@ python -m black If you followed the steps above to install the pre-commit hooks, then you can just wait for those hooks to run `ruff` and `black` for you. -## Adding new azd environment variables +## Adding new features + +We recommend using GitHub Copilot Agent mode when adding new features, +as this project includes [.github/copilot-instructions.md](.github/copilot-instructions.md) file +that instructs Copilot on how to generate code for common code changes. + +If you are not using Copilot Agent mode, consult both that file and suggestions below. + +### Adding new azd environment variables When adding new azd environment variables, please remember to update: @@ -128,7 +137,7 @@ When adding new azd environment variables, please remember to update: 1. [ADO pipeline](.azdo/pipelines/azure-dev.yml). 1. [Github workflows](.github/workflows/azure-dev.yml) -## Adding new UI strings +### Adding new UI strings When adding new UI strings, please remember to update all translations. For any translations that you generate with an AI tool, diff --git a/app/backend/app.py b/app/backend/app.py index 3c6e91068b..d884445f41 100644 --- a/app/backend/app.py +++ b/app/backend/app.py @@ -5,9 +5,9 @@ import mimetypes import os import time -from collections.abc import AsyncGenerator +from collections.abc import AsyncGenerator, Awaitable from pathlib import Path -from typing import Any, Union, cast +from typing import Any, Callable, Union, cast from azure.cognitiveservices.speech import ( ResultReason, @@ -72,6 +72,8 @@ CONFIG_MULTIMODAL_ENABLED, CONFIG_OPENAI_CLIENT, CONFIG_QUERY_REWRITING_ENABLED, + CONFIG_RAG_LLM_INPUTS_OVERRIDE, + CONFIG_RAG_VECTOR_FIELDS_DEFAULT, CONFIG_REASONING_EFFORT_ENABLED, CONFIG_SEARCH_CLIENT, CONFIG_SEMANTIC_RANKER_DEPLOYED, @@ -279,7 +281,7 @@ def auth_setup(): def config(): return jsonify( { - "showMultimodalOption": current_app.config[CONFIG_MULTIMODAL_ENABLED], + "showMultimodalOptions": current_app.config[CONFIG_MULTIMODAL_ENABLED], "showSemanticRankerOption": current_app.config[CONFIG_SEMANTIC_RANKER_DEPLOYED], "showQueryRewritingOption": current_app.config[CONFIG_QUERY_REWRITING_ENABLED], "showReasoningEffortOption": current_app.config[CONFIG_REASONING_EFFORT_ENABLED], @@ -294,6 +296,8 @@ def config(): "showChatHistoryBrowser": current_app.config[CONFIG_CHAT_HISTORY_BROWSER_ENABLED], "showChatHistoryCosmos": current_app.config[CONFIG_CHAT_HISTORY_COSMOS_ENABLED], "showAgenticRetrievalOption": current_app.config[CONFIG_AGENTIC_RETRIEVAL_ENABLED], + "ragLlmInputsOverride": current_app.config[CONFIG_RAG_LLM_INPUTS_OVERRIDE], + "ragVectorFieldsDefault": current_app.config[CONFIG_RAG_VECTOR_FIELDS_DEFAULT], } ) @@ -432,6 +436,7 @@ async def setup_clients(): # https://learn.microsoft.com/azure/ai-services/openai/api-version-deprecation#latest-ga-api-release AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION") or "2024-10-21" AZURE_VISION_ENDPOINT = os.getenv("AZURE_VISION_ENDPOINT", "") + AZURE_OPENAI_API_KEY_OVERRIDE = os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE") # Used only with non-Azure OpenAI deployments OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") OPENAI_ORGANIZATION = os.getenv("OPENAI_ORGANIZATION") @@ -461,6 +466,8 @@ async def setup_clients(): AZURE_SPEECH_SERVICE_VOICE = os.getenv("AZURE_SPEECH_SERVICE_VOICE") or "en-US-AndrewMultilingualNeural" USE_MULTIMODAL = os.getenv("USE_MULTIMODAL", "").lower() == "true" + RAG_LLM_INPUTS_OVERRIDE = os.getenv("RAG_LLM_INPUTS_OVERRIDE", "") + RAG_VECTOR_FIELDS_DEFAULT = os.getenv("RAG_VECTOR_FIELDS_DEFAULT", "") USE_USER_UPLOAD = os.getenv("USE_USER_UPLOAD", "").lower() == "true" ENABLE_LANGUAGE_PICKER = os.getenv("ENABLE_LANGUAGE_PICKER", "").lower() == "true" USE_SPEECH_INPUT_BROWSER = os.getenv("USE_SPEECH_INPUT_BROWSER", "").lower() == "true" @@ -477,6 +484,7 @@ async def setup_clients(): # This assumes you use 'azd auth login' locally, and managed identity when deployed on Azure. # The managed identity is setup in the infra/ folder. azure_credential: Union[AzureDeveloperCliCredential, ManagedIdentityCredential] + azure_ai_token_provider: Callable[[], Awaitable[str]] if RUNNING_ON_AZURE: current_app.logger.info("Setting up Azure credential using ManagedIdentityCredential") if AZURE_CLIENT_ID := os.getenv("AZURE_CLIENT_ID"): @@ -497,6 +505,9 @@ async def setup_clients(): else: current_app.logger.info("Setting up Azure credential using AzureDeveloperCliCredential for home tenant") azure_credential = AzureDeveloperCliCredential(process_timeout=60) + azure_ai_token_provider = get_bearer_token_provider( + azure_credential, "https://cognitiveservices.azure.com/.default" + ) # Set the Azure credential in the app config for use in other parts of the app current_app.config[CONFIG_CREDENTIAL] = azure_credential @@ -565,7 +576,7 @@ async def setup_clients(): document_intelligence_service=os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE"), local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER", "").lower() == "true", local_html_parser=os.getenv("USE_LOCAL_HTML_PARSER", "").lower() == "true", - search_images=USE_MULTIMODAL, + use_multimodal=USE_MULTIMODAL, ) search_info = await setup_search_info( search_service=AZURE_SEARCH_SERVICE, index_name=AZURE_SEARCH_INDEX, azure_credential=azure_credential @@ -573,12 +584,13 @@ async def setup_clients(): text_embeddings_service = setup_embeddings_service( azure_credential=azure_credential, openai_host=OPENAI_HOST, - openai_model_name=OPENAI_EMB_MODEL, - openai_service=AZURE_OPENAI_SERVICE, - openai_custom_url=AZURE_OPENAI_CUSTOM_URL, - openai_deployment=AZURE_OPENAI_EMB_DEPLOYMENT, - openai_dimensions=OPENAI_EMB_DIMENSIONS, - openai_api_version=AZURE_OPENAI_API_VERSION, + emb_model_name=OPENAI_EMB_MODEL, + emb_model_dimensions=OPENAI_EMB_DIMENSIONS, + azure_openai_service=AZURE_OPENAI_SERVICE, + azure_openai_custom_url=AZURE_OPENAI_CUSTOM_URL, + azure_openai_deployment=AZURE_OPENAI_EMB_DEPLOYMENT, + azure_openai_api_version=AZURE_OPENAI_API_VERSION, + azure_openai_key=clean_key_if_exists(AZURE_OPENAI_API_KEY_OVERRIDE), openai_key=clean_key_if_exists(OPENAI_API_KEY), openai_org=OPENAI_ORGANIZATION, disable_vectors=os.getenv("USE_VECTORS", "").lower() == "false", @@ -617,18 +629,17 @@ async def setup_clients(): if not AZURE_OPENAI_SERVICE: raise ValueError("AZURE_OPENAI_SERVICE must be set when OPENAI_HOST is azure") endpoint = f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com" - if api_key := os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE"): + if AZURE_OPENAI_API_KEY_OVERRIDE: current_app.logger.info("AZURE_OPENAI_API_KEY_OVERRIDE found, using as api_key for Azure OpenAI client") openai_client = AsyncAzureOpenAI( - api_version=AZURE_OPENAI_API_VERSION, azure_endpoint=endpoint, api_key=api_key + api_version=AZURE_OPENAI_API_VERSION, azure_endpoint=endpoint, api_key=AZURE_OPENAI_API_KEY_OVERRIDE ) else: current_app.logger.info("Using Azure credential (passwordless authentication) for Azure OpenAI client") - token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default") openai_client = AsyncAzureOpenAI( api_version=AZURE_OPENAI_API_VERSION, azure_endpoint=endpoint, - azure_ad_token_provider=token_provider, + azure_ad_token_provider=azure_ai_token_provider, ) elif OPENAI_HOST == "local": current_app.logger.info("OPENAI_HOST is local, setting up local OpenAI client for OPENAI_BASE_URL with no key") @@ -673,6 +684,8 @@ async def setup_clients(): current_app.config[CONFIG_CHAT_HISTORY_COSMOS_ENABLED] = USE_CHAT_HISTORY_COSMOS current_app.config[CONFIG_AGENTIC_RETRIEVAL_ENABLED] = USE_AGENTIC_RETRIEVAL current_app.config[CONFIG_MULTIMODAL_ENABLED] = USE_MULTIMODAL + current_app.config[CONFIG_RAG_LLM_INPUTS_OVERRIDE] = RAG_LLM_INPUTS_OVERRIDE + current_app.config[CONFIG_RAG_VECTOR_FIELDS_DEFAULT] = RAG_VECTOR_FIELDS_DEFAULT prompt_manager = PromptyManager() @@ -699,6 +712,9 @@ async def setup_clients(): query_speller=AZURE_SEARCH_QUERY_SPELLER, prompt_manager=prompt_manager, reasoning_effort=OPENAI_REASONING_EFFORT, + vision_endpoint=AZURE_VISION_ENDPOINT, + vision_token_provider=azure_ai_token_provider, + multimodal_enabled=USE_MULTIMODAL, ) # ChatReadRetrieveReadApproach is used by /chat for multi-turn conversation @@ -710,6 +726,7 @@ async def setup_clients(): agent_client=agent_client, openai_client=openai_client, auth_helper=auth_helper, + images_blob_container_client=image_blob_container_client, chatgpt_model=OPENAI_CHATGPT_MODEL, chatgpt_deployment=AZURE_OPENAI_CHATGPT_DEPLOYMENT, embedding_model=OPENAI_EMB_MODEL, @@ -723,7 +740,8 @@ async def setup_clients(): prompt_manager=prompt_manager, reasoning_effort=OPENAI_REASONING_EFFORT, vision_endpoint=AZURE_VISION_ENDPOINT, - vision_token_provider=token_provider, + vision_token_provider=azure_ai_token_provider, + multimodal_enabled=USE_MULTIMODAL, ) diff --git a/app/backend/approaches/approach.py b/app/backend/approaches/approach.py index c9185932cf..55c5e97c1b 100644 --- a/app/backend/approaches/approach.py +++ b/app/backend/approaches/approach.py @@ -2,6 +2,7 @@ from abc import ABC from collections.abc import AsyncGenerator, Awaitable from dataclasses import dataclass +from enum import Enum from typing import Any, Callable, Optional, TypedDict, Union, cast from urllib.parse import urljoin @@ -35,6 +36,19 @@ from approaches.promptmanager import PromptManager from core.authentication import AuthenticationHelper +from core.imageshelper import download_blob_as_base64 + + +class LLMInputType(str, Enum): + TEXT_AND_IMAGES = "textAndImages" + IMAGES = "images" + TEXTS = "texts" + + +class VectorFieldType(str, Enum): + EMBEDDING = "textEmbeddingOnly" + IMAGE_EMBEDDING = "imageEmbeddingOnly" + TEXT_AND_IMAGE_EMBEDDINGS = "textAndImageEmbeddings" @dataclass @@ -96,6 +110,7 @@ def update_token_usage(self, usage: CompletionUsage) -> None: class DataPoints: text: Optional[list[str]] = None images: Optional[list] = None + citations: Optional[list[str]] = None @dataclass @@ -157,6 +172,7 @@ def __init__( vision_token_provider: Callable[[], Awaitable[str]], prompt_manager: PromptManager, reasoning_effort: Optional[str] = None, + multimodal_enabled: bool = False, ): self.search_client = search_client self.openai_client = openai_client @@ -173,6 +189,25 @@ def __init__( self.prompt_manager = prompt_manager self.reasoning_effort = reasoning_effort self.include_token_usage = True + self.multimodal_enabled = multimodal_enabled + + def get_default_llm_inputs(self) -> str: + """ + Returns the default LLM inputs based on whether multimodal is enabled + """ + if self.multimodal_enabled: + return LLMInputType.TEXT_AND_IMAGES + else: + return LLMInputType.TEXTS + + def get_default_vector_fields(self) -> str: + """ + Returns the default vector fields based on whether multimodal is enabled + """ + if self.multimodal_enabled: + return VectorFieldType.TEXT_AND_IMAGE_EMBEDDINGS + else: + return VectorFieldType.EMBEDDING def build_filter(self, overrides: dict[str, Any], auth_claims: dict[str, Any]) -> Optional[str]: include_category = overrides.get("include_category") @@ -323,37 +358,65 @@ async def run_agentic_retrieval( return response, results - def get_sources_content( - self, results: list[Document], use_semantic_captions: bool, use_image_citation: bool - ) -> list[str]: + async def get_sources_content( + self, results: list[Document], use_semantic_captions: bool, use_image_sources: bool + ) -> tuple[list[str], list[str], list[str]]: + """ + Extracts text and image sources from the search results. + If use_semantic_captions is True, it will use the captions from the results. + If use_image_sources is True, it will extract image URLs from the results. + Returns: + - A list of text sources (captions or content). + - A list of image sources (base64 encoded). + - A list of allowed citations for those sources. + """ def nonewlines(s: str) -> str: return s.replace("\n", " ").replace("\r", " ") - if use_semantic_captions: - return [ - (self.get_citation((doc.sourcepage or ""), use_image_citation)) - + ": " - + nonewlines(" . ".join([cast(str, c.text) for c in (doc.captions or [])])) - for doc in results - ] - else: - return [ - (self.get_citation((doc.sourcepage or ""), use_image_citation)) + ": " + nonewlines(doc.content or "") - for doc in results - ] + citations = [] + text_sources = [] + image_sources = [] + seen_urls = set() - def get_citation(self, sourcepage: str, use_image_citation: bool) -> str: - if use_image_citation: - return sourcepage - else: - path, ext = os.path.splitext(sourcepage) - if ext.lower() == ".png": - page_idx = path.rfind("-") - page_number = int(path[page_idx + 1 :]) - return f"{path[:page_idx]}.pdf#page={page_number}" + for doc in results: + # Get the citation for the source page + citation = self.get_citation(doc.sourcepage or "") + citations.append(citation) - return sourcepage + # If semantic captions are used, extract captions; otherwise, use content + if use_semantic_captions and doc.captions: + text_sources.append(f"{citation}: {nonewlines(' . '.join([cast(str, c.text) for c in doc.captions]))}") + else: + text_sources.append(f"{citation}: {nonewlines(doc.content or '')}") + + if use_image_sources and hasattr(doc, "images") and doc.images: + for img in doc.images: + # Skip if we've already processed this URL + if img["url"] in seen_urls: + continue + seen_urls.add(img["url"]) + url = await download_blob_as_base64(self.images_blob_container_client, img["url"]) + if url: + image_sources.append(url) + citations.append(self.get_image_citation(doc.sourcepage or "", img["url"])) + + return text_sources, image_sources, citations + + def get_citation(self, sourcepage: str) -> str: + path, ext = os.path.splitext(sourcepage) + if ext.lower() == ".png": + page_idx = path.rfind("-") + page_number = int(path[page_idx + 1 :]) + return f"{path[:page_idx]}.pdf#page={page_number}" + return sourcepage + + def get_image_citation(self, sourcepage: str, image_url: str): + source_page_citation = self.get_citation(sourcepage) + # extract the image filename from image_url, last part after the last slash + image_filename = image_url.split("/")[-1] + if source_page_citation: + return f"{source_page_citation}({image_filename})" async def compute_text_embedding(self, q: str): SUPPORTED_DIMENSIONS_MODEL = { @@ -393,7 +456,7 @@ async def compute_image_embedding(self, q: str): ) as response: json = await response.json() image_query_vector = json["vector"] - return VectorizedQuery(vector=image_query_vector, k_nearest_neighbors=50, fields="imageEmbedding") + return VectorizedQuery(vector=image_query_vector, k_nearest_neighbors=50, fields="images/embedding") def get_system_prompt_variables(self, override_prompt: Optional[str]) -> dict[str, str]: # Allows client to replace the entire prompt, or to inject into the existing prompt using >>> diff --git a/app/backend/approaches/chatapproach.py b/app/backend/approaches/chatapproach.py deleted file mode 100644 index 346c9f3b0a..0000000000 --- a/app/backend/approaches/chatapproach.py +++ /dev/null @@ -1,151 +0,0 @@ -import json -import re -from abc import ABC, abstractmethod -from collections.abc import AsyncGenerator, Awaitable -from typing import Any, Optional, Union, cast - -from openai import AsyncStream -from openai.types.chat import ( - ChatCompletion, - ChatCompletionChunk, - ChatCompletionMessageParam, -) - -from approaches.approach import ( - Approach, - ExtraInfo, -) - - -class ChatApproach(Approach, ABC): - - NO_RESPONSE = "0" - - @abstractmethod - async def run_until_final_call( - self, messages, overrides, auth_claims, should_stream - ) -> tuple[ExtraInfo, Union[Awaitable[ChatCompletion], Awaitable[AsyncStream[ChatCompletionChunk]]]]: - pass - - def get_search_query(self, chat_completion: ChatCompletion, user_query: str): - response_message = chat_completion.choices[0].message - - if response_message.tool_calls: - for tool in response_message.tool_calls: - if tool.type != "function": - continue - function = tool.function - if function.name == "search_sources": - arg = json.loads(function.arguments) - search_query = arg.get("search_query", self.NO_RESPONSE) - if search_query != self.NO_RESPONSE: - return search_query - elif query_text := response_message.content: - if query_text.strip() != self.NO_RESPONSE: - return query_text - return user_query - - def extract_followup_questions(self, content: Optional[str]): - if content is None: - return content, [] - return content.split("<<")[0], re.findall(r"<<([^>>]+)>>", content) - - async def run_without_streaming( - self, - messages: list[ChatCompletionMessageParam], - overrides: dict[str, Any], - auth_claims: dict[str, Any], - session_state: Any = None, - ) -> dict[str, Any]: - extra_info, chat_coroutine = await self.run_until_final_call( - messages, overrides, auth_claims, should_stream=False - ) - chat_completion_response: ChatCompletion = await cast(Awaitable[ChatCompletion], chat_coroutine) - content = chat_completion_response.choices[0].message.content - role = chat_completion_response.choices[0].message.role - if overrides.get("suggest_followup_questions"): - content, followup_questions = self.extract_followup_questions(content) - extra_info.followup_questions = followup_questions - # Assume last thought is for generating answer - if self.include_token_usage and extra_info.thoughts and chat_completion_response.usage: - extra_info.thoughts[-1].update_token_usage(chat_completion_response.usage) - chat_app_response = { - "message": {"content": content, "role": role}, - "context": extra_info, - "session_state": session_state, - } - return chat_app_response - - async def run_with_streaming( - self, - messages: list[ChatCompletionMessageParam], - overrides: dict[str, Any], - auth_claims: dict[str, Any], - session_state: Any = None, - ) -> AsyncGenerator[dict, None]: - extra_info, chat_coroutine = await self.run_until_final_call( - messages, overrides, auth_claims, should_stream=True - ) - chat_coroutine = cast(Awaitable[AsyncStream[ChatCompletionChunk]], chat_coroutine) - yield {"delta": {"role": "assistant"}, "context": extra_info, "session_state": session_state} - - followup_questions_started = False - followup_content = "" - async for event_chunk in await chat_coroutine: - # "2023-07-01-preview" API version has a bug where first response has empty choices - event = event_chunk.model_dump() # Convert pydantic model to dict - if event["choices"]: - # No usage during streaming - completion = { - "delta": { - "content": event["choices"][0]["delta"].get("content"), - "role": event["choices"][0]["delta"]["role"], - } - } - # if event contains << and not >>, it is start of follow-up question, truncate - content = completion["delta"].get("content") - content = content or "" # content may either not exist in delta, or explicitly be None - if overrides.get("suggest_followup_questions") and "<<" in content: - followup_questions_started = True - earlier_content = content[: content.index("<<")] - if earlier_content: - completion["delta"]["content"] = earlier_content - yield completion - followup_content += content[content.index("<<") :] - elif followup_questions_started: - followup_content += content - else: - yield completion - else: - # Final chunk at end of streaming should contain usage - # https://cookbook.openai.com/examples/how_to_stream_completions#4-how-to-get-token-usage-data-for-streamed-chat-completion-response - if event_chunk.usage and extra_info.thoughts and self.include_token_usage: - extra_info.thoughts[-1].update_token_usage(event_chunk.usage) - yield {"delta": {"role": "assistant"}, "context": extra_info, "session_state": session_state} - - if followup_content: - _, followup_questions = self.extract_followup_questions(followup_content) - yield { - "delta": {"role": "assistant"}, - "context": {"context": extra_info, "followup_questions": followup_questions}, - } - - async def run( - self, - messages: list[ChatCompletionMessageParam], - session_state: Any = None, - context: dict[str, Any] = {}, - ) -> dict[str, Any]: - overrides = context.get("overrides", {}) - auth_claims = context.get("auth_claims", {}) - return await self.run_without_streaming(messages, overrides, auth_claims, session_state) - - async def run_stream( - self, - messages: list[ChatCompletionMessageParam], - session_state: Any = None, - context: dict[str, Any] = {}, - ) -> AsyncGenerator[dict[str, Any], None]: - overrides = context.get("overrides", {}) - auth_claims = context.get("auth_claims", {}) - return self.run_with_streaming(messages, overrides, auth_claims, session_state) diff --git a/app/backend/approaches/chatreadretrieveread.py b/app/backend/approaches/chatreadretrieveread.py index 26925c5e7f..04c2883fdd 100644 --- a/app/backend/approaches/chatreadretrieveread.py +++ b/app/backend/approaches/chatreadretrieveread.py @@ -1,9 +1,12 @@ -from collections.abc import Awaitable -from typing import Any, Optional, Union, cast, Callable +import json +import re +from collections.abc import AsyncGenerator, Awaitable +from typing import Any, Callable, Optional, Union, cast from azure.search.documents.agent.aio import KnowledgeAgentRetrievalClient from azure.search.documents.aio import SearchClient from azure.search.documents.models import VectorQuery +from azure.storage.blob.aio import ContainerClient from openai import AsyncOpenAI, AsyncStream from openai.types.chat import ( ChatCompletion, @@ -12,19 +15,27 @@ ChatCompletionToolParam, ) -from approaches.approach import DataPoints, ExtraInfo, ThoughtStep -from approaches.chatapproach import ChatApproach +from approaches.approach import ( + Approach, + DataPoints, + ExtraInfo, + LLMInputType, + ThoughtStep, + VectorFieldType, +) from approaches.promptmanager import PromptManager from core.authentication import AuthenticationHelper -class ChatReadRetrieveReadApproach(ChatApproach): +class ChatReadRetrieveReadApproach(Approach): """ A multi-step approach that first uses OpenAI to turn the user's question into a search query, then uses Azure AI Search to retrieve relevant documents, and then sends the conversation history, original user question, and search results to OpenAI to generate a response. """ + NO_RESPONSE = "0" + def __init__( self, *, @@ -47,8 +58,10 @@ def __init__( query_speller: str, prompt_manager: PromptManager, reasoning_effort: Optional[str] = None, + multimodal_enabled: bool = False, vision_endpoint: Optional[str] = None, - vision_token_provider: Callable[[], Awaitable[str]], + vision_token_provider: Optional[Callable[[], Awaitable[str]]] = None, + images_blob_container_client: Optional[ContainerClient] = None, ): self.search_client = search_client self.search_index_name = search_index_name @@ -57,6 +70,7 @@ def __init__( self.agent_client = agent_client self.openai_client = openai_client self.auth_helper = auth_helper + self.images_blob_container_client = images_blob_container_client self.chatgpt_model = chatgpt_model self.chatgpt_deployment = chatgpt_deployment self.embedding_deployment = embedding_deployment @@ -75,6 +89,130 @@ def __init__( self.include_token_usage = True self.vision_endpoint = vision_endpoint self.vision_token_provider = vision_token_provider + self.multimodal_enabled = multimodal_enabled + + def get_search_query(self, chat_completion: ChatCompletion, user_query: str): + response_message = chat_completion.choices[0].message + + if response_message.tool_calls: + for tool in response_message.tool_calls: + if tool.type != "function": + continue + function = tool.function + if function.name == "search_sources": + arg = json.loads(function.arguments) + search_query = arg.get("search_query", self.NO_RESPONSE) + if search_query != self.NO_RESPONSE: + return search_query + elif query_text := response_message.content: + if query_text.strip() != self.NO_RESPONSE: + return query_text + return user_query + + def extract_followup_questions(self, content: Optional[str]): + if content is None: + return content, [] + return content.split("<<")[0], re.findall(r"<<([^>>]+)>>", content) + + async def run_without_streaming( + self, + messages: list[ChatCompletionMessageParam], + overrides: dict[str, Any], + auth_claims: dict[str, Any], + session_state: Any = None, + ) -> dict[str, Any]: + extra_info, chat_coroutine = await self.run_until_final_call( + messages, overrides, auth_claims, should_stream=False + ) + chat_completion_response: ChatCompletion = await cast(Awaitable[ChatCompletion], chat_coroutine) + content = chat_completion_response.choices[0].message.content + role = chat_completion_response.choices[0].message.role + if overrides.get("suggest_followup_questions"): + content, followup_questions = self.extract_followup_questions(content) + extra_info.followup_questions = followup_questions + # Assume last thought is for generating answer + if self.include_token_usage and extra_info.thoughts and chat_completion_response.usage: + extra_info.thoughts[-1].update_token_usage(chat_completion_response.usage) + chat_app_response = { + "message": {"content": content, "role": role}, + "context": extra_info, + "session_state": session_state, + } + return chat_app_response + + async def run_with_streaming( + self, + messages: list[ChatCompletionMessageParam], + overrides: dict[str, Any], + auth_claims: dict[str, Any], + session_state: Any = None, + ) -> AsyncGenerator[dict, None]: + extra_info, chat_coroutine = await self.run_until_final_call( + messages, overrides, auth_claims, should_stream=True + ) + chat_coroutine = cast(Awaitable[AsyncStream[ChatCompletionChunk]], chat_coroutine) + yield {"delta": {"role": "assistant"}, "context": extra_info, "session_state": session_state} + + followup_questions_started = False + followup_content = "" + async for event_chunk in await chat_coroutine: + # "2023-07-01-preview" API version has a bug where first response has empty choices + event = event_chunk.model_dump() # Convert pydantic model to dict + if event["choices"]: + # No usage during streaming + completion = { + "delta": { + "content": event["choices"][0]["delta"].get("content"), + "role": event["choices"][0]["delta"]["role"], + } + } + # if event contains << and not >>, it is start of follow-up question, truncate + content = completion["delta"].get("content") + content = content or "" # content may either not exist in delta, or explicitly be None + if overrides.get("suggest_followup_questions") and "<<" in content: + followup_questions_started = True + earlier_content = content[: content.index("<<")] + if earlier_content: + completion["delta"]["content"] = earlier_content + yield completion + followup_content += content[content.index("<<") :] + elif followup_questions_started: + followup_content += content + else: + yield completion + else: + # Final chunk at end of streaming should contain usage + # https://cookbook.openai.com/examples/how_to_stream_completions#4-how-to-get-token-usage-data-for-streamed-chat-completion-response + if event_chunk.usage and extra_info.thoughts and self.include_token_usage: + extra_info.thoughts[-1].update_token_usage(event_chunk.usage) + yield {"delta": {"role": "assistant"}, "context": extra_info, "session_state": session_state} + + if followup_content: + _, followup_questions = self.extract_followup_questions(followup_content) + yield { + "delta": {"role": "assistant"}, + "context": {"context": extra_info, "followup_questions": followup_questions}, + } + + async def run( + self, + messages: list[ChatCompletionMessageParam], + session_state: Any = None, + context: dict[str, Any] = {}, + ) -> dict[str, Any]: + overrides = context.get("overrides", {}) + auth_claims = context.get("auth_claims", {}) + return await self.run_without_streaming(messages, overrides, auth_claims, session_state) + + async def run_stream( + self, + messages: list[ChatCompletionMessageParam], + session_state: Any = None, + context: dict[str, Any] = {}, + ) -> AsyncGenerator[dict[str, Any], None]: + overrides = context.get("overrides", {}) + auth_claims = context.get("auth_claims", {}) + return self.run_with_streaming(messages, overrides, auth_claims, session_state) async def run_until_final_call( self, @@ -96,7 +234,6 @@ async def run_until_final_call( else: extra_info = await self.run_search_approach(messages, overrides, auth_claims) - # If there are images, send the images to the model as well messages = self.prompt_manager.render_prompt( self.answer_prompt, self.get_system_prompt_variables(overrides.get("prompt_template")) @@ -105,6 +242,8 @@ async def run_until_final_call( "past_messages": messages[:-1], "user_query": original_user_query, "text_sources": extra_info.data_points.text, + "image_sources": extra_info.data_points.images, + "citations": extra_info.data_points.citations, }, ) @@ -144,6 +283,24 @@ async def run_search_approach( minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0) search_index_filter = self.build_filter(overrides, auth_claims) + llm_inputs = overrides.get("llm_inputs") + vector_fields = overrides.get("vector_fields") + + # Use default values based on multimodal_enabled if not provided in overrides + if llm_inputs is None: + llm_inputs = self.get_default_llm_inputs() + if vector_fields is None: + vector_fields = self.get_default_vector_fields() + + llm_inputs_enum = LLMInputType(llm_inputs) if llm_inputs is not None else None + vector_fields_enum = VectorFieldType(vector_fields) if vector_fields is not None else None + # Use multimodal/image logic based on enums + use_image_embeddings = vector_fields_enum in [ + VectorFieldType.IMAGE_EMBEDDING, + VectorFieldType.TEXT_AND_IMAGE_EMBEDDINGS, + ] + use_image_sources = llm_inputs_enum in [LLMInputType.TEXT_AND_IMAGES, LLMInputType.IMAGES] + original_user_query = messages[-1]["content"] if not isinstance(original_user_query, str): raise ValueError("The most recent message content must be a string.") @@ -175,12 +332,11 @@ async def run_search_approach( # STEP 2: Retrieve relevant documents from the search index with the GPT optimized query - # If retrieval mode includes vectors, compute an embedding for the query vectors: list[VectorQuery] = [] if use_vector_search: vectors.append(await self.compute_text_embedding(query_text)) - # Optionally add image embeddings if using multimodal approach - vectors.append(await self.compute_image_embedding(query_text)) + if use_image_embeddings: + vectors.append(await self.compute_image_embedding(query_text)) results = await self.search( top, @@ -197,10 +353,12 @@ async def run_search_approach( ) # STEP 3: Generate a contextual and content specific answer using the search results and chat history - text_sources = self.get_sources_content(results, use_semantic_captions, use_image_citation=False) + text_sources, image_sources, citations = await self.get_sources_content( + results, use_semantic_captions, use_image_sources=use_image_sources + ) extra_info = ExtraInfo( - DataPoints(text=text_sources), + DataPoints(text=text_sources, images=image_sources, citations=citations), thoughts=[ self.format_thought_step_for_chatcompletion( title="Prompt to generate search query", @@ -222,6 +380,8 @@ async def run_search_approach( "filter": search_index_filter, "use_vector_search": use_vector_search, "use_text_search": use_text_search, + "use_image_embeddings": use_image_embeddings, + "use_image_sources": use_image_sources, }, ), ThoughtStep( @@ -257,10 +417,19 @@ async def run_agentic_retrieval_approach( results_merge_strategy=results_merge_strategy, ) - text_sources = self.get_sources_content(results, use_semantic_captions=False, use_image_citation=False) + # Determine if we should use image sources based on overrides or defaults + llm_inputs = overrides.get("llm_inputs") + if llm_inputs is None: + llm_inputs = self.get_default_llm_inputs() + llm_inputs_enum = LLMInputType(llm_inputs) if llm_inputs is not None else None + use_image_sources = llm_inputs_enum in [LLMInputType.TEXT_AND_IMAGES, LLMInputType.IMAGES] + + text_sources, image_sources, citations = await self.get_sources_content( + results, use_semantic_captions=False, use_image_sources=use_image_sources + ) extra_info = ExtraInfo( - DataPoints(text=text_sources), + DataPoints(text=text_sources, images=image_sources, citations=citations), thoughts=[ ThoughtStep( "Use agentic retrieval", diff --git a/app/backend/approaches/prompts/ask_answer_question.prompty b/app/backend/approaches/prompts/ask_answer_question.prompty index 464634bb26..997740536f 100644 --- a/app/backend/approaches/prompts/ask_answer_question.prompty +++ b/app/backend/approaches/prompts/ask_answer_question.prompty @@ -26,9 +26,13 @@ Each text source starts in a new line and has the file name followed by colon an Always include the source document filename for each fact you use in the response in the format: [document_name.ext#page=N]. If you are referencing an image, add the image filename in the format: [document_name.ext#page=N(image_name.png)]. Answer the following question using only the data provided in the sources below. -The text and image source can be the same file name, don't use the image title when citing the image source, only use the file name as mentioned. -If you cannot answer using the sources below, say you don't know. Return just the answer without any input texts. +If you cannot answer using the sources below, say you don't know. +Return just the answer without any input texts. {% endif %} +Possible citations for current question: +{% for citation in citations %} +[{{ citation }}] +{% endfor %} {{ injected_prompt }} {% endif %} @@ -46,11 +50,9 @@ In-network deductibles are $500 for employee and $1000 for family [info1.txt] an user: {{ user_query }} -{% for image_source in image_sources %} +{% if image_sources is defined %}{% for image_source in image_sources %} ![Image]({{image_source}}) -{% endfor %} -{% if text_sources is defined %} -Sources: -{% for text_source in text_sources %} -{% endfor %} -{% endif %} +{% endfor %}{% endif %} +{% if text_sources is defined %}Sources:{% for text_source in text_sources %} +{{ text_source }} +{% endfor %}{% endif %} diff --git a/app/backend/approaches/retrievethenread.py b/app/backend/approaches/retrievethenread.py index e49065da59..544fd6e9aa 100644 --- a/app/backend/approaches/retrievethenread.py +++ b/app/backend/approaches/retrievethenread.py @@ -1,4 +1,5 @@ -from typing import Any, Optional, cast +from collections.abc import Awaitable +from typing import Any, Callable, Optional, cast from azure.search.documents.agent.aio import KnowledgeAgentRetrievalClient from azure.search.documents.aio import SearchClient @@ -7,10 +8,16 @@ from openai import AsyncOpenAI from openai.types.chat import ChatCompletion, ChatCompletionMessageParam -from approaches.approach import Approach, DataPoints, ExtraInfo, ThoughtStep +from approaches.approach import ( + Approach, + DataPoints, + ExtraInfo, + LLMInputType, + ThoughtStep, + VectorFieldType, +) from approaches.promptmanager import PromptManager from core.authentication import AuthenticationHelper -from core.imageshelper import download_blob_as_base64 class RetrieveThenReadApproach(Approach): @@ -29,7 +36,6 @@ def __init__( agent_deployment: Optional[str], agent_client: KnowledgeAgentRetrievalClient, auth_helper: AuthenticationHelper, - images_blob_container_client: ContainerClient, openai_client: AsyncOpenAI, chatgpt_model: str, chatgpt_deployment: Optional[str], # Not needed for non-Azure OpenAI @@ -43,6 +49,10 @@ def __init__( query_speller: str, prompt_manager: PromptManager, reasoning_effort: Optional[str] = None, + multimodal_enabled: bool = False, + vision_endpoint: Optional[str] = None, + vision_token_provider: Optional[Callable[[], Awaitable[str]]] = None, + images_blob_container_client: Optional[ContainerClient] = None, ): self.search_client = search_client self.search_index_name = search_index_name @@ -67,6 +77,9 @@ def __init__( self.answer_prompt = self.prompt_manager.load_prompt("ask_answer_question.prompty") self.reasoning_effort = reasoning_effort self.include_token_usage = True + self.vision_endpoint = vision_endpoint + self.vision_token_provider = vision_token_provider + self.multimodal_enabled = multimodal_enabled async def run( self, @@ -93,7 +106,8 @@ async def run( | { "user_query": q, "text_sources": extra_info.data_points.text, - "image_sources": extra_info.data_points.images, + "image_sources": extra_info.data_points.images or [], + "citations": extra_info.data_points.citations, }, ) @@ -122,34 +136,54 @@ async def run( "content": chat_completion.choices[0].message.content, "role": chat_completion.choices[0].message.role, }, - "context": extra_info, + "context": { + "thoughts": extra_info.thoughts, + "data_points": { + "text": extra_info.data_points.text or [], + "images": extra_info.data_points.images or [], + "citations": extra_info.data_points.citations or [], + }, + }, "session_state": session_state, } async def run_search_approach( self, messages: list[ChatCompletionMessageParam], overrides: dict[str, Any], auth_claims: dict[str, Any] - ): + ) -> ExtraInfo: use_text_search = overrides.get("retrieval_mode") in ["text", "hybrid", None] use_vector_search = overrides.get("retrieval_mode") in ["vectors", "hybrid", None] use_semantic_ranker = True if overrides.get("semantic_ranker") else False use_query_rewriting = True if overrides.get("query_rewriting") else False use_semantic_captions = True if overrides.get("semantic_captions") else False - use_multimodal = True # TODO: if overrides.get("use_multimodal") else False top = overrides.get("top", 3) minimum_search_score = overrides.get("minimum_search_score", 0.0) minimum_reranker_score = overrides.get("minimum_reranker_score", 0.0) filter = self.build_filter(overrides, auth_claims) q = str(messages[-1]["content"]) - # If retrieval mode includes vectors, compute an embedding for the query + llm_inputs = overrides.get("llm_inputs") + vector_fields = overrides.get("vector_fields") + + # Use default values based on multimodal_enabled if not provided in overrides + if llm_inputs is None: + llm_inputs = self.get_default_llm_inputs() + if vector_fields is None: + vector_fields = self.get_default_vector_fields() + + llm_inputs_enum = LLMInputType(llm_inputs) if llm_inputs is not None else None + vector_fields_enum = VectorFieldType(vector_fields) if vector_fields is not None else None + use_image_embeddings = vector_fields_enum in [ + VectorFieldType.IMAGE_EMBEDDING, + VectorFieldType.TEXT_AND_IMAGE_EMBEDDINGS, + ] + use_image_sources = llm_inputs_enum in [LLMInputType.TEXT_AND_IMAGES, LLMInputType.IMAGES] + vectors: list[VectorQuery] = [] if use_vector_search: - vectors.append(await self.compute_text_embedding(q)) - - # If multimodal is enabled, also compute image embeddings - # TODO: will this work with agentic? is this doing multivector search correctly? - # if use_multimodal: - # vectors.append(await self.compute_image_embedding(q)) + if vector_fields_enum != VectorFieldType.IMAGE_EMBEDDING: + vectors.append(await self.compute_text_embedding(q)) + if use_image_embeddings: + vectors.append(await self.compute_image_embedding(q)) results = await self.search( top, @@ -165,26 +199,12 @@ async def run_search_approach( use_query_rewriting, ) - text_sources = self.get_sources_content(results, use_semantic_captions, use_image_citation=use_multimodal) - - # Extract unique image URLs from results if multimodal is enabled - - seen_urls = set() - image_sources = [] - if use_multimodal: - for doc in results: - if hasattr(doc, "images") and doc.images: - for img in doc.images: - # Skip if we've already processed this URL - if img["url"] in seen_urls: - continue - seen_urls.add(img["url"]) - url = await download_blob_as_base64(self.images_blob_container_client, img["url"]) - if url: - image_sources.append(url) + text_sources, image_sources, citations = await self.get_sources_content( + results, use_semantic_captions, use_image_sources=use_image_sources + ) return ExtraInfo( - DataPoints(text=text_sources, images=image_sources), + DataPoints(text=text_sources, images=image_sources, citations=citations), thoughts=[ ThoughtStep( "Search using user query", @@ -197,7 +217,8 @@ async def run_search_approach( "filter": filter, "use_vector_search": use_vector_search, "use_text_search": use_text_search, - "use_multimodal": use_multimodal, + "use_image_embeddings": use_image_embeddings, + "use_image_sources": use_image_sources, }, ), ThoughtStep( @@ -212,7 +233,7 @@ async def run_agentic_retrieval_approach( messages: list[ChatCompletionMessageParam], overrides: dict[str, Any], auth_claims: dict[str, Any], - ): + ) -> ExtraInfo: minimum_reranker_score = overrides.get("minimum_reranker_score", 0) search_index_filter = self.build_filter(overrides, auth_claims) top = overrides.get("top", 3) @@ -232,10 +253,19 @@ async def run_agentic_retrieval_approach( results_merge_strategy=results_merge_strategy, ) - text_sources = self.get_sources_content(results, use_semantic_captions=False, use_image_citation=False) + # Determine if we should use image sources based on overrides or defaults + llm_inputs = overrides.get("llm_inputs") + if llm_inputs is None: + llm_inputs = self.get_default_llm_inputs() + llm_inputs_enum = LLMInputType(llm_inputs) if llm_inputs is not None else None + use_image_sources = llm_inputs_enum in [LLMInputType.TEXT_AND_IMAGES, LLMInputType.IMAGES] + + text_sources, image_sources, citations = await self.get_sources_content( + results, use_semantic_captions=False, use_image_sources=use_image_sources + ) extra_info = ExtraInfo( - DataPoints(text=text_sources), + DataPoints(text=text_sources, images=image_sources, citations=citations), thoughts=[ ThoughtStep( "Use agentic retrieval", diff --git a/app/backend/config.py b/app/backend/config.py index 70fd591a56..fed61f3a7e 100644 --- a/app/backend/config.py +++ b/app/backend/config.py @@ -32,3 +32,5 @@ CONFIG_COSMOS_HISTORY_CONTAINER = "cosmos_history_container" CONFIG_COSMOS_HISTORY_VERSION = "cosmos_history_version" CONFIG_MULTIMODAL_ENABLED = "multimodal_enabled" +CONFIG_RAG_LLM_INPUTS_OVERRIDE = "rag_llm_inputs_override" +CONFIG_RAG_VECTOR_FIELDS_DEFAULT = "rag_vector_fields_default" diff --git a/app/backend/core/imageshelper.py b/app/backend/core/imageshelper.py index 2ba1f11dcc..e6e19bc46f 100644 --- a/app/backend/core/imageshelper.py +++ b/app/backend/core/imageshelper.py @@ -24,6 +24,8 @@ async def download_blob_as_base64(blob_container_client: ContainerClient, blob_u url_parts = blob_url.split("/") # Skip the domain parts and container name to get the blob path blob_path = "/".join(url_parts[4:]) + # If %20 in URL, replace it with a space + blob_path = blob_path.replace("%20", " ") else: # Treat as a direct blob path blob_path = blob_url diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index 1eb918a3ca..c3b508a76c 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -88,7 +88,6 @@ def setup_blob_manager( storage_container: str, storage_resource_group: str, subscription_id: str, - store_page_images: bool, storage_key: Union[str, None] = None, image_storage_container: Union[str, None] = None, # Added this parameter ): @@ -97,12 +96,11 @@ def setup_blob_manager( return BlobManager( endpoint=f"https://{storage_account}.blob.core.windows.net", container=storage_container, - image_container=image_storage_container, account=storage_account, credential=storage_creds, - resourceGroup=storage_resource_group, - subscriptionId=subscription_id, - store_page_images=store_page_images, + resource_group=storage_resource_group, + subscription_id=subscription_id, + image_container=image_storage_container, ) @@ -465,7 +463,6 @@ async def main(strategy: Strategy, setup_index: bool = True): storage_container=os.environ["AZURE_STORAGE_CONTAINER"], storage_resource_group=os.environ["AZURE_STORAGE_RESOURCE_GROUP"], subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"], - store_page_images=use_multimodal, storage_key=clean_key_if_exists(args.storagekey), image_storage_container=os.environ.get("AZURE_IMAGESTORAGE_CONTAINER"), # Pass the image container ) diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py index 3dc2c1e5b8..c02f8c21cc 100644 --- a/app/backend/prepdocslib/blobmanager.py +++ b/app/backend/prepdocslib/blobmanager.py @@ -5,9 +5,6 @@ from typing import Optional, Union from azure.core.credentials_async import AsyncTokenCredential -from azure.storage.blob import ( - UserDelegationKey, -) from azure.storage.blob.aio import BlobServiceClient from PIL import Image, ImageDraw, ImageFont @@ -27,20 +24,17 @@ def __init__( container: str, account: str, credential: Union[AsyncTokenCredential, str], - resourceGroup: str, - subscriptionId: str, - store_page_images: bool = False, + resource_group: str, + subscription_id: str, image_container: Optional[str] = None, # Added this parameter ): self.endpoint = endpoint self.credential = credential self.account = account self.container = container + self.resource_group = resource_group + self.subscription_id = subscription_id self.image_container = image_container - self.store_page_images = store_page_images - self.resourceGroup = resourceGroup - self.subscriptionId = subscriptionId - self.user_delegation_key: Optional[UserDelegationKey] = None async def upload_blob(self, file: File) -> Optional[list[str]]: async with BlobServiceClient( @@ -118,7 +112,7 @@ async def upload_document_image( return None def get_managedidentity_connectionstring(self): - return f"ResourceId=/subscriptions/{self.subscriptionId}/resourceGroups/{self.resourceGroup}/providers/Microsoft.Storage/storageAccounts/{self.account};" + return f"ResourceId=/subscriptions/{self.subscription_id}/resourceGroups/{self.resource_group}/providers/Microsoft.Storage/storageAccounts/{self.account};" async def remove_blob(self, path: Optional[str] = None): async with BlobServiceClient( diff --git a/app/backend/prepdocslib/filestrategy.py b/app/backend/prepdocslib/filestrategy.py index 4739a83c6a..3a3c9fa420 100644 --- a/app/backend/prepdocslib/filestrategy.py +++ b/app/backend/prepdocslib/filestrategy.py @@ -21,6 +21,7 @@ async def parse_file( blob_manager: Optional[BlobManager] = None, image_embeddings_client: Optional[ImageEmbeddings] = None, ) -> list[Section]: + await blob_manager.upload_blob(file) key = file.file_extension().lower() processor = file_processors.get(key) if processor is None: @@ -112,6 +113,7 @@ async def run(self): files = self.list_file_strategy.list() async for file in files: try: + await self.blob_manager.upload_blob(file) sections = await parse_file( file, self.file_processors, self.category, self.blob_manager, self.image_embeddings ) diff --git a/app/backend/prepdocslib/page.py b/app/backend/prepdocslib/page.py index d2eeee3c7d..c76c260b5e 100644 --- a/app/backend/prepdocslib/page.py +++ b/app/backend/prepdocslib/page.py @@ -8,7 +8,7 @@ class ImageOnPage: filename: str description: str figure_id: str - page_num: int # 1-indexed + page_num: int # 0-indexed url: str | None = None embedding: list[float] | None = None diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py index d402e66c8b..c33ec1bcdd 100644 --- a/app/backend/prepdocslib/pdfparser.py +++ b/app/backend/prepdocslib/pdfparser.py @@ -243,7 +243,7 @@ async def process_figure(doc: pymupdf.Document, figure: DocumentFigure, media_de figure_description = await media_describer.describe_image(cropped_img) return ImageOnPage( bytes=cropped_img, - page_num=page_number, + page_num=page_number - 1, # Convert to 0-indexed figure_id=figure.id, bbox=bbox_pixels, filename=f"figure{figure.id.replace(".", "_")}.png", diff --git a/app/backend/prepdocslib/searchmanager.py b/app/backend/prepdocslib/searchmanager.py index 95f07a32f3..e2ef38d62e 100644 --- a/app/backend/prepdocslib/searchmanager.py +++ b/app/backend/prepdocslib/searchmanager.py @@ -282,6 +282,7 @@ async def create_index(self): raise ValueError("Image search profile and algorithm must be set") vector_search_profiles.append(image_vector_search_profile) vector_algorithms.append(image_vector_algorithm) + # TODO: Add image vectorizer if needed index = SearchIndex( name=self.search_info.index_name, diff --git a/app/frontend/src/api/api.ts b/app/frontend/src/api/api.ts index df95f801b5..d7512326f2 100644 --- a/app/frontend/src/api/api.ts +++ b/app/frontend/src/api/api.ts @@ -79,7 +79,9 @@ export async function getSpeechApi(text: string): Promise { } export function getCitationFilePath(citation: string): string { - return `${BACKEND_URI}/content/${citation}`; + // If there are parentheses at end of citation, remove part in parentheses + const cleanedCitation = citation.replace(/\s*\(.*?\)\s*$/, "").trim(); + return `${BACKEND_URI}/content/${cleanedCitation}`; } export async function uploadFileApi(request: FormData, idToken: string): Promise { diff --git a/app/frontend/src/api/models.ts b/app/frontend/src/api/models.ts index 63ff5c31f4..9d3124d716 100644 --- a/app/frontend/src/api/models.ts +++ b/app/frontend/src/api/models.ts @@ -4,7 +4,7 @@ export const enum RetrievalMode { Text = "text" } -export const enum GPT4VInput { +export const enum LLMInputs { TextAndImages = "textAndImages", Images = "images", Texts = "texts" @@ -37,8 +37,7 @@ export type ChatAppRequestOverrides = { suggest_followup_questions?: boolean; use_oid_security_filter?: boolean; use_groups_security_filter?: boolean; - use_gpt4v?: boolean; - gpt4v_input?: GPT4VInput; + llm_inputs: LLMInputs; vector_fields: VectorFields; language: string; use_agentic_retrieval: boolean; @@ -55,8 +54,14 @@ export type Thoughts = { props?: { [key: string]: any }; }; +export type DataPoints = { + text: string[]; + images: string[]; + citations: string[]; +}; + export type ResponseContext = { - data_points: string[]; + data_points: DataPoints; followup_questions: string[] | null; thoughts: Thoughts[]; }; @@ -88,7 +93,7 @@ export type ChatAppRequest = { export type Config = { defaultReasoningEffort: string; - showGPT4VOptions: boolean; + showMultimodalOptions: boolean; showSemanticRankerOption: boolean; showQueryRewritingOption: boolean; showReasoningEffortOption: boolean; @@ -102,6 +107,8 @@ export type Config = { showChatHistoryBrowser: boolean; showChatHistoryCosmos: boolean; showAgenticRetrievalOption: boolean; + ragLlmInputsOverride: string; + ragVectorFieldsDefault: string; }; export type SimpleAPIResponse = { diff --git a/app/frontend/src/components/Answer/Answer.tsx b/app/frontend/src/components/Answer/Answer.tsx index c9d73a8a76..e3688c5783 100644 --- a/app/frontend/src/components/Answer/Answer.tsx +++ b/app/frontend/src/components/Answer/Answer.tsx @@ -110,8 +110,10 @@ export const Answer = ({ {t("citationWithColon")} {parsedAnswer.citations.map((x, i) => { const path = getCitationFilePath(x); + // Strip out the image filename in parentheses if it exists + const strippedPath = path.replace(/\([^)]*\)$/, ""); return ( - onCitationClicked(path)}> + onCitationClicked(strippedPath)}> {`${++i}. ${x}`} ); diff --git a/app/frontend/src/components/Answer/AnswerParser.tsx b/app/frontend/src/components/Answer/AnswerParser.tsx index 3807592f6d..12b215b10c 100644 --- a/app/frontend/src/components/Answer/AnswerParser.tsx +++ b/app/frontend/src/components/Answer/AnswerParser.tsx @@ -6,32 +6,8 @@ type HtmlParsedAnswer = { citations: string[]; }; -// Function to validate citation format and check if dataPoint starts with possible citation -function isCitationValid(contextDataPoints: any, citationCandidate: string): boolean { - const regex = /.+\.\w{1,}(?:#\S*)?$/; - if (!regex.test(citationCandidate)) { - return false; - } - - // Check if contextDataPoints is an object with a text property that is an array - let dataPointsArray: string[]; - if (Array.isArray(contextDataPoints)) { - dataPointsArray = contextDataPoints; - } else if (contextDataPoints && Array.isArray(contextDataPoints.text)) { - dataPointsArray = contextDataPoints.text; - } else { - return false; - } - - const isValidCitation = dataPointsArray.some(dataPoint => { - return dataPoint.startsWith(citationCandidate); - }); - - return isValidCitation; -} - export function parseAnswerToHtml(answer: ChatAppResponse, isStreaming: boolean, onCitationClicked: (citationFilePath: string) => void): HtmlParsedAnswer { - const contextDataPoints = answer.context.data_points; + const possibleCitations = answer.context.data_points.citations || []; const citations: string[] = []; // Trim any whitespace from the end of the answer after removing follow-up questions @@ -60,7 +36,11 @@ export function parseAnswerToHtml(answer: ChatAppResponse, isStreaming: boolean, } else { let citationIndex: number; - if (!isCitationValid(contextDataPoints, part)) { + const isValidCitation = possibleCitations.some(citation => { + return citation.startsWith(part); + }); + + if (!isValidCitation) { return `[${part}]`; } diff --git a/app/frontend/src/components/Example/ExampleList.tsx b/app/frontend/src/components/Example/ExampleList.tsx index dab4ec97ec..2c89384c7f 100644 --- a/app/frontend/src/components/Example/ExampleList.tsx +++ b/app/frontend/src/components/Example/ExampleList.tsx @@ -5,18 +5,18 @@ import styles from "./Example.module.css"; interface Props { onExampleClicked: (value: string) => void; - useGPT4V?: boolean; + useMultimodalAnswering?: boolean; } -export const ExampleList = ({ onExampleClicked, useGPT4V }: Props) => { +export const ExampleList = ({ onExampleClicked, useMultimodalAnswering }: Props) => { const { t } = useTranslation(); const DEFAULT_EXAMPLES: string[] = [t("defaultExamples.1"), t("defaultExamples.2"), t("defaultExamples.3")]; - const GPT4V_EXAMPLES: string[] = [t("gpt4vExamples.1"), t("gpt4vExamples.2"), t("gpt4vExamples.3")]; + const MULTIMODAL_EXAMPLES: string[] = [t("multimodalExamples.1"), t("multimodalExamples.2"), t("multimodalExamples.3")]; return (