diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
index 989f73871..dc4db97c5 100644
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@@ -18,5 +18,11 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install -r ClientAdvisor/App/requirements.txt
- - name: Run flake8
- run: flake8 --config=ClientAdvisor/App/.flake8 ClientAdvisor/App
+ pip install -r ResearchAssistant/App/requirements.txt
+ pip install pylint flake8 azure-identity pymssql
+ - name: Run flake8 and pylint
+ run: |
+ flake8 --config=ClientAdvisor/App/.flake8 ClientAdvisor/App
+ pylint --rcfile=ClientAdvisor/App/.pylintrc ClientAdvisor/App
+ flake8 --config=ResearchAssistant/App/.flake8 ResearchAssistant/App
+ pylint --rcfile=ResearchAssistant/App/.pylintrc ResearchAssistant/App
\ No newline at end of file
diff --git a/ClientAdvisor/App/.flake8 b/ClientAdvisor/App/.flake8
index 74ee71d52..bc2f0943d 100644
--- a/ClientAdvisor/App/.flake8
+++ b/ClientAdvisor/App/.flake8
@@ -1,4 +1,4 @@
[flake8]
-max-line-length = 88
-extend-ignore = E501, E203
-exclude = .venv, frontend,
\ No newline at end of file
+max-line-length = 160
+extend-ignore = E203, W503, E501
+exclude = .venv, frontend
\ No newline at end of file
diff --git a/ClientAdvisor/App/.pylintrc b/ClientAdvisor/App/.pylintrc
new file mode 100644
index 000000000..dc035c349
--- /dev/null
+++ b/ClientAdvisor/App/.pylintrc
@@ -0,0 +1,41 @@
+[MASTER]
+ignore=tests ; Ignore the tests folder globally.
+
+[MESSAGES CONTROL]
+disable=
+ invalid-name, # C0103: Ignore naming style errors
+ line-too-long, # C0301: Ignore long lines
+ missing-function-docstring, # C0116: Ignore missing function docstrings
+ missing-class-docstring, # C0115: Ignore missing class docstrings
+ missing-module-docstring, # C0114: Ignore missing module docstrings
+ redefined-outer-name, # W0621: Ignore redefined variables warnings
+ broad-exception-raised, # W0719: Ignore broad exception raised warnings
+ broad-exception-caught, # W0718: Ignore broad exception caught warnings
+ too-many-arguments, # R0913: Ignore too many arguments
+ too-many-locals, # R0914: Ignore too many local variables
+ too-many-return-statements, # R0911: Ignore too many return statements
+ too-many-branches, # R0912: Ignore too many branches
+ unused-argument, # W0613: Ignore unused arguments
+ unspecified-encoding, # W1514: Ignore unspecified encoding in open()
+ logging-fstring-interpolation, # W1203: Ignore lazy f-string interpolation
+ missing-timeout, # W3101: Ignore missing timeout in requests.get
+ no-else-return, # R1705: Ignore unnecessary 'else' after return
+ redefined-builtin, # W0622: Ignore redefining built-ins
+ global-statement, # W0603: Ignore global statement usage
+ no-name-in-module, # E0611: Ignore unresolved module names
+ no-member, # E1101: Ignore module has no 'member'
+ pointless-string-statement, # W0105: Ignore pointless string statements
+ unnecessary-comprehension, # R1721: Ignore unnecessary comprehensions
+ fixme, # W0511: Ignore TODO comments
+ too-many-instance-attributes, # R0902: Ignore too many attributes in class
+ too-many-positional-arguments, # R0917: Ignore too many positional arguments
+ raise-missing-from, # W0707: Ignore re-raising without 'raise from'
+ import-outside-toplevel, # C0415: Ignore imports outside top-level
+ no-value-for-parameter # E1120: Ignore missing arguments in function
+
+[TYPECHECK]
+generated-members=get_bearer_token_provider
+
+[FORMAT]
+max-module-lines=1700 # Allow large modules up to 1700 lines
+max-line-length=160 # Allow lines up to 160 characters
\ No newline at end of file
diff --git a/ClientAdvisor/App/app.py b/ClientAdvisor/App/app.py
index e82212438..2a25c2d14 100644
--- a/ClientAdvisor/App/app.py
+++ b/ClientAdvisor/App/app.py
@@ -8,22 +8,31 @@
import httpx
import requests
-from azure.identity.aio import (DefaultAzureCredential,
- get_bearer_token_provider)
+from azure.identity.aio import DefaultAzureCredential, get_bearer_token_provider
+from backend.auth.auth_utils import get_authenticated_user_details, get_tenantid
+from backend.history.cosmosdbservice import CosmosConversationClient
+from backend.utils import (
+ convert_to_pf_format,
+ format_as_ndjson,
+ format_pf_non_streaming_response,
+ format_stream_response,
+ generateFilterString,
+ parse_multi_columns,
+)
+from db import get_connection
from dotenv import load_dotenv
+
# from quart.sessions import SecureCookieSessionInterface
from openai import AsyncAzureOpenAI
-from quart import (Blueprint, Quart, jsonify, make_response, render_template,
- request, send_from_directory)
-
-from backend.auth.auth_utils import (get_authenticated_user_details,
- get_tenantid)
-from backend.history.cosmosdbservice import CosmosConversationClient
-from backend.utils import (convert_to_pf_format, format_as_ndjson,
- format_pf_non_streaming_response,
- format_stream_response, generateFilterString,
- parse_multi_columns)
-from db import get_connection
+from quart import (
+ Blueprint,
+ Quart,
+ jsonify,
+ make_response,
+ render_template,
+ request,
+ send_from_directory,
+)
bp = Blueprint("routes", __name__, static_folder="static", template_folder="static")
@@ -172,7 +181,7 @@ async def assets(path):
"AZURE_COSMOSDB_MONGO_VCORE_VECTOR_COLUMNS"
)
-SHOULD_STREAM = True if AZURE_OPENAI_STREAM.lower() == "true" else False
+SHOULD_STREAM = AZURE_OPENAI_STREAM.lower() == "true"
# Chat History CosmosDB Integration Settings
AZURE_COSMOSDB_DATABASE = os.environ.get("AZURE_COSMOSDB_DATABASE")
@@ -373,7 +382,7 @@ def init_openai_client(use_data=SHOULD_USE_DATA):
return azure_openai_client
except Exception as e:
- logging.exception("Exception in Azure OpenAI initialization", e)
+ logging.exception("Exception in Azure OpenAI initialization", exc_info=True)
azure_openai_client = None
raise e
@@ -399,7 +408,7 @@ def init_cosmosdb_client():
enable_message_feedback=AZURE_COSMOSDB_ENABLE_FEEDBACK,
)
except Exception as e:
- logging.exception("Exception in CosmosDB initialization", e)
+ logging.exception("Exception in CosmosDB initialization", exc_info=True)
cosmos_conversation_client = None
raise e
else:
@@ -472,9 +481,7 @@ def get_configured_data_source():
else []
),
},
- "in_scope": (
- True if AZURE_SEARCH_ENABLE_IN_DOMAIN.lower() == "true" else False
- ),
+ "in_scope": (AZURE_SEARCH_ENABLE_IN_DOMAIN.lower() == "true"),
"top_n_documents": (
int(AZURE_SEARCH_TOP_K) if AZURE_SEARCH_TOP_K else int(SEARCH_TOP_K)
),
@@ -534,9 +541,7 @@ def get_configured_data_source():
),
},
"in_scope": (
- True
- if AZURE_COSMOSDB_MONGO_VCORE_ENABLE_IN_DOMAIN.lower() == "true"
- else False
+ AZURE_COSMOSDB_MONGO_VCORE_ENABLE_IN_DOMAIN.lower() == "true"
),
"top_n_documents": (
int(AZURE_COSMOSDB_MONGO_VCORE_TOP_K)
@@ -590,9 +595,7 @@ def get_configured_data_source():
else []
),
},
- "in_scope": (
- True if ELASTICSEARCH_ENABLE_IN_DOMAIN.lower() == "true" else False
- ),
+ "in_scope": (ELASTICSEARCH_ENABLE_IN_DOMAIN.lower() == "true"),
"top_n_documents": (
int(ELASTICSEARCH_TOP_K)
if ELASTICSEARCH_TOP_K
@@ -642,9 +645,7 @@ def get_configured_data_source():
else []
),
},
- "in_scope": (
- True if AZURE_MLINDEX_ENABLE_IN_DOMAIN.lower() == "true" else False
- ),
+ "in_scope": (AZURE_MLINDEX_ENABLE_IN_DOMAIN.lower() == "true"),
"top_n_documents": (
int(AZURE_MLINDEX_TOP_K)
if AZURE_MLINDEX_TOP_K
@@ -687,9 +688,7 @@ def get_configured_data_source():
else []
),
},
- "in_scope": (
- True if PINECONE_ENABLE_IN_DOMAIN.lower() == "true" else False
- ),
+ "in_scope": (PINECONE_ENABLE_IN_DOMAIN.lower() == "true"),
"top_n_documents": (
int(PINECONE_TOP_K) if PINECONE_TOP_K else int(SEARCH_TOP_K)
),
diff --git a/ClientAdvisor/App/backend/history/cosmosdbservice.py b/ClientAdvisor/App/backend/history/cosmosdbservice.py
index 70c2df5b1..85dc1695e 100644
--- a/ClientAdvisor/App/backend/history/cosmosdbservice.py
+++ b/ClientAdvisor/App/backend/history/cosmosdbservice.py
@@ -27,8 +27,7 @@ def __init__(
except exceptions.CosmosHttpResponseError as e:
if e.status_code == 401:
raise ValueError("Invalid credentials") from e
- else:
- raise ValueError("Invalid CosmosDB endpoint") from e
+ raise ValueError("Invalid CosmosDB endpoint") from e
try:
self.database_client = self.cosmosdb_client.get_database_client(
diff --git a/ClientAdvisor/App/tests/backend/auth/test_auth.py b/ClientAdvisor/App/tests/backend/auth/test_auth.py
index 1adf323d5..7854d9b07 100644
--- a/ClientAdvisor/App/tests/backend/auth/test_auth.py
+++ b/ClientAdvisor/App/tests/backend/auth/test_auth.py
@@ -2,8 +2,7 @@
import json
from unittest.mock import patch
-from backend.auth.auth_utils import (get_authenticated_user_details,
- get_tenantid)
+from backend.auth.auth_utils import get_authenticated_user_details, get_tenantid
def test_get_authenticated_user_details_no_principal_id():
diff --git a/ClientAdvisor/App/tests/backend/history/test_cosmosdb_service.py b/ClientAdvisor/App/tests/backend/history/test_cosmosdb_service.py
index ff0a51e5b..b28096d9e 100644
--- a/ClientAdvisor/App/tests/backend/history/test_cosmosdb_service.py
+++ b/ClientAdvisor/App/tests/backend/history/test_cosmosdb_service.py
@@ -2,7 +2,6 @@
import pytest
from azure.cosmos import exceptions
-
from backend.history.cosmosdbservice import CosmosConversationClient
diff --git a/ClientAdvisor/App/tests/backend/test_utils.py b/ClientAdvisor/App/tests/backend/test_utils.py
index 1585cd7fb..4880c98c9 100644
--- a/ClientAdvisor/App/tests/backend/test_utils.py
+++ b/ClientAdvisor/App/tests/backend/test_utils.py
@@ -3,12 +3,17 @@
from unittest.mock import MagicMock, patch
import pytest
-
-from backend.utils import (JSONEncoder, convert_to_pf_format, fetchUserGroups,
- format_as_ndjson, format_non_streaming_response,
- format_pf_non_streaming_response,
- format_stream_response, generateFilterString,
- parse_multi_columns)
+from backend.utils import (
+ JSONEncoder,
+ convert_to_pf_format,
+ fetchUserGroups,
+ format_as_ndjson,
+ format_non_streaming_response,
+ format_pf_non_streaming_response,
+ format_stream_response,
+ generateFilterString,
+ parse_multi_columns,
+)
@dataclasses.dataclass
diff --git a/ClientAdvisor/App/tests/test_app.py b/ClientAdvisor/App/tests/test_app.py
index d456ac702..3cfd1269f 100644
--- a/ClientAdvisor/App/tests/test_app.py
+++ b/ClientAdvisor/App/tests/test_app.py
@@ -2,9 +2,14 @@
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
-
-from app import (create_app, delete_all_conversations, generate_title,
- init_cosmosdb_client, init_openai_client, stream_chat_request)
+from app import (
+ create_app,
+ delete_all_conversations,
+ generate_title,
+ init_cosmosdb_client,
+ init_openai_client,
+ stream_chat_request,
+)
# Constants for testing
INVALID_API_VERSION = "2022-01-01"
diff --git a/ClientAdvisor/App/tools/data_collection.py b/ClientAdvisor/App/tools/data_collection.py
index c0bb184bc..60644c092 100644
--- a/ClientAdvisor/App/tools/data_collection.py
+++ b/ClientAdvisor/App/tools/data_collection.py
@@ -3,9 +3,8 @@
import os
import sys
-from dotenv import load_dotenv
-
import app
+from dotenv import load_dotenv
# import the app.py module to gain access to the methods to construct payloads and
# call the API through the sdk
diff --git a/ClientAdvisor/AzureFunction/function_app.py b/ClientAdvisor/AzureFunction/function_app.py
index f9bfd8dc8..fea37ad50 100644
--- a/ClientAdvisor/AzureFunction/function_app.py
+++ b/ClientAdvisor/AzureFunction/function_app.py
@@ -1,13 +1,16 @@
-import azure.functions as func
-import openai
-from azurefunctions.extensions.http.fastapi import Request, StreamingResponse
import asyncio
import os
-
from typing import Annotated
+import azure.functions as func
+import openai
+import pymssql
+from azurefunctions.extensions.http.fastapi import Request, StreamingResponse
from semantic_kernel.connectors.ai.function_call_behavior import FunctionCallBehavior
-from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion, OpenAIChatCompletion
+from semantic_kernel.connectors.ai.open_ai import (
+ AzureChatCompletion,
+ OpenAIChatCompletion,
+)
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
OpenAIChatPromptExecutionSettings,
)
@@ -17,7 +20,6 @@
from semantic_kernel.functions.kernel_arguments import KernelArguments
from semantic_kernel.functions.kernel_function_decorator import kernel_function
from semantic_kernel.kernel import Kernel
-import pymssql
# Azure Function App
app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS)
@@ -28,43 +30,53 @@
deployment = os.environ.get("AZURE_OPEN_AI_DEPLOYMENT_MODEL")
temperature = 0
-search_endpoint = os.environ.get("AZURE_AI_SEARCH_ENDPOINT")
+search_endpoint = os.environ.get("AZURE_AI_SEARCH_ENDPOINT")
search_key = os.environ.get("AZURE_AI_SEARCH_API_KEY")
+
class ChatWithDataPlugin:
- @kernel_function(name="Greeting", description="Respond to any greeting or general questions")
- def greeting(self, input: Annotated[str, "the question"]) -> Annotated[str, "The output is a string"]:
- query = input.split(':::')[0]
+ @kernel_function(
+ name="Greeting", description="Respond to any greeting or general questions"
+ )
+ def greeting(
+ self, input: Annotated[str, "the question"]
+ ) -> Annotated[str, "The output is a string"]:
+ query = input.split(":::")[0]
endpoint = os.environ.get("AZURE_OPEN_AI_ENDPOINT")
api_key = os.environ.get("AZURE_OPEN_AI_API_KEY")
client = openai.AzureOpenAI(
- azure_endpoint=endpoint,
- api_key=api_key,
- api_version="2023-09-01-preview"
+ azure_endpoint=endpoint, api_key=api_key, api_version="2023-09-01-preview"
)
deployment = os.environ.get("AZURE_OPEN_AI_DEPLOYMENT_MODEL")
try:
completion = client.chat.completions.create(
model=deployment,
messages=[
- {"role": "system", "content": "You are a helpful assistant to repond to any greeting or general questions."},
+ {
+ "role": "system",
+ "content": "You are a helpful assistant to repond to any greeting or general questions.",
+ },
{"role": "user", "content": query},
],
temperature=0,
)
answer = completion.choices[0].message.content
except Exception as e:
- answer = str(e) # 'Information from database could not be retrieved. Please try again later.'
+ answer = str(
+ e
+ ) # 'Information from database could not be retrieved. Please try again later.'
return answer
-
- @kernel_function(name="ChatWithSQLDatabase", description="Given a query about client assets, investements and meeting dates or times, get details from the database")
+ @kernel_function(
+ name="ChatWithSQLDatabase",
+ description="Given a query about client assets, investements and meeting dates or times, get details from the database",
+ )
def get_SQL_Response(
self,
input: Annotated[str, "the question"],
- ClientId: Annotated[str, "the ClientId"]
- ):
-
+ ClientId: Annotated[str, "the ClientId"],
+ ):
+
# clientid = input.split(':::')[-1]
# query = input.split(':::')[0] + ' . ClientId = ' + input.split(':::')[-1]
clientid = ClientId
@@ -73,13 +85,11 @@ def get_SQL_Response(
api_key = os.environ.get("AZURE_OPEN_AI_API_KEY")
client = openai.AzureOpenAI(
- azure_endpoint=endpoint,
- api_key=api_key,
- api_version="2023-09-01-preview"
+ azure_endpoint=endpoint, api_key=api_key, api_version="2023-09-01-preview"
)
deployment = os.environ.get("AZURE_OPEN_AI_DEPLOYMENT_MODEL")
- sql_prompt = f'''A valid T-SQL query to find {query} for tables and columns provided below:
+ sql_prompt = f"""A valid T-SQL query to find {query} for tables and columns provided below:
1. Table: Clients
Columns: ClientId,Client,Email,Occupation,MaritalStatus,Dependents
2. Table: InvestmentGoals
@@ -100,7 +110,7 @@ def get_SQL_Response(
Do not include assets values unless asked for.
Always use ClientId = {clientid} in the query filter.
Always return client name in the query.
- Only return the generated sql query. do not return anything else'''
+ Only return the generated sql query. do not return anything else"""
try:
completion = client.chat.completions.create(
@@ -112,9 +122,9 @@ def get_SQL_Response(
temperature=0,
)
sql_query = completion.choices[0].message.content
- sql_query = sql_query.replace("```sql",'').replace("```",'')
- #print(sql_query)
-
+ sql_query = sql_query.replace("```sql", "").replace("```", "")
+ # print(sql_query)
+
connectionString = os.environ.get("SQLDB_CONNECTION_STRING")
server = os.environ.get("SQLDB_SERVER")
database = os.environ.get("SQLDB_DATABASE")
@@ -125,57 +135,55 @@ def get_SQL_Response(
# conn = pyodbc.connect(connectionString)
cursor = conn.cursor()
cursor.execute(sql_query)
- answer = ''
+ answer = ""
for row in cursor.fetchall():
answer += str(row)
except Exception as e:
- answer = str(e) # 'Information from database could not be retrieved. Please try again later.'
+ answer = str(
+ e
+ ) # 'Information from database could not be retrieved. Please try again later.'
return answer
- #return sql_query
+ # return sql_query
-
- @kernel_function(name="ChatWithCallTranscripts", description="given a query about meetings summary or actions or notes, get answer from search index for a given ClientId")
+ @kernel_function(
+ name="ChatWithCallTranscripts",
+ description="given a query about meetings summary or actions or notes, get answer from search index for a given ClientId",
+ )
def get_answers_from_calltranscripts(
self,
question: Annotated[str, "the question"],
- ClientId: Annotated[str, "the ClientId"]
+ ClientId: Annotated[str, "the ClientId"],
):
- endpoint=os.environ.get("AZURE_OPEN_AI_ENDPOINT")
- deployment=os.environ.get("AZURE_OPEN_AI_DEPLOYMENT_MODEL")
- apikey=os.environ.get("AZURE_OPEN_AI_API_KEY")
+ endpoint = os.environ.get("AZURE_OPEN_AI_ENDPOINT")
+ deployment = os.environ.get("AZURE_OPEN_AI_DEPLOYMENT_MODEL")
+ apikey = os.environ.get("AZURE_OPEN_AI_API_KEY")
- search_endpoint = os.environ.get("AZURE_AI_SEARCH_ENDPOINT")
+ search_endpoint = os.environ.get("AZURE_AI_SEARCH_ENDPOINT")
search_key = os.environ.get("AZURE_AI_SEARCH_API_KEY")
index_name = os.environ.get("AZURE_SEARCH_INDEX")
client = openai.AzureOpenAI(
- azure_endpoint= endpoint, #f"{endpoint}/openai/deployments/{deployment}/extensions",
- api_key=apikey,
- api_version="2024-02-01"
+ azure_endpoint=endpoint, # f"{endpoint}/openai/deployments/{deployment}/extensions",
+ api_key=apikey,
+ api_version="2024-02-01",
)
query = question
- system_message = '''You are an assistant who provides wealth advisors with helpful information to prepare for client meetings.
+ system_message = """You are an assistant who provides wealth advisors with helpful information to prepare for client meetings.
You have access to the client’s meeting call transcripts.
- You can use this information to answer questions about the clients'''
+ You can use this information to answer questions about the clients"""
completion = client.chat.completions.create(
- model = deployment,
- messages = [
- {
- "role": "system",
- "content": system_message
- },
- {
- "role": "user",
- "content": query
- }
+ model=deployment,
+ messages=[
+ {"role": "system", "content": system_message},
+ {"role": "user", "content": query},
],
- seed = 42,
- temperature = 0,
- max_tokens = 800,
- extra_body = {
+ seed=42,
+ temperature=0,
+ max_tokens=800,
+ extra_body={
"data_sources": [
{
"type": "azure_search",
@@ -183,47 +191,45 @@ def get_answers_from_calltranscripts(
"endpoint": search_endpoint,
"index_name": index_name,
"semantic_configuration": "default",
- "query_type": "vector_simple_hybrid", #"vector_semantic_hybrid"
+ "query_type": "vector_simple_hybrid", # "vector_semantic_hybrid"
"fields_mapping": {
"content_fields_separator": "\n",
"content_fields": ["content"],
"filepath_field": "chunk_id",
- "title_field": "", #null,
+ "title_field": "", # null,
"url_field": "sourceurl",
- "vector_fields": ["contentVector"]
+ "vector_fields": ["contentVector"],
},
- "semantic_configuration": 'my-semantic-config',
+ "semantic_configuration": "my-semantic-config",
"in_scope": "true",
"role_information": system_message,
# "vector_filter_mode": "preFilter", #VectorFilterMode.PRE_FILTER,
- "filter": f"client_id eq '{ClientId}'", #"", #null,
+ "filter": f"client_id eq '{ClientId}'", # "", #null,
"strictness": 3,
"top_n_documents": 5,
- "authentication": {
- "type": "api_key",
- "key": search_key
- },
+ "authentication": {"type": "api_key", "key": search_key},
"embedding_dependency": {
"type": "deployment_name",
- "deployment_name": "text-embedding-ada-002"
+ "deployment_name": "text-embedding-ada-002",
},
-
- }
+ },
}
]
- }
+ },
)
answer = completion.choices[0].message.content
return answer
+
# Get data from Azure Open AI
async def stream_processor(response):
async for message in response:
- if str(message[0]): # Get remaining generated response if applicable
+ if str(message[0]): # Get remaining generated response if applicable
await asyncio.sleep(0.1)
yield str(message[0])
+
@app.route(route="stream_openai_text", methods=[func.HttpMethod.GET])
async def stream_openai_text(req: Request) -> StreamingResponse:
@@ -242,15 +248,15 @@ async def stream_openai_text(req: Request) -> StreamingResponse:
endpoint=endpoint,
api_key=api_key,
api_version=api_version,
- deployment_name=deployment
+ deployment_name=deployment,
)
kernel.add_service(ai_service)
kernel.add_plugin(ChatWithDataPlugin(), plugin_name="ChatWithData")
- settings: OpenAIChatPromptExecutionSettings = kernel.get_prompt_execution_settings_from_service_id(
- service_id=service_id
+ settings: OpenAIChatPromptExecutionSettings = (
+ kernel.get_prompt_execution_settings_from_service_id(service_id=service_id)
)
settings.function_call_behavior = FunctionCallBehavior.EnableFunctions(
auto_invoke=True, filters={"included_plugins": ["ChatWithData"]}
@@ -259,25 +265,28 @@ async def stream_openai_text(req: Request) -> StreamingResponse:
settings.max_tokens = 800
settings.temperature = 0
- system_message = '''you are a helpful assistant to a wealth advisor.
+ system_message = """you are a helpful assistant to a wealth advisor.
Do not answer any questions not related to wealth advisors queries.
If the client name and client id do not match, only return - Please only ask questions about the selected client or select another client to inquire about their details. do not return any other information.
Only use the client name returned from database in the response.
If you cannot answer the question, always return - I cannot answer this question from the data available. Please rephrase or add more details.
** Remove any client identifiers or ids or numbers or ClientId in the final response.
- '''
-
- user_query = query.replace('?',' ')
+ """
- user_query_prompt = f'''{user_query}. Always send clientId as {user_query.split(':::')[-1]} '''
- query_prompt = f'''{system_message}{user_query_prompt}'''
+ user_query = query.replace("?", " ")
+ user_query_prompt = (
+ f"""{user_query}. Always send clientId as {user_query.split(':::')[-1]} """
+ )
+ query_prompt = f"""{system_message}{user_query_prompt}"""
sk_response = kernel.invoke_prompt_stream(
function_name="prompt_test",
plugin_name="weather_test",
prompt=query_prompt,
- settings=settings
- )
+ settings=settings,
+ )
- return StreamingResponse(stream_processor(sk_response), media_type="text/event-stream")
\ No newline at end of file
+ return StreamingResponse(
+ stream_processor(sk_response), media_type="text/event-stream"
+ )
diff --git a/ClientAdvisor/Deployment/scripts/fabric_scripts/create_fabric_items.py b/ClientAdvisor/Deployment/scripts/fabric_scripts/create_fabric_items.py
index 9a718a425..e5e159d76 100644
--- a/ClientAdvisor/Deployment/scripts/fabric_scripts/create_fabric_items.py
+++ b/ClientAdvisor/Deployment/scripts/fabric_scripts/create_fabric_items.py
@@ -1,88 +1,85 @@
-from azure.identity import DefaultAzureCredential
import base64
import json
-import requests
-import pandas as pd
import os
-from glob import iglob
import time
+from glob import iglob
+import pandas as pd
+import requests
# credential = DefaultAzureCredential()
-from azure.identity import AzureCliCredential
+from azure.identity import AzureCliCredential, DefaultAzureCredential
+
credential = AzureCliCredential()
-cred = credential.get_token('https://api.fabric.microsoft.com/.default')
+cred = credential.get_token("https://api.fabric.microsoft.com/.default")
token = cred.token
fabric_headers = {"Authorization": "Bearer " + token.strip()}
-key_vault_name = 'kv_to-be-replaced'
+key_vault_name = "kv_to-be-replaced"
workspaceId = "workspaceId_to-be-replaced"
solutionname = "solutionName_to-be-replaced"
create_workspace = False
-pipeline_notebook_name = 'pipeline_notebook'
-pipeline_name = 'data_pipeline'
-lakehouse_name = 'lakehouse_' + solutionname
+pipeline_notebook_name = "pipeline_notebook"
+pipeline_name = "data_pipeline"
+lakehouse_name = "lakehouse_" + solutionname
-print("workspace id: " ,workspaceId)
+print("workspace id: ", workspaceId)
if create_workspace == True:
- workspace_name = 'workspace_' + solutionname
+ workspace_name = "workspace_" + solutionname
- # create workspace
- ws_url = 'https://api.fabric.microsoft.com/v1/workspaces'
+ # create workspace
+ ws_url = "https://api.fabric.microsoft.com/v1/workspaces"
- ws_data = {
- "displayName": workspace_name
- }
- ws_res = requests.post(ws_url, headers=fabric_headers, json=ws_data)
- ws_details = ws_res.json()
- # print(ws_details['id'])
- workspaceId = ws_details['id']
+ ws_data = {"displayName": workspace_name}
+ ws_res = requests.post(ws_url, headers=fabric_headers, json=ws_data)
+ ws_details = ws_res.json()
+ # print(ws_details['id'])
+ workspaceId = ws_details["id"]
fabric_base_url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspaceId}/"
-fabric_items_url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspaceId}/items/"
+fabric_items_url = (
+ f"https://api.fabric.microsoft.com/v1/workspaces/{workspaceId}/items/"
+)
fabric_create_workspace_url = f"https://api.fabric.microsoft.com/v1/workspaces"
-#get workspace name
+# get workspace name
ws_res = requests.get(fabric_base_url, headers=fabric_headers)
-workspace_name = ws_res.json()['displayName']
+workspace_name = ws_res.json()["displayName"]
-#create lakehouse
-lakehouse_data = {
- "displayName": lakehouse_name,
- "type": "Lakehouse"
-}
-lakehouse_res = requests.post(fabric_items_url, headers=fabric_headers, json=lakehouse_data)
+# create lakehouse
+lakehouse_data = {"displayName": lakehouse_name, "type": "Lakehouse"}
+lakehouse_res = requests.post(
+ fabric_items_url, headers=fabric_headers, json=lakehouse_data
+)
# print("lakehouse name: ", lakehouse_name)
# copy local files to lakehouse
-from azure.storage.filedatalake import (
- DataLakeServiceClient
-)
+from azure.storage.filedatalake import DataLakeServiceClient
-account_name = "onelake" #always onelake
+account_name = "onelake" # always onelake
data_path = f"{lakehouse_name}.Lakehouse/Files"
folder_path = "/"
account_url = f"https://{account_name}.dfs.fabric.microsoft.com"
service_client = DataLakeServiceClient(account_url, credential=credential)
-#Create a file system client for the workspace
+# Create a file system client for the workspace
file_system_client = service_client.get_file_system_client(workspace_name)
directory_client = file_system_client.get_directory_client(f"{data_path}/{folder_path}")
-local_path = 'data/**/*'
+local_path = "data/**/*"
file_names = [f for f in iglob(local_path, recursive=True) if os.path.isfile(f)]
for file_name in file_names:
- file_client = directory_client.get_file_client(file_name)
- with open(file=file_name, mode="rb") as data:
- file_client.upload_data(data, overwrite=True)
+ file_client = directory_client.get_file_client(file_name)
+ with open(file=file_name, mode="rb") as data:
+ file_client.upload_data(data, overwrite=True)
# #get environments
# try:
@@ -93,28 +90,34 @@
# except:
# env_res_id = ''
-#create notebook items
-notebook_names =['pipeline_notebook','01_process_data','02_create_calendar_data']
+# create notebook items
+notebook_names = ["pipeline_notebook", "01_process_data", "02_create_calendar_data"]
# notebook_names =['process_data_new']
# add sleep timer
time.sleep(120) # 1 minute
for notebook_name in notebook_names:
- with open('notebooks/'+ notebook_name +'.ipynb', 'r') as f:
+ with open("notebooks/" + notebook_name + ".ipynb", "r") as f:
notebook_json = json.load(f)
print("lakehouse_res")
print(lakehouse_res)
print(lakehouse_res.json())
-
+
try:
- notebook_json['metadata']['dependencies']['lakehouse']['default_lakehouse'] = lakehouse_res.json()['id']
- notebook_json['metadata']['dependencies']['lakehouse']['default_lakehouse_name'] = lakehouse_res.json()['displayName']
- notebook_json['metadata']['dependencies']['lakehouse']['default_lakehouse_workspace_id'] = lakehouse_res.json()['workspaceId']
+ notebook_json["metadata"]["dependencies"]["lakehouse"][
+ "default_lakehouse"
+ ] = lakehouse_res.json()["id"]
+ notebook_json["metadata"]["dependencies"]["lakehouse"][
+ "default_lakehouse_name"
+ ] = lakehouse_res.json()["displayName"]
+ notebook_json["metadata"]["dependencies"]["lakehouse"][
+ "default_lakehouse_workspace_id"
+ ] = lakehouse_res.json()["workspaceId"]
except:
pass
-
+
# if env_res_id != '':
# try:
# notebook_json['metadata']['dependencies']['environment']['environmentId'] = env_res_id
@@ -122,41 +125,44 @@
# except:
# pass
+ notebook_base64 = base64.b64encode(json.dumps(notebook_json).encode("utf-8"))
- notebook_base64 = base64.b64encode(json.dumps(notebook_json).encode('utf-8'))
-
notebook_data = {
- "displayName":notebook_name,
- "type":"Notebook",
- "definition" : {
+ "displayName": notebook_name,
+ "type": "Notebook",
+ "definition": {
"format": "ipynb",
"parts": [
{
"path": "notebook-content.ipynb",
- "payload": notebook_base64.decode('utf-8'),
- "payloadType": "InlineBase64"
+ "payload": notebook_base64.decode("utf-8"),
+ "payloadType": "InlineBase64",
}
- ]
- }
+ ],
+ },
}
-
- fabric_response = requests.post(fabric_items_url, headers=fabric_headers, json=notebook_data)
- #print(fabric_response.json())
+
+ fabric_response = requests.post(
+ fabric_items_url, headers=fabric_headers, json=notebook_data
+ )
+ # print(fabric_response.json())
time.sleep(120)
# get wrapper notebook id
-fabric_notebooks_url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspaceId}/notebooks"
+fabric_notebooks_url = (
+ f"https://api.fabric.microsoft.com/v1/workspaces/{workspaceId}/notebooks"
+)
notebooks_res = requests.get(fabric_notebooks_url, headers=fabric_headers)
notebooks_res.json()
-pipeline_notebook_id = ''
+pipeline_notebook_id = ""
print("notebook_res.json.values: ", notebooks_res.json().values())
for n in notebooks_res.json().values():
for notebook in n:
- print("notebook displayname", notebook['displayName'])
- if notebook['displayName'] == pipeline_notebook_name:
- pipeline_notebook_id = notebook['id']
+ print("notebook displayname", notebook["displayName"])
+ if notebook["displayName"] == pipeline_notebook_name:
+ pipeline_notebook_id = notebook["id"]
break
print("pipeline_notebook_id: ", pipeline_notebook_id)
@@ -175,37 +181,42 @@
"retry": 0,
"retryIntervalInSeconds": 30,
"secureOutput": "false",
- "secureInput": "false"
+ "secureInput": "false",
},
"typeProperties": {
"notebookId": pipeline_notebook_id,
- "workspaceId": workspaceId
- }
+ "workspaceId": workspaceId,
+ },
}
]
- }
+ },
}
-pipeline_base64 = base64.b64encode(json.dumps(pipeline_json).encode('utf-8'))
+pipeline_base64 = base64.b64encode(json.dumps(pipeline_json).encode("utf-8"))
pipeline_data = {
- "displayName":pipeline_name,
- "type":"DataPipeline",
- "definition" : {
- # "format": "json",
- "parts": [
- {
- "path": "pipeline-content.json",
- "payload": pipeline_base64.decode('utf-8'),
- "payloadType": "InlineBase64"
- }
- ]
- }
- }
+ "displayName": pipeline_name,
+ "type": "DataPipeline",
+ "definition": {
+ # "format": "json",
+ "parts": [
+ {
+ "path": "pipeline-content.json",
+ "payload": pipeline_base64.decode("utf-8"),
+ "payloadType": "InlineBase64",
+ }
+ ]
+ },
+}
-pipeline_response = requests.post(fabric_items_url, headers=fabric_headers, json=pipeline_data)
+pipeline_response = requests.post(
+ fabric_items_url, headers=fabric_headers, json=pipeline_data
+)
pipeline_response.json()
# run the pipeline once
-job_url = fabric_base_url + f"items/{pipeline_response.json()['id']}/jobs/instances?jobType=Pipeline"
-job_response = requests.post(job_url, headers=fabric_headers)
\ No newline at end of file
+job_url = (
+ fabric_base_url
+ + f"items/{pipeline_response.json()['id']}/jobs/instances?jobType=Pipeline"
+)
+job_response = requests.post(job_url, headers=fabric_headers)
diff --git a/ClientAdvisor/Deployment/scripts/index_scripts/create_search_index.py b/ClientAdvisor/Deployment/scripts/index_scripts/create_search_index.py
index af89d88c6..3a0cd2fac 100644
--- a/ClientAdvisor/Deployment/scripts/index_scripts/create_search_index.py
+++ b/ClientAdvisor/Deployment/scripts/index_scripts/create_search_index.py
@@ -1,147 +1,172 @@
-#Get Azure Key Vault Client
-key_vault_name = 'kv_to-be-replaced' #'nc6262-kv-2fpeafsylfd2e'
+# Get Azure Key Vault Client
+key_vault_name = "kv_to-be-replaced" #'nc6262-kv-2fpeafsylfd2e'
index_name = "transcripts_index"
file_system_client_name = "data"
-directory = 'clienttranscripts/meeting_transcripts'
-csv_file_name = 'clienttranscripts/meeting_transcripts_metadata/transcripts_metadata.csv'
+directory = "clienttranscripts/meeting_transcripts"
+csv_file_name = (
+ "clienttranscripts/meeting_transcripts_metadata/transcripts_metadata.csv"
+)
+
+from azure.identity import DefaultAzureCredential
+from azure.keyvault.secrets import SecretClient
-from azure.keyvault.secrets import SecretClient
-from azure.identity import DefaultAzureCredential
def get_secrets_from_kv(kv_name, secret_name):
-
- # Set the name of the Azure Key Vault
- key_vault_name = kv_name
- credential = DefaultAzureCredential()
- # Create a secret client object using the credential and Key Vault name
- secret_client = SecretClient(vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential)
-
- # Retrieve the secret value
- return(secret_client.get_secret(secret_name).value)
+ # Set the name of the Azure Key Vault
+ key_vault_name = kv_name
+ credential = DefaultAzureCredential()
+
+ # Create a secret client object using the credential and Key Vault name
+ secret_client = SecretClient(
+ vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential
+ )
-search_endpoint = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-ENDPOINT")
-search_key = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-KEY")
+ # Retrieve the secret value
+ return secret_client.get_secret(secret_name).value
+
+
+search_endpoint = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-ENDPOINT")
+search_key = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-KEY")
# openai_api_type = get_secrets_from_kv(key_vault_name,"OPENAI-API-TYPE")
-openai_api_key = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-KEY")
-openai_api_base = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-ENDPOINT")
-openai_api_version = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-PREVIEW-API-VERSION")
+openai_api_key = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-KEY")
+openai_api_base = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-ENDPOINT")
+openai_api_version = get_secrets_from_kv(
+ key_vault_name, "AZURE-OPENAI-PREVIEW-API-VERSION"
+)
# Create the search index
-from azure.core.credentials import AzureKeyCredential
+from azure.core.credentials import AzureKeyCredential
+
search_credential = AzureKeyCredential(search_key)
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
- SimpleField,
- SearchFieldDataType,
+ HnswAlgorithmConfiguration,
SearchableField,
SearchField,
- VectorSearch,
- HnswAlgorithmConfiguration,
- VectorSearchProfile,
+ SearchFieldDataType,
+ SearchIndex,
SemanticConfiguration,
- SemanticPrioritizedFields,
SemanticField,
+ SemanticPrioritizedFields,
SemanticSearch,
- SearchIndex
+ SimpleField,
+ VectorSearch,
+ VectorSearchProfile,
)
# Create a search index
index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
fields = [
- SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
+ SimpleField(
+ name="id",
+ type=SearchFieldDataType.String,
+ key=True,
+ sortable=True,
+ filterable=True,
+ facetable=True,
+ ),
SearchableField(name="chunk_id", type=SearchFieldDataType.String),
SearchableField(name="content", type=SearchFieldDataType.String),
SearchableField(name="sourceurl", type=SearchFieldDataType.String),
- SearchableField(name="client_id", type=SearchFieldDataType.String,filterable=True),
- SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
- searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
+ SearchableField(name="client_id", type=SearchFieldDataType.String, filterable=True),
+ SearchField(
+ name="contentVector",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
+ searchable=True,
+ vector_search_dimensions=1536,
+ vector_search_profile_name="myHnswProfile",
+ ),
]
-# Configure the vector search configuration
+# Configure the vector search configuration
vector_search = VectorSearch(
- algorithms=[
- HnswAlgorithmConfiguration(
- name="myHnsw"
- )
- ],
+ algorithms=[HnswAlgorithmConfiguration(name="myHnsw")],
profiles=[
VectorSearchProfile(
name="myHnswProfile",
algorithm_configuration_name="myHnsw",
)
- ]
+ ],
)
semantic_config = SemanticConfiguration(
name="my-semantic-config",
prioritized_fields=SemanticPrioritizedFields(
keywords_fields=[SemanticField(field_name="client_id")],
- content_fields=[SemanticField(field_name="content")]
- )
+ content_fields=[SemanticField(field_name="content")],
+ ),
)
# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])
# Create the search index with the semantic settings
-index = SearchIndex(name=index_name, fields=fields,
- vector_search=vector_search, semantic_search=semantic_search)
+index = SearchIndex(
+ name=index_name,
+ fields=fields,
+ vector_search=vector_search,
+ semantic_search=semantic_search,
+)
result = index_client.create_or_update_index(index)
-print(f' {result.name} created')
+print(f" {result.name} created")
from openai import AzureOpenAI
+
# Function: Get Embeddings
-def get_embeddings(text: str,openai_api_base,openai_api_version,openai_api_key):
+def get_embeddings(text: str, openai_api_base, openai_api_version, openai_api_key):
model_id = "text-embedding-ada-002"
client = AzureOpenAI(
api_version=openai_api_version,
azure_endpoint=openai_api_base,
- api_key = openai_api_key
+ api_key=openai_api_key,
)
-
+
embedding = client.embeddings.create(input=text, model=model_id).data[0].embedding
return embedding
+
import re
+
def clean_spaces_with_regex(text):
# Use a regular expression to replace multiple spaces with a single space
- cleaned_text = re.sub(r'\s+', ' ', text)
+ cleaned_text = re.sub(r"\s+", " ", text)
# Use a regular expression to replace consecutive dots with a single dot
- cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text)
+ cleaned_text = re.sub(r"\.{2,}", ".", cleaned_text)
return cleaned_text
+
def chunk_data(text):
- tokens_per_chunk = 1024 #500
+ tokens_per_chunk = 1024 # 500
text = clean_spaces_with_regex(text)
SENTENCE_ENDINGS = [".", "!", "?"]
- WORDS_BREAKS = ['\n', '\t', '}', '{', ']', '[', ')', '(', ' ', ':', ';', ',']
+ WORDS_BREAKS = ["\n", "\t", "}", "{", "]", "[", ")", "(", " ", ":", ";", ","]
- sentences = text.split('. ') # Split text into sentences
+ sentences = text.split(". ") # Split text into sentences
chunks = []
- current_chunk = ''
+ current_chunk = ""
current_chunk_token_count = 0
-
+
# Iterate through each sentence
for sentence in sentences:
# Split sentence into tokens
tokens = sentence.split()
-
+
# Check if adding the current sentence exceeds tokens_per_chunk
if current_chunk_token_count + len(tokens) <= tokens_per_chunk:
# Add the sentence to the current chunk
if current_chunk:
- current_chunk += '. ' + sentence
+ current_chunk += ". " + sentence
else:
current_chunk += sentence
current_chunk_token_count += len(tokens)
@@ -150,21 +175,28 @@ def chunk_data(text):
chunks.append(current_chunk)
current_chunk = sentence
current_chunk_token_count = len(tokens)
-
+
# Add the last chunk
if current_chunk:
chunks.append(current_chunk)
-
+
return chunks
-#add documents to the index
-import json
+# add documents to the index
+
import base64
+import json
+import os
import time
+
import pandas as pd
from azure.search.documents import SearchClient
-import os
+from azure.storage.filedatalake import (
+ DataLakeDirectoryClient,
+ DataLakeServiceClient,
+ FileSystemClient,
+)
# foldername = 'clienttranscripts'
# path_name = f'Data/{foldername}/meeting_transcripts'
@@ -172,20 +204,17 @@ def chunk_data(text):
# paths = os.listdir(path_name)
-from azure.storage.filedatalake import (
- DataLakeServiceClient,
- DataLakeDirectoryClient,
- FileSystemClient
-)
account_name = get_secrets_from_kv(key_vault_name, "ADLS-ACCOUNT-NAME")
credential = DefaultAzureCredential()
account_url = f"https://{account_name}.dfs.core.windows.net"
-service_client = DataLakeServiceClient(account_url, credential=credential,api_version='2023-01-03')
+service_client = DataLakeServiceClient(
+ account_url, credential=credential, api_version="2023-01-03"
+)
-file_system_client = service_client.get_file_system_client(file_system_client_name)
+file_system_client = service_client.get_file_system_client(file_system_client_name)
directory_name = directory
paths = file_system_client.get_paths(path=directory_name)
print(paths)
@@ -200,12 +229,13 @@ def chunk_data(text):
# # display(df_metadata)
import pandas as pd
+
# Read the CSV file into a Pandas DataFrame
file_path = csv_file_name
print(file_path)
file_client = file_system_client.get_file_client(file_path)
csv_file = file_client.download_file()
-df_metadata = pd.read_csv(csv_file, encoding='utf-8')
+df_metadata = pd.read_csv(csv_file, encoding="utf-8")
docs = []
counter = 0
@@ -216,49 +246,59 @@ def chunk_data(text):
file_client = file_system_client.get_file_client(path.name)
data_file = file_client.download_file()
data = json.load(data_file)
- text = data['Content']
+ text = data["Content"]
- filename = path.name.split('/')[-1]
- document_id = filename.replace('.json','').replace('convo_','')
+ filename = path.name.split("/")[-1]
+ document_id = filename.replace(".json", "").replace("convo_", "")
# print(document_id)
- df_file_metadata = df_metadata[df_metadata['ConversationId']==str(document_id)].iloc[0]
-
+ df_file_metadata = df_metadata[
+ df_metadata["ConversationId"] == str(document_id)
+ ].iloc[0]
+
chunks = chunk_data(text)
chunk_num = 0
for chunk in chunks:
chunk_num += 1
d = {
- "chunk_id" : document_id + '_' + str(chunk_num).zfill(2),
- "client_id": str(df_file_metadata['ClientId']),
- "content": 'ClientId is ' + str(df_file_metadata['ClientId']) + ' . ' + chunk,
- }
+ "chunk_id": document_id + "_" + str(chunk_num).zfill(2),
+ "client_id": str(df_file_metadata["ClientId"]),
+ "content": "ClientId is "
+ + str(df_file_metadata["ClientId"])
+ + " . "
+ + chunk,
+ }
counter += 1
try:
- v_contentVector = get_embeddings(d["content"],openai_api_base,openai_api_version,openai_api_key)
+ v_contentVector = get_embeddings(
+ d["content"], openai_api_base, openai_api_version, openai_api_key
+ )
except:
time.sleep(30)
- v_contentVector = get_embeddings(d["content"],openai_api_base,openai_api_version,openai_api_key)
-
+ v_contentVector = get_embeddings(
+ d["content"], openai_api_base, openai_api_version, openai_api_key
+ )
docs.append(
{
- "id": base64.urlsafe_b64encode(bytes(d["chunk_id"], encoding='utf-8')).decode('utf-8'),
- "chunk_id": d["chunk_id"],
- "client_id": d["client_id"],
- "content": d["content"],
- "sourceurl": path.name.split('/')[-1],
- "contentVector": v_contentVector
+ "id": base64.urlsafe_b64encode(
+ bytes(d["chunk_id"], encoding="utf-8")
+ ).decode("utf-8"),
+ "chunk_id": d["chunk_id"],
+ "client_id": d["client_id"],
+ "content": d["content"],
+ "sourceurl": path.name.split("/")[-1],
+ "contentVector": v_contentVector,
}
)
-
+
if counter % 10 == 0:
result = search_client.upload_documents(documents=docs)
docs = []
- print(f' {str(counter)} uploaded')
-
+ print(f" {str(counter)} uploaded")
+
time.sleep(4)
-#upload the last batch
+# upload the last batch
if docs != []:
- search_client.upload_documents(documents=docs)
\ No newline at end of file
+ search_client.upload_documents(documents=docs)
diff --git a/ClientAdvisor/Deployment/scripts/index_scripts/create_sql_tables.py b/ClientAdvisor/Deployment/scripts/index_scripts/create_sql_tables.py
index cb43e8e8b..e84e18758 100644
--- a/ClientAdvisor/Deployment/scripts/index_scripts/create_sql_tables.py
+++ b/ClientAdvisor/Deployment/scripts/index_scripts/create_sql_tables.py
@@ -1,47 +1,51 @@
-key_vault_name = 'kv_to-be-replaced'
+key_vault_name = "kv_to-be-replaced"
-import pandas as pd
-import pymssql
import os
from datetime import datetime
-from azure.keyvault.secrets import SecretClient
-from azure.identity import DefaultAzureCredential
+import pandas as pd
+import pymssql
+from azure.identity import DefaultAzureCredential
+from azure.keyvault.secrets import SecretClient
+
def get_secrets_from_kv(kv_name, secret_name):
- key_vault_name = kv_name # Set the name of the Azure Key Vault
+ key_vault_name = kv_name # Set the name of the Azure Key Vault
credential = DefaultAzureCredential()
- secret_client = SecretClient(vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential) # Create a secret client object using the credential and Key Vault name
- return(secret_client.get_secret(secret_name).value) # Retrieve the secret value
+ secret_client = SecretClient(
+ vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential
+ ) # Create a secret client object using the credential and Key Vault name
+ return secret_client.get_secret(secret_name).value # Retrieve the secret value
+
-server = get_secrets_from_kv(key_vault_name,"SQLDB-SERVER")
-database = get_secrets_from_kv(key_vault_name,"SQLDB-DATABASE")
-username = get_secrets_from_kv(key_vault_name,"SQLDB-USERNAME")
-password = get_secrets_from_kv(key_vault_name,"SQLDB-PASSWORD")
+server = get_secrets_from_kv(key_vault_name, "SQLDB-SERVER")
+database = get_secrets_from_kv(key_vault_name, "SQLDB-DATABASE")
+username = get_secrets_from_kv(key_vault_name, "SQLDB-USERNAME")
+password = get_secrets_from_kv(key_vault_name, "SQLDB-PASSWORD")
conn = pymssql.connect(server, username, password, database)
cursor = conn.cursor()
-from azure.storage.filedatalake import (
- DataLakeServiceClient
-)
+from azure.storage.filedatalake import DataLakeServiceClient
account_name = get_secrets_from_kv(key_vault_name, "ADLS-ACCOUNT-NAME")
credential = DefaultAzureCredential()
account_url = f"https://{account_name}.dfs.core.windows.net"
-service_client = DataLakeServiceClient(account_url, credential=credential,api_version='2023-01-03')
+service_client = DataLakeServiceClient(
+ account_url, credential=credential, api_version="2023-01-03"
+)
file_system_client_name = "data"
-directory = 'clientdata'
+directory = "clientdata"
-file_system_client = service_client.get_file_system_client(file_system_client_name)
+file_system_client = service_client.get_file_system_client(file_system_client_name)
directory_name = directory
cursor = conn.cursor()
-cursor.execute('DROP TABLE IF EXISTS Clients')
+cursor.execute("DROP TABLE IF EXISTS Clients")
conn.commit()
create_client_sql = """CREATE TABLE Clients (
@@ -56,13 +60,23 @@ def get_secrets_from_kv(kv_name, secret_name):
conn.commit()
# Read the CSV file into a Pandas DataFrame
-file_path = directory + '/Clients.csv'
+file_path = directory + "/Clients.csv"
file_client = file_system_client.get_file_client(file_path)
csv_file = file_client.download_file()
-df = pd.read_csv(csv_file, encoding='utf-8')
+df = pd.read_csv(csv_file, encoding="utf-8")
for index, item in df.iterrows():
- cursor.execute(f"INSERT INTO Clients (ClientId,Client, Email, Occupation, MaritalStatus, Dependents) VALUES (%s,%s,%s,%s,%s,%s)", (item.ClientId, item.Client, item.Email, item.Occupation, item.MaritalStatus, item.Dependents))
+ cursor.execute(
+ f"INSERT INTO Clients (ClientId,Client, Email, Occupation, MaritalStatus, Dependents) VALUES (%s,%s,%s,%s,%s,%s)",
+ (
+ item.ClientId,
+ item.Client,
+ item.Email,
+ item.Occupation,
+ item.MaritalStatus,
+ item.Dependents,
+ ),
+ )
conn.commit()
@@ -90,15 +104,15 @@ def get_secrets_from_kv(kv_name, secret_name):
# csv_file = file_client.download_file()
# df = pd.read_csv(csv_file, encoding='utf-8')
-# for index, item in df.iterrows():
+# for index, item in df.iterrows():
# cursor.execute(f"INSERT INTO ClientInvestmentPortfolio (ClientId, AssetDate, AssetType, Investment, ROI, RevenueWithoutStrategy) VALUES (%s,%s, %s,%s, %s, %s)", (item.ClientId, item.AssetDate, item.AssetType, item.Investment, item.ROI, item.RevenueWithoutStrategy))
-
+
# conn.commit()
from decimal import Decimal
-cursor.execute('DROP TABLE IF EXISTS Assets')
+cursor.execute("DROP TABLE IF EXISTS Assets")
conn.commit()
create_assets_sql = """CREATE TABLE Assets (
@@ -113,34 +127,44 @@ def get_secrets_from_kv(kv_name, secret_name):
cursor.execute(create_assets_sql)
conn.commit()
-file_path = directory + '/Assets.csv'
+file_path = directory + "/Assets.csv"
file_client = file_system_client.get_file_client(file_path)
csv_file = file_client.download_file()
-df = pd.read_csv(csv_file, encoding='utf-8')
+df = pd.read_csv(csv_file, encoding="utf-8")
# # to adjust the dates to current date
-df['AssetDate'] = pd.to_datetime(df['AssetDate'])
+df["AssetDate"] = pd.to_datetime(df["AssetDate"])
today = datetime.today()
-days_difference = (today - max(df['AssetDate'])).days - 30
-months_difference = int(days_difference/30)
+days_difference = (today - max(df["AssetDate"])).days - 30
+months_difference = int(days_difference / 30)
# print(months_difference)
# df['AssetDate'] = df['AssetDate'] + pd.Timedelta(days=days_difference)
-df['AssetDate'] = df['AssetDate'] + pd.DateOffset(months=months_difference)
+df["AssetDate"] = df["AssetDate"] + pd.DateOffset(months=months_difference)
-df['AssetDate'] = pd.to_datetime(df['AssetDate'], format='%m/%d/%Y') # %Y-%m-%d')
-df['ClientId'] = df['ClientId'].astype(int)
-df['Investment'] = df['Investment'].astype(float)
-df['ROI'] = df['ROI'].astype(float)
-df['Revenue'] = df['Revenue'].astype(float)
+df["AssetDate"] = pd.to_datetime(df["AssetDate"], format="%m/%d/%Y") # %Y-%m-%d')
+df["ClientId"] = df["ClientId"].astype(int)
+df["Investment"] = df["Investment"].astype(float)
+df["ROI"] = df["ROI"].astype(float)
+df["Revenue"] = df["Revenue"].astype(float)
for index, item in df.iterrows():
- cursor.execute(f"INSERT INTO Assets (ClientId,AssetDate, Investment, ROI, Revenue, AssetType) VALUES (%s,%s,%s,%s,%s,%s)", (item.ClientId, item.AssetDate, item.Investment, item.ROI, item.Revenue, item.AssetType))
+ cursor.execute(
+ f"INSERT INTO Assets (ClientId,AssetDate, Investment, ROI, Revenue, AssetType) VALUES (%s,%s,%s,%s,%s,%s)",
+ (
+ item.ClientId,
+ item.AssetDate,
+ item.Investment,
+ item.ROI,
+ item.Revenue,
+ item.AssetType,
+ ),
+ )
conn.commit()
-#InvestmentGoals
-cursor.execute('DROP TABLE IF EXISTS InvestmentGoals')
+# InvestmentGoals
+cursor.execute("DROP TABLE IF EXISTS InvestmentGoals")
conn.commit()
create_ig_sql = """CREATE TABLE InvestmentGoals (
@@ -151,19 +175,22 @@ def get_secrets_from_kv(kv_name, secret_name):
cursor.execute(create_ig_sql)
conn.commit()
-file_path = directory + '/InvestmentGoals.csv'
+file_path = directory + "/InvestmentGoals.csv"
file_client = file_system_client.get_file_client(file_path)
csv_file = file_client.download_file()
-df = pd.read_csv(csv_file, encoding='utf-8')
+df = pd.read_csv(csv_file, encoding="utf-8")
-df['ClientId'] = df['ClientId'].astype(int)
+df["ClientId"] = df["ClientId"].astype(int)
for index, item in df.iterrows():
- cursor.execute(f"INSERT INTO InvestmentGoals (ClientId,InvestmentGoal) VALUES (%s,%s)", (item.ClientId, item.InvestmentGoal))
+ cursor.execute(
+ f"INSERT INTO InvestmentGoals (ClientId,InvestmentGoal) VALUES (%s,%s)",
+ (item.ClientId, item.InvestmentGoal),
+ )
conn.commit()
-cursor.execute('DROP TABLE IF EXISTS InvestmentGoalsDetails')
+cursor.execute("DROP TABLE IF EXISTS InvestmentGoalsDetails")
conn.commit()
create_ig_sql = """CREATE TABLE InvestmentGoalsDetails (
@@ -176,19 +203,22 @@ def get_secrets_from_kv(kv_name, secret_name):
cursor.execute(create_ig_sql)
conn.commit()
-file_path = directory + '/InvestmentGoalsDetails.csv'
+file_path = directory + "/InvestmentGoalsDetails.csv"
file_client = file_system_client.get_file_client(file_path)
csv_file = file_client.download_file()
-df = pd.read_csv(csv_file, encoding='utf-8')
+df = pd.read_csv(csv_file, encoding="utf-8")
-df['ClientId'] = df['ClientId'].astype(int)
+df["ClientId"] = df["ClientId"].astype(int)
for index, item in df.iterrows():
- cursor.execute(f"INSERT INTO InvestmentGoalsDetails (ClientId,InvestmentGoal, TargetAmount, Contribution) VALUES (%s,%s,%s,%s)", (item.ClientId, item.InvestmentGoal, item.TargetAmount, item.Contribution))
+ cursor.execute(
+ f"INSERT INTO InvestmentGoalsDetails (ClientId,InvestmentGoal, TargetAmount, Contribution) VALUES (%s,%s,%s,%s)",
+ (item.ClientId, item.InvestmentGoal, item.TargetAmount, item.Contribution),
+ )
conn.commit()
-#ClientSummaries
-cursor.execute('DROP TABLE IF EXISTS ClientSummaries')
+# ClientSummaries
+cursor.execute("DROP TABLE IF EXISTS ClientSummaries")
conn.commit()
create_cs_sql = """CREATE TABLE ClientSummaries (
@@ -199,19 +229,22 @@ def get_secrets_from_kv(kv_name, secret_name):
cursor.execute(create_cs_sql)
conn.commit()
-file_path = directory + '/ClientSummaries.csv'
+file_path = directory + "/ClientSummaries.csv"
file_client = file_system_client.get_file_client(file_path)
csv_file = file_client.download_file()
-df = pd.read_csv(csv_file, encoding='utf-8')
+df = pd.read_csv(csv_file, encoding="utf-8")
-df['ClientId'] = df['ClientId'].astype(int)
+df["ClientId"] = df["ClientId"].astype(int)
for index, item in df.iterrows():
- cursor.execute(f"INSERT INTO ClientSummaries (ClientId,ClientSummary) VALUES (%s,%s)", (item.ClientId, item.ClientSummary))
+ cursor.execute(
+ f"INSERT INTO ClientSummaries (ClientId,ClientSummary) VALUES (%s,%s)",
+ (item.ClientId, item.ClientSummary),
+ )
conn.commit()
# Retirement
-cursor.execute('DROP TABLE IF EXISTS Retirement')
+cursor.execute("DROP TABLE IF EXISTS Retirement")
conn.commit()
create_cs_sql = """CREATE TABLE Retirement (
@@ -225,30 +258,39 @@ def get_secrets_from_kv(kv_name, secret_name):
conn.commit()
-file_path = directory + '/Retirement.csv'
+file_path = directory + "/Retirement.csv"
file_client = file_system_client.get_file_client(file_path)
csv_file = file_client.download_file()
-df = pd.read_csv(csv_file, encoding='utf-8')
+df = pd.read_csv(csv_file, encoding="utf-8")
-df['ClientId'] = df['ClientId'].astype(int)
+df["ClientId"] = df["ClientId"].astype(int)
# to adjust the dates to current date
-df['StatusDate'] = pd.to_datetime(df['StatusDate'])
+df["StatusDate"] = pd.to_datetime(df["StatusDate"])
today = datetime.today()
-days_difference = (today - max(df['StatusDate'])).days - 30
-months_difference = int(days_difference/30)
-df['StatusDate'] = df['StatusDate'] + pd.DateOffset(months=months_difference)
-df['StatusDate'] = pd.to_datetime(df['StatusDate']).dt.date
+days_difference = (today - max(df["StatusDate"])).days - 30
+months_difference = int(days_difference / 30)
+df["StatusDate"] = df["StatusDate"] + pd.DateOffset(months=months_difference)
+df["StatusDate"] = pd.to_datetime(df["StatusDate"]).dt.date
for index, item in df.iterrows():
- cursor.execute(f"INSERT INTO Retirement (ClientId,StatusDate, RetirementGoalProgress, EducationGoalProgress) VALUES (%s,%s,%s,%s)", (item.ClientId, item.StatusDate, item.RetirementGoalProgress, item.EducationGoalProgress))
+ cursor.execute(
+ f"INSERT INTO Retirement (ClientId,StatusDate, RetirementGoalProgress, EducationGoalProgress) VALUES (%s,%s,%s,%s)",
+ (
+ item.ClientId,
+ item.StatusDate,
+ item.RetirementGoalProgress,
+ item.EducationGoalProgress,
+ ),
+ )
conn.commit()
import pandas as pd
+
cursor = conn.cursor()
-cursor.execute('DROP TABLE IF EXISTS ClientMeetings')
+cursor.execute("DROP TABLE IF EXISTS ClientMeetings")
conn.commit()
create_cs_sql = """CREATE TABLE ClientMeetings (
@@ -265,43 +307,65 @@ def get_secrets_from_kv(kv_name, secret_name):
conn.commit()
-file_path = directory + '/ClientMeetingsMetadata.csv'
+file_path = directory + "/ClientMeetingsMetadata.csv"
file_client = file_system_client.get_file_client(file_path)
csv_file = file_client.download_file()
-df = pd.read_csv(csv_file, encoding='utf-8')
+df = pd.read_csv(csv_file, encoding="utf-8")
# to adjust the dates to current date
-df['StartTime'] = pd.to_datetime(df['StartTime'])
-df['EndTime'] = pd.to_datetime(df['EndTime'])
+df["StartTime"] = pd.to_datetime(df["StartTime"])
+df["EndTime"] = pd.to_datetime(df["EndTime"])
today = datetime.today()
-days_difference = (today - min(df['StartTime'])).days - 30
+days_difference = (today - min(df["StartTime"])).days - 30
days_difference
-df['StartTime'] = df['StartTime'] + pd.Timedelta(days=days_difference)
-df['EndTime'] = df['EndTime'] + pd.Timedelta(days=days_difference)
+df["StartTime"] = df["StartTime"] + pd.Timedelta(days=days_difference)
+df["EndTime"] = df["EndTime"] + pd.Timedelta(days=days_difference)
for index, item in df.iterrows():
-
- cursor.execute(f"INSERT INTO ClientMeetings (ClientId,ConversationId,Title,StartTime,EndTime,Advisor,ClientEmail) VALUES (%s,%s,%s,%s,%s,%s,%s)", (item.ClientId, item.ConversationId, item.Title, item.StartTime, item.EndTime, item.Advisor, item.ClientEmail))
+
+ cursor.execute(
+ f"INSERT INTO ClientMeetings (ClientId,ConversationId,Title,StartTime,EndTime,Advisor,ClientEmail) VALUES (%s,%s,%s,%s,%s,%s,%s)",
+ (
+ item.ClientId,
+ item.ConversationId,
+ item.Title,
+ item.StartTime,
+ item.EndTime,
+ item.Advisor,
+ item.ClientEmail,
+ ),
+ )
conn.commit()
-file_path = directory + '/ClientFutureMeetings.csv'
+file_path = directory + "/ClientFutureMeetings.csv"
file_client = file_system_client.get_file_client(file_path)
csv_file = file_client.download_file()
-df = pd.read_csv(csv_file, encoding='utf-8')
+df = pd.read_csv(csv_file, encoding="utf-8")
# to adjust the dates to current date
-df['StartTime'] = pd.to_datetime(df['StartTime'])
-df['EndTime'] = pd.to_datetime(df['EndTime'])
+df["StartTime"] = pd.to_datetime(df["StartTime"])
+df["EndTime"] = pd.to_datetime(df["EndTime"])
today = datetime.today()
-days_difference = (today - min(df['StartTime'])).days + 1
-df['StartTime'] = df['StartTime'] + pd.Timedelta(days=days_difference)
-df['EndTime'] = df['EndTime'] + pd.Timedelta(days=days_difference)
+days_difference = (today - min(df["StartTime"])).days + 1
+df["StartTime"] = df["StartTime"] + pd.Timedelta(days=days_difference)
+df["EndTime"] = df["EndTime"] + pd.Timedelta(days=days_difference)
-df['ClientId'] = df['ClientId'].astype(int)
-df['ConversationId'] = ''
+df["ClientId"] = df["ClientId"].astype(int)
+df["ConversationId"] = ""
for index, item in df.iterrows():
- cursor.execute(f"INSERT INTO ClientMeetings (ClientId,ConversationId,Title,StartTime,EndTime,Advisor,ClientEmail) VALUES (%s,%s,%s,%s,%s,%s,%s)", (item.ClientId, item.ConversationId, item.Title, item.StartTime, item.EndTime, item.Advisor, item.ClientEmail))
-conn.commit()
\ No newline at end of file
+ cursor.execute(
+ f"INSERT INTO ClientMeetings (ClientId,ConversationId,Title,StartTime,EndTime,Advisor,ClientEmail) VALUES (%s,%s,%s,%s,%s,%s,%s)",
+ (
+ item.ClientId,
+ item.ConversationId,
+ item.Title,
+ item.StartTime,
+ item.EndTime,
+ item.Advisor,
+ item.ClientEmail,
+ ),
+ )
+conn.commit()
diff --git a/ClientAdvisor/Deployment/scripts/index_scripts/create_update_sql_dates.py b/ClientAdvisor/Deployment/scripts/index_scripts/create_update_sql_dates.py
index d0e8c725c..a9ccdd1bc 100644
--- a/ClientAdvisor/Deployment/scripts/index_scripts/create_update_sql_dates.py
+++ b/ClientAdvisor/Deployment/scripts/index_scripts/create_update_sql_dates.py
@@ -1,48 +1,52 @@
-key_vault_name = 'kv_to-be-replaced'
+key_vault_name = "kv_to-be-replaced"
-import pandas as pd
-import pymssql
import os
from datetime import datetime
-from azure.keyvault.secrets import SecretClient
-from azure.identity import DefaultAzureCredential
+import pandas as pd
+import pymssql
+from azure.identity import DefaultAzureCredential
+from azure.keyvault.secrets import SecretClient
+
def get_secrets_from_kv(kv_name, secret_name):
- key_vault_name = kv_name # Set the name of the Azure Key Vault
+ key_vault_name = kv_name # Set the name of the Azure Key Vault
credential = DefaultAzureCredential()
- secret_client = SecretClient(vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential) # Create a secret client object using the credential and Key Vault name
- return(secret_client.get_secret(secret_name).value) # Retrieve the secret value
+ secret_client = SecretClient(
+ vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential
+ ) # Create a secret client object using the credential and Key Vault name
+ return secret_client.get_secret(secret_name).value # Retrieve the secret value
-server = get_secrets_from_kv(key_vault_name,"SQLDB-SERVER")
-database = get_secrets_from_kv(key_vault_name,"SQLDB-DATABASE")
-username = get_secrets_from_kv(key_vault_name,"SQLDB-USERNAME")
-password = get_secrets_from_kv(key_vault_name,"SQLDB-PASSWORD")
+
+server = get_secrets_from_kv(key_vault_name, "SQLDB-SERVER")
+database = get_secrets_from_kv(key_vault_name, "SQLDB-DATABASE")
+username = get_secrets_from_kv(key_vault_name, "SQLDB-USERNAME")
+password = get_secrets_from_kv(key_vault_name, "SQLDB-PASSWORD")
conn = pymssql.connect(server, username, password, database)
cursor = conn.cursor()
-from azure.storage.filedatalake import (
- DataLakeServiceClient
-)
+from azure.storage.filedatalake import DataLakeServiceClient
account_name = get_secrets_from_kv(key_vault_name, "ADLS-ACCOUNT-NAME")
credential = DefaultAzureCredential()
account_url = f"https://{account_name}.dfs.core.windows.net"
-service_client = DataLakeServiceClient(account_url, credential=credential,api_version='2023-01-03')
+service_client = DataLakeServiceClient(
+ account_url, credential=credential, api_version="2023-01-03"
+)
file_system_client_name = "data"
-directory = 'clientdata'
+directory = "clientdata"
-file_system_client = service_client.get_file_system_client(file_system_client_name)
+file_system_client = service_client.get_file_system_client(file_system_client_name)
directory_name = directory
cursor = conn.cursor()
-cursor.execute('DROP TABLE IF EXISTS Clients')
+cursor.execute("DROP TABLE IF EXISTS Clients")
conn.commit()
create_client_sql = """CREATE TABLE Clients (
@@ -54,4 +58,4 @@ def get_secrets_from_kv(kv_name, secret_name):
Dependents int
);"""
cursor.execute(create_client_sql)
-conn.commit()
\ No newline at end of file
+conn.commit()
diff --git a/ResearchAssistant/App/.flake8 b/ResearchAssistant/App/.flake8
index c462975ac..bc2f0943d 100644
--- a/ResearchAssistant/App/.flake8
+++ b/ResearchAssistant/App/.flake8
@@ -1,4 +1,4 @@
[flake8]
-max-line-length = 88
-extend-ignore = E501, E203
-exclude = .venv, frontend,
\ No newline at end of file
+max-line-length = 160
+extend-ignore = E203, W503, E501
+exclude = .venv, frontend
\ No newline at end of file
diff --git a/ResearchAssistant/App/.pylintrc b/ResearchAssistant/App/.pylintrc
new file mode 100644
index 000000000..a35c43970
--- /dev/null
+++ b/ResearchAssistant/App/.pylintrc
@@ -0,0 +1,39 @@
+[MASTER]
+ignore=tests ; Ignore the tests folder globally.
+
+[MESSAGES CONTROL]
+disable=
+ invalid-name, # C0103: Ignore naming style errors
+ line-too-long, # C0301: Ignore long lines
+ missing-function-docstring, # C0116: Ignore missing function docstrings
+ missing-class-docstring, # C0115: Ignore missing class docstrings
+ missing-module-docstring, # C0114: Ignore missing module docstrings
+ redefined-outer-name, # W0621: Ignore redefined variables warnings
+ broad-exception-raised, # W0719: Ignore broad exception raised warnings
+ broad-exception-caught, # W0718: Ignore broad exception caught warnings
+ too-many-arguments, # R0913: Ignore too many arguments
+ too-many-locals, # R0914: Ignore too many local variables
+ too-many-return-statements, # R0911: Ignore too many return statements
+ too-many-statements, # R0915: Ignore too many statements in a function
+ too-many-branches, # R0912: Ignore too many branches
+ unused-argument, # W0613: Ignore unused arguments
+ unspecified-encoding, # W1514: Ignore unspecified encoding in open()
+ logging-fstring-interpolation, # W1203: Ignore lazy f-string interpolation
+ missing-timeout, # W3101: Ignore missing timeout in requests.get
+ no-else-return, # R1705: Ignore unnecessary 'else' after return
+ redefined-builtin, # W0622: Ignore redefining built-ins
+ global-statement, # W0603: Ignore global statement usage
+ no-name-in-module, # E0611: Ignore unresolved module names
+ no-member, # E1101: Ignore module has no 'member'
+ pointless-string-statement, # W0105: Ignore pointless string statements
+ unnecessary-comprehension, # R1721: Ignore unnecessary comprehensions
+ simplifiable-if-expression, # R1719: Ignore simplifiable if expressions
+ dangerous-default-value, # W0102: Ignore mutable default arguments
+ consider-using-with # R1732: Ignore using 'with' for file or resource management
+
+[TYPECHECK]
+generated-members=get_bearer_token_provider
+
+[FORMAT]
+max-module-lines=1700 # Allow large modules up to 1700 lines
+max-line-length=160 # Allow lines up to 160 character
\ No newline at end of file
diff --git a/ResearchAssistant/Deployment/scripts/aihub_scripts/create_ai_hub.py b/ResearchAssistant/Deployment/scripts/aihub_scripts/create_ai_hub.py
index cf0b8c3a6..5c78fc98d 100644
--- a/ResearchAssistant/Deployment/scripts/aihub_scripts/create_ai_hub.py
+++ b/ResearchAssistant/Deployment/scripts/aihub_scripts/create_ai_hub.py
@@ -1,16 +1,17 @@
# Get Azure Key Vault Client
-key_vault_name = 'kv_to-be-replaced'
+key_vault_name = "kv_to-be-replaced"
from azure.ai.ml import MLClient
from azure.ai.ml.entities import (
- Hub,
- Project,
ApiKeyConfiguration,
AzureAISearchConnection,
AzureOpenAIConnection,
+ Hub,
+ Project,
)
-from azure.keyvault.secrets import SecretClient
from azure.identity import DefaultAzureCredential
+from azure.keyvault.secrets import SecretClient
+
def get_secrets_from_kv(kv_name, secret_name):
# Set the name of the Azure Key Vault
@@ -27,15 +28,16 @@ def get_secrets_from_kv(kv_name, secret_name):
# Retrieve the secret value
return secret_client.get_secret(secret_name).value
+
# Azure configuration
-key_vault_name = 'kv_to-be-replaced'
-subscription_id = 'subscription_to-be-replaced'
-resource_group_name = 'rg_to-be-replaced'
-aihub_name = 'ai_hub_' + 'solutionname_to-be-replaced'
-project_name = 'ai_project_' + 'solutionname_to-be-replaced'
-deployment_name = 'draftsinference-' + 'solutionname_to-be-replaced'
-solutionLocation = 'solutionlocation_to-be-replaced'
+key_vault_name = "kv_to-be-replaced"
+subscription_id = "subscription_to-be-replaced"
+resource_group_name = "rg_to-be-replaced"
+aihub_name = "ai_hub_" + "solutionname_to-be-replaced"
+project_name = "ai_project_" + "solutionname_to-be-replaced"
+deployment_name = "draftsinference-" + "solutionname_to-be-replaced"
+solutionLocation = "solutionlocation_to-be-replaced"
# Open AI Details
open_ai_key = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-KEY")
@@ -90,7 +92,7 @@ def get_secrets_from_kv(kv_name, secret_name):
api_key=open_ai_key,
api_version=openai_api_version,
azure_endpoint=f"https://{open_ai_res_name}.openai.azure.com/",
- open_ai_resource_id=f"/subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/providers/Microsoft.CognitiveServices/accounts/{open_ai_res_name}"
+ open_ai_resource_id=f"/subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/providers/Microsoft.CognitiveServices/accounts/{open_ai_res_name}",
)
ml_client.connections.create_or_update(open_ai_connection)
@@ -104,7 +106,9 @@ def get_secrets_from_kv(kv_name, secret_name):
credentials=ApiKeyConfiguration(key=ai_search_key),
)
-aisearch_connection.tags["ResourceId"] = f"/subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/providers/Microsoft.Search/searchServices/{ai_search_res_name}"
+aisearch_connection.tags["ResourceId"] = (
+ f"/subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/providers/Microsoft.Search/searchServices/{ai_search_res_name}"
+)
aisearch_connection.tags["ApiVersion"] = "2024-05-01-preview"
ml_client.connections.create_or_update(aisearch_connection)
diff --git a/ResearchAssistant/Deployment/scripts/fabric_scripts/create_fabric_items.py b/ResearchAssistant/Deployment/scripts/fabric_scripts/create_fabric_items.py
index 510cb6699..f77b4a182 100644
--- a/ResearchAssistant/Deployment/scripts/fabric_scripts/create_fabric_items.py
+++ b/ResearchAssistant/Deployment/scripts/fabric_scripts/create_fabric_items.py
@@ -1,61 +1,70 @@
-from azure.identity import DefaultAzureCredential
import base64
import json
-import requests
+
import pandas as pd
+import requests
+from azure.identity import AzureCliCredential, DefaultAzureCredential
# credential = DefaultAzureCredential()
-from azure.identity import AzureCliCredential
credential = AzureCliCredential()
-cred = credential.get_token('https://api.fabric.microsoft.com/.default')
+cred = credential.get_token("https://api.fabric.microsoft.com/.default")
token = cred.token
-key_vault_name = 'kv_to-be-replaced'
+key_vault_name = "kv_to-be-replaced"
workspaceId = "workspaceId_to-be-replaced"
fabric_headers = {"Authorization": "Bearer " + token.strip()}
fabric_base_url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspaceId}/"
-fabric_items_url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspaceId}/items/"
+fabric_items_url = (
+ f"https://api.fabric.microsoft.com/v1/workspaces/{workspaceId}/items/"
+)
-lakehouse_name = 'Lakehouse1'
+lakehouse_name = "Lakehouse1"
-lakehouse_data = {
- "displayName": lakehouse_name,
- "type": "Lakehouse"
-}
+lakehouse_data = {"displayName": lakehouse_name, "type": "Lakehouse"}
-lakehouse_res = requests.post(fabric_items_url, headers=fabric_headers, json=lakehouse_data)
+lakehouse_res = requests.post(
+ fabric_items_url, headers=fabric_headers, json=lakehouse_data
+)
-notebook_names =['create_articles_index','create_grants_index','create_drafts_index']
+notebook_names = ["create_articles_index", "create_grants_index", "create_drafts_index"]
for notebook_name in notebook_names:
- with open('notebooks/'+ notebook_name +'.ipynb', 'r') as f:
+ with open("notebooks/" + notebook_name + ".ipynb", "r") as f:
notebook_json = json.load(f)
- notebook_json['metadata']['trident']['lakehouse']['default_lakehouse'] = lakehouse_res.json()['id']
- notebook_json['metadata']['trident']['lakehouse']['default_lakehouse_name'] = lakehouse_res.json()['displayName']
- notebook_json['metadata']['trident']['lakehouse']['workspaceId'] = lakehouse_res.json()['workspaceId']
+ notebook_json["metadata"]["trident"]["lakehouse"][
+ "default_lakehouse"
+ ] = lakehouse_res.json()["id"]
+ notebook_json["metadata"]["trident"]["lakehouse"][
+ "default_lakehouse_name"
+ ] = lakehouse_res.json()["displayName"]
+ notebook_json["metadata"]["trident"]["lakehouse"][
+ "workspaceId"
+ ] = lakehouse_res.json()["workspaceId"]
- notebook_base64 = base64.b64encode(json.dumps(notebook_json).encode('utf-8'))
+ notebook_base64 = base64.b64encode(json.dumps(notebook_json).encode("utf-8"))
notebook_data = {
- "displayName":notebook_name,
- "type":"Notebook",
- "definition" : {
+ "displayName": notebook_name,
+ "type": "Notebook",
+ "definition": {
"format": "ipynb",
"parts": [
{
"path": "notebook-content.ipynb",
- "payload": notebook_base64.decode('utf-8'),
- "payloadType": "InlineBase64"
+ "payload": notebook_base64.decode("utf-8"),
+ "payloadType": "InlineBase64",
}
- ]
- }
+ ],
+ },
}
- fabric_response = requests.post(fabric_items_url, headers=fabric_headers, json=notebook_data)
- #print(fabric_response.json())
\ No newline at end of file
+ fabric_response = requests.post(
+ fabric_items_url, headers=fabric_headers, json=notebook_data
+ )
+ # print(fabric_response.json())
diff --git a/ResearchAssistant/Deployment/scripts/index_scripts/create_articles_index.py b/ResearchAssistant/Deployment/scripts/index_scripts/create_articles_index.py
index 21b4624c5..cd0e16678 100644
--- a/ResearchAssistant/Deployment/scripts/index_scripts/create_articles_index.py
+++ b/ResearchAssistant/Deployment/scripts/index_scripts/create_articles_index.py
@@ -1,140 +1,124 @@
-#Get Azure Key Vault Client
-key_vault_name = 'kv_to-be-replaced'
+# Get Azure Key Vault Client
+key_vault_name = "kv_to-be-replaced"
import time
-
-time.sleep(120) # to fix the issue of the script
-#hardcoded values
+time.sleep(120) # to fix the issue of the script
+
+# hardcoded values
index_name = "articlesindex"
-drafts_index_name = 'draftsindex'
+drafts_index_name = "draftsindex"
file_system_client_name = "data"
-directory = 'demodata/pubmed_articles'
-csv_file_name = '/metadata/pubmed_articles.csv'
+directory = "demodata/pubmed_articles"
+csv_file_name = "/metadata/pubmed_articles.csv"
num_pages = 10
-from azure.keyvault.secrets import SecretClient
-from azure.identity import DefaultAzureCredential
+from azure.identity import DefaultAzureCredential
+from azure.keyvault.secrets import SecretClient
+
def get_secrets_from_kv(kv_name, secret_name):
-
- # Set the name of the Azure Key Vault
- key_vault_name = kv_name
-
- # Create a credential object using the default Azure credentials
- credential = DefaultAzureCredential()
-
- # Create a secret client object using the credential and Key Vault name
- secret_client = SecretClient(vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential)
-
- # Retrieve the secret value
- return(secret_client.get_secret(secret_name).value)
-
-
-#Utils
- # Import required libraries
-import os
-import json
-import openai
-import os
-from azure.core.credentials import AzureKeyCredential
-from azure.ai.textanalytics import TextAnalyticsClient
+ # Set the name of the Azure Key Vault
+ key_vault_name = kv_name
-from azure.core.credentials import AzureKeyCredential
-from azure.search.documents import SearchClient, SearchIndexingBufferedSender
-from azure.search.documents.indexes import SearchIndexClient
-from azure.search.documents.models import (
- QueryAnswerType,
- QueryCaptionType,
- QueryCaptionResult,
- QueryAnswerResult,
- SemanticErrorMode,
- SemanticErrorReason,
- SemanticSearchResultsType,
- QueryType,
- VectorizedQuery,
- VectorQuery,
- VectorFilterMode,
-)
-from azure.search.documents.indexes.models import (
+ # Create a credential object using the default Azure credentials
+ credential = DefaultAzureCredential()
+
+ # Create a secret client object using the credential and Key Vault name
+ secret_client = SecretClient(
+ vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential
+ )
+
+ # Retrieve the secret value
+ return secret_client.get_secret(secret_name).value
+
+
+# Utils
+# Import required libraries
+import json
+import os
+
+import openai
+from azure.ai.textanalytics import TextAnalyticsClient
+from azure.core.credentials import AzureKeyCredential
+from azure.search.documents import SearchClient, SearchIndexingBufferedSender
+from azure.search.documents.indexes import SearchIndexClient
+from azure.search.documents.indexes.models import (
ExhaustiveKnnAlgorithmConfiguration,
ExhaustiveKnnParameters,
- SearchIndex,
- SearchField,
- SearchFieldDataType,
- SimpleField,
- SearchableField,
- SearchIndex,
- SemanticConfiguration,
- SemanticPrioritizedFields,
- SemanticField,
- SearchField,
- SemanticSearch,
- VectorSearch,
HnswAlgorithmConfiguration,
- HnswParameters,
- VectorSearch,
- VectorSearchAlgorithmConfiguration,
- VectorSearchAlgorithmKind,
- VectorSearchProfile,
- SearchIndex,
+ HnswParameters,
+ SearchableField,
SearchField,
SearchFieldDataType,
+ SearchIndex,
+ SemanticConfiguration,
+ SemanticField,
+ SemanticPrioritizedFields,
+ SemanticSearch,
SimpleField,
- SearchableField,
- VectorSearch,
- ExhaustiveKnnParameters,
- SearchIndex,
- SearchField,
- SearchFieldDataType,
- SimpleField,
- SearchableField,
- SearchIndex,
- SemanticConfiguration,
- SemanticField,
- SearchField,
- VectorSearch,
- HnswParameters,
VectorSearch,
+ VectorSearchAlgorithmConfiguration,
VectorSearchAlgorithmKind,
VectorSearchAlgorithmMetric,
VectorSearchProfile,
-)
-search_endpoint = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-ENDPOINT")
-search_key = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-KEY")
+)
+from azure.search.documents.models import (
+ QueryAnswerResult,
+ QueryAnswerType,
+ QueryCaptionResult,
+ QueryCaptionType,
+ QueryType,
+ SemanticErrorMode,
+ SemanticErrorReason,
+ SemanticSearchResultsType,
+ VectorFilterMode,
+ VectorizedQuery,
+ VectorQuery,
+)
-openai.api_key = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-KEY")
-openai.api_base = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-ENDPOINT")
-openai.api_version = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-PREVIEW-API-VERSION")
+search_endpoint = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-ENDPOINT")
+search_key = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-KEY")
-openai_api_key = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-KEY")
-openai_api_base = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-ENDPOINT")
-openai_api_version = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-PREVIEW-API-VERSION")
+openai.api_key = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-KEY")
+openai.api_base = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-ENDPOINT")
+openai.api_version = get_secrets_from_kv(
+ key_vault_name, "AZURE-OPENAI-PREVIEW-API-VERSION"
+)
-# Set up your Azure Text Analytics service and credentials
-COG_SERVICES_NAME = get_secrets_from_kv(key_vault_name,"COG-SERVICES-NAME")
-COG_SERVICES_ENDPOINT = get_secrets_from_kv(key_vault_name,"COG-SERVICES-ENDPOINT")
-COG_SERVICES_KEY = get_secrets_from_kv(key_vault_name,"COG-SERVICES-KEY")
+openai_api_key = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-KEY")
+openai_api_base = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-ENDPOINT")
+openai_api_version = get_secrets_from_kv(
+ key_vault_name, "AZURE-OPENAI-PREVIEW-API-VERSION"
+)
+
+# Set up your Azure Text Analytics service and credentials
+COG_SERVICES_NAME = get_secrets_from_kv(key_vault_name, "COG-SERVICES-NAME")
+COG_SERVICES_ENDPOINT = get_secrets_from_kv(key_vault_name, "COG-SERVICES-ENDPOINT")
+COG_SERVICES_KEY = get_secrets_from_kv(key_vault_name, "COG-SERVICES-KEY")
-cog_services_credential = AzureKeyCredential(COG_SERVICES_KEY)
+cog_services_credential = AzureKeyCredential(COG_SERVICES_KEY)
+
+# Create a TextAnalyticsClient using your endpoint and credentials
+cog_services_client = TextAnalyticsClient(
+ endpoint=COG_SERVICES_ENDPOINT, credential=cog_services_credential
+)
-# Create a TextAnalyticsClient using your endpoint and credentials
-cog_services_client = TextAnalyticsClient(endpoint=COG_SERVICES_ENDPOINT, credential=cog_services_credential)
-def get_named_entities(cog_services_client,input_text):
- # Call the named entity recognition API to extract named entities from your text
- result = cog_services_client.recognize_entities(documents=[input_text])
-
- # return the named entities for each document
- # full list of categories #https://learn.microsoft.com/en-us/azure/ai-services/language-service/named-entity-recognition/concepts/named-entity-categories?tabs=ga-api
+def get_named_entities(cog_services_client, input_text):
+ # Call the named entity recognition API to extract named entities from your text
+ result = cog_services_client.recognize_entities(documents=[input_text])
- Person = []
+ # return the named entities for each document
+ # full list of categories #https://learn.microsoft.com/en-us/azure/ai-services/language-service/named-entity-recognition/concepts/named-entity-categories?tabs=ga-api
+
+ Person = []
Location = []
- Organization = []
+ Organization = []
DateTime = []
- URL = []
+ URL = []
Email = []
PersonType = []
Event = []
@@ -142,7 +126,7 @@ def get_named_entities(cog_services_client,input_text):
for idx, doc in enumerate(result):
if not doc.is_error:
- for entity in doc.entities:
+ for entity in doc.entities:
if entity.category == "DateTime":
DateTime.append(entity.text)
elif entity.category == "Person":
@@ -162,39 +146,53 @@ def get_named_entities(cog_services_client,input_text):
elif entity.category == "Quantity":
Quantity.append(entity.text)
- else:
- print(" Error: {}".format(doc.error.message))
- return(list(set(DateTime)),list(set(Person)),list(set(Location)),list(set(Organization)),list(set(URL)),list(set(Email)),list(set(PersonType)),list(set(Event)),list(set(Quantity)))
-
+ else:
+ print(" Error: {}".format(doc.error.message))
+ return (
+ list(set(DateTime)),
+ list(set(Person)),
+ list(set(Location)),
+ list(set(Organization)),
+ list(set(URL)),
+ list(set(Email)),
+ list(set(PersonType)),
+ list(set(Event)),
+ list(set(Quantity)),
+ )
+
from openai import AzureOpenAI
+
# Function: Get Embeddings
-def get_embeddings(text: str,openai_api_base,openai_api_version,openai_api_key):
+def get_embeddings(text: str, openai_api_base, openai_api_version, openai_api_key):
model_id = "text-embedding-ada-002"
client = AzureOpenAI(
api_version=openai_api_version,
azure_endpoint=openai_api_base,
- api_key = openai_api_key
+ api_key=openai_api_key,
)
-
+
# embedding = openai.Embedding.create(input=text, deployment_id=model_id)["data"][0]["embedding"]
embedding = client.embeddings.create(input=text, model=model_id).data[0].embedding
return embedding
+
# from langchain.text_splitter import MarkdownTextSplitter, RecursiveCharacterTextSplitter, PythonCodeTextSplitter
# import tiktoken
import re
+
def clean_spaces_with_regex(text):
# Use a regular expression to replace multiple spaces with a single space
- cleaned_text = re.sub(r'\s+', ' ', text)
+ cleaned_text = re.sub(r"\s+", " ", text)
# Use a regular expression to replace consecutive dots with a single dot
- cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text)
+ cleaned_text = re.sub(r"\.{2,}", ".", cleaned_text)
return cleaned_text
+
# def estimate_tokens(text):
# GPT2_TOKENIZER = tiktoken.get_encoding("gpt2")
# return(len(GPT2_TOKENIZER.encode(text)))
@@ -211,27 +209,28 @@ def clean_spaces_with_regex(text):
# return(splitter.split_text(text))
+
def chunk_data(text):
- tokens_per_chunk = 500 #1024
+ tokens_per_chunk = 500 # 1024
text = clean_spaces_with_regex(text)
SENTENCE_ENDINGS = [".", "!", "?"]
- WORDS_BREAKS = ['\n', '\t', '}', '{', ']', '[', ')', '(', ' ', ':', ';', ',']
+ WORDS_BREAKS = ["\n", "\t", "}", "{", "]", "[", ")", "(", " ", ":", ";", ","]
- sentences = text.split('. ') # Split text into sentences
+ sentences = text.split(". ") # Split text into sentences
chunks = []
- current_chunk = ''
+ current_chunk = ""
current_chunk_token_count = 0
-
+
# Iterate through each sentence
for sentence in sentences:
# Split sentence into tokens
tokens = sentence.split()
-
+
# Check if adding the current sentence exceeds tokens_per_chunk
if current_chunk_token_count + len(tokens) <= tokens_per_chunk:
# Add the sentence to the current chunk
if current_chunk:
- current_chunk += '. ' + sentence
+ current_chunk += ". " + sentence
else:
current_chunk += sentence
current_chunk_token_count += len(tokens)
@@ -240,43 +239,114 @@ def chunk_data(text):
chunks.append(current_chunk)
current_chunk = sentence
current_chunk_token_count = len(tokens)
-
+
# Add the last chunk
if current_chunk:
chunks.append(current_chunk)
-
+
return chunks
+
# Create the search index
search_credential = AzureKeyCredential(search_key)
-index_client = SearchIndexClient(
- endpoint=search_endpoint, credential=search_credential)
+index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
fields = [
- SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
+ SimpleField(
+ name="id",
+ type=SearchFieldDataType.String,
+ key=True,
+ sortable=True,
+ filterable=True,
+ facetable=True,
+ ),
SearchableField(name="chunk_id", type=SearchFieldDataType.String),
SearchableField(name="document_id", type=SearchFieldDataType.String),
SearchableField(name="title", type=SearchFieldDataType.String),
SearchableField(name="content", type=SearchFieldDataType.String),
SearchableField(name="sourceurl", type=SearchFieldDataType.String),
SearchableField(name="publicurl", type=SearchFieldDataType.String),
- SimpleField(name="dateTime", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="Person", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="Location", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="Organization", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="URL", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="Email", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="PersonType", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="Event", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="Quantity", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
- searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
- SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
- searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile")
+ SimpleField(
+ name="dateTime",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="Person",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="Location",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="Organization",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="URL",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="Email",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="PersonType",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="Event",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="Quantity",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SearchField(
+ name="titleVector",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
+ searchable=True,
+ vector_search_dimensions=1536,
+ vector_search_profile_name="myHnswProfile",
+ ),
+ SearchField(
+ name="contentVector",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
+ searchable=True,
+ vector_search_dimensions=1536,
+ vector_search_profile_name="myHnswProfile",
+ ),
]
-# Configure the vector search configuration
+# Configure the vector search configuration
vector_search = VectorSearch(
algorithms=[
HnswAlgorithmConfiguration(
@@ -286,16 +356,16 @@ def chunk_data(text):
m=4,
ef_construction=400,
ef_search=500,
- metric=VectorSearchAlgorithmMetric.COSINE
- )
+ metric=VectorSearchAlgorithmMetric.COSINE,
+ ),
),
ExhaustiveKnnAlgorithmConfiguration(
name="myExhaustiveKnn",
kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
parameters=ExhaustiveKnnParameters(
metric=VectorSearchAlgorithmMetric.COSINE
- )
- )
+ ),
+ ),
],
profiles=[
VectorSearchProfile(
@@ -305,60 +375,70 @@ def chunk_data(text):
VectorSearchProfile(
name="myExhaustiveKnnProfile",
algorithm_configuration_name="myExhaustiveKnn",
- )
- ]
+ ),
+ ],
)
semantic_config = SemanticConfiguration(
name="my-semantic-config",
prioritized_fields=SemanticPrioritizedFields(
title_field=SemanticField(field_name="title"),
- content_fields=[SemanticField(field_name="content")]
- )
+ content_fields=[SemanticField(field_name="content")],
+ ),
)
# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])
# Create the search index with the semantic settings
-index = SearchIndex(name=index_name, fields=fields,
- vector_search=vector_search, semantic_search=semantic_search)
+index = SearchIndex(
+ name=index_name,
+ fields=fields,
+ vector_search=vector_search,
+ semantic_search=semantic_search,
+)
result = index_client.create_or_update_index(index)
-print(f' {result.name} created')
+print(f" {result.name} created")
# Create the drafts search index with the semantic settings
-index = SearchIndex(name=drafts_index_name, fields=fields,
- vector_search=vector_search, semantic_search=semantic_search)
+index = SearchIndex(
+ name=drafts_index_name,
+ fields=fields,
+ vector_search=vector_search,
+ semantic_search=semantic_search,
+)
result = index_client.create_or_update_index(index)
-print(f' {result.name} created')
+print(f" {result.name} created")
-#add documents to the index
+# add documents to the index
-from azure.core.credentials import AzureKeyCredential
-from azure.storage.filedatalake import (
- DataLakeServiceClient,
- DataLakeDirectoryClient,
- FileSystemClient
-)
-from azure.identity import ClientSecretCredential
-import pypdf
-from io import BytesIO
import base64
import time
-import pandas as pd
+from io import BytesIO
+import pandas as pd
+import pypdf
+from azure.core.credentials import AzureKeyCredential
+from azure.identity import ClientSecretCredential
+from azure.storage.filedatalake import (
+ DataLakeDirectoryClient,
+ DataLakeServiceClient,
+ FileSystemClient,
+)
account_name = get_secrets_from_kv(key_vault_name, "ADLS-ACCOUNT-NAME")
credential = DefaultAzureCredential()
account_url = f"https://{account_name}.dfs.core.windows.net"
-service_client = DataLakeServiceClient(account_url, credential=credential,api_version='2023-01-03')
+service_client = DataLakeServiceClient(
+ account_url, credential=credential, api_version="2023-01-03"
+)
-file_system_client = service_client.get_file_system_client(file_system_client_name)
-directory_name = directory + '/pdfs'
+file_system_client = service_client.get_file_system_client(file_system_client_name)
+directory_name = directory + "/pdfs"
paths = file_system_client.get_paths(path=directory_name)
# Azure Cognitive Search Vector Index
@@ -375,7 +455,7 @@ def chunk_data(text):
print(file_path)
file_client = file_system_client.get_file_client(file_path)
csv_file = file_client.download_file()
-df_metadata = pd.read_csv(csv_file, encoding='utf-8')
+df_metadata = pd.read_csv(csv_file, encoding="utf-8")
docs = []
num_pdfs = 0
@@ -387,59 +467,83 @@ def chunk_data(text):
stream = BytesIO()
pdf_file.readinto(stream)
pdf_reader = pypdf.PdfReader(stream)
- filename = path.name.split('/')[-1]
- document_id = filename.replace('.pdf','')
+ filename = path.name.split("/")[-1]
+ document_id = filename.replace(".pdf", "")
+
+ df_file_metadata = df_metadata[df_metadata["pubmed_id"] == int(document_id)].iloc[0]
- df_file_metadata = df_metadata[df_metadata['pubmed_id']==int(document_id)].iloc[0]
-
- text = ""
+ text = ""
- n = num_pages #len(pdf_reader.pages)
+ n = num_pages # len(pdf_reader.pages)
if len(pdf_reader.pages) < n:
n = len(pdf_reader.pages)
- for page_num in range(n): #range(len(pdf_reader.pages)):
- public_url = df_file_metadata['publicurl'] + '#page=' + str(page_num)
+ for page_num in range(n): # range(len(pdf_reader.pages)):
+ public_url = df_file_metadata["publicurl"] + "#page=" + str(page_num)
page = pdf_reader.pages[page_num]
- text = page.extract_text()
-
+ text = page.extract_text()
+
chunks = chunk_data(text)
chunk_num = 0
for chunk in chunks:
chunk_num += 1
d = {
- "chunk_id" : path.name.split('/')[-1] + '_' + str(page_num).zfill(2) + '_' + str(chunk_num).zfill(2),
- "document_id": str(df_file_metadata['pubmed_id']),
- "content": chunk,
- "title": df_file_metadata['title'],
- "abstract": df_file_metadata['abstract'] } #path.name.split('/')[-1] + '_' + str(page_num).zfill(2) + '_' + str(chunk_num).zfill(2)}
-
- d["dateTime"],d["Person"],d["Location"],d["Organization"],d["URL"],d["Email"],d["PersonType"],d["Event"],d["Quantity"] = get_named_entities(cog_services_client,d["content"])
+ "chunk_id": path.name.split("/")[-1]
+ + "_"
+ + str(page_num).zfill(2)
+ + "_"
+ + str(chunk_num).zfill(2),
+ "document_id": str(df_file_metadata["pubmed_id"]),
+ "content": chunk,
+ "title": df_file_metadata["title"],
+ "abstract": df_file_metadata["abstract"],
+ } # path.name.split('/')[-1] + '_' + str(page_num).zfill(2) + '_' + str(chunk_num).zfill(2)}
+
+ (
+ d["dateTime"],
+ d["Person"],
+ d["Location"],
+ d["Organization"],
+ d["URL"],
+ d["Email"],
+ d["PersonType"],
+ d["Event"],
+ d["Quantity"],
+ ) = get_named_entities(cog_services_client, d["content"])
counter += 1
try:
- v_titleVector = get_embeddings(d["title"],openai_api_base,openai_api_version,openai_api_key)
+ v_titleVector = get_embeddings(
+ d["title"], openai_api_base, openai_api_version, openai_api_key
+ )
except:
time.sleep(30)
- v_titleVector = get_embeddings(d["title"],openai_api_base,openai_api_version,openai_api_key)
-
+ v_titleVector = get_embeddings(
+ d["title"], openai_api_base, openai_api_version, openai_api_key
+ )
+
try:
- v_contentVector = get_embeddings(d["content"],openai_api_base,openai_api_version,openai_api_key)
+ v_contentVector = get_embeddings(
+ d["content"], openai_api_base, openai_api_version, openai_api_key
+ )
except:
time.sleep(30)
- v_contentVector = get_embeddings(d["content"],openai_api_base,openai_api_version,openai_api_key)
-
+ v_contentVector = get_embeddings(
+ d["content"], openai_api_base, openai_api_version, openai_api_key
+ )
docs.append(
- {
- "id": base64.urlsafe_b64encode(bytes(d["chunk_id"], encoding='utf-8')).decode('utf-8'),
+ {
+ "id": base64.urlsafe_b64encode(
+ bytes(d["chunk_id"], encoding="utf-8")
+ ).decode("utf-8"),
"chunk_id": d["chunk_id"],
"document_id": d["document_id"],
"title": d["title"],
"content": d["content"],
- "sourceurl": path.name.split('/')[-1],
- "publicurl": public_url,
+ "sourceurl": path.name.split("/")[-1],
+ "publicurl": public_url,
"dateTime": d["dateTime"],
"Person": d["Person"],
"Location": d["Location"],
@@ -450,18 +554,16 @@ def chunk_data(text):
"Event": d["Event"],
"Quantity": d["Quantity"],
"titleVector": v_titleVector,
- "contentVector": v_contentVector
- }
+ "contentVector": v_contentVector,
+ }
)
-
+
if counter % 10 == 0:
result = client.upload_documents(documents=docs)
result = drafts_client.upload_documents(documents=docs)
docs = []
- print(f' {str(counter)} uploaded')
-#upload the last batch
+ print(f" {str(counter)} uploaded")
+# upload the last batch
if docs != []:
client.upload_documents(documents=docs)
drafts_client.upload_documents(documents=docs)
-
-
diff --git a/ResearchAssistant/Deployment/scripts/index_scripts/create_drafts_index.py b/ResearchAssistant/Deployment/scripts/index_scripts/create_drafts_index.py
index 9acb0a492..2e3c33073 100644
--- a/ResearchAssistant/Deployment/scripts/index_scripts/create_drafts_index.py
+++ b/ResearchAssistant/Deployment/scripts/index_scripts/create_drafts_index.py
@@ -1,138 +1,122 @@
-#Get Azure Key Vault Client
-key_vault_name = 'kv_to-be-replaced'
+# Get Azure Key Vault Client
+key_vault_name = "kv_to-be-replaced"
-#hardcoded values
+# hardcoded values
index_name = "draftsindex"
file_system_client_name = "data"
-directory = 'demodata/completed_grants'
-directory2 = 'demodata2/completed_grants'
-directory3 = 'demodata3/completed_grants'
-csv_file_name = '/metadata/completed_grants.csv'
+directory = "demodata/completed_grants"
+directory2 = "demodata2/completed_grants"
+directory3 = "demodata3/completed_grants"
+csv_file_name = "/metadata/completed_grants.csv"
num_pages = 10
-from azure.keyvault.secrets import SecretClient
-from azure.identity import DefaultAzureCredential
+from azure.identity import DefaultAzureCredential
+from azure.keyvault.secrets import SecretClient
+
def get_secrets_from_kv(kv_name, secret_name):
-
- # Set the name of the Azure Key Vault
- key_vault_name = kv_name
-
- # Create a credential object using the default Azure credentials
- credential = DefaultAzureCredential()
-
- # Create a secret client object using the credential and Key Vault name
- secret_client = SecretClient(vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential)
-
- # Retrieve the secret value
- return(secret_client.get_secret(secret_name).value)
-
-
-#Utils
- # Import required libraries
-import os
-import json
-import openai
-import os
-from azure.core.credentials import AzureKeyCredential
-from azure.ai.textanalytics import TextAnalyticsClient
+ # Set the name of the Azure Key Vault
+ key_vault_name = kv_name
-from azure.core.credentials import AzureKeyCredential
-from azure.search.documents import SearchClient, SearchIndexingBufferedSender
-from azure.search.documents.indexes import SearchIndexClient
-from azure.search.documents.models import (
- QueryAnswerType,
- QueryCaptionType,
- QueryCaptionResult,
- QueryAnswerResult,
- SemanticErrorMode,
- SemanticErrorReason,
- SemanticSearchResultsType,
- QueryType,
- VectorizedQuery,
- VectorQuery,
- VectorFilterMode,
-)
-from azure.search.documents.indexes.models import (
+ # Create a credential object using the default Azure credentials
+ credential = DefaultAzureCredential()
+
+ # Create a secret client object using the credential and Key Vault name
+ secret_client = SecretClient(
+ vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential
+ )
+
+ # Retrieve the secret value
+ return secret_client.get_secret(secret_name).value
+
+
+# Utils
+# Import required libraries
+import json
+import os
+
+import openai
+from azure.ai.textanalytics import TextAnalyticsClient
+from azure.core.credentials import AzureKeyCredential
+from azure.search.documents import SearchClient, SearchIndexingBufferedSender
+from azure.search.documents.indexes import SearchIndexClient
+from azure.search.documents.indexes.models import (
ExhaustiveKnnAlgorithmConfiguration,
ExhaustiveKnnParameters,
- SearchIndex,
- SearchField,
- SearchFieldDataType,
- SimpleField,
- SearchableField,
- SearchIndex,
- SemanticConfiguration,
- SemanticPrioritizedFields,
- SemanticField,
- SearchField,
- SemanticSearch,
- VectorSearch,
HnswAlgorithmConfiguration,
- HnswParameters,
- VectorSearch,
- VectorSearchAlgorithmConfiguration,
- VectorSearchAlgorithmKind,
- VectorSearchProfile,
- SearchIndex,
+ HnswParameters,
+ SearchableField,
SearchField,
SearchFieldDataType,
+ SearchIndex,
+ SemanticConfiguration,
+ SemanticField,
+ SemanticPrioritizedFields,
+ SemanticSearch,
SimpleField,
- SearchableField,
- VectorSearch,
- ExhaustiveKnnParameters,
- SearchIndex,
- SearchField,
- SearchFieldDataType,
- SimpleField,
- SearchableField,
- SearchIndex,
- SemanticConfiguration,
- SemanticField,
- SearchField,
- VectorSearch,
- HnswParameters,
VectorSearch,
+ VectorSearchAlgorithmConfiguration,
VectorSearchAlgorithmKind,
VectorSearchAlgorithmMetric,
VectorSearchProfile,
-)
-search_endpoint = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-ENDPOINT")
-search_key = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-KEY")
+)
+from azure.search.documents.models import (
+ QueryAnswerResult,
+ QueryAnswerType,
+ QueryCaptionResult,
+ QueryCaptionType,
+ QueryType,
+ SemanticErrorMode,
+ SemanticErrorReason,
+ SemanticSearchResultsType,
+ VectorFilterMode,
+ VectorizedQuery,
+ VectorQuery,
+)
-openai.api_key = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-KEY")
-openai.api_base = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-ENDPOINT")
-openai.api_version = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-PREVIEW-API-VERSION")
+search_endpoint = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-ENDPOINT")
+search_key = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-KEY")
-openai_api_key = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-KEY")
-openai_api_base = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-ENDPOINT")
-openai_api_version = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-PREVIEW-API-VERSION")
+openai.api_key = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-KEY")
+openai.api_base = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-ENDPOINT")
+openai.api_version = get_secrets_from_kv(
+ key_vault_name, "AZURE-OPENAI-PREVIEW-API-VERSION"
+)
+
+openai_api_key = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-KEY")
+openai_api_base = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-ENDPOINT")
+openai_api_version = get_secrets_from_kv(
+ key_vault_name, "AZURE-OPENAI-PREVIEW-API-VERSION"
+)
-# Set up your Azure Text Analytics service and credentials
-COG_SERVICES_NAME = get_secrets_from_kv(key_vault_name,"COG-SERVICES-NAME")
-COG_SERVICES_ENDPOINT = get_secrets_from_kv(key_vault_name,"COG-SERVICES-ENDPOINT")
-COG_SERVICES_KEY = get_secrets_from_kv(key_vault_name,"COG-SERVICES-KEY")
+# Set up your Azure Text Analytics service and credentials
+COG_SERVICES_NAME = get_secrets_from_kv(key_vault_name, "COG-SERVICES-NAME")
+COG_SERVICES_ENDPOINT = get_secrets_from_kv(key_vault_name, "COG-SERVICES-ENDPOINT")
+COG_SERVICES_KEY = get_secrets_from_kv(key_vault_name, "COG-SERVICES-KEY")
-cog_services_credential = AzureKeyCredential(COG_SERVICES_KEY)
+cog_services_credential = AzureKeyCredential(COG_SERVICES_KEY)
-# Create a TextAnalyticsClient using your endpoint and credentials
-cog_services_client = TextAnalyticsClient(endpoint=COG_SERVICES_ENDPOINT, credential=cog_services_credential)
+# Create a TextAnalyticsClient using your endpoint and credentials
+cog_services_client = TextAnalyticsClient(
+ endpoint=COG_SERVICES_ENDPOINT, credential=cog_services_credential
+)
-def get_named_entities(cog_services_client,input_text):
- # Call the named entity recognition API to extract named entities from your text
- input_text = input_text[:5000] #limit to 5000 characters
- result = cog_services_client.recognize_entities(documents=[input_text])
-
- # return the named entities for each document
- # full list of categories #https://learn.microsoft.com/en-us/azure/ai-services/language-service/named-entity-recognition/concepts/named-entity-categories?tabs=ga-api
- Person = []
+def get_named_entities(cog_services_client, input_text):
+ # Call the named entity recognition API to extract named entities from your text
+ input_text = input_text[:5000] # limit to 5000 characters
+ result = cog_services_client.recognize_entities(documents=[input_text])
+
+ # return the named entities for each document
+ # full list of categories #https://learn.microsoft.com/en-us/azure/ai-services/language-service/named-entity-recognition/concepts/named-entity-categories?tabs=ga-api
+
+ Person = []
Location = []
- Organization = []
+ Organization = []
DateTime = []
- URL = []
+ URL = []
Email = []
PersonType = []
Event = []
@@ -140,7 +124,7 @@ def get_named_entities(cog_services_client,input_text):
for idx, doc in enumerate(result):
if not doc.is_error:
- for entity in doc.entities:
+ for entity in doc.entities:
if entity.category == "DateTime":
DateTime.append(entity.text)
elif entity.category == "Person":
@@ -160,39 +144,53 @@ def get_named_entities(cog_services_client,input_text):
elif entity.category == "Quantity":
Quantity.append(entity.text)
- else:
- print(" Error: {}".format(doc.error.message))
- return(list(set(DateTime)),list(set(Person)),list(set(Location)),list(set(Organization)),list(set(URL)),list(set(Email)),list(set(PersonType)),list(set(Event)),list(set(Quantity)))
-
+ else:
+ print(" Error: {}".format(doc.error.message))
+ return (
+ list(set(DateTime)),
+ list(set(Person)),
+ list(set(Location)),
+ list(set(Organization)),
+ list(set(URL)),
+ list(set(Email)),
+ list(set(PersonType)),
+ list(set(Event)),
+ list(set(Quantity)),
+ )
+
from openai import AzureOpenAI
+
# Function: Get Embeddings
-def get_embeddings(text: str,openai_api_base,openai_api_version,openai_api_key):
+def get_embeddings(text: str, openai_api_base, openai_api_version, openai_api_key):
model_id = "text-embedding-ada-002"
client = AzureOpenAI(
api_version=openai_api_version,
azure_endpoint=openai_api_base,
- api_key = openai_api_key
+ api_key=openai_api_key,
)
-
+
# embedding = openai.Embedding.create(input=text, deployment_id=model_id)["data"][0]["embedding"]
embedding = client.embeddings.create(input=text, model=model_id).data[0].embedding
return embedding
+
# from langchain.text_splitter import MarkdownTextSplitter, RecursiveCharacterTextSplitter, PythonCodeTextSplitter
# import tiktoken
import re
+
def clean_spaces_with_regex(text):
# Use a regular expression to replace multiple spaces with a single space
- cleaned_text = re.sub(r'\s+', ' ', text)
+ cleaned_text = re.sub(r"\s+", " ", text)
# Use a regular expression to replace consecutive dots with a single dot
- cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text)
+ cleaned_text = re.sub(r"\.{2,}", ".", cleaned_text)
return cleaned_text
+
# def estimate_tokens(text):
# GPT2_TOKENIZER = tiktoken.get_encoding("gpt2")
# return(len(GPT2_TOKENIZER.encode(text)))
@@ -209,27 +207,28 @@ def clean_spaces_with_regex(text):
# return(splitter.split_text(text))
+
def chunk_data(text):
- tokens_per_chunk = 500 #1024
+ tokens_per_chunk = 500 # 1024
text = clean_spaces_with_regex(text)
SENTENCE_ENDINGS = [".", "!", "?"]
- WORDS_BREAKS = ['\n', '\t', '}', '{', ']', '[', ')', '(', ' ', ':', ';', ',']
+ WORDS_BREAKS = ["\n", "\t", "}", "{", "]", "[", ")", "(", " ", ":", ";", ","]
- sentences = text.split('. ') # Split text into sentences
+ sentences = text.split(". ") # Split text into sentences
chunks = []
- current_chunk = ''
+ current_chunk = ""
current_chunk_token_count = 0
-
+
# Iterate through each sentence
for sentence in sentences:
# Split sentence into tokens
tokens = sentence.split()
-
+
# Check if adding the current sentence exceeds tokens_per_chunk
if current_chunk_token_count + len(tokens) <= tokens_per_chunk:
# Add the sentence to the current chunk
if current_chunk:
- current_chunk += '. ' + sentence
+ current_chunk += ". " + sentence
else:
current_chunk += sentence
current_chunk_token_count += len(tokens)
@@ -238,43 +237,114 @@ def chunk_data(text):
chunks.append(current_chunk)
current_chunk = sentence
current_chunk_token_count = len(tokens)
-
+
# Add the last chunk
if current_chunk:
chunks.append(current_chunk)
-
+
return chunks
+
# Create the search index
search_credential = AzureKeyCredential(search_key)
-index_client = SearchIndexClient(
- endpoint=search_endpoint, credential=search_credential)
+index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
fields = [
- SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
+ SimpleField(
+ name="id",
+ type=SearchFieldDataType.String,
+ key=True,
+ sortable=True,
+ filterable=True,
+ facetable=True,
+ ),
SearchableField(name="chunk_id", type=SearchFieldDataType.String),
SearchableField(name="document_id", type=SearchFieldDataType.String),
SearchableField(name="title", type=SearchFieldDataType.String),
SearchableField(name="content", type=SearchFieldDataType.String),
SearchableField(name="sourceurl", type=SearchFieldDataType.String),
SearchableField(name="publicurl", type=SearchFieldDataType.String),
- SimpleField(name="dateTime", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="Person", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="Location", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="Organization", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="URL", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="Email", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="PersonType", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="Event", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="Quantity", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
- searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
- SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
- searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile")
+ SimpleField(
+ name="dateTime",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="Person",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="Location",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="Organization",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="URL",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="Email",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="PersonType",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="Event",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="Quantity",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SearchField(
+ name="titleVector",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
+ searchable=True,
+ vector_search_dimensions=1536,
+ vector_search_profile_name="myHnswProfile",
+ ),
+ SearchField(
+ name="contentVector",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
+ searchable=True,
+ vector_search_dimensions=1536,
+ vector_search_profile_name="myHnswProfile",
+ ),
]
-# Configure the vector search configuration
+# Configure the vector search configuration
vector_search = VectorSearch(
algorithms=[
HnswAlgorithmConfiguration(
@@ -284,16 +354,16 @@ def chunk_data(text):
m=4,
ef_construction=400,
ef_search=500,
- metric=VectorSearchAlgorithmMetric.COSINE
- )
+ metric=VectorSearchAlgorithmMetric.COSINE,
+ ),
),
ExhaustiveKnnAlgorithmConfiguration(
name="myExhaustiveKnn",
kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
parameters=ExhaustiveKnnParameters(
metric=VectorSearchAlgorithmMetric.COSINE
- )
- )
+ ),
+ ),
],
profiles=[
VectorSearchProfile(
@@ -303,56 +373,62 @@ def chunk_data(text):
VectorSearchProfile(
name="myExhaustiveKnnProfile",
algorithm_configuration_name="myExhaustiveKnn",
- )
- ]
+ ),
+ ],
)
semantic_config = SemanticConfiguration(
name="my-semantic-config",
prioritized_fields=SemanticPrioritizedFields(
title_field=SemanticField(field_name="title"),
- content_fields=[SemanticField(field_name="content")]
- )
+ content_fields=[SemanticField(field_name="content")],
+ ),
)
# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])
# Create the search index with the semantic settings
-index = SearchIndex(name=index_name, fields=fields,
- vector_search=vector_search, semantic_search=semantic_search)
+index = SearchIndex(
+ name=index_name,
+ fields=fields,
+ vector_search=vector_search,
+ semantic_search=semantic_search,
+)
result = index_client.create_or_update_index(index)
-print(f' {result.name} created')
+print(f" {result.name} created")
# #add documents to the index
-from azure.core.credentials import AzureKeyCredential
-from azure.storage.filedatalake import (
- DataLakeServiceClient,
- DataLakeDirectoryClient,
- FileSystemClient
-)
-from azure.identity import ClientSecretCredential
-import pypdf
-from io import BytesIO
import base64
import time
-import pandas as pd
+from io import BytesIO
+import pandas as pd
+import pypdf
+from azure.core.credentials import AzureKeyCredential
+from azure.identity import ClientSecretCredential
+from azure.storage.filedatalake import (
+ DataLakeDirectoryClient,
+ DataLakeServiceClient,
+ FileSystemClient,
+)
account_name = get_secrets_from_kv(key_vault_name, "ADLS-ACCOUNT-NAME")
credential = DefaultAzureCredential()
account_url = f"https://{account_name}.dfs.core.windows.net"
-service_client = DataLakeServiceClient(account_url, credential=credential,api_version='2023-01-03')
+service_client = DataLakeServiceClient(
+ account_url, credential=credential, api_version="2023-01-03"
+)
-file_system_client = service_client.get_file_system_client(file_system_client_name)
-directory_name = directory + '/pdfs'
+file_system_client = service_client.get_file_system_client(file_system_client_name)
+directory_name = directory + "/pdfs"
paths = list(file_system_client.get_paths(path=directory_name))
-paths = paths + list(file_system_client.get_paths(path=directory2 + '/pdfs'))
-paths = paths + list(file_system_client.get_paths(path=directory3 + '/pdfs'))
+paths = paths + list(file_system_client.get_paths(path=directory2 + "/pdfs"))
+paths = paths + list(file_system_client.get_paths(path=directory3 + "/pdfs"))
# Azure Cognitive Search Vector Index
search_credential = AzureKeyCredential(search_key)
@@ -366,7 +442,7 @@ def chunk_data(text):
print(file_path)
file_client = file_system_client.get_file_client(file_path)
csv_file = file_client.download_file()
-df_metadata = pd.read_csv(csv_file, encoding='utf-8')
+df_metadata = pd.read_csv(csv_file, encoding="utf-8")
docs = []
num_pdfs = 0
@@ -378,57 +454,81 @@ def chunk_data(text):
stream = BytesIO()
pdf_file.readinto(stream)
pdf_reader = pypdf.PdfReader(stream)
- filename = path.name.split('/')[-1]
- document_id = filename.replace('.pdf','')
+ filename = path.name.split("/")[-1]
+ document_id = filename.replace(".pdf", "")
+
+ df_file_metadata = df_metadata[df_metadata["grant_id"] == document_id].iloc[0]
- df_file_metadata = df_metadata[df_metadata['grant_id']==document_id].iloc[0]
-
- text = ""
+ text = ""
- n = num_pages #len(pdf_reader.pages)
+ n = num_pages # len(pdf_reader.pages)
if len(pdf_reader.pages) < n:
n = len(pdf_reader.pages)
- for page_num in range(n): #range(len(pdf_reader.pages)):
- public_url = df_file_metadata['publicurl'] + '#page=' + str(page_num)
+ for page_num in range(n): # range(len(pdf_reader.pages)):
+ public_url = df_file_metadata["publicurl"] + "#page=" + str(page_num)
page = pdf_reader.pages[page_num]
- text = page.extract_text()
-
+ text = page.extract_text()
+
chunks = chunk_data(text)
chunk_num = 0
for chunk in chunks:
chunk_num += 1
d = {
- "chunk_id" : path.name.split('/')[-1] + '_' + str(page_num).zfill(2) + '_' + str(chunk_num).zfill(2),
- "document_id": str(df_file_metadata['grant_id']),
- "content": chunk,
- "title": df_file_metadata['title'] }
+ "chunk_id": path.name.split("/")[-1]
+ + "_"
+ + str(page_num).zfill(2)
+ + "_"
+ + str(chunk_num).zfill(2),
+ "document_id": str(df_file_metadata["grant_id"]),
+ "content": chunk,
+ "title": df_file_metadata["title"],
+ }
- d["dateTime"],d["Person"],d["Location"],d["Organization"],d["URL"],d["Email"],d["PersonType"],d["Event"],d["Quantity"] = get_named_entities(cog_services_client,d["content"])
+ (
+ d["dateTime"],
+ d["Person"],
+ d["Location"],
+ d["Organization"],
+ d["URL"],
+ d["Email"],
+ d["PersonType"],
+ d["Event"],
+ d["Quantity"],
+ ) = get_named_entities(cog_services_client, d["content"])
counter += 1
try:
- v_titleVector = get_embeddings(d["title"],openai_api_base,openai_api_version,openai_api_key)
+ v_titleVector = get_embeddings(
+ d["title"], openai_api_base, openai_api_version, openai_api_key
+ )
except:
time.sleep(30)
- v_titleVector = get_embeddings(d["title"],openai_api_base,openai_api_version,openai_api_key)
-
+ v_titleVector = get_embeddings(
+ d["title"], openai_api_base, openai_api_version, openai_api_key
+ )
+
try:
- v_contentVector = get_embeddings(d["content"],openai_api_base,openai_api_version,openai_api_key)
+ v_contentVector = get_embeddings(
+ d["content"], openai_api_base, openai_api_version, openai_api_key
+ )
except:
time.sleep(30)
- v_contentVector = get_embeddings(d["content"],openai_api_base,openai_api_version,openai_api_key)
-
+ v_contentVector = get_embeddings(
+ d["content"], openai_api_base, openai_api_version, openai_api_key
+ )
docs.append(
- {
- "id": base64.urlsafe_b64encode(bytes(d["chunk_id"], encoding='utf-8')).decode('utf-8'),
+ {
+ "id": base64.urlsafe_b64encode(
+ bytes(d["chunk_id"], encoding="utf-8")
+ ).decode("utf-8"),
"chunk_id": d["chunk_id"],
"document_id": d["document_id"],
"title": d["title"],
"content": d["content"],
- "sourceurl": path.name.split('/')[-1],
+ "sourceurl": path.name.split("/")[-1],
"publicurl": public_url,
"dateTime": d["dateTime"],
"Person": d["Person"],
@@ -440,14 +540,14 @@ def chunk_data(text):
"Event": d["Event"],
"Quantity": d["Quantity"],
"titleVector": v_titleVector,
- "contentVector": v_contentVector
- }
+ "contentVector": v_contentVector,
+ }
)
-
+
if counter % 10 == 0:
result = client.upload_documents(documents=docs)
docs = []
- print(f' {str(counter)} uploaded')
-#upload the last batch
+ print(f" {str(counter)} uploaded")
+# upload the last batch
if docs != []:
client.upload_documents(documents=docs)
diff --git a/ResearchAssistant/Deployment/scripts/index_scripts/create_grants_index.py b/ResearchAssistant/Deployment/scripts/index_scripts/create_grants_index.py
index a59871275..dcb7e2de7 100644
--- a/ResearchAssistant/Deployment/scripts/index_scripts/create_grants_index.py
+++ b/ResearchAssistant/Deployment/scripts/index_scripts/create_grants_index.py
@@ -1,136 +1,120 @@
-#Get Azure Key Vault Client
-key_vault_name = 'kv_to-be-replaced'
+# Get Azure Key Vault Client
+key_vault_name = "kv_to-be-replaced"
-#hardcoded values
+# hardcoded values
index_name = "grantsindex"
-drafts_index_name = 'draftsindex'
+drafts_index_name = "draftsindex"
file_system_client_name = "data"
-directory = 'demodata/nih_grants'
-csv_file_name = '/metadata/nih_grants.csv'
+directory = "demodata/nih_grants"
+csv_file_name = "/metadata/nih_grants.csv"
num_pages = 10
-from azure.keyvault.secrets import SecretClient
-from azure.identity import DefaultAzureCredential
+from azure.identity import DefaultAzureCredential
+from azure.keyvault.secrets import SecretClient
+
def get_secrets_from_kv(kv_name, secret_name):
-
- # Set the name of the Azure Key Vault
- key_vault_name = kv_name
-
- # Create a credential object using the default Azure credentials
- credential = DefaultAzureCredential()
-
- # Create a secret client object using the credential and Key Vault name
- secret_client = SecretClient(vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential)
-
- # Retrieve the secret value
- return(secret_client.get_secret(secret_name).value)
-
-
-#Utils
- # Import required libraries
-import os
-import json
-import openai
-import os
-from azure.core.credentials import AzureKeyCredential
-from azure.ai.textanalytics import TextAnalyticsClient
+ # Set the name of the Azure Key Vault
+ key_vault_name = kv_name
-from azure.core.credentials import AzureKeyCredential
-from azure.search.documents import SearchClient, SearchIndexingBufferedSender
-from azure.search.documents.indexes import SearchIndexClient
-from azure.search.documents.models import (
- QueryAnswerType,
- QueryCaptionType,
- QueryCaptionResult,
- QueryAnswerResult,
- SemanticErrorMode,
- SemanticErrorReason,
- SemanticSearchResultsType,
- QueryType,
- VectorizedQuery,
- VectorQuery,
- VectorFilterMode,
-)
-from azure.search.documents.indexes.models import (
+ # Create a credential object using the default Azure credentials
+ credential = DefaultAzureCredential()
+
+ # Create a secret client object using the credential and Key Vault name
+ secret_client = SecretClient(
+ vault_url=f"https://{key_vault_name}.vault.azure.net/", credential=credential
+ )
+
+ # Retrieve the secret value
+ return secret_client.get_secret(secret_name).value
+
+
+# Utils
+# Import required libraries
+import json
+import os
+
+import openai
+from azure.ai.textanalytics import TextAnalyticsClient
+from azure.core.credentials import AzureKeyCredential
+from azure.search.documents import SearchClient, SearchIndexingBufferedSender
+from azure.search.documents.indexes import SearchIndexClient
+from azure.search.documents.indexes.models import (
ExhaustiveKnnAlgorithmConfiguration,
ExhaustiveKnnParameters,
- SearchIndex,
- SearchField,
- SearchFieldDataType,
- SimpleField,
- SearchableField,
- SearchIndex,
- SemanticConfiguration,
- SemanticPrioritizedFields,
- SemanticField,
- SearchField,
- SemanticSearch,
- VectorSearch,
HnswAlgorithmConfiguration,
- HnswParameters,
- VectorSearch,
- VectorSearchAlgorithmConfiguration,
- VectorSearchAlgorithmKind,
- VectorSearchProfile,
- SearchIndex,
+ HnswParameters,
+ SearchableField,
SearchField,
SearchFieldDataType,
+ SearchIndex,
+ SemanticConfiguration,
+ SemanticField,
+ SemanticPrioritizedFields,
+ SemanticSearch,
SimpleField,
- SearchableField,
- VectorSearch,
- ExhaustiveKnnParameters,
- SearchIndex,
- SearchField,
- SearchFieldDataType,
- SimpleField,
- SearchableField,
- SearchIndex,
- SemanticConfiguration,
- SemanticField,
- SearchField,
- VectorSearch,
- HnswParameters,
VectorSearch,
+ VectorSearchAlgorithmConfiguration,
VectorSearchAlgorithmKind,
VectorSearchAlgorithmMetric,
VectorSearchProfile,
-)
-search_endpoint = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-ENDPOINT")
-search_key = get_secrets_from_kv(key_vault_name,"AZURE-SEARCH-KEY")
+)
+from azure.search.documents.models import (
+ QueryAnswerResult,
+ QueryAnswerType,
+ QueryCaptionResult,
+ QueryCaptionType,
+ QueryType,
+ SemanticErrorMode,
+ SemanticErrorReason,
+ SemanticSearchResultsType,
+ VectorFilterMode,
+ VectorizedQuery,
+ VectorQuery,
+)
+
+search_endpoint = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-ENDPOINT")
+search_key = get_secrets_from_kv(key_vault_name, "AZURE-SEARCH-KEY")
-openai.api_key = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-KEY")
-openai.api_base = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-ENDPOINT")
-openai.api_version = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-PREVIEW-API-VERSION")
+openai.api_key = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-KEY")
+openai.api_base = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-ENDPOINT")
+openai.api_version = get_secrets_from_kv(
+ key_vault_name, "AZURE-OPENAI-PREVIEW-API-VERSION"
+)
+
+openai_api_key = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-KEY")
+openai_api_base = get_secrets_from_kv(key_vault_name, "AZURE-OPENAI-ENDPOINT")
+openai_api_version = get_secrets_from_kv(
+ key_vault_name, "AZURE-OPENAI-PREVIEW-API-VERSION"
+)
-openai_api_key = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-KEY")
-openai_api_base = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-ENDPOINT")
-openai_api_version = get_secrets_from_kv(key_vault_name,"AZURE-OPENAI-PREVIEW-API-VERSION")
+# Set up your Azure Text Analytics service and credentials
+COG_SERVICES_NAME = get_secrets_from_kv(key_vault_name, "COG-SERVICES-NAME")
+COG_SERVICES_ENDPOINT = get_secrets_from_kv(key_vault_name, "COG-SERVICES-ENDPOINT")
+COG_SERVICES_KEY = get_secrets_from_kv(key_vault_name, "COG-SERVICES-KEY")
-# Set up your Azure Text Analytics service and credentials
-COG_SERVICES_NAME = get_secrets_from_kv(key_vault_name,"COG-SERVICES-NAME")
-COG_SERVICES_ENDPOINT = get_secrets_from_kv(key_vault_name,"COG-SERVICES-ENDPOINT")
-COG_SERVICES_KEY = get_secrets_from_kv(key_vault_name,"COG-SERVICES-KEY")
+cog_services_credential = AzureKeyCredential(COG_SERVICES_KEY)
-cog_services_credential = AzureKeyCredential(COG_SERVICES_KEY)
+# Create a TextAnalyticsClient using your endpoint and credentials
+cog_services_client = TextAnalyticsClient(
+ endpoint=COG_SERVICES_ENDPOINT, credential=cog_services_credential
+)
-# Create a TextAnalyticsClient using your endpoint and credentials
-cog_services_client = TextAnalyticsClient(endpoint=COG_SERVICES_ENDPOINT, credential=cog_services_credential)
-def get_named_entities(cog_services_client,input_text):
- # Call the named entity recognition API to extract named entities from your text
- result = cog_services_client.recognize_entities(documents=[input_text])
-
- # return the named entities for each document
- # full list of categories #https://learn.microsoft.com/en-us/azure/ai-services/language-service/named-entity-recognition/concepts/named-entity-categories?tabs=ga-api
+def get_named_entities(cog_services_client, input_text):
+ # Call the named entity recognition API to extract named entities from your text
+ result = cog_services_client.recognize_entities(documents=[input_text])
- Person = []
+ # return the named entities for each document
+ # full list of categories #https://learn.microsoft.com/en-us/azure/ai-services/language-service/named-entity-recognition/concepts/named-entity-categories?tabs=ga-api
+
+ Person = []
Location = []
- Organization = []
+ Organization = []
DateTime = []
- URL = []
+ URL = []
Email = []
PersonType = []
Event = []
@@ -138,7 +122,7 @@ def get_named_entities(cog_services_client,input_text):
for idx, doc in enumerate(result):
if not doc.is_error:
- for entity in doc.entities:
+ for entity in doc.entities:
if entity.category == "DateTime":
DateTime.append(entity.text)
elif entity.category == "Person":
@@ -158,39 +142,53 @@ def get_named_entities(cog_services_client,input_text):
elif entity.category == "Quantity":
Quantity.append(entity.text)
- else:
- print(" Error: {}".format(doc.error.message))
- return(list(set(DateTime)),list(set(Person)),list(set(Location)),list(set(Organization)),list(set(URL)),list(set(Email)),list(set(PersonType)),list(set(Event)),list(set(Quantity)))
-
+ else:
+ print(" Error: {}".format(doc.error.message))
+ return (
+ list(set(DateTime)),
+ list(set(Person)),
+ list(set(Location)),
+ list(set(Organization)),
+ list(set(URL)),
+ list(set(Email)),
+ list(set(PersonType)),
+ list(set(Event)),
+ list(set(Quantity)),
+ )
+
from openai import AzureOpenAI
+
# Function: Get Embeddings
-def get_embeddings(text: str,openai_api_base,openai_api_version,openai_api_key):
+def get_embeddings(text: str, openai_api_base, openai_api_version, openai_api_key):
model_id = "text-embedding-ada-002"
client = AzureOpenAI(
api_version=openai_api_version,
azure_endpoint=openai_api_base,
- api_key = openai_api_key
+ api_key=openai_api_key,
)
-
+
# embedding = openai.Embedding.create(input=text, deployment_id=model_id)["data"][0]["embedding"]
embedding = client.embeddings.create(input=text, model=model_id).data[0].embedding
return embedding
+
# from langchain.text_splitter import MarkdownTextSplitter, RecursiveCharacterTextSplitter, PythonCodeTextSplitter
# import tiktoken
import re
+
def clean_spaces_with_regex(text):
# Use a regular expression to replace multiple spaces with a single space
- cleaned_text = re.sub(r'\s+', ' ', text)
+ cleaned_text = re.sub(r"\s+", " ", text)
# Use a regular expression to replace consecutive dots with a single dot
- cleaned_text = re.sub(r'\.{2,}', '.', cleaned_text)
+ cleaned_text = re.sub(r"\.{2,}", ".", cleaned_text)
return cleaned_text
+
# def estimate_tokens(text):
# GPT2_TOKENIZER = tiktoken.get_encoding("gpt2")
# return(len(GPT2_TOKENIZER.encode(text)))
@@ -207,27 +205,28 @@ def clean_spaces_with_regex(text):
# return(splitter.split_text(text))
+
def chunk_data(text):
- tokens_per_chunk = 500 #1024
+ tokens_per_chunk = 500 # 1024
text = clean_spaces_with_regex(text)
SENTENCE_ENDINGS = [".", "!", "?"]
- WORDS_BREAKS = ['\n', '\t', '}', '{', ']', '[', ')', '(', ' ', ':', ';', ',']
+ WORDS_BREAKS = ["\n", "\t", "}", "{", "]", "[", ")", "(", " ", ":", ";", ","]
- sentences = text.split('. ') # Split text into sentences
+ sentences = text.split(". ") # Split text into sentences
chunks = []
- current_chunk = ''
+ current_chunk = ""
current_chunk_token_count = 0
-
+
# Iterate through each sentence
for sentence in sentences:
# Split sentence into tokens
tokens = sentence.split()
-
+
# Check if adding the current sentence exceeds tokens_per_chunk
if current_chunk_token_count + len(tokens) <= tokens_per_chunk:
# Add the sentence to the current chunk
if current_chunk:
- current_chunk += '. ' + sentence
+ current_chunk += ". " + sentence
else:
current_chunk += sentence
current_chunk_token_count += len(tokens)
@@ -236,43 +235,114 @@ def chunk_data(text):
chunks.append(current_chunk)
current_chunk = sentence
current_chunk_token_count = len(tokens)
-
+
# Add the last chunk
if current_chunk:
chunks.append(current_chunk)
-
+
return chunks
+
# Create the search index
search_credential = AzureKeyCredential(search_key)
-index_client = SearchIndexClient(
- endpoint=search_endpoint, credential=search_credential)
+index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
fields = [
- SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True),
+ SimpleField(
+ name="id",
+ type=SearchFieldDataType.String,
+ key=True,
+ sortable=True,
+ filterable=True,
+ facetable=True,
+ ),
SearchableField(name="chunk_id", type=SearchFieldDataType.String),
SearchableField(name="document_id", type=SearchFieldDataType.String),
SearchableField(name="title", type=SearchFieldDataType.String),
SearchableField(name="content", type=SearchFieldDataType.String),
SearchableField(name="sourceurl", type=SearchFieldDataType.String),
SearchableField(name="publicurl", type=SearchFieldDataType.String),
- SimpleField(name="dateTime", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="Person", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="Location", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="Organization", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="URL", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="Email", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="PersonType", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="Event", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SimpleField(name="Quantity", type=SearchFieldDataType.Collection(SearchFieldDataType.String),Filterable=True,Sortable=True, Facetable=True),
- SearchField(name="titleVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
- searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile"),
- SearchField(name="contentVector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
- searchable=True, vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile")
+ SimpleField(
+ name="dateTime",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="Person",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="Location",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="Organization",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="URL",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="Email",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="PersonType",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="Event",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SimpleField(
+ name="Quantity",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.String),
+ Filterable=True,
+ Sortable=True,
+ Facetable=True,
+ ),
+ SearchField(
+ name="titleVector",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
+ searchable=True,
+ vector_search_dimensions=1536,
+ vector_search_profile_name="myHnswProfile",
+ ),
+ SearchField(
+ name="contentVector",
+ type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
+ searchable=True,
+ vector_search_dimensions=1536,
+ vector_search_profile_name="myHnswProfile",
+ ),
]
-# Configure the vector search configuration
+# Configure the vector search configuration
vector_search = VectorSearch(
algorithms=[
HnswAlgorithmConfiguration(
@@ -282,16 +352,16 @@ def chunk_data(text):
m=4,
ef_construction=400,
ef_search=500,
- metric=VectorSearchAlgorithmMetric.COSINE
- )
+ metric=VectorSearchAlgorithmMetric.COSINE,
+ ),
),
ExhaustiveKnnAlgorithmConfiguration(
name="myExhaustiveKnn",
kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,
parameters=ExhaustiveKnnParameters(
metric=VectorSearchAlgorithmMetric.COSINE
- )
- )
+ ),
+ ),
],
profiles=[
VectorSearchProfile(
@@ -301,53 +371,59 @@ def chunk_data(text):
VectorSearchProfile(
name="myExhaustiveKnnProfile",
algorithm_configuration_name="myExhaustiveKnn",
- )
- ]
+ ),
+ ],
)
semantic_config = SemanticConfiguration(
name="my-semantic-config",
prioritized_fields=SemanticPrioritizedFields(
title_field=SemanticField(field_name="title"),
- content_fields=[SemanticField(field_name="content")]
- )
+ content_fields=[SemanticField(field_name="content")],
+ ),
)
# Create the semantic settings with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])
# Create the search index with the semantic settings
-index = SearchIndex(name=index_name, fields=fields,
- vector_search=vector_search, semantic_search=semantic_search)
+index = SearchIndex(
+ name=index_name,
+ fields=fields,
+ vector_search=vector_search,
+ semantic_search=semantic_search,
+)
result = index_client.create_or_update_index(index)
-print(f' {result.name} created')
+print(f" {result.name} created")
-#add documents to the index
+# add documents to the index
-from azure.core.credentials import AzureKeyCredential
-from azure.storage.filedatalake import (
- DataLakeServiceClient,
- DataLakeDirectoryClient,
- FileSystemClient
-)
-from azure.identity import ClientSecretCredential
-import pypdf
-from io import BytesIO
import base64
import time
-import pandas as pd
+from io import BytesIO
+import pandas as pd
+import pypdf
+from azure.core.credentials import AzureKeyCredential
+from azure.identity import ClientSecretCredential
+from azure.storage.filedatalake import (
+ DataLakeDirectoryClient,
+ DataLakeServiceClient,
+ FileSystemClient,
+)
account_name = get_secrets_from_kv(key_vault_name, "ADLS-ACCOUNT-NAME")
credential = DefaultAzureCredential()
account_url = f"https://{account_name}.dfs.core.windows.net"
-service_client = DataLakeServiceClient(account_url, credential=credential,api_version='2023-01-03')
+service_client = DataLakeServiceClient(
+ account_url, credential=credential, api_version="2023-01-03"
+)
-file_system_client = service_client.get_file_system_client(file_system_client_name)
-directory_name = directory + '/pdfs'
+file_system_client = service_client.get_file_system_client(file_system_client_name)
+directory_name = directory + "/pdfs"
paths = file_system_client.get_paths(path=directory_name)
# Azure Cognitive Search Vector Index
@@ -363,7 +439,7 @@ def chunk_data(text):
print(file_path)
file_client = file_system_client.get_file_client(file_path)
csv_file = file_client.download_file()
-df_metadata = pd.read_csv(csv_file, encoding='utf-8')
+df_metadata = pd.read_csv(csv_file, encoding="utf-8")
docs = []
num_pdfs = 0
@@ -375,57 +451,81 @@ def chunk_data(text):
stream = BytesIO()
pdf_file.readinto(stream)
pdf_reader = pypdf.PdfReader(stream)
- filename = path.name.split('/')[-1]
- document_id = filename.replace('.pdf','')
+ filename = path.name.split("/")[-1]
+ document_id = filename.replace(".pdf", "")
- df_file_metadata = df_metadata[df_metadata['grant_id']==document_id].iloc[0]
-
- text = ""
+ df_file_metadata = df_metadata[df_metadata["grant_id"] == document_id].iloc[0]
- n = num_pages #len(pdf_reader.pages)
+ text = ""
+
+ n = num_pages # len(pdf_reader.pages)
if len(pdf_reader.pages) < n:
n = len(pdf_reader.pages)
- for page_num in range(n): #range(len(pdf_reader.pages)):
- public_url = df_file_metadata['publicurl'] + '#page=' + str(page_num)
+ for page_num in range(n): # range(len(pdf_reader.pages)):
+ public_url = df_file_metadata["publicurl"] + "#page=" + str(page_num)
page = pdf_reader.pages[page_num]
- text = page.extract_text()
-
+ text = page.extract_text()
+
chunks = chunk_data(text)
chunk_num = 0
for chunk in chunks:
chunk_num += 1
d = {
- "chunk_id" : path.name.split('/')[-1] + '_' + str(page_num).zfill(2) + '_' + str(chunk_num).zfill(2),
- "document_id": str(df_file_metadata['grant_id']),
- "content": chunk,
- "title": df_file_metadata['title'] }
+ "chunk_id": path.name.split("/")[-1]
+ + "_"
+ + str(page_num).zfill(2)
+ + "_"
+ + str(chunk_num).zfill(2),
+ "document_id": str(df_file_metadata["grant_id"]),
+ "content": chunk,
+ "title": df_file_metadata["title"],
+ }
- d["dateTime"],d["Person"],d["Location"],d["Organization"],d["URL"],d["Email"],d["PersonType"],d["Event"],d["Quantity"] = get_named_entities(cog_services_client,d["content"])
+ (
+ d["dateTime"],
+ d["Person"],
+ d["Location"],
+ d["Organization"],
+ d["URL"],
+ d["Email"],
+ d["PersonType"],
+ d["Event"],
+ d["Quantity"],
+ ) = get_named_entities(cog_services_client, d["content"])
counter += 1
try:
- v_titleVector = get_embeddings(d["title"],openai_api_base,openai_api_version,openai_api_key)
+ v_titleVector = get_embeddings(
+ d["title"], openai_api_base, openai_api_version, openai_api_key
+ )
except:
time.sleep(30)
- v_titleVector = get_embeddings(d["title"],openai_api_base,openai_api_version,openai_api_key)
-
+ v_titleVector = get_embeddings(
+ d["title"], openai_api_base, openai_api_version, openai_api_key
+ )
+
try:
- v_contentVector = get_embeddings(d["content"],openai_api_base,openai_api_version,openai_api_key)
+ v_contentVector = get_embeddings(
+ d["content"], openai_api_base, openai_api_version, openai_api_key
+ )
except:
time.sleep(30)
- v_contentVector = get_embeddings(d["content"],openai_api_base,openai_api_version,openai_api_key)
-
+ v_contentVector = get_embeddings(
+ d["content"], openai_api_base, openai_api_version, openai_api_key
+ )
docs.append(
- {
- "id": base64.urlsafe_b64encode(bytes(d["chunk_id"], encoding='utf-8')).decode('utf-8'),
+ {
+ "id": base64.urlsafe_b64encode(
+ bytes(d["chunk_id"], encoding="utf-8")
+ ).decode("utf-8"),
"chunk_id": d["chunk_id"],
"document_id": d["document_id"],
"title": d["title"],
"content": d["content"],
- "sourceurl": path.name.split('/')[-1],
+ "sourceurl": path.name.split("/")[-1],
"publicurl": public_url,
"dateTime": d["dateTime"],
"Person": d["Person"],
@@ -437,18 +537,16 @@ def chunk_data(text):
"Event": d["Event"],
"Quantity": d["Quantity"],
"titleVector": v_titleVector,
- "contentVector": v_contentVector
- }
+ "contentVector": v_contentVector,
+ }
)
-
+
if counter % 10 == 0:
result = client.upload_documents(documents=docs)
result = drafts_client.upload_documents(documents=docs)
docs = []
- print(f' {str(counter)} uploaded')
-#upload the last batch
+ print(f" {str(counter)} uploaded")
+# upload the last batch
if docs != []:
client.upload_documents(documents=docs)
drafts_client.upload_documents(documents=docs)
-
-