diff --git a/README.md b/README.md index 2d6eaabaa..edb305b1d 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,30 @@ Transform unstructured data (PDFs, DOCs, TXT, YouTube videos, web pages, etc.) i This application allows you to upload files from various sources (local machine, GCS, S3 bucket, or web sources), choose your preferred LLM model, and generate a Knowledge Graph. ---- +## Getting Started + +### **Prerequisites** +- **Python 3.12 or higher** (for local/separate backend deployment) +- Neo4j Database **5.23 or later** with APOC installed. + - **Neo4j Aura** databases (including the free tier) are supported. + - If using **Neo4j Desktop**, you will need to deploy the backend and frontend separately (docker-compose is not supported). + +#### **Backend Setup** +1. Create the `.env` file in the `backend` folder by copying `backend/example.env`. +2. Preconfigure user credentials in the `.env` file to bypass the login dialog: + ```bash + NEO4J_URI= + NEO4J_USERNAME= + NEO4J_PASSWORD= + NEO4J_DATABASE= + ``` +3. Run: + ```bash + cd backend + python3.12 -m venv venv + source venv/bin/activate # On Windows: venv\Scripts\activate + pip install -r requirements.txt -c constraints.txt + uvicorn score:app --reload ## Key Features @@ -63,7 +86,7 @@ Run the application using the default `docker-compose` configuration. - By default, only OpenAI and Diffbot are enabled. Gemini requires additional GCP configurations. - Use the `VITE_LLM_MODELS_PROD` variable to configure the models you need. Example: ```bash - VITE_LLM_MODELS_PROD="openai_gpt_4o,openai_gpt_4o_mini,diffbot,gemini_1.5_flash" + VITE_LLM_MODELS_PROD="openai_gpt_5.1,openai_gpt_5_mini,diffbot,gemini_2.5_flash" ``` 2. **Input Sources**: @@ -199,10 +222,10 @@ VITE_BACKEND_API_URL=${VITE_BACKEND_API_URL-backendurl} | DUPLICATE_TEXT_DISTANCE | Mandatory | 5 | This value used to find distance for all node pairs in the graph and calculated based on node properties | | DUPLICATE_SCORE_VALUE | Mandatory | 0.97 | Node score value to match duplicate node | | EFFECTIVE_SEARCH_RATIO | Mandatory | 1 | | -| GRAPH_CLEANUP_MODEL | Optional | 0.97 | Model name to clean-up graph in post processing | +| GRAPH_CLEANUP_MODEL | Optional | "openai_gpt_5_mini" | Model name to clean-up graph in post processing | | MAX_TOKEN_CHUNK_SIZE | Optional | 10000 | Maximum token size to process file content | | YOUTUBE_TRANSCRIPT_PROXY| Optional | | Proxy key to process youtube video for getting transcript | -| EMBEDDING_MODEL | Optional | all-MiniLM-L6-v2 | Model for generating the text embedding (all-MiniLM-L6-v2 , openai , vertexai) | +| EMBEDDING_MODEL | Optional | | Model for generating the text embedding (default all-MiniLM-L6-v2 , openai , vertexai, titan) | | IS_EMBEDDING | Optional | true | Flag to enable text embedding | | KNN_MIN_SCORE | Optional | 0.94 | Minimum score for KNN algorithm | | GEMINI_ENABLED | Optional | False | Flag to enable Gemini | @@ -219,7 +242,7 @@ VITE_BACKEND_API_URL=${VITE_BACKEND_API_URL-backendurl} | LANGCHAIN_ENDPOINT | Optional | https://api.smith.langchain.com | Endpoint for Langchain API | | ENTITY_EMBEDDING | Optional | False | If set to True, It will add embeddings for each entity in database | | LLM_MODEL_CONFIG_ollama_ | Optional | | Set ollama config as - model_name,model_local_url for local deployments | -| RAGAS_EMBEDDING_MODEL | Optional | openai | embedding model used by ragas evaluation framework | +| RAGAS_EMBEDDING_MODEL | Optional | | embedding model used by ragas evaluation framework | | | | **FRONTEND ENV** | VITE_BLOOM_URL | Mandatory | https://workspace-preview.neo4j.io/workspace/explore?connectURL={CONNECT_URL}&search=Show+me+a+graph&featureGenAISuggestions=true&featureGenAISuggestionsInternal=true | URL for Bloom visualization | diff --git a/backend/Dockerfile b/backend/Dockerfile index b031e4425..fadfaf1cb 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.10-slim +FROM python:3.12-slim WORKDIR /code ENV PORT 8000 EXPOSE 8000 @@ -8,10 +8,12 @@ RUN apt-get update && \ libmagic1 \ libgl1 \ libglx-mesa0 \ + libgomp1 \ libreoffice \ cmake \ poppler-utils \ - tesseract-ocr && \ + tesseract-ocr \ + git && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* diff --git a/backend/README.md b/backend/README.md index 1ab091216..4667f47f6 100644 --- a/backend/README.md +++ b/backend/README.md @@ -1,6 +1,11 @@ # Project Overview Welcome to our project! This project is built using FastAPI framework to create a fast and modern API with Python. +## Prerequisites + +- Python 3.12 or higher +- pip (Python package manager) + ## Feature API Endpoint : This project provides various API endpoint to perform specific tasks. Data Validation : Utilize FastAPI data validation and serialization feature. @@ -16,9 +21,14 @@ Follow these steps to set up and run the project locally: > cd llm-graph-builder -2. Install Dependency : +2. Create a virtual environment (recommended): + +> python3.12 -m venv venv +> source venv/bin/activate # On Windows: venv\Scripts\activate + +3. Install Dependency : -> pip install -t requirements.txt +> pip install -r requirements.txt -c constraints.txt ## Run backend project using unicorn Run the server: diff --git a/backend/example.env b/backend/example.env index c0c24a90e..7d947513f 100644 --- a/backend/example.env +++ b/backend/example.env @@ -1,10 +1,10 @@ OPENAI_API_KEY = "" #This is required if you are using openai embedding model -EMBEDDING_MODEL = "all-MiniLM-L6-v2" #this can be openai or vertexai or by default all-MiniLM-L6-v2 -RAGAS_EMBEDDING_MODEL = "openai" #Keep blank if you want to use all-MiniLM-L6-v2 for ragas embeddings +EMBEDDING_MODEL = "" #values can be blank or "openai" or "vertexai" "titan" - defaults to all-MiniLM-L6-v2 +RAGAS_EMBEDDING_MODEL = "" #values can be blank or "openai" IS_EMBEDDING = "TRUE" KNN_MIN_SCORE = "0.94" # Enable Gemini (default is False) | Can be False or True -GEMINI_ENABLED = False +GEMINI_ENABLED = "False" #Keep true if you are using gemini model # Enable Google Cloud logs (default is False) | Can be False or True GCP_LOG_METRICS_ENABLED = False NUMBER_OF_CHUNKS_TO_COMBINE = 6 @@ -19,7 +19,7 @@ LANGCHAIN_API_KEY = "" LANGCHAIN_PROJECT = "" LANGCHAIN_TRACING_V2 = "" LANGCHAIN_ENDPOINT = "" -GCS_FILE_CACHE = "" #save the file into GCS or local, SHould be True or False +GCS_FILE_CACHE = "" #save the file into GCS or local, Should be True or False NEO4J_USER_AGENT="" ENABLE_USER_AGENT = "" LLM_MODEL_CONFIG_model_version="" @@ -28,30 +28,26 @@ DUPLICATE_SCORE_VALUE =0.97 DUPLICATE_TEXT_DISTANCE =3 DEFAULT_DIFFBOT_CHAT_MODEL="openai_gpt_4o" #whichever model specified here , need to add config for that model in below format) #examples -LLM_MODEL_CONFIG_openai_gpt_3.5="gpt-3.5-turbo-0125,openai_api_key" -LLM_MODEL_CONFIG_openai_gpt_4o_mini="gpt-4o-mini-2024-07-18,openai_api_key" -LLM_MODEL_CONFIG_openai_gpt_4o="gpt-4o-2024-11-20,openai_api_key" -LLM_MODEL_CONFIG_openai_gpt_4.1_mini="gpt-4.1-mini,openai_api_key" -LLM_MODEL_CONFIG_openai_gpt_4.1="gpt-4.1,openai_api_key" -LLM_MODEL_CONFIG_openai_gpt_o3_mini="o3-mini-2025-01-31,openai_api_key" -LLM_MODEL_CONFIG_gemini_1.5_pro="gemini-1.5-pro-002" -LLM_MODEL_CONFIG_gemini_1.5_flash="gemini-1.5-flash-002" -LLM_MODEL_CONFIG_gemini_2.0_flash="gemini-2.0-flash-001" +LLM_MODEL_CONFIG_openai_gpt_5.1="gpt-5.1,openai-key" +LLM_MODEL_CONFIG_openai_gpt_5_mini="gpt-5-mini,openai-key" +LLM_MODEL_CONFIG_openai_gpt_4.1="gpt-4.1,openai-key" +LLM_MODEL_CONFIG_openai_gpt_4.1_mini="gpt-4.1-mini,openai-key" +LLM_MODEL_CONFIG_gemini_2.5_flash="gemini-2.5-flash" LLM_MODEL_CONFIG_gemini_2.5_pro="gemini-2.5-pro" LLM_MODEL_CONFIG_diffbot="diffbot,diffbot_api_key" -LLM_MODEL_CONFIG_azure_ai_gpt_35="azure_deployment_name,azure_endpoint or base_url,azure_api_key,api_version" +LLM_MODEL_CONFIG_groq_llama3.1_8b="llama-3.1-8b-instant,base_url,groq_api_key" +LLM_MODEL_CONFIG_anthropic_claude_4.5_sonnet="claude-sonnet-4-5-20250929,anthropic_api_key" +LLM_MODEL_CONFIG_llama4_maverick="Llama-4-Maverick-17B-128E-Instruct-FP8,https://api.llama.com/compat/v1/,LLM|1207839134334841|NT9iYvy201sMmsOJB6spkslwoiM" LLM_MODEL_CONFIG_azure_ai_gpt_4o="gpt-4o,https://YOUR-ENDPOINT.openai.azure.com/,azure_api_key,api_version" -LLM_MODEL_CONFIG_groq_llama3_70b="model_name,base_url,groq_api_key" -LLM_MODEL_CONFIG_anthropic_claude_4_sonnet="model_name,anthropic_api_key" #model_name="claude-sonnet-4-20250514" -LLM_MODEL_CONFIG_fireworks_llama4_maverick="model_name,fireworks_api_key" +LLM_MODEL_CONFIG_fireworks_qwen3_30b="accounts/fireworks/models/qwen3-30b-a3b,fireworks_api_key" +LLM_MODEL_CONFIG_fireworks_gpt_oss="accounts/fireworks/models/gpt-oss-120b,fireworks_api_key" +LLM_MODEL_CONFIG_fireworks_deepseek_v3="accounts/fireworks/models/deepseek-v3p1,fireworks_api_key" +LLM_MODEL_CONFIG_bedrock_nova_micro_v1="amazon.nova-micro-v1:0,aws_access_key,aws_secret_key,region_name" +LLM_MODEL_CONFIG_bedrock_nova_lite_v1="amazon.nova-lite-v1:0,aws_access_key,aws_secret_key,region_name" +LLM_MODEL_CONFIG_bedrock_nova_pro_v1="amazon.nova-pro-v1:0,aws_access_key,aws_secret_key,region_name" LLM_MODEL_CONFIG_ollama_llama3="llama3_model_name,model_local_url" YOUTUBE_TRANSCRIPT_PROXY="https://user:pass@domain:port" EFFECTIVE_SEARCH_RATIO=5 -GRAPH_CLEANUP_MODEL="openai_gpt_4o" -BEDROCK_EMBEDDING_MODEL="model_name,aws_access_key,aws_secret_key,region_name" #model_name="amazon.titan-embed-text-v1" -LLM_MODEL_CONFIG_bedrock_nova_micro_v1="model_name,aws_access_key,aws_secret_key,region_name" #model_name="amazon.nova-micro-v1:0" -LLM_MODEL_CONFIG_bedrock_nova_lite_v1="model_name,aws_access_key,aws_secret_key,region_name" #model_name="amazon.nova-lite-v1:0" -LLM_MODEL_CONFIG_bedrock_nova_pro_v1="model_name,aws_access_key,aws_secret_key,region_name" #model_name="amazon.nova-pro-v1:0" -LLM_MODEL_CONFIG_fireworks_deepseek_r1="model_name,fireworks_api_key" #model_name="accounts/fireworks/models/deepseek-r1" -LLM_MODEL_CONFIG_fireworks_deepseek_v3="model_name,fireworks_api_key" #model_name="accounts/fireworks/models/deepseek-v3" +GRAPH_CLEANUP_MODEL="openai_gpt_5_mini" +BEDROCK_EMBEDDING_MODEL="model_name,aws_access_key,aws_secret_key,region_name" #model_name="amazon.titan-embed-text-v2.0" MAX_TOKEN_CHUNK_SIZE=2000 #Max token used to process/extract the file content. \ No newline at end of file diff --git a/backend/requirements.txt b/backend/requirements.txt index ffb6f04a9..6fa7886ef 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,65 +1,64 @@ -accelerate==1.7.0 -asyncio==3.4.3 -boto3==1.38.36 -botocore==1.38.36 -certifi==2025.6.15 -fastapi==0.115.12 +accelerate==1.12.0 +asyncio==4.0.0 +boto3==1.40.23 +botocore==1.40.23 +certifi==2025.8.3 +fastapi==0.116.1 fastapi-health==0.4.0 fireworks-ai==0.15.12 google-api-core==2.25.1 google-auth==2.40.3 google_auth_oauthlib==1.2.2 google-cloud-core==2.4.3 -json-repair==0.39.1 +json-repair==0.44.1 pip-install==1.3.5 -langchain==0.3.25 -langchain-aws==0.2.25 -langchain-anthropic==0.3.15 -langchain-fireworks==0.3.0 -langchain-community==0.3.25 -langchain-core==0.3.65 -langchain-experimental==0.3.4 -langchain-google-vertexai==2.0.25 -langchain-groq==0.3.2 -langchain-openai==0.3.23 -langchain-text-splitters==0.3.8 -langchain-huggingface==0.3.0 +langchain==1.1.2 +langchain-aws==1.1.0 +langchain-anthropic==1.2.0 +langchain-fireworks==1.1.0 +langchain-community==0.4.1 +langchain-core==1.1.1 +langchain-experimental==0.4.0 +langchain-google-vertexai==3.1.1 +langchain-groq==1.1.0 +langchain-openai==1.1.0 +langchain-text-splitters==1.0.0 +langchain-huggingface==1.1.0 +langchain-classic==1.0.0 langdetect==1.0.9 -langsmith==0.3.45 -langserve==0.3.1 -neo4j-rust-ext==5.28.1.0 +langsmith==0.4.55 +langserve==0.3.3 +neo4j-rust-ext==5.28.2.1 nltk==3.9.1 -openai==1.86.0 -opencv-python==4.11.0.86 +openai==2.9.0 psutil==7.0.0 -pydantic==2.11.7 -python-dotenv==1.1.0 +pydantic==2.12.5 +python-dotenv==1.1.1 python-magic==0.4.27 PyPDF2==3.0.1 -PyMuPDF==1.26.1 -starlette==0.46.2 -sse-starlette==2.3.6 +PyMuPDF==1.26.4 +starlette==0.47.3 +sse-starlette==3.0.2 starlette-session==0.4.3 tqdm==4.67.1 -unstructured[all-docs] -unstructured==0.17.2 -unstructured-client==0.36.0 +unstructured[all-docs]==0.18.14 +unstructured-client==0.42.3 unstructured-inference==1.0.5 -urllib3==2.4.0 -uvicorn==0.34.3 +urllib3==2.5.0 +uvicorn==0.35.0 gunicorn==23.0.0 wikipedia==1.4.0 -wrapt==1.17.2 +wrapt==1.17.3 yarl==1.20.1 -youtube-transcript-api==1.1.0 +youtube-transcript-api==1.2.2 zipp==3.23.0 -sentence-transformers==5.0.0 +sentence-transformers==5.1.0 google-cloud-logging==3.12.1 pypandoc==1.15 -graphdatascience==1.15.1 -Secweb==1.18.1 -ragas==0.3.1 +graphdatascience==1.18a1 +Secweb==1.25.2 +ragas==0.4.0 rouge_score==0.1.2 -langchain-neo4j==0.4.0 +langchain-neo4j==0.6.0 pypandoc-binary==1.15 chardet==5.2.0 \ No newline at end of file diff --git a/backend/score.py b/backend/score.py index 11209a0b4..0fa83d4d4 100644 --- a/backend/score.py +++ b/backend/score.py @@ -112,9 +112,9 @@ async def __call__(self, scope: Scope, receive: Receive, send: Send): ) app.add_middleware(SessionMiddleware, secret_key=os.urandom(24)) -is_gemini_enabled = os.environ.get("GEMINI_ENABLED", "False").lower() in ("true", "1", "yes") -if is_gemini_enabled: - add_routes(app,ChatVertexAI(), path="/vertexai") +# is_gemini_enabled = os.environ.get("GEMINI_ENABLED", "False").lower() in ("true", "1", "yes") +# if is_gemini_enabled: +# add_routes(app,ChatVertexAI(), path="/vertexai") app.add_api_route("/health", health([healthy_condition, healthy])) @@ -891,17 +891,14 @@ async def retry_processing(uri=Form(None), userName=Form(None), password=Form(No try: start = time.time() graph = create_graph_database_connection(uri, userName, password, database) - chunks = execute_graph_query(graph,QUERY_TO_GET_CHUNKS,params={"filename":file_name}) + # chunks = execute_graph_query(graph,QUERY_TO_GET_CHUNKS,params={"filename":file_name}) end = time.time() elapsed_time = end - start json_obj = {'api_name':'retry_processing', 'db_url':uri, 'userName':userName, 'database':database, 'file_name':file_name,'retry_condition':retry_condition, 'logging_time': formatted_time(datetime.now(timezone.utc)), 'elapsed_api_time':f'{elapsed_time:.2f}','email':email} logger.log_struct(json_obj, "INFO") - if chunks[0]['text'] is None or chunks[0]['text']=="" or not chunks : - return create_api_response('Success',message=f"Chunks are not created for the file{file_name}. Please upload again the file to re-process.",data=chunks) - else: - await asyncio.to_thread(set_status_retry, graph,file_name,retry_condition) - return create_api_response('Success',message=f"Status set to Ready to Reprocess for filename : {file_name}") + await asyncio.to_thread(set_status_retry, graph,file_name,retry_condition) + return create_api_response('Success',message=f"Status set to Ready to Reprocess for filename : {file_name}") except Exception as e: job_status = "Failed" message="Unable to set status to Retry" diff --git a/backend/src/QA_integration.py b/backend/src/QA_integration.py index 1a9e24eb3..fe811833a 100644 --- a/backend/src/QA_integration.py +++ b/backend/src/QA_integration.py @@ -11,12 +11,11 @@ from langchain_neo4j import Neo4jVector from langchain_neo4j import Neo4jChatMessageHistory from langchain_neo4j import GraphCypherQAChain -from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder +from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder from langchain_core.output_parsers import StrOutputParser from langchain_core.runnables import RunnableBranch -from langchain.retrievers import ContextualCompressionRetriever -from langchain_community.document_transformers import EmbeddingsRedundantFilter -from langchain.retrievers.document_compressors import EmbeddingsFilter, DocumentCompressorPipeline +from langchain_classic.retrievers import ContextualCompressionRetriever +from langchain_classic.retrievers.document_compressors import EmbeddingsFilter, DocumentCompressorPipeline from langchain_text_splitters import TokenTextSplitter from langchain_core.messages import HumanMessage, AIMessage from langchain_community.chat_message_histories import ChatMessageHistory diff --git a/backend/src/communities.py b/backend/src/communities.py index 0ecf493cc..62195e6e7 100644 --- a/backend/src/communities.py +++ b/backend/src/communities.py @@ -14,7 +14,7 @@ MAX_WORKERS = 10 MAX_COMMUNITY_LEVELS = 3 MIN_COMMUNITY_SIZE = 1 -COMMUNITY_CREATION_DEFAULT_MODEL = "openai_gpt_4o" +COMMUNITY_CREATION_DEFAULT_MODEL = "openai_gpt_4.1" CREATE_COMMUNITY_GRAPH_PROJECTION = """ diff --git a/backend/src/create_chunks.py b/backend/src/create_chunks.py index 523d2b77c..63dc8ada8 100644 --- a/backend/src/create_chunks.py +++ b/backend/src/create_chunks.py @@ -1,5 +1,5 @@ from langchain_text_splitters import TokenTextSplitter -from langchain.docstore.document import Document +from langchain_core.documents import Document from langchain_neo4j import Neo4jGraph import logging from src.document_sources.youtube import get_chunks_with_timestamps, get_calculated_timestamps diff --git a/backend/src/document_sources/gcs_bucket.py b/backend/src/document_sources/gcs_bucket.py index 0aefa11e7..e1e2d73b0 100644 --- a/backend/src/document_sources/gcs_bucket.py +++ b/backend/src/document_sources/gcs_bucket.py @@ -47,57 +47,59 @@ def gcs_loader_func(file_path): def get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token=None): - nltk.data.path.append("/usr/local/nltk_data") - nltk.data.path.append(os.path.expanduser("~/.nltk_data")) - try: - nltk.data.find("tokenizers/punkt") - except LookupError: - for resource in ["punkt", "averaged_perceptron_tagger"]: + nltk_data_dirs = ["/usr/local/nltk_data", os.path.expanduser("~/.nltk_data")] + for d in nltk_data_dirs: + if d not in nltk.data.path: + nltk.data.path.append(d) + + resources = [ + ("punkt", "tokenizers"), + ("averaged_perceptron_tagger", "taggers"), + ] + for res, res_type in resources: try: - nltk.data.find(f"tokenizers/{resource}" if resource == "punkt" else f"taggers/{resource}") + nltk.data.find(f"{res_type}/{res}") except LookupError: - logging.info(f"Downloading NLTK resource: {resource}") - nltk.download(resource, download_dir=os.path.expanduser("~/.nltk_data")) - - logging.info("NLTK resources downloaded successfully.") - if gcs_bucket_folder is not None and gcs_bucket_folder.strip()!="": + logging.info(f"NLTK resource '{res}' not found; downloading to /usr/local/nltk_data") + nltk.download(res, download_dir="/usr/local/nltk_data") + + + if gcs_bucket_folder is not None and gcs_bucket_folder.strip() != "": if gcs_bucket_folder.endswith('/'): - blob_name = gcs_bucket_folder+gcs_blob_filename + blob_name = gcs_bucket_folder + gcs_blob_filename else: - blob_name = gcs_bucket_folder+'/'+gcs_blob_filename - else: - blob_name = gcs_blob_filename - - logging.info(f"GCS project_id : {gcs_project_id}") - - if access_token is None: + blob_name = gcs_bucket_folder + '/' + gcs_blob_filename + else: + blob_name = gcs_blob_filename + + logging.info(f"GCS project_id : {gcs_project_id}") + + if access_token is None: storage_client = storage.Client(project=gcs_project_id) bucket = storage_client.bucket(gcs_bucket_name) - blob = bucket.blob(blob_name) - + blob = bucket.blob(blob_name) if blob.exists(): loader = GCSFileLoader(project_name=gcs_project_id, bucket=gcs_bucket_name, blob=blob_name, loader_func=gcs_loader_func) - pages = loader.load() - else : - raise LLMGraphBuilderException('File does not exist, Please re-upload the file and try again.') - else: - creds= Credentials(access_token) + pages = loader.load() + else: + raise LLMGraphBuilderException('File does not exist, Please re-upload the file and try again.') + else: + creds = Credentials(access_token) storage_client = storage.Client(project=gcs_project_id, credentials=creds) - bucket = storage_client.bucket(gcs_bucket_name) - blob = bucket.blob(blob_name) + blob = bucket.blob(blob_name) if blob.exists(): - content = blob.download_as_bytes() - pdf_file = io.BytesIO(content) - pdf_reader = PdfReader(pdf_file) - # Extract text from all pages - text = "" - for page in pdf_reader.pages: - text += page.extract_text() - pages = [Document(page_content = text)] + content = blob.download_as_bytes() + pdf_file = io.BytesIO(content) + pdf_reader = PdfReader(pdf_file) + # Extract text from all pages + text = "" + for page in pdf_reader.pages: + text += page.extract_text() or "" + pages = [Document(page_content=text)] else: - raise LLMGraphBuilderException(f'File Not Found in GCS bucket - {gcs_bucket_name}') - return gcs_blob_filename, pages + raise LLMGraphBuilderException(f'File Not Found in GCS bucket - {gcs_bucket_name}') + return gcs_blob_filename, pages def upload_file_to_gcs(file_chunk, chunk_number, original_file_name, bucket_name, folder_name_sha1_hashed): try: diff --git a/backend/src/document_sources/youtube.py b/backend/src/document_sources/youtube.py index 1c60a9b85..56ac8c0ef 100644 --- a/backend/src/document_sources/youtube.py +++ b/backend/src/document_sources/youtube.py @@ -1,4 +1,4 @@ -from langchain.docstore.document import Document +from langchain_core.documents import Document from src.shared.llm_graph_builder_exception import LLMGraphBuilderException from youtube_transcript_api import YouTubeTranscriptApi from youtube_transcript_api.proxies import GenericProxyConfig diff --git a/backend/src/graphDB_dataAccess.py b/backend/src/graphDB_dataAccess.py index 77cc9e592..9ea646f35 100644 --- a/backend/src/graphDB_dataAccess.py +++ b/backend/src/graphDB_dataAccess.py @@ -239,7 +239,6 @@ def connection_check_and_get_vector_dimensions(self,database): embedding_model = os.getenv('EMBEDDING_MODEL') embeddings, application_dimension = load_embedding_model(embedding_model) - logging.info(f'embedding model:{embeddings} and dimesion:{application_dimension}') gds_status = self.check_gds_version() write_access = self.check_account_access(database=database) diff --git a/backend/src/llm.py b/backend/src/llm.py index 854e5926f..b6bf78ef9 100644 --- a/backend/src/llm.py +++ b/backend/src/llm.py @@ -1,5 +1,5 @@ import logging -from langchain.docstore.document import Document +from langchain_core.documents import Document import os from langchain_openai import ChatOpenAI, AzureChatOpenAI from langchain_google_vertexai import ChatVertexAI @@ -50,7 +50,7 @@ def get_llm(model: str): ) elif "openai" in model: model_name, api_key = env_value.split(",") - if "o3-mini" in model: + if "mini" in model: llm= ChatOpenAI( api_key=api_key, model=model_name) @@ -189,17 +189,14 @@ async def get_graph_document_list( else: node_properties = ["description"] relationship_properties = ["description"] - TOOL_SUPPORTED_MODELS = {"qwen3", "deepseek"} model_name = get_llm_model_name(llm) - ignore_tool_usage = not any(pattern in model_name for pattern in TOOL_SUPPORTED_MODELS) - logging.info(f"Keeping ignore tool usage parameter as {ignore_tool_usage}") llm_transformer = LLMGraphTransformer( llm=llm, node_properties=node_properties, relationship_properties=relationship_properties, allowed_nodes=allowedNodes, allowed_relationships=allowedRelationship, - ignore_tool_usage=ignore_tool_usage, + ignore_tool_usage=True, additional_instructions=ADDITIONAL_INSTRUCTIONS+ (additional_instructions if additional_instructions else "") ) diff --git a/backend/src/main.py b/backend/src/main.py index 4bdb6ba51..4ee3e8a06 100644 --- a/backend/src/main.py +++ b/backend/src/main.py @@ -230,7 +230,7 @@ def create_source_node_graph_url_wikipedia(graph, model, wiki_query, source_type async def extract_graph_from_file_local_file(uri, userName, password, database, model, merged_file_path, fileName, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions): logging.info(f'Process file name :{fileName}') - if not retry_condition: + if retry_condition in ["", None] or retry_condition not in [DELETE_ENTITIES_AND_START_FROM_BEGINNING, START_FROM_LAST_PROCESSED_POSITION]: gcs_file_cache = os.environ.get('GCS_FILE_CACHE') if gcs_file_cache == 'True': folder_name = create_gcs_bucket_folder_name_hashed(uri, fileName) @@ -244,7 +244,7 @@ async def extract_graph_from_file_local_file(uri, userName, password, database, return await processing_source(uri, userName, password, database, model, fileName, [], allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, True, merged_file_path, retry_condition, additional_instructions=additional_instructions) async def extract_graph_from_file_s3(uri, userName, password, database, model, source_url, aws_access_key_id, aws_secret_access_key, file_name, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions): - if not retry_condition: + if retry_condition in ["", None] or retry_condition not in [DELETE_ENTITIES_AND_START_FROM_BEGINNING, START_FROM_LAST_PROCESSED_POSITION]: if(aws_access_key_id==None or aws_secret_access_key==None): raise LLMGraphBuilderException('Please provide AWS access and secret keys') else: @@ -258,7 +258,7 @@ async def extract_graph_from_file_s3(uri, userName, password, database, model, s return await processing_source(uri, userName, password, database, model, file_name, [], allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition=retry_condition, additional_instructions=additional_instructions) async def extract_graph_from_web_page(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions): - if not retry_condition: + if retry_condition in ["", None] or retry_condition not in [DELETE_ENTITIES_AND_START_FROM_BEGINNING, START_FROM_LAST_PROCESSED_POSITION]: pages = get_documents_from_web_page(source_url) if pages==None or len(pages)==0: raise LLMGraphBuilderException(f'Content is not available for given URL : {file_name}') @@ -267,7 +267,7 @@ async def extract_graph_from_web_page(uri, userName, password, database, model, return await processing_source(uri, userName, password, database, model, file_name, [], allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition=retry_condition, additional_instructions=additional_instructions) async def extract_graph_from_file_youtube(uri, userName, password, database, model, source_url, file_name, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions): - if not retry_condition: + if retry_condition in ["", None] or retry_condition not in [DELETE_ENTITIES_AND_START_FROM_BEGINNING, START_FROM_LAST_PROCESSED_POSITION]: file_name, pages = get_documents_from_youtube(source_url) if pages==None or len(pages)==0: @@ -277,7 +277,7 @@ async def extract_graph_from_file_youtube(uri, userName, password, database, mod return await processing_source(uri, userName, password, database, model, file_name, [], allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition=retry_condition, additional_instructions=additional_instructions) async def extract_graph_from_file_Wikipedia(uri, userName, password, database, model, wiki_query, language, file_name, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions): - if not retry_condition: + if retry_condition in ["", None] or retry_condition not in [DELETE_ENTITIES_AND_START_FROM_BEGINNING, START_FROM_LAST_PROCESSED_POSITION]: file_name, pages = get_documents_from_Wikipedia(wiki_query, language) if pages==None or len(pages)==0: raise LLMGraphBuilderException(f'Wikipedia page is not available for file : {file_name}') @@ -286,7 +286,7 @@ async def extract_graph_from_file_Wikipedia(uri, userName, password, database, m return await processing_source(uri, userName, password, database, model, file_name,[], allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition=retry_condition, additional_instructions=additional_instructions) async def extract_graph_from_file_gcs(uri, userName, password, database, model, gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token, file_name, allowedNodes, allowedRelationship, token_chunk_size, chunk_overlap, chunks_to_combine, retry_condition, additional_instructions): - if not retry_condition: + if retry_condition in ["", None] or retry_condition not in [DELETE_ENTITIES_AND_START_FROM_BEGINNING, START_FROM_LAST_PROCESSED_POSITION]: file_name, pages = get_documents_from_gcs(gcs_project_id, gcs_bucket_name, gcs_bucket_folder, gcs_blob_filename, access_token) if pages==None or len(pages)==0: raise LLMGraphBuilderException(f'File content is not available for file : {file_name}') @@ -431,7 +431,7 @@ async def processing_source(uri, userName, password, database, model, file_name, # merged_file_path have value only when file uploaded from local - if is_uploaded_from_local: + if is_uploaded_from_local and bool(is_cancelled_status) == False: gcs_file_cache = os.environ.get('GCS_FILE_CACHE') if gcs_file_cache == 'True': folder_name = create_gcs_bucket_folder_name_hashed(uri, file_name) @@ -511,7 +511,7 @@ async def processing_chunks(chunkId_chunkDoc_list,graph,uri, userName, password, return node_count,rel_count,latency_processing_chunk def get_chunkId_chunkDoc_list(graph, file_name, pages, token_chunk_size, chunk_overlap, retry_condition): - if not retry_condition: + if retry_condition in ["", None] or retry_condition not in [DELETE_ENTITIES_AND_START_FROM_BEGINNING, START_FROM_LAST_PROCESSED_POSITION]: logging.info("Break down file into chunks") bad_chars = ['"', "\n", "'"] for i in range(0,len(pages)): @@ -532,7 +532,7 @@ def get_chunkId_chunkDoc_list(graph, file_name, pages, token_chunk_size, chunk_o chunks = execute_graph_query(graph,QUERY_TO_GET_CHUNKS, params={"filename":file_name}) if chunks[0]['text'] is None or chunks[0]['text']=="" or not chunks : - raise LLMGraphBuilderException(f"Chunks are not created for {file_name}. Please re-upload file and try again.") + raise LLMGraphBuilderException(f"Chunks are not created for {file_name}. Please re-upload file or reprocess the file with option Start From Beginning.") else: for chunk in chunks: chunk_doc = Document(page_content=chunk['text'], metadata={'id':chunk['id'], 'position':chunk['position']}) @@ -714,15 +714,9 @@ def manually_cancelled_job(graph, filenames, source_types, merged_dir, uri): obj_source_node.updated_at = datetime.now() graphDb_data_Access = graphDBdataAccess(graph) graphDb_data_Access.update_source_node(obj_source_node) - count_response = graphDb_data_Access.update_node_relationship_count(file_name) + #Update the nodeCount and relCount properties in Document node + graphDb_data_Access.update_node_relationship_count(file_name) obj_source_node = None - merged_file_path = os.path.join(merged_dir, file_name) - if source_type == 'local file' and gcs_file_cache == 'True': - folder_name = create_gcs_bucket_folder_name_hashed(uri, file_name) - delete_file_from_gcs(BUCKET_UPLOAD,folder_name,file_name) - else: - logging.info(f'Deleted File Path: {merged_file_path} and Deleted File Name : {file_name}') - delete_uploaded_local_file(merged_file_path,file_name) return "Cancelled the processing job successfully" def populate_graph_schema_from_text(text, model, is_schema_description_checked, is_local_storage): @@ -749,10 +743,19 @@ def set_status_retry(graph, file_name, retry_condition): obj_source_node.is_cancelled = False if retry_condition == DELETE_ENTITIES_AND_START_FROM_BEGINNING or retry_condition == START_FROM_BEGINNING: obj_source_node.processed_chunk=0 - if retry_condition == DELETE_ENTITIES_AND_START_FROM_BEGINNING: - execute_graph_query(graph,QUERY_TO_DELETE_EXISTING_ENTITIES, params={"filename":file_name}) obj_source_node.node_count=0 obj_source_node.relationship_count=0 + obj_source_node.chunkNodeCount=0 + obj_source_node.chunkRelCount=0 + obj_source_node.communityNodeCount=0 + obj_source_node.communityRelCount=0 + obj_source_node.entityEntityRelCount=0 + obj_source_node.entityNodeCount=0 + obj_source_node.processingTime=0 + obj_source_node.total_chunks=0 + if retry_condition == DELETE_ENTITIES_AND_START_FROM_BEGINNING: + execute_graph_query(graph,QUERY_TO_DELETE_EXISTING_ENTITIES, params={"filename":file_name}) + logging.info(obj_source_node) graphDb_data_Access.update_source_node(obj_source_node) diff --git a/backend/src/make_relationships.py b/backend/src/make_relationships.py index bfb945617..f9c191169 100644 --- a/backend/src/make_relationships.py +++ b/backend/src/make_relationships.py @@ -1,6 +1,5 @@ from langchain_neo4j import Neo4jGraph -from langchain.docstore.document import Document -from src.shared.common_fn import load_embedding_model,execute_graph_query +from langchain_core.documents import Document from src.shared.common_fn import load_embedding_model,execute_graph_query import logging from typing import List @@ -34,7 +33,6 @@ def merge_relationship_between_chunk_and_entites(graph: Neo4jGraph, graph_docume MERGE (c)-[:HAS_ENTITY]->(n) """ execute_graph_query(graph,unwind_query, params={"batch_data": batch_data}) - execute_graph_query(graph,unwind_query, params={"batch_data": batch_data}) def create_chunk_embeddings(graph, chunkId_chunkDoc_list, file_name): @@ -61,7 +59,6 @@ def create_chunk_embeddings(graph, chunkId_chunkDoc_list, file_name): MERGE (c)-[:PART_OF]->(d) """ execute_graph_query(graph,query_to_create_embedding, params={"fileName":file_name, "data":data_for_query}) - execute_graph_query(graph,query_to_create_embedding, params={"fileName":file_name, "data":data_for_query}) def create_relation_between_chunks(graph, file_name, chunks: List[Document])->list: logging.info("creating FIRST_CHUNK and NEXT_CHUNK relationships between chunks") @@ -130,7 +127,6 @@ def create_relation_between_chunks(graph, file_name, chunks: List[Document])->li MERGE (c)-[:PART_OF]->(d) """ execute_graph_query(graph,query_to_create_chunk_and_PART_OF_relation, params={"batch_data": batch_data}) - execute_graph_query(graph,query_to_create_chunk_and_PART_OF_relation, params={"batch_data": batch_data}) query_to_create_FIRST_relation = """ UNWIND $relationships AS relationship @@ -140,7 +136,6 @@ def create_relation_between_chunks(graph, file_name, chunks: List[Document])->li MERGE (d)-[:FIRST_CHUNK]->(c)) """ execute_graph_query(graph,query_to_create_FIRST_relation, params={"f_name": file_name, "relationships": relationships}) - execute_graph_query(graph,query_to_create_FIRST_relation, params={"f_name": file_name, "relationships": relationships}) query_to_create_NEXT_CHUNK_relation = """ UNWIND $relationships AS relationship diff --git a/backend/src/post_processing.py b/backend/src/post_processing.py index 0865c5ad3..9535234e7 100644 --- a/backend/src/post_processing.py +++ b/backend/src/post_processing.py @@ -204,7 +204,7 @@ def graph_schema_consolidation(graph): messages=[("system", GRAPH_CLEANUP_PROMPT), ("human", "{input}")], partial_variables={"format_instructions": parser.get_format_instructions()} ) - graph_cleanup_model = os.getenv("GRAPH_CLEANUP_MODEL", 'openai_gpt_4o') + graph_cleanup_model = os.getenv("GRAPH_CLEANUP_MODEL") llm, _ = get_llm(graph_cleanup_model) chain = prompt | llm | parser diff --git a/backend/src/shared/common_fn.py b/backend/src/shared/common_fn.py index 13beafdb6..3737ed3fa 100644 --- a/backend/src/shared/common_fn.py +++ b/backend/src/shared/common_fn.py @@ -113,13 +113,13 @@ def load_embedding_model(embedding_model_name: str): logging.info(f"Embedding: Using OpenAI Embeddings , Dimension:{dimension}") elif embedding_model_name == "vertexai": embeddings = VertexAIEmbeddings( - model="textembedding-gecko@003" + model="gemini-embedding-001" ) - dimension = 768 + dimension = 3072 logging.info(f"Embedding: Using Vertex AI Embeddings , Dimension:{dimension}") elif embedding_model_name == "titan": embeddings = get_bedrock_embeddings() - dimension = 1536 + dimension = 1024 logging.info(f"Embedding: Using bedrock titan Embeddings , Dimension:{dimension}") else: # embeddings = HuggingFaceEmbeddings(model_name="./local_model") diff --git a/backend/test_integrationqa.py b/backend/test_integrationqa.py index f80ed2f0c..838feab5c 100644 --- a/backend/test_integrationqa.py +++ b/backend/test_integrationqa.py @@ -8,6 +8,12 @@ from dotenv import load_dotenv from src.main import * from src.QA_integration import QA_RAG +from pathlib import Path +import time +from concurrent.futures import ThreadPoolExecutor, as_completed +from tqdm import tqdm +from typing import Callable, List, Dict, Any, Tuple +import pandas as pd # Load environment variables load_dotenv() @@ -18,9 +24,12 @@ # Logging configuration logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") # Directory Paths -BASE_DIR = os.path.dirname(__file__) -CHUNK_DIR = os.path.join(BASE_DIR, "chunks") -MERGED_DIR = os.path.join(BASE_DIR, "merged_files") +BASE_DIR = Path(__file__).parent +CHUNK_DIR = BASE_DIR / "chunks" +MERGED_DIR = BASE_DIR / "merged_files" +RESULTS_DIR = BASE_DIR / "test_results" +RESULTS_DIR.mkdir(exist_ok=True) + # Initialize Neo4j connection graph = create_graph_database_connection(URI, USERNAME, PASSWORD, DATABASE) @@ -37,18 +46,32 @@ def create_source_node_local(graph, model, file_name): graphDB_data_Access.create_source_node(source_node) return source_node +def save_result_csv(data: Dict[str, Any], filename: str): + file_path = RESULTS_DIR / filename + df = pd.DataFrame([data]) + if not file_path.exists(): + df.to_csv(file_path, index=False) + else: + df.to_csv(file_path, mode='a', header=False, index=False) + +def save_result_json(data: Any, filename: str): + file_path = RESULTS_DIR / filename + tmp_path = file_path.with_suffix('.tmp') + with open(tmp_path, "w") as f: + json.dump(data, f, indent=4) + tmp_path.rename(file_path) def test_graph_from_file_local(model_name): """Tests graph creation from a local file.""" try: file_name = 'About Amazon.pdf' - merged_file_path = os.path.join(MERGED_DIR, file_name) + merged_file_path = MERGED_DIR / file_name shutil.copyfile('/workspaces/llm-graph-builder/backend/files/About Amazon.pdf', merged_file_path) graph = create_graph_database_connection(URI, USERNAME, PASSWORD, DATABASE) create_source_node_local(graph, model_name, file_name) result = asyncio.run( extract_graph_from_file_local_file( - URI, USERNAME, PASSWORD, DATABASE, model_name, merged_file_path, file_name, '', '',100,20,1, None,'' + URI, USERNAME, PASSWORD, DATABASE, model_name, str(merged_file_path), file_name, '', '',100,20,1, None,'' ) ) logging.info(f"Local file test result: {result}") @@ -98,12 +121,13 @@ def test_graph_from_youtube_video(model_name): def test_graph_website(model_name): """Tests graph creation from a Website page.""" try: - source_url = 'https://www.cloudskillsboost.google/' + source_url = 'https://www.scrapethissite.com/pages/simple/' + file_name = 'Countries of the World: A Simple Example | Scrape This Site | A public sandbox for learning web scraping-simple' graph = create_graph_database_connection(URI, USERNAME, PASSWORD, DATABASE) create_source_node_graph_web_url(graph, model_name, source_url, "web-url") result = asyncio.run( extract_graph_from_web_page( - URI, USERNAME, PASSWORD, DATABASE, model_name, source_url, "Google Cloud Skills Boost-www", '', '',100,20,1, None,'' + URI, USERNAME, PASSWORD, DATABASE, model_name, source_url, file_name, '', '', 100, 20, 1, None, '' ) ) logging.info(f"Web URL test result: {result}") @@ -119,7 +143,6 @@ def test_chatbot_qna(model_name, mode='vector'): try: graph = create_graph_database_connection(URI, USERNAME, PASSWORD, DATABASE) result = QA_RAG(graph, model_name, 'Tell me about Amazon', '[]', 1, mode) - # assert len(result['message']) > 20 logging.info(f"Chatbot QnA test passed for mode: {mode}") final_result = {'model_name':model_name,'mode':mode,'result':result} return final_result @@ -155,7 +178,7 @@ def test_populate_graph_schema_from_text(model_name): """Tests schema population from text.""" try: schema_text = "Amazon was founded on July 5, 1994, by Jeff Bezos in Bellevue, Washington." - result_schema = populate_graph_schema_from_text(schema_text, model_name, True) + result_schema = populate_graph_schema_from_text(schema_text, model_name, True, False) logging.info(f"Schema test result: {result_schema}") return result_schema except Exception as e: @@ -163,7 +186,6 @@ def test_populate_graph_schema_from_text(model_name): return {"status": "Failed", "error": str(e)} def get_duplicate_nodes(): - #graph = create_graph_database_connection(uri, userName, password, database) graphDb_data_Access = graphDBdataAccess(graph) nodes_list, total_nodes = graphDb_data_Access.get_duplicate_nodes_list() if total_nodes['total']>0: @@ -190,92 +212,174 @@ def flatten_extract_dataframe(df: pd.DataFrame): flat_df = pd.DataFrame(rows) return flat_df -def run_tests(): - """Runs all integration tests and logs results.""" - extract_list = [] - extract_error_list = [] - chatbot_list = [] - chatbot_error_list = [] - other_api_list = [] - models = ['openai_gpt_4o','openai_gpt_4o_mini','openai_gpt_4.1','openai_gpt_4.1_mini','gemini_2.0_flash','fireworks_llama4_maverick','bedrock_nova_pro_v1'] - chatbot_modes = [ - "vector", - "graph+vector", - "fulltext", - "graph+vector+fulltext", - "entity search+vector" - ] - for model_name in models: - logging.info(f"Starting tests for model: {model_name}") - # Run each test independently to capture all errors - for test_func, test_args in [ - (test_graph_from_file_local, [model_name]), - (test_graph_from_wikipedia, [model_name]), - (test_graph_from_youtube_video,[model_name]), - (test_graph_website,[model_name]), - ]: - try: - result = test_func(*test_args) - if isinstance(result, dict) and result.get("status") == "Failed": - extract_error_list.append((model_name, test_func.__name__, result.get("error", "Unknown error"))) - else: - extract_list.append(result) - except Exception as e: - logging.error(f"Error in {test_func.__name__} for {model_name}: {e}") - extract_error_list.append((model_name, test_func.__name__, str(e))) - # Run all chatbot QnA modes - for mode in chatbot_modes: - try: - result = test_chatbot_qna(model_name,mode=mode) - if isinstance(result, dict) and result.get("status") == "Failed": - chatbot_error_list.append((model_name, f"test_chatbot_qna ({mode})", result.get("error", "Unknown error"))) - else: - chatbot_list.append(result) - except Exception as e: - logging.error(f"Error in test_chatbot_qna ({mode}) for {model_name}: {e}") - chatbot_error_list.append((model_name, f"test_chatbot_qna ({mode})", str(e))) - - try: +def run_model_tests(model_name: str, chatbot_modes: List[str]) -> Dict[str, Any]: + """ + Runs all test functions for a single model, saving results incrementally. + Returns a summary dict for reporting. + """ + test_funcs: List[Tuple[Callable, List[Any], str]] = [ + (test_graph_from_file_local, [model_name], f"Extract_Integration_TestResult_{model_name}.csv"), + (test_graph_from_wikipedia, [model_name], f"Extract_Integration_TestResult_{model_name}.csv"), + (test_graph_from_youtube_video, [model_name], f"Extract_Integration_TestResult_{model_name}.csv"), + (test_graph_website, [model_name], f"Extract_Integration_TestResult_{model_name}.csv"), + ] + extract_error_list = [] + chatbot_error_list = [] + other_api_list = [] + test_results = [] + with tqdm(total=len(test_funcs), desc=f"Model: {model_name}", position=1, leave=False) as test_bar: + for test_func, test_args, result_file in test_funcs: + start_time = time.time() + try: + result = test_func(*test_args) + elapsed = time.time() - start_time + logging.info(f"{test_func.__name__} for {model_name} completed in {elapsed:.2f} seconds.") + result_with_time = result.copy() if isinstance(result, dict) else {"result": result} + result_with_time["time_taken_sec"] = round(elapsed, 2) + result_with_time["test_function"] = test_func.__name__ + save_result_csv(result_with_time, result_file) + test_results.append(result_with_time) + if isinstance(result, dict) and result.get("status") == "Failed": + extract_error_list.append((model_name, test_func.__name__, result.get("error", "Unknown error"), round(elapsed, 2))) + except Exception as e: + elapsed = time.time() - start_time + logging.error(f"Error in {test_func.__name__} for {model_name}: {e} (Time taken: {elapsed:.2f}s)") + extract_error_list.append((model_name, test_func.__name__, str(e), round(elapsed, 2))) + save_result_csv({"model": model_name, "function": test_func.__name__, "error": str(e), "time_taken_sec": round(elapsed, 2)}, result_file) + test_bar.update(1) + # Chatbot tests + with tqdm(total=len(chatbot_modes), desc=f"Chatbot: {model_name}", position=2, leave=False) as chatbot_bar: + for mode in chatbot_modes: + start_time = time.time() + try: + result = test_chatbot_qna(model_name, mode=mode) + elapsed = time.time() - start_time + logging.info(f"test_chatbot_qna ({mode}) for {model_name} completed in {elapsed:.2f} seconds.") + result_with_time = result.copy() if isinstance(result, dict) else {"result": result} + result_with_time["time_taken_sec"] = round(elapsed, 2) + result_with_time["mode"] = mode + save_result_csv(result_with_time, f"chatbot_Integration_TestResult_{model_name}.csv") + test_results.append(result_with_time) + if isinstance(result, dict) and result.get("status") == "Failed": + chatbot_error_list.append((model_name, f"test_chatbot_qna ({mode})", result.get("error", "Unknown error"), round(elapsed, 2))) + except Exception as e: + elapsed = time.time() - start_time + logging.error(f"Error in test_chatbot_qna ({mode}) for {model_name}: {e} (Time taken: {elapsed:.2f}s)") + chatbot_error_list.append((model_name, f"test_chatbot_qna ({mode})", str(e), round(elapsed, 2))) + save_result_csv({"model": model_name, "function": "test_chatbot_qna", "mode": mode, "error": str(e), "time_taken_sec": round(elapsed, 2)}, f"chatbot_Integration_TestResult_{model_name}.csv") + chatbot_bar.update(1) + # Schema test + start_time = time.time() + try: schema_result = test_populate_graph_schema_from_text(model_name) - other_api_list.append({f"{model_name}":schema_result}) - except Exception as e: - logging.error(f"Error in test_populate_graph_schema_from_text for {model_name}: {e}") - other_api_list.append({f"{model_name}":str(e)}) - # Handle disconnected nodes separately - try: - dis_elementid, dis_status = get_disconnected_nodes() - delete_status = delete_disconnected_nodes([dis_elementid]) if dis_elementid else "No disconnected nodes found" - except Exception as e: - dis_status, delete_status = "Error fetching nodes", "Error deleting nodes" - logging.error(f"Error handling disconnected nodes: {e}") - - try: - dup = get_duplicate_nodes() - except Exception as e: - dup = "Error getting duplicate nodes" - logging.error(f"Error getting duplicate nodes: {e}") - # Convert results to DataFrame - df_extract = pd.DataFrame(extract_list) - df_extract['execution_date'] = dt.today().strftime('%Y-%m-%d') - df_extract.to_csv(f"test_results/Extract_Integration_TestResult_{dt.now().strftime('%Y%m%d_%H%M%S')}.csv", index=False) + elapsed = time.time() - start_time + logging.info(f"test_populate_graph_schema_from_text for {model_name} completed in {elapsed:.2f} seconds.") + schema_result_with_time = schema_result.copy() if isinstance(schema_result, dict) else {"result": schema_result} + schema_result_with_time["time_taken_sec"] = round(elapsed, 2) + save_result_json(schema_result_with_time, f"schema_result_{model_name}.json") + other_api_list.append({f"{model_name}": schema_result_with_time}) + except Exception as e: + elapsed = time.time() - start_time + logging.error(f"Error in test_populate_graph_schema_from_text for {model_name}: {e} (Time taken: {elapsed:.2f}s)") + other_api_list.append({f"{model_name}": str(e)}) + save_result_json({"model": model_name, "error": str(e), "time_taken_sec": round(elapsed, 2)}, f"schema_result_{model_name}.json") + return { + "model": model_name, + "extract_errors": extract_error_list, + "chatbot_errors": chatbot_error_list, + "other_api": other_api_list, + "test_results": test_results + } - df_chatbot = pd.DataFrame(chatbot_list) - df_chatbot['execution_date'] = dt.today().strftime('%Y-%m-%d') - df_chatbot.to_csv(f"test_results/chatbot_Integration_TestResult_{dt.now().strftime('%Y%m%d_%H%M%S')}.csv", index=False) +def run_tests_sequential(models: List[str], chatbot_modes: List[str]) -> None: + """ + Runs all model tests sequentially, without progress bars, and generates a summary report. + """ + all_summaries = [] + for idx, model in enumerate(models): + logging.info(f"Running tests for model {idx+1}/{len(models)}: {model}") + summary = run_model_tests(model, chatbot_modes) + all_summaries.append(summary) + # Handle disconnected nodes and duplicates (single-threaded, after all models) + start_time = time.time() + try: + dis_elementid, dis_status = get_disconnected_nodes() + delete_status = delete_disconnected_nodes([dis_elementid]) if dis_elementid else "No disconnected nodes found" + elapsed = time.time() - start_time + save_result_json({"disconnected_nodes": dis_status, "delete_status": delete_status, "time_taken_sec": round(elapsed, 2)}, "disconnected_nodes.json") + except Exception as e: + elapsed = time.time() - start_time + save_result_json({"error": str(e), "time_taken_sec": round(elapsed, 2)}, "disconnected_nodes.json") + start_time = time.time() + try: + dup = get_duplicate_nodes() + elapsed = time.time() - start_time + save_result_json({"duplicate_nodes": dup, "time_taken_sec": round(elapsed, 2)}, "duplicate_nodes.json") + except Exception as e: + elapsed = time.time() - start_time + save_result_json({"error": str(e), "time_taken_sec": round(elapsed, 2)}, "duplicate_nodes.json") + # Save errors incrementally + for summary in all_summaries: + if summary["extract_errors"]: + df_errors = pd.DataFrame(summary["extract_errors"], columns=['Model', 'Function', 'Error', 'TimeTakenSec']) + df_errors['execution_date'] = dt.today().strftime('%Y-%m-%d') + df_errors.to_csv(RESULTS_DIR / f"Extract_Error_details.csv", mode='a', header=not (RESULTS_DIR / f"Extract_Error_details.csv").exists(), index=False) + if summary["chatbot_errors"]: + df_errors = pd.DataFrame(summary["chatbot_errors"], columns=['Model', 'Function', 'Error', 'TimeTakenSec']) + df_errors['execution_date'] = dt.today().strftime('%Y-%m-%d') + df_errors.to_csv(RESULTS_DIR / f"chatbot_Error_details.csv", mode='a', header=not (RESULTS_DIR / f"chatbot_Error_details.csv").exists(), index=False) + # Generate summary report + generate_summary_report(all_summaries, RESULTS_DIR / "summary_report.md") + logging.info("All tests completed.") - other_api_dict = {'disconnected_nodes':dis_status,'delete_disconnected_nodes' : delete_status,'get_duplicate_nodes':dup,'test_populate_graph_schema_from_text':other_api_list} - with open(f"test_results/other_api_results_{dt.now().strftime('%Y%m%d_%H%M%S')}.txt", "w") as file: - file.write(json.dumps(other_api_dict, indent=4)) - # Save errors - if extract_error_list: - df_errors = pd.DataFrame(extract_error_list, columns=['Model', 'Function', 'Error']) - df_errors['execution_date'] = dt.today().strftime('%Y-%m-%d') - df_errors.to_csv(f"test_results/Extract_Error_details_{dt.now().strftime('%Y%m%d_%H%M%S')}.csv", index=False) - if chatbot_error_list: - df_errors = pd.DataFrame(chatbot_error_list, columns=['Model', 'Function', 'Error']) - df_errors['execution_date'] = dt.today().strftime('%Y-%m-%d') - df_errors.to_csv(f"test_results/chatbot_Error_details_{dt.now().strftime('%Y%m%d_%H%M%S')}.csv", index=False) - logging.info("All tests completed.") +def generate_summary_report(summaries: List[Dict[str, Any]], report_path: Path) -> None: + """ + Generates a Markdown summary report from all model test summaries. + """ + lines = ["# Integration Test Summary Report\n"] + for summary in summaries: + lines.append(f"## Model: {summary['model']}\n") + lines.append("### Test Results\n") + for result in summary["test_results"]: + status = result.get("status", "Success") + func = result.get("test_function", result.get("mode", "")) + time_taken = result.get("time_taken_sec", "") + lines.append(f"- **{func}**: {status} (Time: {time_taken}s)") + if summary["extract_errors"]: + lines.append("\n### Extract Errors\n") + for err in summary["extract_errors"]: + lines.append(f"- {err}") + if summary["chatbot_errors"]: + lines.append("\n### Chatbot Errors\n") + for err in summary["chatbot_errors"]: + lines.append(f"- {err}") + lines.append("\n---\n") + with open(report_path, "w") as f: + f.write("\n".join(lines)) +# Usage in main if __name__ == "__main__": - run_tests() + models = [ + 'openai_gpt_5.1', + 'openai_gpt_5_mini', + 'openai_gpt_4.1', + 'openai_gpt_4.1_mini', + 'gemini_2.5_flash', + 'gemini_2.5_pro', + 'groq_llama3.1_8b', + 'anthropic_claude_4.5_sonnet', + 'llama4_maverick', + 'fireworks_gpt_oss', + 'fireworks_deepseek_v3', + 'bedrock_nova_micro_v1', + 'bedrock_nova_lite_v1', + 'bedrock_nova_pro_v1' + ] + chatbot_modes = [ + "vector", + "graph+vector", + "fulltext", + "graph+vector+fulltext", + "entity search+vector" + ] + run_tests_sequential(models, chatbot_modes) diff --git a/docs/frontend/frontend_docs.adoc b/docs/frontend/frontend_docs.adoc index d9f1333a8..21e51d848 100644 --- a/docs/frontend/frontend_docs.adoc +++ b/docs/frontend/frontend_docs.adoc @@ -141,8 +141,8 @@ Install necessary dependencies by running yarn install, such as axios for making == 2. Connect to the Neo4j Aura instance: Created a connection modal by adding details including protocol, URI, database name, username, and password. Added a submit button that triggers an API: ***/connect*** and accepts params like uri, password, username and database to establish a connection to the Neo4j Aura instance. Handled the authentication and error scenarios appropriately, by displaying relevant messages. To check whether the backend connection is up and working we hit the API: ***/health.*** The user can now access both AURA DS and AURA DB instances. -* If GDS Connection is there icon is scientific molecule > Graph enhancement model > Post processing jobs > gives user the leverage to check and uncheck the communities checkbox. -* If AURA DB > icon is database icon > Graph enhancement model > Post processing jobs > communities checkbox is disabled. +* If GDS Connection is there icon is scientific molecule > Graph settings model > Post processing jobs > gives user the leverage to check and uncheck the communities checkbox. +* If AURA DB > icon is database icon > Graph settings model > Post processing jobs > communities checkbox is disabled. image::images/ConnectionModal.jpg[NoConnection, 600] @@ -304,7 +304,7 @@ image::images/ChatModesProd.jpg[ChatModesProd, 600] image::images/ChatModesDev.jpg[ChatModesDev, 600] -== 7. Graph Enhancement Settings: +== 7. Graph Settings: Users can now set their own Schema for nodes and relations or can already be an existing schema. * ***Entity Extraction Settings:*** diff --git a/docs/project_docs.adoc b/docs/project_docs.adoc index 22e997217..4721f4d63 100644 --- a/docs/project_docs.adoc +++ b/docs/project_docs.adoc @@ -21,6 +21,11 @@ This document provides comprehensive documentation for the Neo4j llm-graph-build == Local Setup and Execution +Prerequisites: +- Python 3.12 or higher +- Node.js 20 or higher +- Docker (optional, for containerized deployment) + Run Docker Compose to build and start all components: .... docker-compose up --build @@ -38,8 +43,8 @@ yarn run dev ** For backend .... cd backend -python -m venv envName -source envName/bin/activate +python3.12 -m venv venv +source venv/bin/activate pip install -r requirements.txt uvicorn score:app --reload .... diff --git a/frontend/src/components/ChatBot/Chatbot.tsx b/frontend/src/components/ChatBot/Chatbot.tsx index 4e1cab6e8..52ca34776 100644 --- a/frontend/src/components/ChatBot/Chatbot.tsx +++ b/frontend/src/components/ChatBot/Chatbot.tsx @@ -442,8 +442,8 @@ const Chatbot: FC = (props) => { size='large' source={ChatBotAvatar} status={connectionStatus ? 'online' : 'offline'} - shape='square' type='image' + shape='square' /> ) : ( = (props) => { name='KM' size='large' status={connectionStatus ? 'online' : 'offline'} - shape='square' type='image' + shape='square' /> )} diff --git a/frontend/src/components/ChatBot/ChunkInfo.tsx b/frontend/src/components/ChatBot/ChunkInfo.tsx index b56057503..782c6a82b 100644 --- a/frontend/src/components/ChatBot/ChunkInfo.tsx +++ b/frontend/src/components/ChatBot/ChunkInfo.tsx @@ -87,7 +87,9 @@ const ChunkInfo: FC = ({ loading, chunks, mode }) => { = ({ loading, chunks, mode }) => { <>
- + {chunk?.url}
diff --git a/frontend/src/components/ChatBot/SourcesInfo.tsx b/frontend/src/components/ChatBot/SourcesInfo.tsx index 2a967b864..2de2c9d98 100644 --- a/frontend/src/components/ChatBot/SourcesInfo.tsx +++ b/frontend/src/components/ChatBot/SourcesInfo.tsx @@ -71,7 +71,13 @@ const SourcesInfo: FC = ({ loading, mode, chunks, sources }) => { {isAllowedHost(link, ['wikipedia.org']) && (
Wikipedia Logo - + = ({ loading, mode, chunks, sources }) => { <>
youtube-source-logo - + = ({ loading, mode, chunks, sources }) => { !isAllowedHost(link, ['storage.googleapis.com', 'wikipedia.org', 'www.youtube.com']) && (
- + {link}
diff --git a/frontend/src/components/Content.tsx b/frontend/src/components/Content.tsx index d2a5e14ac..f8f6ac433 100644 --- a/frontend/src/components/Content.tsx +++ b/frontend/src/components/Content.tsx @@ -954,13 +954,13 @@ const Content: React.FC = ({ - Graph Enhancement + Graph Settings {!connectionStatus ? ( = ({