Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 28 additions & 20 deletions app/main.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
import os
import time
import pandas as pd
import glob
import re

from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Paragraph

from nlp.gpt_utils import order_files
from generation.generate_notes import create_notes
from extraction.file_utils import handle_image, handle_pdf, get_file_type, handle_video, clear_output, split_text, move_file
from extraction.file_utils import get_file_type, clear_output, split_text, move_file
from nlp.topic_modelling import topic_model
from extraction.outline import create_outline
from extraction.line_embed import line_sort
from nlp.embedding_utils import embed_sections
from tasks import (
handle_image_task,
handle_pdf_task,
handle_video_task,
create_outline_task,
line_sort_task,
embed_sections_task,
create_notes_task,
)
from celery import group

from config import NOTE_INPUTS_DIR, RAW_TEXT, SECTIONS, COMPLETED_NOTES_FILE, PREVIOUS_INPUTS

Expand Down Expand Up @@ -84,38 +89,40 @@ def main():

# Work with the available files
time.sleep(3)
processing_tasks = []
for file in files:
file_path, file_type = get_file_type(file)
valid_video_types = {"MP4", "AVI", "MKV", "MOV", "WMV", "FLV", "WEBM", "MPEG", "MPG", "OGV", "3GP", "MTS"}
valid_image_types = {"PNG", "JPEG", "JPG", "BMP", "GIF", "TIFF", "WEBP"}
try:
# For Images
if file_type in valid_image_types:
handle_image(file, file_path)
# For PDFs
processing_tasks.append(handle_image_task.s(file, file_path))
elif file_type == 'PDF':
handle_pdf(file, file_path)
# For videos
processing_tasks.append(handle_pdf_task.s(file, file_path))
elif file_type in valid_video_types:
handle_video(file, file_path)

processing_tasks.append(handle_video_task.s(file, file_path))
except Exception as e:
print(f"Invalid file {e}")

if processing_tasks:
group(processing_tasks).apply_async().get()

time.sleep(3)

# Creata an outline from the raw text and return a df with info
df= create_outline(RAW_TEXT)
# Create an outline from the raw text and return a df with info
df_records = create_outline_task.delay(RAW_TEXT).get()
df = pd.DataFrame(df_records)

section_embeddings = {
row["filename"]: row["embedding"]
for _, row in df.iterrows()
for row in df_records
}

# Sort lines into their respective sections
line_sort(RAW_TEXT, section_embeddings)
line_sort_task.delay(RAW_TEXT, section_embeddings).get()

# Embed the new sorted sections
embed_sections(SECTIONS)
embed_sections_task.delay(SECTIONS).get()

stop_timer.set() # Signal the timer thread to stop (processing done)
timer_thread.join() # Wait for the timer thread to finish
Expand All @@ -131,8 +138,9 @@ def main():
)

# Create notes on chunks
for section in sections:
create_notes(section, df) # Send topics GPT to make notes on
note_tasks = [create_notes_task.s(section, df_records) for section in sections]
if note_tasks:
group(note_tasks).apply_async().get()

print("--------------")
print("Notes Completed")
Expand Down
52 changes: 52 additions & 0 deletions app/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os
import pandas as pd
from celery import Celery
from extraction.file_utils import handle_image, handle_pdf, handle_video
from extraction.outline import create_outline
from extraction.line_embed import line_sort
from nlp.embedding_utils import embed_sections
from generation.generate_notes import create_notes


celery = Celery(
'tasks',
broker=os.environ.get('CELERY_BROKER_URL', 'redis://localhost:6379/0'),
backend=os.environ.get('CELERY_RESULT_BACKEND', 'redis://localhost:6379/0')
)


@celery.task
def handle_image_task(file, file_path):
return handle_image(file, file_path)


@celery.task
def handle_pdf_task(file, file_path):
return handle_pdf(file, file_path)


@celery.task
def handle_video_task(file, file_path):
return handle_video(file, file_path)


@celery.task
def create_outline_task(raw_text_path):
df = create_outline(raw_text_path)
return df.to_dict(orient='records')


@celery.task
def line_sort_task(raw_text_path, section_embeddings):
return line_sort(raw_text_path, section_embeddings)


@celery.task
def embed_sections_task(sections_dir):
return embed_sections(sections_dir)


@celery.task
def create_notes_task(section, df_records):
df = pd.DataFrame(df_records)
return create_notes(section, df)
6 changes: 4 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,7 @@ pytesseract
Pillow
PyMuPDF
vosk
markdown
pdf2image
markdown
pdf2image
celery
redis