@@ -69,11 +69,16 @@ def insert_document(doc_path: Path, *, config: RAGLiteConfig | None = None) -> N
6969 """Insert a document into the database and update the index."""
7070 # Use the default config if not provided.
7171 config = config or RAGLiteConfig ()
72- db_backend = make_url (config .db_url ).get_backend_name ()
7372 # Preprocess the document into chunks and chunk embeddings.
74- with tqdm (total = 5 , unit = "step" , dynamic_ncols = True ) as pbar :
73+ with tqdm (total = 6 , unit = "step" , dynamic_ncols = True ) as pbar :
7574 pbar .set_description ("Initializing database" )
7675 engine = create_database_engine (config )
76+ document_record = Document .from_path (doc_path )
77+ with Session (engine ) as session : # Exit early if the document is already in the database.
78+ if session .get (Document , document_record .id ) is not None :
79+ pbar .update (6 )
80+ pbar .close ()
81+ return
7782 pbar .update (1 )
7883 pbar .set_description ("Converting to Markdown" )
7984 doc = document_to_markdown (doc_path )
@@ -92,32 +97,20 @@ def insert_document(doc_path: Path, *, config: RAGLiteConfig | None = None) -> N
9297 max_size = config .chunk_max_size ,
9398 )
9499 pbar .update (1 )
95- # Create and store the chunk records.
96- with Session (engine ) as session :
97- # Add the document to the document table.
98- document_record = Document .from_path (doc_path )
99- if session .get (Document , document_record .id ) is None :
100+ pbar .set_description ("Updating database" )
101+ with Session (engine ) as session :
100102 session .add (document_record )
103+ for chunk_record , chunk_embedding_record_list in zip (
104+ * _create_chunk_records (document_record .id , chunks , chunk_embeddings , config ),
105+ strict = True ,
106+ ):
107+ session .add (chunk_record )
108+ session .add_all (chunk_embedding_record_list )
101109 session .commit ()
102- # Create the chunk records to insert into the chunk table.
103- chunk_records , chunk_embedding_records = _create_chunk_records (
104- document_record .id , chunks , chunk_embeddings , config
105- )
106- # Store the chunk and chunk embedding records.
107- for chunk_record , chunk_embedding_record_list in tqdm (
108- zip (chunk_records , chunk_embedding_records , strict = True ),
109- desc = "Inserting chunks" ,
110- total = len (chunk_records ),
111- unit = "chunk" ,
112- dynamic_ncols = True ,
113- ):
114- if session .get (Chunk , chunk_record .id ) is not None :
115- continue
116- session .add (chunk_record )
117- session .add_all (chunk_embedding_record_list )
118- session .commit ()
110+ pbar .update (1 )
111+ pbar .close ()
119112 # Manually update the vector search chunk index for SQLite.
120- if db_backend == "sqlite" :
113+ if make_url ( config . db_url ). get_backend_name () == "sqlite" :
121114 from pynndescent import NNDescent
122115
123116 with Session (engine ) as session :
0 commit comments