Skip to content

Commit 955cf18

Browse files
rahul-tripRahul Tripathi
andauthored
community[patch]: Ingest source, owner and full_path if present in Document's metadata. (#20949)
Description: The PebbloSafeLoader should first check for owner, full_path and size in metadata before implementing its own logic. Dependencies: None Documentation: NA. Signed-off-by: Rahul Tripathi <rauhl.psit.ec@gmail.com> Co-authored-by: Rahul Tripathi <rauhl.psit.ec@gmail.com>
1 parent 790ea75 commit 955cf18

File tree

2 files changed

+13
-8
lines changed

2 files changed

+13
-8
lines changed

libs/community/langchain_community/document_loaders/pebblo.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -157,16 +157,19 @@ def _classify_doc(self, loaded_docs: list, loading_end: bool = False) -> list:
157157
doc_content = [doc.dict() for doc in loaded_docs]
158158
docs = []
159159
for doc in doc_content:
160-
doc_authorized_identities = doc.get("metadata", {}).get(
161-
"authorized_identities", []
162-
)
160+
doc_metadata = doc.get("metadata", {})
161+
doc_authorized_identities = doc_metadata.get("authorized_identities", [])
163162
doc_source_path = get_full_path(
164-
doc.get("metadata", {}).get("source", self.source_path)
163+
doc_metadata.get(
164+
"full_path", doc_metadata.get("source", self.source_path)
165+
)
166+
)
167+
doc_source_owner = doc_metadata.get(
168+
"owner", PebbloSafeLoader.get_file_owner_from_path(doc_source_path)
165169
)
166-
doc_source_owner = PebbloSafeLoader.get_file_owner_from_path(
167-
doc_source_path
170+
doc_source_size = doc_metadata.get(
171+
"size", self.get_source_size(doc_source_path)
168172
)
169-
doc_source_size = self.get_source_size(doc_source_path)
170173
page_content = str(doc.get("page_content"))
171174
page_content_size = self.calculate_content_size(page_content)
172175
self.source_aggregate_size += page_content_size

libs/community/langchain_community/utilities/pebblo.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,9 @@ def get_full_path(path: str) -> str:
169169
or (path in ["unknown", "-", "in-memory"])
170170
):
171171
return path
172-
full_path = pathlib.Path(path).resolve()
172+
full_path = pathlib.Path(path)
173+
if full_path.exists():
174+
full_path = full_path.resolve()
173175
return str(full_path)
174176

175177

0 commit comments

Comments
 (0)