typedef-ai
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/fenic/_backends/local/utils/doc_loader.py‎
Lines changed: 145 additions & 25 deletions b/‎src/fenic/_backends/local/utils/doc_loader.py‎
Lines changed: 145 additions & 25 deletions
diff --git a/‎src/fenic/api/io/reader.py‎
Lines changed: 82 additions & 0 deletions b/‎src/fenic/api/io/reader.py‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎src/fenic/core/_logical_plan/plans/source.py‎
Lines changed: 4 additions & 4 deletions b/‎src/fenic/core/_logical_plan/plans/source.py‎
Lines changed: 4 additions & 4 deletions
@@ -37,6 +37,7 @@ dependencies = [
   "types-protobuf==5.29.1.20250403",
   "zstandard>=0.23.0",
   "jsonschema>=4.0.0",
+  "pymupdf>=1.26.4",
 ]
 
 [project.urls]
 
@@ -6,14 +6,19 @@
 import re
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from pathlib import Path
-from typing import List, Optional, Tuple
+from typing import List, Literal, Optional, Tuple
 
+import fitz  # PyMuPDF
 import polars as pl
+from pydantic import ConfigDict, validate_call
 
 from fenic._backends.local.utils.io_utils import PathScheme, get_path_scheme
+from fenic.core._utils.schema import convert_custom_schema_to_polars_schema
 from fenic.core.error import FileLoaderError, ValidationError
 from fenic.core.types import ColumnField, Schema
 from fenic.core.types.datatypes import (
+    BooleanType,
+    IntegerType,
     StringType,
 )
 
@@ -30,9 +35,10 @@ class DocFolderLoader:
     """
 
     @staticmethod
+    @validate_call(config=ConfigDict(strict=True))
     def load_docs_from_folder(
             paths: list[str],
-            valid_file_extension: str,
+            valid_file_extension: Literal["md", "json", "pdf"],
             exclude_pattern: Optional[str] = None,
             recursive: bool = False,
     ) -> pl.DataFrame:
@@ -65,28 +71,47 @@ def load_docs_from_folder(
 
         if not files:
             logger.debug(f"No files found in {paths}")
-            return DocFolderLoader._build_no_files_dataframe()
+            return DocFolderLoader._build_no_files_dataframe(file_extension=valid_file_extension)
 
         # Calculate the batch size to ensure that each worker gets at least one file.
         max_workers = os.cpu_count() + 4
-        return DocFolderLoader._process_files(files, max_workers)
+        
+        # Process files with the appropriate handler based on extension
+        return DocFolderLoader._process_files(files, max_workers, valid_file_extension)
 
     @staticmethod
-    def get_schema() -> Schema:
+    def get_schema(file_extension: str = None) -> Schema:
         """Get the schema for the data type.
 
         Args:
-            data_type: The data type of the files to load
+            file_extension: The file extension to determine schema
 
         Returns:
             Schema: The schema for the data type
         """
+        column_fields = [
+            ColumnField(name="file_path", data_type=StringType),
+            ColumnField(name="error", data_type=StringType),
+        ]
+        if file_extension == "pdf":
+            column_fields.extend([
+                # additional file metadata fields
+                ColumnField(name="size", data_type=IntegerType),
+                # PDF metadata fields
+                ColumnField(name="title", data_type=StringType),
+                ColumnField(name="author", data_type=StringType),
+                ColumnField(name="creation_date", data_type=StringType),
+                ColumnField(name="mod_date", data_type=StringType),
+                ColumnField(name="page_count", data_type=IntegerType),
+                ColumnField(name="has_forms", data_type=BooleanType),
+                ColumnField(name="has_signature_fields", data_type=BooleanType),
+                ColumnField(name="image_count", data_type=IntegerType),
+                ColumnField(name="is_encrypted", data_type=BooleanType),
+            ])
+        else: # load file content directly
+            column_fields.append(ColumnField(name="content", data_type=StringType))
         return Schema(
-            column_fields=[
-                ColumnField(name="file_path", data_type=StringType),
-                ColumnField(name="error", data_type=StringType),
-                ColumnField(name="content", data_type=StringType),
-            ]
+            column_fields=column_fields
         )
 
     @staticmethod
@@ -143,19 +168,29 @@ def _enumerate_files(
     def _process_files(
         files: List[str],
         max_workers: int,
+        file_extension: str = None,
     ) -> pl.DataFrame:
         """Process files in parallel using a thread pool.
 
         Args:
             files: List of file paths to process
             max_workers: Number of worker threads
+            file_extension: File extension to determine processing type
 
         Returns:
             DataFrame: A dataframe containing the files in the folder.
         """
+        # Determine which processing function and schema to use
+
+        schema = convert_custom_schema_to_polars_schema(DocFolderLoader.get_schema(file_extension=file_extension))
+        if file_extension == "pdf":
+            process_func = DocFolderLoader._process_single_pdf_metadata
+        else:
+            process_func = DocFolderLoader._process_single_file
+        
         with ThreadPoolExecutor(max_workers=max_workers) as executor:
             it = iter(files)
-            pending = {executor.submit(DocFolderLoader._process_single_file, f)
+            pending = {executor.submit(process_func, f)
                        for _, f in zip(range(max_workers), it, strict=False)}
 
             def results_generator():
@@ -164,12 +199,12 @@ def results_generator():
                         pending.remove(future)
                         yield future.result()
                         try:
-                            pending.add(executor.submit(DocFolderLoader._process_single_file, next(it)))
+                            pending.add(executor.submit(process_func, next(it)))
                         except StopIteration:
                             pass
 
             # Uses the iterator over the results to build the dataframe.
-            return pl.DataFrame(results_generator(), schema=DocFolderLoader._get_polars_schema())
+            return pl.DataFrame(results_generator(), schema=schema)
 
     @staticmethod
     def _process_single_file(
@@ -203,17 +238,9 @@ def _process_single_file(
         return file_path, string_error, file_content
 
     @staticmethod
-    def _get_polars_schema() -> pl.Schema:
-        return pl.Schema({
-            "file_path": pl.Utf8,
-            "error": pl.Utf8,
-            "content": pl.Utf8,
-        })
-
-    @staticmethod
-    def _build_no_files_dataframe() -> pl.DataFrame:
-        """Build a dataframe from the file content."""
-        return pl.DataFrame({}, schema=DocFolderLoader._get_polars_schema())
+    def _build_no_files_dataframe(file_extension: str) -> pl.DataFrame:
+        """Build an empty dataframe with the appropriate schema."""
+        return pl.DataFrame({}, schema=convert_custom_schema_to_polars_schema(DocFolderLoader.get_schema(file_extension=file_extension)))
 
     @staticmethod
     def _enumerate_files_s3(
@@ -302,3 +329,96 @@ def _load_file_s3(file_path: str) -> str:
     def _load_file_hf(file_path: str) -> str:
         """Load a file from HuggingFace."""
         raise NotImplementedError("HF file loading is not implemented yet.")
+
+    @staticmethod
+    def _process_single_pdf_metadata(file_path: str) -> dict:
+        """Process a single PDF file to extract metadata.
+
+        Args:
+            file_path: The path to the PDF file to process
+
+        Returns:
+            dict: A dictionary containing PDF metadata and error information.
+        """
+        
+        path_scheme = get_path_scheme(file_path)
+        logger.debug(f"Processing PDF: {file_path} - {path_scheme}")
+        
+        # Initialize the flat result dict with default values
+        result = {
+            "file_path": file_path,
+            "error": None,
+            "size": 0,
+            "title": None,
+            "author": None,
+            "creation_date": None,
+            "mod_date": None,
+            "page_count": 0,
+            "has_forms": False,
+            "has_signature_fields": False,
+            "image_count": 0,
+            "is_encrypted": False,
+        }
+        
+        try:
+            if path_scheme == PathScheme.S3:
+                raise NotImplementedError("S3 PDF processing not implemented yet.")
+            elif path_scheme == PathScheme.HF:
+                raise NotImplementedError("HF PDF processing not implemented yet.")
+            else:
+                result["size"] = os.path.getsize(file_path)
+                doc = fitz.open(file_path)
+                
+                # Extract basic document info
+                doc_metadata = doc.metadata
+                result.update({
+                    "title": doc_metadata.get("title") or "",
+                    "author": doc_metadata.get("author") or "",
+                    "creation_date": doc_metadata.get("creationDate") or "",
+                    "mod_date": doc_metadata.get("modDate") or "",
+                    "page_count": len(doc),
+                    "is_encrypted": doc.needs_pass,
+                })
+                
+                # Analyze document structure
+                image_count = 0
+                has_forms = False
+                has_signature_fields = False
+                
+                for page_num in range(len(doc)):
+                    page = doc[page_num]
+                    
+                    # Count raster images
+                    page_images = page.get_images()
+                    if page_images:
+                        image_count += len(page_images)
+                    
+                    # Vector drawings are represented as drawings in PyMuPDF
+                    drawings = page.get_drawings()
+                    if drawings:
+                        image_count += len(drawings)
+                    
+                    # Check for forms and signature fields
+                    if not has_forms or not has_signature_fields:
+                        widgets = list(page.widgets())
+                        if len(widgets) > 0:
+                            has_forms = True
+                            for widget in widgets:
+                                if widget.field_type == fitz.PDF_WIDGET_TYPE_SIGNATURE:
+                                    has_signature_fields = True
+                                    break
+                
+                result.update({
+                    "has_forms": has_forms,
+                    "has_signature_fields": has_signature_fields,
+                    "image_count": image_count,
+                })
+                
+                doc.close()
+                logger.debug(f"PDF processed successfully: {file_path}")
+                
+        except Exception as e:
+            logger.warning(f"Error processing PDF {file_path}: {str(e)}")
+            result["error"] = str(e)
+        
+        return result
@@ -332,4 +332,86 @@ def docs(
             col("error"),
             col("content").cast(data_type).alias("content"),
         )
+        return df
+
+    def pdf_metadata(
+            self,
+            paths: Union[str, list[str]],
+            exclude: Optional[str] = None,
+            recursive: bool = False,
+    ) -> DataFrame:
+        r"""Load a DataFrame with metadata from PDF files.
+
+        Args:
+            paths: Glob pattern (or list of glob patterns) to the folder(s) to load.
+            exclude: A regex pattern to exclude files.
+                     If it is not provided no files will be excluded.
+            recursive: Whether to recursively load files from the folder.
+
+        Returns:
+            DataFrame: A dataframe with the metadata of all the PDF files found in the paths.
+                       the metadata from a single PDF document is a row in the dataframe.
+
+        Raises:
+            ValidationError: If any file does not have a `.pdf` extension.
+
+        Notes:
+            - Each row in the dataframe corresponds to a file in the list of paths.
+            - The metadata columns are:
+                - doc_path: The path to the document.
+                - error: The error message if the file failed to be loaded.
+                - size: Size of the PDF file in bytes.
+                - title: Title of the PDF document.
+                - author: Author of the PDF document.
+                - creation_date: Creation date of the PDF.
+                - mod_date: Modification date of the PDF.
+                - page_count: Number of pages in the PDF.
+                - has_forms: Whether the PDF contains form fields, or fields that accept user input.
+                - has_signature_fields: Whether the PDF contains signature fields.
+                - image_count: Number of images in the PDF.
+                - is_encrypted: Whether the PDF is encrypted.
+            - Recursive loading is supported in conjunction with the '**' glob pattern,
+              e.g. `data/**/*.pdf` will load all PDF files in the `data` folder and all subfolders
+                   when recursive is set to True.
+              Without recursive = True, then ** behaves like a single '*' pattern.
+
+        Example: Read the metadata of all the PDF files in a folder and all its subfolders.
+            ```python
+            df = session.read.pdf_metadata("data/docs/**/*.pdf", recursive=True)
+            ```
+
+        Example: Read a metadata of PDFS in a folder, excluding some files.
+            ```python
+            df = session.read.pdf_metadata("data/docs/*.pdf", exclude=r"\.backup.pdf$")
+            ```
+
+        """
+        if isinstance(paths, str):
+            paths = [paths]
+
+        logical_node = DocSource.from_session_state(
+            paths=paths,
+            valid_file_extension="pdf",
+            exclude=exclude,
+            recursive=recursive,
+            session_state=self._session_state,
+        )
+        from fenic.api.dataframe import DataFrame
+
+        df = DataFrame._from_logical_plan(logical_node, self._session_state)
+        # Rename file_path to doc_path for consistency with other doc readers
+        df = df.select(
+            col("file_path").alias("doc_path"),
+            col("error"),
+            col("size"),
+            col("title"),
+            col("author"),
+            col("creation_date"),
+            col("mod_date"),
+            col("page_count"),
+            col("has_forms"),
+            col("has_signature_fields"),
+            col("image_count"),
+            col("is_encrypted"),
+        )
         return df
@@ -197,7 +197,7 @@ class DocSource(LogicalPlan):
     def __init__(
         self,
         paths: list[str],
-        valid_file_extension: Literal["md", "json"],
+        valid_file_extension: Literal["md", "json", "pdf"],
         exclude: Optional[str] = None,
         recursive: bool = False,
         session_state: Optional[BaseSessionState] = None,
@@ -212,7 +212,7 @@ def __init__(
     def from_session_state(
         cls,
         paths: list[str],
-        valid_file_extension: Literal["md", "json"],
+        valid_file_extension: Literal["md", "json", "pdf"],
         exclude: Optional[str] = None,
         recursive: bool = False,
         session_state: Optional[BaseSessionState] = None,
@@ -223,7 +223,7 @@ def from_session_state(
     def from_schema(
         cls,
         paths: list[str],
-        valid_file_extension: Literal["md", "json"],
+        valid_file_extension: Literal["md", "json", "pdf"],
         exclude: Optional[str] = None,
         recursive: bool = False,
         schema: Optional[Schema] = None,
@@ -232,7 +232,7 @@ def from_schema(
 
     def _build_schema(self, session_state: BaseSessionState) -> Schema:
         DocFolderLoader.validate_paths(self._paths)
-        return DocFolderLoader.get_schema()
+        return DocFolderLoader.get_schema(self._valid_file_extension)
 
     def children(self) -> List[LogicalPlan]:
         return []
Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,7 @@ dependencies = [`
`37`	`37`	`"types-protobuf==5.29.1.20250403",`
`38`	`38`	`"zstandard>=0.23.0",`
`39`	`39`	`"jsonschema>=4.0.0",`
	`40`	`+ "pymupdf>=1.26.4",`
`40`	`41`	`]`
`41`	`42`
`42`	`43`	`[project.urls]`