feat: Introduce ContentProcessor enum for content processing options (#8)

mbeacom · web-flow · commit 82cc59fdc2d5 · 2025-07-18T10:37:03.000-04:00
diff --git a/README.md b/README.md
@@ -45,13 +45,13 @@ uv add genai-processors-url-fetch[markitdown]
 
 ```python
 from genai_processors import processor
-from genai_processors_url_fetch import UrlFetchProcessor, FetchConfig
+from genai_processors_url_fetch import UrlFetchProcessor, FetchConfig, ContentProcessor
 
 # Basic usage with defaults (BeautifulSoup text extraction)
 fetcher = UrlFetchProcessor()
 
 # Use markitdown for richer content processing
-config = FetchConfig(content_processor="markitdown")
+config = FetchConfig(content_processor=ContentProcessor.MARKITDOWN)
 markitdown_fetcher = UrlFetchProcessor(config)
 
 # Process text containing URLs
@@ -81,7 +81,7 @@ All security features are enabled by default but can be configured via the Fetch
 The processor uses a dataclass-based configuration system for clean, type-safe settings. You can customize the processor's behavior by passing a FetchConfig object during initialization.
 
 ```python
-from genai_processors_url_fetch import UrlFetchProcessor, FetchConfig
+from genai_processors_url_fetch import UrlFetchProcessor, FetchConfig, ContentProcessor
 
 # Example of a customized security configuration
 config = FetchConfig(
@@ -104,12 +104,13 @@ The `FetchConfig` dataclass provides comprehensive configuration options organiz
 * **user_agent** (str, default: "GenAI-Processors/UrlFetchProcessor"): The User-Agent string to send with HTTP requests.
 * **include_original_part** (bool, default: True): If True, the original ProcessorPart that contained the URL(s) will be yielded at the end of processing.
 * **fail_on_error** (bool, default: False): If True, the processor will raise a RuntimeError on the first failed fetch.
-* **content_processor** (Literal["beautifulsoup", "markitdown", "raw"], default: "beautifulsoup"): Content processing method.
-  - `"beautifulsoup"`: Extract clean text using BeautifulSoup (fastest, good for simple HTML)
-  - `"markitdown"`: Convert content to markdown using Microsoft's markitdown library (best for rich content, requires optional dependency)
-  - `"raw"`: Return the raw HTML content without processing
+* **content_processor** (ContentProcessor, default: ContentProcessor.BEAUTIFULSOUP): Content processing method.
+  * `ContentProcessor.BEAUTIFULSOUP`: Extract clean text using BeautifulSoup (fastest, good for simple HTML)
+  * `ContentProcessor.MARKITDOWN`: Convert content to markdown using Microsoft's markitdown library (best for rich content, requires optional dependency)
+  * `ContentProcessor.RAW`: Return the raw HTML content without processing
+  * **Note:** String values ("beautifulsoup", "markitdown", "raw") are automatically converted to enum values for backward compatibility.
 * **markitdown_options** (dict[str, Any], default: {}): Options passed to the markitdown MarkItDown constructor when using markitdown processor.
-* **extract_text_only** (bool | None, default: None): **Deprecated.** Use `content_processor` instead. For backward compatibility: `True` maps to `"beautifulsoup"`, `False` maps to `"raw"`.
+* **extract_text_only** (bool | None, default: None): **Deprecated.** Use `content_processor` instead. For backward compatibility: `True` maps to `ContentProcessor.BEAUTIFULSOUP`, `False` maps to `ContentProcessor.RAW`.
 
 ##### Security Controls
 
@@ -128,7 +129,7 @@ The UrlFetchProcessor supports three content processing methods via the `content
 #### BeautifulSoup (Default)
 
 ```python
-config = FetchConfig(content_processor="beautifulsoup")
+config = FetchConfig(content_processor=ContentProcessor.BEAUTIFULSOUP)
 fetcher = UrlFetchProcessor(config)
 # Returns: Clean text extracted from HTML, fastest processing
 # Mimetype: "text/plain; charset=utf-8"
@@ -140,7 +141,7 @@ The markitdown processor provides the richest content extraction by converting H
 
 ```python
 config = FetchConfig(
-    content_processor="markitdown",
+    content_processor=ContentProcessor.MARKITDOWN,
     markitdown_options={
         "extract_tables": True,     # Preserve table structure
         "preserve_links": True,     # Keep link formatting
@@ -169,7 +170,7 @@ fetcher = UrlFetchProcessor(config)
 #### Raw HTML
 
 ```python
-config = FetchConfig(content_processor="raw")
+config = FetchConfig(content_processor=ContentProcessor.RAW)
 fetcher = UrlFetchProcessor(config)
 # Returns: Original HTML content without processing
 # Mimetype: "text/html; charset=utf-8"
@@ -229,11 +230,11 @@ for content_part in successful_content:
 
 ```python
 from genai_processors import streams
-from genai_processors_url_fetch import UrlFetchProcessor, FetchConfig
+from genai_processors_url_fetch import UrlFetchProcessor, FetchConfig, ContentProcessor
 
 # Configure markitdown processor for rich content extraction
 config = FetchConfig(
-    content_processor="markitdown",
+    content_processor=ContentProcessor.MARKITDOWN,
     include_original_part=False,
     markitdown_options={
         "extract_tables": True,
diff --git a/examples/markitdown_example.py b/examples/markitdown_example.py
@@ -9,7 +9,7 @@
 
 from genai_processors import processor
 
-from genai_processors_url_fetch import FetchConfig, UrlFetchProcessor
+from genai_processors_url_fetch import ContentProcessor, FetchConfig, UrlFetchProcessor
 
 
 async def main() -> None:
@@ -22,7 +22,7 @@ async def main() -> None:
 
     # Configure with markitdown processor
     config = FetchConfig(
-        content_processor="markitdown",
+        content_processor=ContentProcessor.MARKITDOWN,
         include_original_part=False,
         timeout=10.0,
         markitdown_options={
diff --git a/examples/url_content_summarizer.py b/examples/url_content_summarizer.py
@@ -32,7 +32,11 @@
 from genai_processors.core import genai_model
 from google.genai import types as genai_types
 
-from genai_processors_url_fetch.url_fetch import FetchConfig, UrlFetchProcessor
+from genai_processors_url_fetch.url_fetch import (
+    ContentProcessor,
+    FetchConfig,
+    UrlFetchProcessor,
+)
 
 # Get API key from environment
 API_KEY = os.environ.get("GEMINI_API_KEY", "")
@@ -77,7 +81,7 @@ async def run_url_summarizer() -> None:
         timeout=10.0,
         max_response_size=2 * 1024 * 1024,  # 2MB limit
         include_original_part=False,  # Only show fetched content
-        extract_text_only=True,  # Convert HTML to text
+        content_processor=ContentProcessor.BEAUTIFULSOUP,  # Extract clean text
         # Security: Only allow HTTPS and block private IPs
         allowed_schemes=["https"],
         block_private_ips=True,
diff --git a/genai_processors_url_fetch/__init__.py b/genai_processors_url_fetch/__init__.py
@@ -8,7 +8,7 @@
 This is an independent contrib processor for the genai-processors ecosystem.
 """
 
-from .url_fetch import FetchConfig, UrlFetchProcessor
+from .url_fetch import ContentProcessor, FetchConfig, UrlFetchProcessor
 
-__version__ = "0.2.0"
-__all__ = ["UrlFetchProcessor", "FetchConfig"]
+__version__ = "0.3.0"
+__all__ = ["UrlFetchProcessor", "FetchConfig", "ContentProcessor"]
diff --git a/genai_processors_url_fetch/tests/test_markitdown.py b/genai_processors_url_fetch/tests/test_markitdown.py
@@ -6,7 +6,7 @@
 import pytest
 from genai_processors import processor
 
-from genai_processors_url_fetch import FetchConfig, UrlFetchProcessor
+from genai_processors_url_fetch import ContentProcessor, FetchConfig, UrlFetchProcessor
 
 
 class TestMarkitdownIntegration:
@@ -18,12 +18,12 @@ def test_markitdown_config_validation(self) -> None:
         try:
             import markitdown  # noqa: F401
 
-            config = FetchConfig(content_processor="markitdown")
-            assert config.content_processor == "markitdown"
+            config = FetchConfig(content_processor=ContentProcessor.MARKITDOWN)
+            assert config.content_processor == ContentProcessor.MARKITDOWN
         except ImportError:
             # Should raise ImportError if markitdown is not available
             with pytest.raises(ImportError, match="markitdown is required"):
-                FetchConfig(content_processor="markitdown")
+                FetchConfig(content_processor=ContentProcessor.MARKITDOWN)
 
     def test_markitdown_options_passed_correctly(self) -> None:
         """Test markitdown options are passed to MarkItDown constructor."""
@@ -145,15 +145,15 @@ def test_backward_compatibility_extract_text_only(self) -> None:
             assert len(w) == 1
             assert issubclass(w[0].category, DeprecationWarning)
             assert "extract_text_only is deprecated" in str(w[0].message)
-            assert config1.content_processor == "beautifulsoup"
+            assert config1.content_processor == ContentProcessor.BEAUTIFULSOUP
 
         # Test extract_text_only=False maps to raw
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter("always")
             config2 = FetchConfig(extract_text_only=False)
             assert len(w) == 1
             assert issubclass(w[0].category, DeprecationWarning)
-            assert config2.content_processor == "raw"
+            assert config2.content_processor == ContentProcessor.RAW
 
     @pytest.mark.anyio
     async def test_invalid_content_processor_raises_error(self) -> None:
@@ -217,9 +217,9 @@ def test_markitdown_config_import_error_when_not_available(self) -> None:
         # Mock HAS_MARKITDOWN to be False to simulate markitdown not available
         patch_path = "genai_processors_url_fetch.url_fetch.HAS_MARKITDOWN"
         with patch(patch_path, new=False):
-            expected_msg = "markitdown is required for content_processor"
+            expected_msg = "markitdown is required for ContentProcessor.MARKITDOWN"
             with pytest.raises(ImportError, match=expected_msg):
-                FetchConfig(content_processor="markitdown")
+                FetchConfig(content_processor=ContentProcessor.MARKITDOWN)
 
     def test_extract_text_only_overrides_markitdown(self) -> None:
         """Test extract_text_only parameter overrides markitdown setting."""
@@ -236,12 +236,12 @@ def test_extract_text_only_overrides_markitdown(self) -> None:
                 assert len(w) == 1
                 assert issubclass(w[0].category, DeprecationWarning)
                 assert "extract_text_only is deprecated" in str(w[0].message)
-                assert config.content_processor == "raw"  # Overridden
+                assert config.content_processor == ContentProcessor.RAW  # Overridden
 
         except ImportError:
             # If markitdown not available, test override with default
             with warnings.catch_warnings(record=True) as w:
                 warnings.simplefilter("always")
                 config = FetchConfig(extract_text_only=False)
                 assert len(w) == 1
-                assert config.content_processor == "raw"
+                assert config.content_processor == ContentProcessor.RAW
diff --git a/genai_processors_url_fetch/tests/test_url_fetch.py b/genai_processors_url_fetch/tests/test_url_fetch.py
@@ -9,6 +9,7 @@
 
 from genai_processors_url_fetch.url_fetch import (
     URL_REGEX,
+    ContentProcessor,
     FetchConfig,
     UrlFetchProcessor,
 )
@@ -200,9 +201,9 @@ async def test_fail_on_error_config(self) -> None:
 
     @pytest.mark.anyio
     async def test_content_processor_raw_config(self) -> None:
-        """Test the content_processor='raw' configuration option."""
+        """Test the content_processor=ContentProcessor.RAW configuration option."""
         config = FetchConfig(
-            content_processor="raw",
+            content_processor=ContentProcessor.RAW,
             include_original_part=False,
         )
         p = UrlFetchProcessor(config)
@@ -243,20 +244,20 @@ def test_fetch_config_initialization(self) -> None:
         assert config.timeout == 15.0
         assert config.include_original_part is True
         assert config.fail_on_error is False
-        assert config.content_processor == "beautifulsoup"
+        assert config.content_processor == ContentProcessor.BEAUTIFULSOUP
         assert config.extract_text_only is None  # deprecated field
 
         # Custom config
         config = FetchConfig(
             timeout=30.0,
             include_original_part=False,
             fail_on_error=True,
-            content_processor="raw",
+            content_processor=ContentProcessor.RAW,
         )
         assert config.timeout == 30.0
         assert config.include_original_part is False
         assert config.fail_on_error is True
-        assert config.content_processor == "raw"
+        assert config.content_processor == ContentProcessor.RAW
 
     def test_backward_compatibility_extract_text_only(self) -> None:
         """Test backward compatibility for extract_text_only parameter."""
@@ -266,13 +267,13 @@ def test_backward_compatibility_extract_text_only(self) -> None:
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", DeprecationWarning)
             config1 = FetchConfig(extract_text_only=True)
-            assert config1.content_processor == "beautifulsoup"
+            assert config1.content_processor == ContentProcessor.BEAUTIFULSOUP
 
         # Test extract_text_only=False maps to raw
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", DeprecationWarning)
             config2 = FetchConfig(extract_text_only=False)
-            assert config2.content_processor == "raw"
+            assert config2.content_processor == ContentProcessor.RAW
 
     def test_processor_initialization(self) -> None:
         """Test UrlFetchProcessor initialization."""
@@ -285,6 +286,34 @@ def test_processor_initialization(self) -> None:
         p = UrlFetchProcessor(config)
         assert p.config.timeout == 45.0
 
+    def test_content_processor_enum(self) -> None:
+        """Test ContentProcessor enum functionality."""
+        # Test enum usage
+        config1 = FetchConfig(content_processor=ContentProcessor.MARKITDOWN)
+        assert config1.content_processor == ContentProcessor.MARKITDOWN
+        assert config1.content_processor.value == "markitdown"
+
+        config2 = FetchConfig(content_processor=ContentProcessor.RAW)
+        assert config2.content_processor == ContentProcessor.RAW
+        assert config2.content_processor.value == "raw"
+
+        config3 = FetchConfig(content_processor=ContentProcessor.BEAUTIFULSOUP)
+        assert config3.content_processor == ContentProcessor.BEAUTIFULSOUP
+        assert config3.content_processor.value == "beautifulsoup"
+
+        # Test string-to-enum conversion (backward compatibility)
+        config4 = FetchConfig(content_processor="markitdown")
+        assert config4.content_processor == ContentProcessor.MARKITDOWN
+        assert isinstance(config4.content_processor, ContentProcessor)
+
+        # Test invalid string raises ValueError
+        expected_error = (
+            r"Invalid content_processor 'invalid_processor'\. "
+            r"Valid values are: 'beautifulsoup', 'markitdown', 'raw'\."
+        )
+        with pytest.raises(ValueError, match=expected_error):
+            FetchConfig(content_processor="invalid_processor")
+
     @pytest.mark.anyio
     async def test_default_security_settings(self) -> None:
         """Test that security is enabled by default."""
diff --git a/genai_processors_url_fetch/url_fetch.py b/genai_processors_url_fetch/url_fetch.py
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/uv.lock b/uv.lock