Skip to content

Commit 82cc59f

Browse files
authored
feat: Introduce ContentProcessor enum for content processing options (#8)
1 parent 298d253 commit 82cc59f

File tree

9 files changed

+115
-53
lines changed

9 files changed

+115
-53
lines changed

README.md

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,13 @@ uv add genai-processors-url-fetch[markitdown]
4545

4646
```python
4747
from genai_processors import processor
48-
from genai_processors_url_fetch import UrlFetchProcessor, FetchConfig
48+
from genai_processors_url_fetch import UrlFetchProcessor, FetchConfig, ContentProcessor
4949

5050
# Basic usage with defaults (BeautifulSoup text extraction)
5151
fetcher = UrlFetchProcessor()
5252

5353
# Use markitdown for richer content processing
54-
config = FetchConfig(content_processor="markitdown")
54+
config = FetchConfig(content_processor=ContentProcessor.MARKITDOWN)
5555
markitdown_fetcher = UrlFetchProcessor(config)
5656

5757
# Process text containing URLs
@@ -81,7 +81,7 @@ All security features are enabled by default but can be configured via the Fetch
8181
The processor uses a dataclass-based configuration system for clean, type-safe settings. You can customize the processor's behavior by passing a FetchConfig object during initialization.
8282

8383
```python
84-
from genai_processors_url_fetch import UrlFetchProcessor, FetchConfig
84+
from genai_processors_url_fetch import UrlFetchProcessor, FetchConfig, ContentProcessor
8585

8686
# Example of a customized security configuration
8787
config = FetchConfig(
@@ -104,12 +104,13 @@ The `FetchConfig` dataclass provides comprehensive configuration options organiz
104104
* **user_agent** (str, default: "GenAI-Processors/UrlFetchProcessor"): The User-Agent string to send with HTTP requests.
105105
* **include_original_part** (bool, default: True): If True, the original ProcessorPart that contained the URL(s) will be yielded at the end of processing.
106106
* **fail_on_error** (bool, default: False): If True, the processor will raise a RuntimeError on the first failed fetch.
107-
* **content_processor** (Literal["beautifulsoup", "markitdown", "raw"], default: "beautifulsoup"): Content processing method.
108-
- `"beautifulsoup"`: Extract clean text using BeautifulSoup (fastest, good for simple HTML)
109-
- `"markitdown"`: Convert content to markdown using Microsoft's markitdown library (best for rich content, requires optional dependency)
110-
- `"raw"`: Return the raw HTML content without processing
107+
* **content_processor** (ContentProcessor, default: ContentProcessor.BEAUTIFULSOUP): Content processing method.
108+
* `ContentProcessor.BEAUTIFULSOUP`: Extract clean text using BeautifulSoup (fastest, good for simple HTML)
109+
* `ContentProcessor.MARKITDOWN`: Convert content to markdown using Microsoft's markitdown library (best for rich content, requires optional dependency)
110+
* `ContentProcessor.RAW`: Return the raw HTML content without processing
111+
* **Note:** String values ("beautifulsoup", "markitdown", "raw") are automatically converted to enum values for backward compatibility.
111112
* **markitdown_options** (dict[str, Any], default: {}): Options passed to the markitdown MarkItDown constructor when using markitdown processor.
112-
* **extract_text_only** (bool | None, default: None): **Deprecated.** Use `content_processor` instead. For backward compatibility: `True` maps to `"beautifulsoup"`, `False` maps to `"raw"`.
113+
* **extract_text_only** (bool | None, default: None): **Deprecated.** Use `content_processor` instead. For backward compatibility: `True` maps to `ContentProcessor.BEAUTIFULSOUP`, `False` maps to `ContentProcessor.RAW`.
113114

114115
##### Security Controls
115116

@@ -128,7 +129,7 @@ The UrlFetchProcessor supports three content processing methods via the `content
128129
#### BeautifulSoup (Default)
129130

130131
```python
131-
config = FetchConfig(content_processor="beautifulsoup")
132+
config = FetchConfig(content_processor=ContentProcessor.BEAUTIFULSOUP)
132133
fetcher = UrlFetchProcessor(config)
133134
# Returns: Clean text extracted from HTML, fastest processing
134135
# Mimetype: "text/plain; charset=utf-8"
@@ -140,7 +141,7 @@ The markitdown processor provides the richest content extraction by converting H
140141

141142
```python
142143
config = FetchConfig(
143-
content_processor="markitdown",
144+
content_processor=ContentProcessor.MARKITDOWN,
144145
markitdown_options={
145146
"extract_tables": True, # Preserve table structure
146147
"preserve_links": True, # Keep link formatting
@@ -169,7 +170,7 @@ fetcher = UrlFetchProcessor(config)
169170
#### Raw HTML
170171

171172
```python
172-
config = FetchConfig(content_processor="raw")
173+
config = FetchConfig(content_processor=ContentProcessor.RAW)
173174
fetcher = UrlFetchProcessor(config)
174175
# Returns: Original HTML content without processing
175176
# Mimetype: "text/html; charset=utf-8"
@@ -229,11 +230,11 @@ for content_part in successful_content:
229230

230231
```python
231232
from genai_processors import streams
232-
from genai_processors_url_fetch import UrlFetchProcessor, FetchConfig
233+
from genai_processors_url_fetch import UrlFetchProcessor, FetchConfig, ContentProcessor
233234

234235
# Configure markitdown processor for rich content extraction
235236
config = FetchConfig(
236-
content_processor="markitdown",
237+
content_processor=ContentProcessor.MARKITDOWN,
237238
include_original_part=False,
238239
markitdown_options={
239240
"extract_tables": True,

examples/markitdown_example.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from genai_processors import processor
1111

12-
from genai_processors_url_fetch import FetchConfig, UrlFetchProcessor
12+
from genai_processors_url_fetch import ContentProcessor, FetchConfig, UrlFetchProcessor
1313

1414

1515
async def main() -> None:
@@ -22,7 +22,7 @@ async def main() -> None:
2222

2323
# Configure with markitdown processor
2424
config = FetchConfig(
25-
content_processor="markitdown",
25+
content_processor=ContentProcessor.MARKITDOWN,
2626
include_original_part=False,
2727
timeout=10.0,
2828
markitdown_options={

examples/url_content_summarizer.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,11 @@
3232
from genai_processors.core import genai_model
3333
from google.genai import types as genai_types
3434

35-
from genai_processors_url_fetch.url_fetch import FetchConfig, UrlFetchProcessor
35+
from genai_processors_url_fetch.url_fetch import (
36+
ContentProcessor,
37+
FetchConfig,
38+
UrlFetchProcessor,
39+
)
3640

3741
# Get API key from environment
3842
API_KEY = os.environ.get("GEMINI_API_KEY", "")
@@ -77,7 +81,7 @@ async def run_url_summarizer() -> None:
7781
timeout=10.0,
7882
max_response_size=2 * 1024 * 1024, # 2MB limit
7983
include_original_part=False, # Only show fetched content
80-
extract_text_only=True, # Convert HTML to text
84+
content_processor=ContentProcessor.BEAUTIFULSOUP, # Extract clean text
8185
# Security: Only allow HTTPS and block private IPs
8286
allowed_schemes=["https"],
8387
block_private_ips=True,

genai_processors_url_fetch/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
This is an independent contrib processor for the genai-processors ecosystem.
99
"""
1010

11-
from .url_fetch import FetchConfig, UrlFetchProcessor
11+
from .url_fetch import ContentProcessor, FetchConfig, UrlFetchProcessor
1212

13-
__version__ = "0.2.0"
14-
__all__ = ["UrlFetchProcessor", "FetchConfig"]
13+
__version__ = "0.3.0"
14+
__all__ = ["UrlFetchProcessor", "FetchConfig", "ContentProcessor"]

genai_processors_url_fetch/tests/test_markitdown.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import pytest
77
from genai_processors import processor
88

9-
from genai_processors_url_fetch import FetchConfig, UrlFetchProcessor
9+
from genai_processors_url_fetch import ContentProcessor, FetchConfig, UrlFetchProcessor
1010

1111

1212
class TestMarkitdownIntegration:
@@ -18,12 +18,12 @@ def test_markitdown_config_validation(self) -> None:
1818
try:
1919
import markitdown # noqa: F401
2020

21-
config = FetchConfig(content_processor="markitdown")
22-
assert config.content_processor == "markitdown"
21+
config = FetchConfig(content_processor=ContentProcessor.MARKITDOWN)
22+
assert config.content_processor == ContentProcessor.MARKITDOWN
2323
except ImportError:
2424
# Should raise ImportError if markitdown is not available
2525
with pytest.raises(ImportError, match="markitdown is required"):
26-
FetchConfig(content_processor="markitdown")
26+
FetchConfig(content_processor=ContentProcessor.MARKITDOWN)
2727

2828
def test_markitdown_options_passed_correctly(self) -> None:
2929
"""Test markitdown options are passed to MarkItDown constructor."""
@@ -145,15 +145,15 @@ def test_backward_compatibility_extract_text_only(self) -> None:
145145
assert len(w) == 1
146146
assert issubclass(w[0].category, DeprecationWarning)
147147
assert "extract_text_only is deprecated" in str(w[0].message)
148-
assert config1.content_processor == "beautifulsoup"
148+
assert config1.content_processor == ContentProcessor.BEAUTIFULSOUP
149149

150150
# Test extract_text_only=False maps to raw
151151
with warnings.catch_warnings(record=True) as w:
152152
warnings.simplefilter("always")
153153
config2 = FetchConfig(extract_text_only=False)
154154
assert len(w) == 1
155155
assert issubclass(w[0].category, DeprecationWarning)
156-
assert config2.content_processor == "raw"
156+
assert config2.content_processor == ContentProcessor.RAW
157157

158158
@pytest.mark.anyio
159159
async def test_invalid_content_processor_raises_error(self) -> None:
@@ -217,9 +217,9 @@ def test_markitdown_config_import_error_when_not_available(self) -> None:
217217
# Mock HAS_MARKITDOWN to be False to simulate markitdown not available
218218
patch_path = "genai_processors_url_fetch.url_fetch.HAS_MARKITDOWN"
219219
with patch(patch_path, new=False):
220-
expected_msg = "markitdown is required for content_processor"
220+
expected_msg = "markitdown is required for ContentProcessor.MARKITDOWN"
221221
with pytest.raises(ImportError, match=expected_msg):
222-
FetchConfig(content_processor="markitdown")
222+
FetchConfig(content_processor=ContentProcessor.MARKITDOWN)
223223

224224
def test_extract_text_only_overrides_markitdown(self) -> None:
225225
"""Test extract_text_only parameter overrides markitdown setting."""
@@ -236,12 +236,12 @@ def test_extract_text_only_overrides_markitdown(self) -> None:
236236
assert len(w) == 1
237237
assert issubclass(w[0].category, DeprecationWarning)
238238
assert "extract_text_only is deprecated" in str(w[0].message)
239-
assert config.content_processor == "raw" # Overridden
239+
assert config.content_processor == ContentProcessor.RAW # Overridden
240240

241241
except ImportError:
242242
# If markitdown not available, test override with default
243243
with warnings.catch_warnings(record=True) as w:
244244
warnings.simplefilter("always")
245245
config = FetchConfig(extract_text_only=False)
246246
assert len(w) == 1
247-
assert config.content_processor == "raw"
247+
assert config.content_processor == ContentProcessor.RAW

genai_processors_url_fetch/tests/test_url_fetch.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
from genai_processors_url_fetch.url_fetch import (
1111
URL_REGEX,
12+
ContentProcessor,
1213
FetchConfig,
1314
UrlFetchProcessor,
1415
)
@@ -200,9 +201,9 @@ async def test_fail_on_error_config(self) -> None:
200201

201202
@pytest.mark.anyio
202203
async def test_content_processor_raw_config(self) -> None:
203-
"""Test the content_processor='raw' configuration option."""
204+
"""Test the content_processor=ContentProcessor.RAW configuration option."""
204205
config = FetchConfig(
205-
content_processor="raw",
206+
content_processor=ContentProcessor.RAW,
206207
include_original_part=False,
207208
)
208209
p = UrlFetchProcessor(config)
@@ -243,20 +244,20 @@ def test_fetch_config_initialization(self) -> None:
243244
assert config.timeout == 15.0
244245
assert config.include_original_part is True
245246
assert config.fail_on_error is False
246-
assert config.content_processor == "beautifulsoup"
247+
assert config.content_processor == ContentProcessor.BEAUTIFULSOUP
247248
assert config.extract_text_only is None # deprecated field
248249

249250
# Custom config
250251
config = FetchConfig(
251252
timeout=30.0,
252253
include_original_part=False,
253254
fail_on_error=True,
254-
content_processor="raw",
255+
content_processor=ContentProcessor.RAW,
255256
)
256257
assert config.timeout == 30.0
257258
assert config.include_original_part is False
258259
assert config.fail_on_error is True
259-
assert config.content_processor == "raw"
260+
assert config.content_processor == ContentProcessor.RAW
260261

261262
def test_backward_compatibility_extract_text_only(self) -> None:
262263
"""Test backward compatibility for extract_text_only parameter."""
@@ -266,13 +267,13 @@ def test_backward_compatibility_extract_text_only(self) -> None:
266267
with warnings.catch_warnings():
267268
warnings.simplefilter("ignore", DeprecationWarning)
268269
config1 = FetchConfig(extract_text_only=True)
269-
assert config1.content_processor == "beautifulsoup"
270+
assert config1.content_processor == ContentProcessor.BEAUTIFULSOUP
270271

271272
# Test extract_text_only=False maps to raw
272273
with warnings.catch_warnings():
273274
warnings.simplefilter("ignore", DeprecationWarning)
274275
config2 = FetchConfig(extract_text_only=False)
275-
assert config2.content_processor == "raw"
276+
assert config2.content_processor == ContentProcessor.RAW
276277

277278
def test_processor_initialization(self) -> None:
278279
"""Test UrlFetchProcessor initialization."""
@@ -285,6 +286,34 @@ def test_processor_initialization(self) -> None:
285286
p = UrlFetchProcessor(config)
286287
assert p.config.timeout == 45.0
287288

289+
def test_content_processor_enum(self) -> None:
290+
"""Test ContentProcessor enum functionality."""
291+
# Test enum usage
292+
config1 = FetchConfig(content_processor=ContentProcessor.MARKITDOWN)
293+
assert config1.content_processor == ContentProcessor.MARKITDOWN
294+
assert config1.content_processor.value == "markitdown"
295+
296+
config2 = FetchConfig(content_processor=ContentProcessor.RAW)
297+
assert config2.content_processor == ContentProcessor.RAW
298+
assert config2.content_processor.value == "raw"
299+
300+
config3 = FetchConfig(content_processor=ContentProcessor.BEAUTIFULSOUP)
301+
assert config3.content_processor == ContentProcessor.BEAUTIFULSOUP
302+
assert config3.content_processor.value == "beautifulsoup"
303+
304+
# Test string-to-enum conversion (backward compatibility)
305+
config4 = FetchConfig(content_processor="markitdown")
306+
assert config4.content_processor == ContentProcessor.MARKITDOWN
307+
assert isinstance(config4.content_processor, ContentProcessor)
308+
309+
# Test invalid string raises ValueError
310+
expected_error = (
311+
r"Invalid content_processor 'invalid_processor'\. "
312+
r"Valid values are: 'beautifulsoup', 'markitdown', 'raw'\."
313+
)
314+
with pytest.raises(ValueError, match=expected_error):
315+
FetchConfig(content_processor="invalid_processor")
316+
288317
@pytest.mark.anyio
289318
async def test_default_security_settings(self) -> None:
290319
"""Test that security is enabled by default."""

0 commit comments

Comments
 (0)