diff --git a/examples/firecrawl_example.py b/examples/firecrawl_example.py new file mode 100644 index 0000000000..76797be2a5 --- /dev/null +++ b/examples/firecrawl_example.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Example usage of the Firecrawl tool with MetaGPT.""" + +import asyncio +import os +import sys +from pathlib import Path +import time + +# Add the project root to Python path +project_root = Path(__file__).parent.parent +sys.path.append(str(project_root)) + +from metagpt.tools.libs.firecrawl import Firecrawl + +async def main(): + """Run example usage of Firecrawl tool.""" + # Set up environment variables if not already set + if "FIRECRAWL_API_KEY" not in os.environ: + os.environ["FIRECRAWL_API_KEY"] = "YOUR-FIRECRAWL-API-KEY" + + # Create Firecrawl instance + firecrawl = Firecrawl() + + # Example 1: Search for information + print("\nExample 1: Search for Therapist in Portugal by name") + search_results = await firecrawl.search("Psicologa Clínica Mairí Stumpf") + print("Search Results:", search_results) + + # Example 2: Map and crawl a website + print("\nExample 2: Map and crawl a website") + map_results = await firecrawl.map_url("https://docs.firecrawl.dev") + print("Map Results:", map_results) + + if map_results.get("links"): + crawl_job = await firecrawl.crawl_url(map_results["links"][0]) + print("Crawl Job:", crawl_job) + + if crawl_job.get("id"): + status = await firecrawl.get_crawl_status(crawl_job["id"]) + print("Crawl Status:", status) + # While the status is not "completed" we can loop and print the status + while status.get("status") != "completed": + status = await firecrawl.get_crawl_status(crawl_job["id"]) + print("Crawl Status:", status) + await asyncio.sleep(5) + + # Example 3: Scrape a specific URL + print("\nExample 3: Scrape a URL") + scrape_results = await firecrawl.scrape_url("https://example.com") + print("Scrape Results:", scrape_results) + + # Example 4: Extract information from URLs + print("\nExample 4: Extract information") + extract_job = await firecrawl.extract( + ["https://www.imdb.com/chart/starmeter/"], + params={"prompt": "Extract the top five most popular celebs names and their popularity score if available"} + ) + print("Extract Job:", extract_job) + + if extract_job.get("id"): + extract_status = await firecrawl.get_extract_status(extract_job["id"]) + print("\nExtract Status:", extract_status) + + # While the status is not "completed" we can loop and print the status + while extract_status.get("status") != "completed": + extract_status = await firecrawl.get_extract_status(extract_job["id"]) + print("\nUpdated Status:", extract_status) + await asyncio.sleep(10) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/metagpt/tools/libs/firecrawl.py b/metagpt/tools/libs/firecrawl.py new file mode 100644 index 0000000000..f67ac0e388 --- /dev/null +++ b/metagpt/tools/libs/firecrawl.py @@ -0,0 +1,314 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Firecrawl Tool for MetaGPT. + +This module provides a tool for interacting with the Firecrawl API, enabling web scraping, +crawling, searching, and information extraction capabilities. + +Author: Ademílson Tonato +""" + +import os +from typing import Any, Dict, List, Optional, Union + +import aiohttp +from metagpt.tools.tool_registry import register_tool + + +@register_tool(tags=["web", "scraping", "search"], include_functions=["map_url", "scrape_url", "search", "crawl_url", "extract"]) +class Firecrawl: + """A tool for web scraping, crawling, searching and information extraction using Firecrawl API. + + This tool provides methods to interact with the Firecrawl API for various web data collection + and processing tasks. It supports URL mapping, scraping, searching, crawling, and information + extraction. + + Attributes: + api_key (str): The API key for authenticating with Firecrawl API. + api_url (str): The base URL for the Firecrawl API. + """ + + def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None): + """Initialize the Firecrawl tool. + + Args: + api_key (Optional[str]): API key for Firecrawl. Defaults to environment variable. + api_url (Optional[str]): Base URL for Firecrawl API. Defaults to production URL. + """ + self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') + if not self.api_key: + raise ValueError('No API key provided') + self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') + self.request_timeout = aiohttp.ClientTimeout(total=60) + + def _prepare_headers(self) -> Dict[str, str]: + """Prepare headers for API requests. + + Returns: + Dict[str, str]: Headers including content type and authorization. + """ + return { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {self.api_key}', + } + + def _prepare_request_data(self, data: Dict[str, Any]) -> Dict[str, Any]: + """Prepare request data with integration parameter. + + Args: + data (Dict[str, Any]): The original request data. + + Returns: + Dict[str, Any]: Request data with integration parameter added. + """ + data['integration'] = 'metagpt' + return data + + async def _handle_error(self, response: aiohttp.ClientResponse, action: str) -> None: + """Handle API errors. + + Args: + response (aiohttp.ClientResponse): The response from the API. + action (str): Description of the action being performed. + + Raises: + aiohttp.ClientResponseError: If the API request fails. + """ + try: + error_data = await response.json() + error_message = error_data.get('error', 'No error message provided.') + error_details = error_data.get('details', 'No additional error details provided.') + except: + raise aiohttp.ClientResponseError( + response.request_info, + response.history, + status=response.status, + message=f'Failed to parse Firecrawl error response as JSON. Status code: {response.status}' + ) + + message = f"Error during {action}: Status code {response.status}. {error_message} - {error_details}" + raise aiohttp.ClientResponseError( + response.request_info, + response.history, + status=response.status, + message=message + ) + + async def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Map a URL to discover all available links. + + Args: + url (str): The URL to map. + params (Optional[Dict[str, Any]]): Additional parameters for the mapping operation. + + Returns: + Dict[str, Any]: A dictionary containing the mapped URLs and related information. + + Raises: + aiohttp.ClientResponseError: If the API request fails. + """ + headers = self._prepare_headers() + json_data = {'url': url} + if params: + json_data.update(params) + json_data = self._prepare_request_data(json_data) + + async with aiohttp.ClientSession(timeout=self.request_timeout) as session: + async with session.post( + f'{self.api_url}/v1/map', + headers=headers, + json=json_data + ) as response: + if response.status == 200: + try: + return await response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + await self._handle_error(response, 'map URL') + + async def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Scrape content from a specific URL. + + Args: + url (str): The URL to scrape. + params (Optional[Dict[str, Any]]): Additional parameters for the scraping operation. + + Returns: + Dict[str, Any]: A dictionary containing the scraped content and metadata. + + Raises: + aiohttp.ClientResponseError: If the API request fails. + """ + headers = self._prepare_headers() + json_data = {'url': url} + if params: + json_data.update(params) + json_data = self._prepare_request_data(json_data) + + async with aiohttp.ClientSession(timeout=self.request_timeout) as session: + async with session.post( + f'{self.api_url}/v1/scrape', + headers=headers, + json=json_data + ) as response: + if response.status == 200: + try: + return await response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + await self._handle_error(response, 'scrape URL') + + async def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Perform a web search using Firecrawl. + + Args: + query (str): The search query. + params (Optional[Dict[str, Any]]): Additional parameters for the search operation. + + Returns: + Dict[str, Any]: A dictionary containing search results and metadata. + + Raises: + aiohttp.ClientResponseError: If the API request fails. + """ + headers = self._prepare_headers() + json_data = {'query': query} + if params: + json_data.update(params) + json_data = self._prepare_request_data(json_data) + + async with aiohttp.ClientSession(timeout=self.request_timeout) as session: + async with session.post( + f'{self.api_url}/v1/search', + headers=headers, + json=json_data + ) as response: + if response.status == 200: + try: + return await response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + await self._handle_error(response, 'search') + + async def crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Start a crawl job for a given URL. + + Args: + url (str): The URL to crawl. + params (Optional[Dict[str, Any]]): Additional parameters for the crawl operation. + + Returns: + Dict[str, Any]: A dictionary containing the crawl results and metadata. + + Raises: + aiohttp.ClientResponseError: If the API request fails. + """ + headers = self._prepare_headers() + json_data = {'url': url} + if params: + json_data.update(params) + json_data = self._prepare_request_data(json_data) + + async with aiohttp.ClientSession(timeout=self.request_timeout) as session: + async with session.post( + f'{self.api_url}/v1/crawl', + headers=headers, + json=json_data + ) as response: + if response.status == 200: + try: + return await response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + await self._handle_error(response, 'start crawl job') + + async def get_crawl_status(self, job_id: str) -> Dict[str, Any]: + """Get the status of a crawl job. + + Args: + job_id (str): The ID of the crawl job. + + Returns: + Dict[str, Any]: A dictionary containing the crawl job status and results. + + Raises: + aiohttp.ClientResponseError: If the API request fails. + """ + headers = self._prepare_headers() + + async with aiohttp.ClientSession(timeout=self.request_timeout) as session: + async with session.get( + f'{self.api_url}/v1/crawl/{job_id}', + headers=headers + ) as response: + if response.status == 200: + try: + return await response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + await self._handle_error(response, 'check crawl status') + + async def extract(self, urls: List[str], params: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: + """Extract structured information from URLs. + + Args: + urls (List[str]): List of URLs to extract information from. + params (Optional[Dict[str, Any]]): Additional parameters for the extraction operation. + + Returns: + Dict[str, Any]: A dictionary containing the extracted information and metadata. + + Raises: + aiohttp.ClientResponseError: If the API request fails. + """ + headers = self._prepare_headers() + json_data = {'urls': urls} + if params: + json_data.update(params) + json_data = self._prepare_request_data(json_data) + + async with aiohttp.ClientSession(timeout=self.request_timeout) as session: + async with session.post( + f'{self.api_url}/v1/extract', + headers=headers, + json=json_data + ) as response: + if response.status == 200: + try: + return await response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + await self._handle_error(response, 'extract') + + async def get_extract_status(self, job_id: str) -> Dict[str, Any]: + """Get the status of an extract job. + + Args: + job_id (str): The ID of the extract job. + + Returns: + Dict[str, Any]: A dictionary containing the extract job status and results. + + Raises: + aiohttp.ClientResponseError: If the API request fails. + """ + headers = self._prepare_headers() + + async with aiohttp.ClientSession(timeout=self.request_timeout) as session: + async with session.get( + f'{self.api_url}/v1/extract/{job_id}', + headers=headers + ) as response: + if response.status == 200: + try: + return await response.json() + except: + raise Exception('Failed to parse Firecrawl response as JSON.') + else: + await self._handle_error(response, 'check extract status') \ No newline at end of file diff --git a/tests/metagpt/tools/libs/test_firecrawl.py b/tests/metagpt/tools/libs/test_firecrawl.py new file mode 100644 index 0000000000..7226628017 --- /dev/null +++ b/tests/metagpt/tools/libs/test_firecrawl.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +"""Test module for the Firecrawl tool.""" + +import os +import pytest +from unittest.mock import MagicMock, patch, AsyncMock +import aiohttp + +from metagpt.tools.libs.firecrawl import Firecrawl + +API_KEY = "YOUR-FIRECRAWL-API-KEY" +API_URL = "https://api.firecrawl.dev" + +EXPECTED_HEADERS = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {API_KEY}', +} + +@pytest.fixture +def firecrawl(): + """Create a Firecrawl instance for testing.""" + return Firecrawl(api_key=API_KEY, api_url=API_URL) + +def test_initialization(): + tool = Firecrawl(api_key=API_KEY, api_url=API_URL) + assert tool.api_key == API_KEY + assert tool.api_url == API_URL + +def test_initialization_with_env_vars(): + os.environ["FIRECRAWL_API_KEY"] = API_KEY + os.environ["FIRECRAWL_API_URL"] = API_URL + tool = Firecrawl() + assert tool.api_key == API_KEY + assert tool.api_url == API_URL + del os.environ["FIRECRAWL_API_KEY"] + del os.environ["FIRECRAWL_API_URL"] + +def test_initialization_without_api_key(): + with pytest.raises(ValueError, match="No API key provided"): + Firecrawl() + +def mock_aiohttp_session(method: str, mock_response_data: dict, status: int = 200): + mock_response = AsyncMock() + mock_response.status = status + mock_response.json = AsyncMock(return_value=mock_response_data) + + mock_cm_response = MagicMock() + mock_cm_response.__aenter__ = AsyncMock(return_value=mock_response) + mock_cm_response.__aexit__ = AsyncMock(return_value=None) + + mock_method = MagicMock(return_value=mock_cm_response) + + mock_session = MagicMock() + setattr(mock_session, method, mock_method) + + mock_session_cm = MagicMock() + mock_session_cm.__aenter__ = AsyncMock(return_value=mock_session) + mock_session_cm.__aexit__ = AsyncMock(return_value=None) + + return mock_session_cm + +@pytest.mark.asyncio +async def test_map_url(firecrawl): + with patch("aiohttp.ClientSession", return_value=mock_aiohttp_session("post", {"success": True, "links": ["http://example.com/page1"]})): + result = await firecrawl.map_url("http://example.com") + assert result == {"success": True, "links": ["http://example.com/page1"]} + +@pytest.mark.asyncio +async def test_scrape_url(firecrawl): + with patch("aiohttp.ClientSession", return_value=mock_aiohttp_session("post", {"success": True, "data": {"title": "Example"}})): + result = await firecrawl.scrape_url("http://example.com") + assert result == {"success": True, "data": {"title": "Example"}} + +@pytest.mark.asyncio +async def test_search(firecrawl): + with patch("aiohttp.ClientSession", return_value=mock_aiohttp_session("post", {"success": True, "results": [{"title": "Test Result"}]})): + result = await firecrawl.search("test query") + assert result == {"success": True, "results": [{"title": "Test Result"}]} + +@pytest.mark.asyncio +async def test_crawl_url(firecrawl): + with patch("aiohttp.ClientSession", return_value=mock_aiohttp_session("post", {"success": True, "id": "test_job_id"})): + result = await firecrawl.crawl_url("http://example.com") + assert result == {"success": True, "id": "test_job_id"} + +@pytest.mark.asyncio +async def test_get_crawl_status(firecrawl): + with patch("aiohttp.ClientSession", return_value=mock_aiohttp_session("get", {"success": True, "status": "completed"})): + result = await firecrawl.get_crawl_status("test_job_id") + assert result == {"success": True, "status": "completed"} + +@pytest.mark.asyncio +async def test_extract(firecrawl): + with patch("aiohttp.ClientSession", return_value=mock_aiohttp_session("post", {"success": True, "data": {"extracted": "content"}})): + result = await firecrawl.extract(["http://example.com"]) + assert result == {"success": True, "data": {"extracted": "content"}} + +@pytest.mark.asyncio +async def test_get_extract_status(firecrawl): + with patch("aiohttp.ClientSession", return_value=mock_aiohttp_session("get", {"success": True, "status": "completed"})): + result = await firecrawl.get_extract_status("test_job_id") + assert result == {"success": True, "status": "completed"} + +@pytest.mark.asyncio +async def test_error_handling(firecrawl): + mock_response = AsyncMock() + mock_response.status = 400 + mock_response.json = AsyncMock(return_value={"error": "Test error", "details": "Test error details"}) + mock_response.request_info = MagicMock() + mock_response.history = [] + + mock_context = MagicMock() + mock_context.__aenter__ = AsyncMock(return_value=mock_response) + mock_context.__aexit__ = AsyncMock(return_value=None) + + def mock_post(*args, **kwargs): + return mock_context + + mock_session = MagicMock() + mock_session.post = mock_post + + mock_session_cm = MagicMock() + mock_session_cm.__aenter__ = AsyncMock(return_value=mock_session) + mock_session_cm.__aexit__ = AsyncMock(return_value=None) + + with patch("aiohttp.ClientSession", return_value=mock_session_cm): + with pytest.raises(aiohttp.ClientResponseError): + await firecrawl.map_url("http://example.com") + +@pytest.mark.asyncio +async def test_params_integration(firecrawl): + with patch("aiohttp.ClientSession", return_value=mock_aiohttp_session("post", {"success": True})): + params = {"param1": "value1", "param2": "value2"} + result = await firecrawl.map_url("http://example.com", params=params) + assert result == {"success": True}