Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 15 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,21 @@ This framework supports APIs for Firecrawl, Apify, ScraperAPI, ScrapingBee, Zyte

Below are evaluation results across different engines.

| Engine | Coverage (Success Rate) (%) | Quality (F1) |
|-----------------|-----------------------------|--------------|
| Firecrawl | 80.9 | 0.68 |
| Exa | 76.3 | 0.53 |
| Tavily | 67.6 | 0.50 |
| ScraperAPI | 63.5 | 0.45 |
| Zyte | 62.9 | 0.47 |
| ScrapingBee | 60.6 | 0.45 |
| Apify | 60.2 | 0.42 |
| Crawl4ai | 58.0 | 0.45 |
| Selenium | 55.0 | 0.40 |
| Scrapy | 54.0 | 0.43 |
| Puppeteer | 53.7 | 0.41 |
| Rest (requests) | 50.6 | 0.36 |
| Playwright | 39.5 | 0.34 |
| Engine | Coverage (Success Rate) (%) | Quality (F1) |
|-----------------|------------------------------|--------------|
| Firecrawl | 80.9 | 0.68 |
| Exa | 76.3 | 0.53 |
| Apify | 75.8 | 0.58 |
| Tavily | 67.6 | 0.50 |
| ScraperAPI | 63.5 | 0.45 |
| Zyte | 62.9 | 0.47 |
| ScrapingBee | 60.6 | 0.45 |
| Crawl4ai | 58.0 | 0.45 |
| Selenium | 55.0 | 0.40 |
| Scrapy | 54.0 | 0.43 |
| Puppeteer | 53.7 | 0.41 |
| Rest (requests) | 50.6 | 0.36 |
| Playwright | 39.5 | 0.34 |

## Install

Expand Down
44 changes: 20 additions & 24 deletions engines/apify_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,33 +24,28 @@ def __init__(self):
if not self.api_token:
raise RuntimeError("APIFY_API_TOKEN environment variable not set.")
self.client = ApifyClient(self.api_token)
self.actor_id = "apify/web-scraper"
self.actor_id = "apify/website-content-crawler"

def scrape(self, url: str, run_id: str) -> ScrapeResult:
error = None
html = ""
markdown = ""
content_size = 0
status_code = 500
status_code = 500
try:
# Start the actor and wait for it to finish
actor_client = self.client.actor(self.actor_id)
run_result = actor_client.call(
run_input={
"startUrls": [{"url": url}],
"maxRequestsPerCrawl": 1,
"pseudoUrls": [],
"linkSelector": "",
"proxyConfiguration": {"useApifyProxy": True},
"crawlerType": "chrome",
"pageFunction": """
async function(context) {
const $ = context.jQuery;
return {
html: $('body').html(),
status_code: context.response ? context.response.status : null
};
}
"""
"crawlerType": "playwright:adaptive",
"maxCrawlPages": 1,
"saveFiles": False,
"saveHtml": False,
"saveHtmlAsFile": False,
"saveMarkdown": True,
"saveScreenshots": False,
"signHttpRequests": False,
"proxyConfiguration": {"useApifyProxy": True}
},
timeout_secs=120 # Wait up to 2 minutes
)
Expand All @@ -60,12 +55,13 @@ def scrape(self, url: str, run_id: str) -> ScrapeResult:
dataset_id = run_result["defaultDatasetId"]
dataset_client = self.client.dataset(dataset_id)
items = dataset_client.list_items().items
if items and "html" in items[0]:
html = items[0]["html"] or ""
status_code = items[0].get("status_code")
content_size = len(html.encode("utf-8")) if html else 0
if items and "markdown" in items[0]:
markdown = items[0]["markdown"] or ""
crawl_data = items[0].get("crawl")
status_code = crawl_data.get("httpStatusCode")
content_size = len(markdown.encode("utf-8")) if markdown else 0
else:
error = "No HTML found in Apify dataset result."
error = "No markdown found in Apify dataset result."
except Exception as e:
error = str(e)

Expand All @@ -76,7 +72,7 @@ def scrape(self, url: str, run_id: str) -> ScrapeResult:
status_code=status_code or 500,
error=error,
content_size=content_size,
format="html",
format="markdown",
created_at=datetime.now().isoformat(),
content=html,
content=markdown,
)
10 changes: 5 additions & 5 deletions runs/results/apify_api_quality.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"avg_recall": 0.4088987780290431,
"avg_precision": 0.4309147557081136,
"avg_f1": 0.4166200898332274,
"success_rate": 0.6021505376344086
}
"success_rate": 0.758,
"avg_recall": 0.490671096073996,
"avg_precision": 0.5579099299255283,
"avg_f1": 0.5082330459356168
}