diff --git a/.gitignore b/.gitignore index 02a7e102..48c8c098 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ node_modules apify_storage crawlee_storage storage +output.json \ No newline at end of file diff --git a/config.ts b/config.ts index 84c15ba5..5b20aafd 100644 --- a/config.ts +++ b/config.ts @@ -5,6 +5,8 @@ type Config = { url: string; /** Pattern to match against for links on a page to subsequently crawl */ match: string; + /** Optional REGEX to match URLs against for to NOT crawl */ + exclude?: string; /** Selector to grab the inner text from */ selector: string; /** Don't crawl more than this many pages */ @@ -23,6 +25,7 @@ type Config = { export const config: Config = { url: "https://www.builder.io/c/docs/developers", match: "https://www.builder.io/c/docs/**", + exclude: "integrate", selector: `.docs-builder-container`, maxPagesToCrawl: 50, outputFileName: "output.json", diff --git a/src/main.ts b/src/main.ts index c217d7cf..9d3a8529 100644 --- a/src/main.ts +++ b/src/main.ts @@ -4,6 +4,7 @@ import { readFile, writeFile } from "fs/promises"; import { glob } from "glob"; import { config } from "../config.js"; import { Page } from "playwright"; +import { URL } from "url"; export function getPageHtml(page: Page) { return page.evaluate((selector) => { @@ -24,7 +25,7 @@ if (process.env.NO_CRAWL !== "true") { const cookie = { name: config.cookie.name, value: config.cookie.value, - url: request.loadedUrl, + url: request.loadedUrl, }; await page.context().addCookies([cookie]); } @@ -47,10 +48,20 @@ if (process.env.NO_CRAWL !== "true") { // Extract links from the current page // and add them to the crawling queue. + const links = await page.$$eval('a', (as) => as.map(a => a.href)); + const filteredLinks = links.filter(link => { + // Check if the link matches the exclude pattern + const excludePattern = new RegExp(config.exclude || ""); + return !excludePattern.test(link); + }); + + // Enqueue filtered links await enqueueLinks({ + urls: filteredLinks, globs: [config.match], }); }, + // Comment this option to scrape the full website. maxRequestsPerCrawl: config.maxPagesToCrawl, // Uncomment this option to see the browser window.