diff --git a/.gitignore b/.gitignore index c132d3b4..4daa0eb2 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,9 @@ storage # any output from the crawler *.json pnpm-lock.yaml + +# Final ouputs folder +outputs + +# VS Code workspace files +*.code-workspace diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 00000000..fe827dbd --- /dev/null +++ b/.prettierignore @@ -0,0 +1,30 @@ +# Ignore artifacts + +node_modules +.github +storage +outputs +*.code-workspace + +## This file tells which files shouldn't be added to source control + +.idea +dist +node_modules +apify_storage +crawlee_storage +storage +.DS_Store + +## any output from the crawler + +*.json +pnpm-lock.yaml + +## Final ouputs folder + +outputs + +## VS Code workspace files + +*.code-workspace diff --git a/README.md b/README.md index 43bfe4c7..33c67488 100644 --- a/README.md +++ b/README.md @@ -64,32 +64,112 @@ export const defaultConfig: Config = { }; ``` -See [config.ts](src/config.ts) for all available options. Here is a sample of the common configu options: +See [config.ts](src/config.ts) for all available options. Here is a sample of the common config options: -```ts +````ts type Config = { - /** URL to start the crawl, if sitemap is provided then it will be used instead and download all pages in the sitemap */ + /** + * URL to start the crawl, if url is a sitemap, it will crawl all pages in the sitemap + * @example "https://www.builder.io/c/docs/developers" + * @example "https://www.builder.io/sitemap.xml" + * @default "" + * @required + */ url: string; - /** Pattern to match against for links on a page to subsequently crawl */ + /** + * Pattern to match against for links on a page to subsequently crawl + * @example "https://www.builder.io/c/docs/**" + * @default "" + */ match: string; - /** Selector to grab the inner text from */ + /** + * Selector to grab the inner text from + * @example ".docs-builder-container" + * @default "" + * @required + */ selector: string; - /** Don't crawl more than this many pages */ + /** + * Don't crawl more than this many pages + * @default 50 + */ maxPagesToCrawl: number; - /** File name for the finished data */ + /** + * File name for the finished data + * @example "output.json" + */ outputFileName: string; - /** Optional resources to exclude - * + /** + * Cookie to be set. E.g. for Cookie Consent + */ + cookie?: { + name: string, + value: string, + url: string, + }; + /** + * Function to run for each page found + */ + onVisitPage?: (page: object, data: string); + /** + * Timeout to wait for a selector to appear + */ + waitForSelectorTimeout: object; + /** + * Resource file extensions to exclude from crawl * @example * ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab'] */ resourceExclusions?: string[]; - /** Optional maximum file size in megabytes to include in the output file */ + /** + * Maximum file size in megabytes to include in the output file + * @example 1 + */ maxFileSize?: number; - /** Optional maximum number tokens to include in the output file */ + /** + * The maximum number tokens to include in the output file + * @example 5000 + */ maxTokens?: number; + /** + * Maximum concurent parellel requets at a time Maximum concurent parellel requets at a time + * @example + * Specific number of parellel requests + * ```ts + * maxConcurrency: 2; + * ``` + * @example + * 0 = Unlimited, Doesn't stop until cancelled + * text outside of the code block as regular text. + * ```ts + * maxConcurrency: 0; + * ``` + * @example + * undefined = max parellel requests possible + * ```ts + * maxConcurrency: undefined; + * ``` + * @default 1 + */ + maxConcurrency?: number; + /** + * Range for random number of milliseconds between **min** and **max** to wait after each page crawl + * @default {min:1000,max:1000} + * @example {min:1000, max:2000} + */ + waitPerPageCrawlTimeoutRange?: { + min: number, + max: number, + }; + + /** Optional - Boolean parameter to use PlayWright with displayed browser or headless ( default headless=True ). */ + /** + * Headless mode + * @default true + */ + headless?: boolean; }; -``` +```` #### Run your crawler @@ -103,6 +183,22 @@ npm start To obtain the `output.json` with a containerized execution. Go into the `containerapp` directory. Modify the `config.ts` same as above, the `output.json`file should be generated in the data folder. Note : the `outputFileName` property in the `config.ts` file in containerapp folder is configured to work with the container. +#### [Running as a CLI](#running-as-a-cli) + +To run the `./dist/cli.ts` command line interface, follow these instructions: + +1. Open a terminal. +2. Navigate to the root directory of the project. +3. Run the following command: `./dist/cli.ts [arguments]` + Replace `[arguments]` with the appropriate command line arguments for your use case. +4. The CLI will execute the specified command and display the output in the terminal. + +> Note: Make sure you have the necessary dependencies installed and the project has been built before running the CLI. + +#### [Development](#development) + +> Instructions for Development will go here... + ### Upload your data to OpenAI The crawl will generate a file called `output.json` at the root of this project. Upload that [to OpenAI](https://platform.openai.com/docs/assistants/overview) to create your custom assistant or custom GPT. diff --git a/config.ts b/config.ts index bc2d22e0..e289244f 100644 --- a/config.ts +++ b/config.ts @@ -1,8 +1,33 @@ import { Config } from "./src/config"; +import { fileURLToPath } from "url"; +import { dirname } from "path"; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +const startingUrl = "https://www.builder.io/c/docs/developers"; +const urlPrefix = "https://"; +const domain = "www.builder.io"; +const urlSuffix = "/c/docs"; +const baseUrl = urlPrefix + domain; +const matchUrl_prefix = baseUrl + urlSuffix; +const matchUrl = matchUrl_prefix + "/**"; + +// Now date stamp for output file name +const now = new Date(); +const date = now.toISOString().split("T")[0]; +const time = now.toTimeString().split(" ")[0]; +const outputs_dir = __dirname.split("/").slice(0, -1).join("/") + "/outputs"; + +const outputFileName = + outputs_dir + "/" + domain + "-" + date + "-" + time + ".json"; export const defaultConfig: Config = { - url: "https://www.builder.io/c/docs/developers", - match: "https://www.builder.io/c/docs/**", + url: startingUrl, + match: matchUrl, maxPagesToCrawl: 50, - outputFileName: "output.json", + outputFileName: outputFileName, + waitPerPageCrawlTimeoutRange: { min: 1000, max: 1000 }, + headless: true, + maxConcurrency: 1, }; diff --git a/containerapp/Dockerfile b/containerapp/Dockerfile index 876a9a10..1edbf8cf 100644 --- a/containerapp/Dockerfile +++ b/containerapp/Dockerfile @@ -28,8 +28,9 @@ RUN cd /home && git clone https://github.com/builderio/gpt-crawler && cd gpt-cra npx playwright install && \ npx playwright install-deps -# Directory to mount in the docker container to get the output.json data +# Directories to mount in the docker container to get the output json data RUN cd /home && mkdir data - +# Final output directory +RUN cd /home && mkdir outputs WORKDIR /home \ No newline at end of file diff --git a/containerapp/data/config.ts b/containerapp/data/config.ts index eb923667..45b81fd7 100644 --- a/containerapp/data/config.ts +++ b/containerapp/data/config.ts @@ -1,3 +1,4 @@ +// @ts-ignore import { Config } from "./src/config"; export const defaultConfig: Config = { diff --git a/src/config.ts b/src/config.ts index 7e5f5fbf..d5417738 100644 --- a/src/config.ts +++ b/src/config.ts @@ -10,15 +10,16 @@ export const configSchema = z.object({ * @example "https://www.builder.io/c/docs/developers" * @example "https://www.builder.io/sitemap.xml" * @default "" + * @required */ url: z.string(), /** * Pattern to match against for links on a page to subsequently crawl * @example "https://www.builder.io/c/docs/**" * @default "" + * @required */ match: z.string().or(z.array(z.string())), - /** * Selector to grab the inner text from * @example ".docs-builder-container" @@ -29,20 +30,24 @@ export const configSchema = z.object({ * Don't crawl more than this many pages * @default 50 */ - maxPagesToCrawl: z.number().int().positive(), + maxPagesToCrawl: z.number().int().nonnegative().or(z.undefined()).optional(), /** * File name for the finished data - * @default "output.json" + * @example "output.json" */ outputFileName: z.string(), - /** Optional cookie to be set. E.g. for Cookie Consent */ + /** + * Cookie to be set. E.g. for Cookie Consent + */ cookie: z .object({ name: z.string(), value: z.string(), }) .optional(), - /** Optional function to run for each page found */ + /** + * Function to run for each page found + */ onVisitPage: z .function() .args( @@ -53,23 +58,60 @@ export const configSchema = z.object({ ) .returns(z.promise(z.void())) .optional(), - /** Optional timeout for waiting for a selector to appear */ - waitForSelectorTimeout: z.number().int().nonnegative().optional(), - /** Optional resources to exclude - * + /** + * Resources to exclude * @example * ['png','jpg','jpeg','gif','svg','css','js','ico','woff','woff2','ttf','eot','otf','mp4','mp3','webm','ogg','wav','flac','aac','zip','tar','gz','rar','7z','exe','dmg','apk','csv','xls','xlsx','doc','docx','pdf','epub','iso','dmg','bin','ppt','pptx','odt','avi','mkv','xml','json','yml','yaml','rss','atom','swf','txt','dart','webp','bmp','tif','psd','ai','indd','eps','ps','zipx','srt','wasm','m4v','m4a','webp','weba','m4b','opus','ogv','ogm','oga','spx','ogx','flv','3gp','3g2','jxr','wdp','jng','hief','avif','apng','avifs','heif','heic','cur','ico','ani','jp2','jpm','jpx','mj2','wmv','wma','aac','tif','tiff','mpg','mpeg','mov','avi','wmv','flv','swf','mkv','m4v','m4p','m4b','m4r','m4a','mp3','wav','wma','ogg','oga','webm','3gp','3g2','flac','spx','amr','mid','midi','mka','dts','ac3','eac3','weba','m3u','m3u8','ts','wpl','pls','vob','ifo','bup','svcd','drc','dsm','dsv','dsa','dss','vivo','ivf','dvd','fli','flc','flic','flic','mng','asf','m2v','asx','ram','ra','rm','rpm','roq','smi','smil','wmf','wmz','wmd','wvx','wmx','movie','wri','ins','isp','acsm','djvu','fb2','xps','oxps','ps','eps','ai','prn','svg','dwg','dxf','ttf','fnt','fon','otf','cab'] */ resourceExclusions: z.array(z.string()).optional(), - - /** Optional maximum file size in megabytes to include in the output file + /** + * Maximum file size in megabytes to include in the output file * @example 1 */ maxFileSize: z.number().int().positive().optional(), - /** Optional maximum number tokens to include in the output file + /** + * The maximum number tokens to include in the output file * @example 5000 */ maxTokens: z.number().int().positive().optional(), + /** + * Maximum concurent parellel requets at a time Maximum concurent parellel requets at a time + * @example + * Specific number of parellel requests + * ```ts + * maxConcurrency: 2; + * ``` + * @example + * 0 = Unlimited, Doesn't stop until cancelled + * text outside of the code block as regular text. + * ```ts + * maxConcurrency: 0; + * ``` + * @example + * undefined = max parellel requests possible + * ```ts + * maxConcurrency: undefined; + * ``` + * @default 1 + */ + maxConcurrency: z.number().int().nonnegative().optional(), + /** + * Range for random number of milliseconds between **min** and **max** to wait after each page crawl + * @default {min:1000,max:1000} + * @example {min:1000,max:2000} + */ + waitForSelectorTimeout: z.number().int().nonnegative().optional(), + waitPerPageCrawlTimeoutRange: z + .object({ + min: z.number().int().nonnegative(), + max: z.number().int().nonnegative(), + }) + .optional(), + /** + * Headless mode + * @default true + */ + headless: z.boolean().optional(), }); export type Config = z.infer; diff --git a/src/core.ts b/src/core.ts index 8e03bbe5..7741c108 100644 --- a/src/core.ts +++ b/src/core.ts @@ -47,6 +47,13 @@ export async function waitForXPath(page: Page, xpath: string, timeout: number) { } export async function crawl(config: Config) { + // Function to delay the next crawl + function delay(time: number) { + return new Promise(function (resolve) { + setTimeout(resolve, time); + }); + } + configSchema.parse(config); if (process.env.NO_CRAWL !== "true") { @@ -55,6 +62,14 @@ export async function crawl(config: Config) { const crawler = new PlaywrightCrawler({ // Use the requestHandler to process each of the crawled pages. async requestHandler({ request, page, enqueueLinks, log, pushData }) { + // Warn if unlimited crawling is enabled + if (config.maxPagesToCrawl == 0) { + config.maxPagesToCrawl = undefined; + log.warningOnce( + `maxPagesToCrawl is set to ${config.maxPagesToCrawl} which means it will contine until it cannot find anymore links defined by match: ${config.match}`, + ); + } + if (config.cookie) { // Set the cookie for the specific URL const cookie = { @@ -66,9 +81,12 @@ export async function crawl(config: Config) { } const title = await page.title(); + // Display the pageCounter/maxPagesToCrawl number or pageCounter/∞ if maxPagesToCrawl=0 + const maxPagesToCrawlDisplay = + config.maxPagesToCrawl == undefined ? "∞" : config.maxPagesToCrawl; pageCounter++; log.info( - `Crawling: Page ${pageCounter} / ${config.maxPagesToCrawl} - URL: ${request.loadedUrl}...`, + `Crawling: Page ${pageCounter} / ${maxPagesToCrawlDisplay} - URL: ${request.loadedUrl}...`, ); // Use custom handling for XPath selector @@ -101,11 +119,29 @@ export async function crawl(config: Config) { globs: typeof config.match === "string" ? [config.match] : config.match, }); + // Use waitPerPageCrawlTimeoutRange to handle rate limiting + if (config.waitPerPageCrawlTimeoutRange) { + // Create a random number between min and max + const randomTimeout = Math.floor( + Math.random() * + (config.waitPerPageCrawlTimeoutRange.max - + config.waitPerPageCrawlTimeoutRange.min + + 1) + + config.waitPerPageCrawlTimeoutRange.min, + ); + log.info( + `Waiting ${randomTimeout} milliseconds before next crawl to avoid rate limiting...`, + ); + // Wait for the random amount of time before crawling the next page + await delay(randomTimeout); + } else { + // Wait for 1 second before crawling the next page + await delay(1000); + } }, - // Comment this option to scrape the full website. - maxRequestsPerCrawl: config.maxPagesToCrawl, - // Uncomment this option to see the browser window. - // headless: false, + maxConcurrency: config.maxConcurrency || 1, // Set the max concurrency + maxRequestsPerCrawl: config.maxPagesToCrawl, // Set the max pages to crawl or set to 0 to scrape the full website. + headless: config.headless ?? true, // Set to false to see the browser in action preNavigationHooks: [ // Abort requests for certain resource types async ({ page, log }) => {