diff --git a/.gitignore b/.gitignore index 3c3629e..4f228ac 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,9 @@ node_modules +**/dist/ +**/examples/ +**/*.traineddata +.env +.env.local +tmp/ +!tmp/.gitkeep +*.tsbuildinfo \ No newline at end of file diff --git a/cli/README.md b/cli/README.md new file mode 100644 index 0000000..cb67c7c --- /dev/null +++ b/cli/README.md @@ -0,0 +1,254 @@ +# Documind CLI + +Command-line interface for intelligent document processing and structured data extraction using Documind. + +## Installation + +```bash +# Install dependencies +npm install + +# Build the CLI +npm run build +``` + +## Usage + +The CLI provides several commands to work with documents: + +### Extract Command + +Extract structured data from documents using schemas, templates, or auto-generation. + +```bash +documind extract [options] +``` + +#### Options + +- `-f, --file ` - Path to the document file (required) +- `-m, --model ` - LLM model to use (default: gpt-4o-mini) +- `-s, --schema ` - Path to JSON schema file +- `-t, --template ` - Name of a predefined template +- `-a, --auto-schema` - Auto-generate schema from document +- `-i, --instructions ` - Instructions for auto-schema generation +- `-o, --output ` - Output file path for results (JSON) +- `--base-url ` - Base URL for local LLM (for Ollama) + +#### Examples + +**Extract with auto-generated schema:** +```bash +documind extract -f invoice.pdf --auto-schema +``` + +**Extract with custom schema:** +```bash +documind extract -f document.pdf -s schema.json -o output.json +``` + +**Extract using a predefined template:** +```bash +documind extract -f invoice.pdf -t invoice -o result.json +``` + +**Extract with local LLM (Ollama):** +```bash +documind extract -f doc.pdf -m llama3.2-vision --base-url http://localhost:11434/v1 --auto-schema +``` + +**Extract with specific instructions:** +```bash +documind extract -f contract.pdf --auto-schema -i "Extract party names, dates, and monetary amounts" +``` + +### Convert Command + +Convert documents to markdown or plain text. + +```bash +documind convert [options] +``` + +#### Options + +- `-f, --file ` - Path to the document file (required) +- `-m, --model ` - LLM model to use (default: gpt-4o-mini) +- `-t, --format ` - Output format: markdown or plaintext (default: markdown) +- `-o, --output ` - Output file path +- `--base-url ` - Base URL for local LLM (for Ollama) + +#### Examples + +**Convert to markdown:** +```bash +documind convert -f document.pdf -t markdown -o output.md +``` + +**Convert to plain text:** +```bash +documind convert -f document.pdf -t plaintext -o output.txt +``` + +### Templates Command + +List all available predefined templates. + +```bash +documind templates +``` + +## Supported Models + +### OpenAI Models +- `gpt-4o` +- `gpt-4o-mini` (default) +- `gpt-4.1` +- `gpt-4.1-mini` + +### Google Models +- `gemini-2.0-flash-001` +- `gemini-2.0-flash-lite-preview-02-05` +- `gemini-1.5-flash` +- `gemini-1.5-flash-8b` +- `gemini-1.5-pro` + +### Local Models (Ollama) +- `llama3.2-vision` + +## Environment Variables + +Set these environment variables for API access: + +```bash +# OpenAI API key for GPT models +export OPENAI_API_KEY="your-openai-api-key" + +# Google Gemini API key +export GEMINI_API_KEY="your-gemini-api-key" + +# Base URL for local LLM (Ollama) +export BASE_URL="http://localhost:11434/v1" +``` + +Or create a `.env` file: + +```env +OPENAI_API_KEY=your-openai-api-key +GEMINI_API_KEY=your-gemini-api-key +BASE_URL=http://localhost:11434/v1 +``` + +## Schema Format + +Schemas should be in JSON format with the following structure: + +```json +[ + { + "name": "field_name", + "type": "string", + "description": "Field description" + }, + { + "name": "nested_object", + "type": "object", + "description": "An object field", + "children": [ + { + "name": "child_field", + "type": "number", + "description": "Child field description" + } + ] + }, + { + "name": "items_list", + "type": "array", + "description": "A list of items", + "children": [ + { + "name": "item_name", + "type": "string", + "description": "Item name" + } + ] + } +] +``` + +### Supported Field Types + +- `string` - Text values +- `number` - Numeric values +- `boolean` - True/false values +- `enum` - Predefined set of values (requires `values` array) +- `object` - Nested objects (requires `children` array) +- `array` - Lists (requires `children` array) + +## Using with Local LLMs + +To use Ollama or other local LLMs: + +1. Install and start Ollama: +```bash +# Install Ollama +curl https://ollama.ai/install.sh | sh + +# Pull the vision model +ollama pull llama3.2-vision + +# Start Ollama (if not already running) +ollama serve +``` + +2. Use with the CLI: +```bash +documind extract \ + -f document.pdf \ + -m llama3.2-vision \ + --base-url http://localhost:11434/v1 \ + --auto-schema \ + -o result.json +``` + +## Development + +```bash +# Watch mode +npm run dev + +# Build +npm run build + +# Run +npm start -- extract -f example.pdf --auto-schema +``` + +## Troubleshooting + +### "Cannot find module" errors + +Make sure you've built the project: +```bash +npm run build +``` + +### API Key errors + +Ensure your API keys are set: +```bash +echo $OPENAI_API_KEY +echo $GEMINI_API_KEY +``` + +### Local LLM connection issues + +Check if Ollama is running: +```bash +curl http://localhost:11434/v1/models +``` + +## License + +AGPL-3.0 diff --git a/cli/package.json b/cli/package.json new file mode 100644 index 0000000..3a0397d --- /dev/null +++ b/cli/package.json @@ -0,0 +1,37 @@ +{ + "name": "cli", + "version": "1.0.0", + "description": "Command-line interface for Documind extractor", + "type": "module", + "main": "dist/index.js", + "types": "dist/index.d.ts", + "bin": { + "documind": "./dist/index.js" + }, + "files": [ + "dist" + ], + "scripts": { + "build": "tsc --build && tsc-alias -p tsconfig.json", + "start": "node dist/index.js", + "dev": "tsx src/index.ts" + }, + "keywords": [ + "cli", + "document", + "extraction", + "llm" + ], + "dependencies": { + "commander": "^12.0.0", + "chalk": "^5.3.0", + "ora": "^8.0.1", + "extractor": "*", + "dotenv": "^16.4.5" + }, + "devDependencies": { + "@types/node": "^20.14.11", + "tsc-alias": "^1.8.8", + "typescript": "^5.6.3" + } +} diff --git a/cli/src/commands/convert.ts b/cli/src/commands/convert.ts new file mode 100644 index 0000000..b277819 --- /dev/null +++ b/cli/src/commands/convert.ts @@ -0,0 +1,73 @@ +import { Command } from 'commander'; +import { formatter } from 'extractor'; +import ora from 'ora'; +import { logger } from '../utils/logger.js'; +import { fileExists, writeJsonFile, resolveFilePath } from '../utils/file-helper.js'; +import fs from 'fs'; + +export function createConvertCommand(): Command { + const cmd = new Command('convert'); + + cmd + .description('Convert a document to markdown or plain text') + .requiredOption('-f, --file ', 'Path to the document file') + .option('-m, --model ', 'LLM model to use', 'gpt-4o-mini') + .option('-t, --format ', 'Output format (markdown or plaintext)', 'markdown') + .option('-o, --output ', 'Output file path') + .option('--base-url ', 'Base URL for local LLM (for Ollama)') + .action(async (options: ConvertOptions) => { + await handleConvert(options); + }); + + return cmd; +} + +interface ConvertOptions { + file: string; + model: string; + format: 'markdown' | 'plaintext'; + output?: string; + baseUrl?: string; +} + +async function handleConvert(options: ConvertOptions): Promise { + const spinner = ora('Starting conversion...').start(); + + try { + const filePath = resolveFilePath(options.file); + + if (!fileExists(filePath)) { + spinner.fail(`File not found: ${filePath}`); + process.exit(1); + } + + if (options.baseUrl) { + process.env.BASE_URL = options.baseUrl; + } + + spinner.text = `Converting document to ${options.format}...`; + + let result: string; + + if (options.format === 'markdown') { + result = await formatter.markdown({ file: filePath, model: options.model }); + } else { + result = await formatter.plaintext({ file: filePath, model: options.model }); + } + + spinner.succeed('Conversion completed!'); + + if (options.output) { + const outputPath = resolveFilePath(options.output); + fs.writeFileSync(outputPath, result, 'utf-8'); + logger.success(`Output saved to: ${outputPath}`); + } else { + console.log('\n' + result); + } + + } catch (error) { + spinner.fail('Conversion failed'); + logger.error((error as Error).message); + process.exit(1); + } +} diff --git a/cli/src/commands/extract.ts b/cli/src/commands/extract.ts new file mode 100644 index 0000000..20fb44b --- /dev/null +++ b/cli/src/commands/extract.ts @@ -0,0 +1,147 @@ +import { Command } from 'commander'; +import { extract, type SchemaField, type ExtractResult } from 'extractor'; +import ora from 'ora'; +import { logger } from '../utils/logger.js'; +import { fileExists, readJsonFile, writeJsonFile, resolveFilePath } from '../utils/file-helper.js'; +import path from 'path'; +import chalk from 'chalk'; + +export function createExtractCommand(): Command { + const cmd = new Command('extract'); + + cmd + .description('Extract structured data from a document') + .requiredOption('-f, --file ', 'Path to the document file') + .option('-m, --model ', 'LLM model to use (e.g., gpt-4o-mini, llama3.2-vision, gemini-1.5-flash)', 'gpt-4o-mini') + .option('-s, --schema ', 'Path to JSON schema file') + .option('-t, --template ', 'Name of a predefined template') + .option('-a, --auto-schema', 'Auto-generate schema from document') + .option('-i, --instructions ', 'Instructions for auto-schema generation') + .option('-o, --output ', 'Output file path for results (JSON)') + .option('--base-url ', 'Base URL for local LLM (for Ollama)') + .option('-l, --language ', 'Language code for OCR (e.g., eng, deu, fra)', 'eng') + .option('--image-quality ', 'Image quality for compression (1-100, lower=smaller file)', '85') + .option('--max-image-width ', 'Max width for image resizing in pixels', '2048') + .action(async (options) => { + await handleExtract(options); + }); + + return cmd; +} + +interface ExtractOptions { + file: string; + model: string; + schema?: string; + template?: string; + autoSchema?: boolean; + instructions?: string; + output?: string; + baseUrl?: string; + language?: string; + imageQuality?: string; + maxImageWidth?: string; +} + +async function handleExtract(options: ExtractOptions): Promise { + const spinner = ora('Starting extraction...').start(); + + try { + // Resolve file path + const filePath = resolveFilePath(options.file); + + if (!fileExists(filePath)) { + spinner.fail(`File not found: ${filePath}`); + process.exit(1); + } + + spinner.text = 'File found. Preparing extraction...'; + + // Set base URL for local LLM if provided + if (options.baseUrl) { + process.env.BASE_URL = options.baseUrl; + logger.info(`Using local LLM at: ${options.baseUrl}`); + } + + // Prepare extraction parameters + let schema: SchemaField[] | undefined; + let autoSchemaOption: boolean | { instructions: string } | undefined; + + if (options.schema) { + const schemaPath = resolveFilePath(options.schema); + if (!fileExists(schemaPath)) { + spinner.fail(`Schema file not found: ${schemaPath}`); + process.exit(1); + } + schema = readJsonFile(schemaPath); + spinner.text = 'Schema loaded from file'; + } + + if (options.autoSchema) { + if (options.instructions) { + autoSchemaOption = { instructions: options.instructions }; + spinner.text = 'Auto-generating schema with instructions...'; + } else { + autoSchemaOption = true; + spinner.text = 'Auto-generating schema...'; + } + } + + // Validate options + if (!schema && !options.template && !autoSchemaOption) { + spinner.fail('You must provide either --schema, --template, or --auto-schema'); + process.exit(1); + } + + spinner.text = `Extracting data using ${options.model}...`; + + // Parse numeric options + const imageQuality = parseInt(options.imageQuality || '85', 10); + const maxImageWidth = parseInt(options.maxImageWidth || '2048', 10); + + // Perform extraction + const result: ExtractResult = await extract({ + file: filePath, + model: options.model, + schema, + template: options.template, + autoSchema: autoSchemaOption, + language: options.language || 'eng', + imageQuality, + maxImageWidth, + }); + + spinner.succeed('Extraction completed successfully!'); + + // Display results + logger.header('Extraction Results'); + logger.info(`Document: ${result.fileName}`); + logger.info(`Pages processed: ${result.pages}`); + + console.log('\n' + chalk.bold('Extracted Data:')); + console.log(JSON.stringify(result.data, null, 2)); + + // Save to file if output path provided + if (options.output) { + const outputPath = resolveFilePath(options.output); + writeJsonFile(outputPath, { + fileName: result.fileName, + pages: result.pages, + data: result.data, + extractedAt: new Date().toISOString(), + model: options.model, + }); + logger.success(`Results saved to: ${outputPath}`); + } + + } catch (error) { + spinner.fail('Extraction failed'); + logger.error((error as Error).message); + + if (process.env.DEBUG) { + console.error(error); + } + + process.exit(1); + } +} diff --git a/cli/src/commands/templates.ts b/cli/src/commands/templates.ts new file mode 100644 index 0000000..9970bcd --- /dev/null +++ b/cli/src/commands/templates.ts @@ -0,0 +1,39 @@ +import { Command } from 'commander'; +import { templates } from 'extractor'; +import { logger } from '../utils/logger.js'; +import chalk from 'chalk'; + +export function createTemplatesCommand(): Command { + const cmd = new Command('templates'); + + cmd + .description('List available templates') + .action(() => { + handleTemplates(); + }); + + return cmd; +} + +function handleTemplates(): void { + try { + logger.header('Available Templates'); + + const availableTemplates = templates.list(); + + if (availableTemplates.length === 0) { + logger.warn('No templates found'); + return; + } + + availableTemplates.forEach((template: string) => { + console.log(chalk.cyan(' •'), chalk.bold(template)); + }); + + console.log('\n' + chalk.dim('Use with: documind extract -f -t ')); + + } catch (error) { + logger.error(`Failed to list templates: ${(error as Error).message}`); + process.exit(1); + } +} diff --git a/cli/src/index.ts b/cli/src/index.ts new file mode 100644 index 0000000..dae5642 --- /dev/null +++ b/cli/src/index.ts @@ -0,0 +1,66 @@ +#!/usr/bin/env node + +import { Command } from 'commander'; +import { config } from 'dotenv'; +import { createExtractCommand } from './commands/extract.js'; +import { createConvertCommand } from './commands/convert.js'; +import { createTemplatesCommand } from './commands/templates.js'; +import chalk from 'chalk'; + +// Load environment variables +config(); + +const program = new Command(); + +program + .name('documind') + .description('CLI tool for intelligent document processing and extraction') + .version('1.0.0'); + +// Add commands +program.addCommand(createExtractCommand()); +program.addCommand(createConvertCommand()); +program.addCommand(createTemplatesCommand()); + +// Add help examples +program.addHelpText('after', ` +${chalk.bold('Examples:')} + + ${chalk.dim('# Extract data with auto-generated schema')} + ${chalk.cyan('$ documind extract -f invoice.pdf --auto-schema')} + + ${chalk.dim('# Extract data with custom schema')} + ${chalk.cyan('$ documind extract -f document.pdf -s schema.json -o output.json')} + + ${chalk.dim('# Extract data using a template')} + ${chalk.cyan('$ documind extract -f invoice.pdf -t invoice -o result.json')} + + ${chalk.dim('# Extract data with local LLM (Ollama)')} + ${chalk.cyan('$ documind extract -f doc.pdf -m llama3.2-vision --base-url http://localhost:11434/v1 --auto-schema')} + + ${chalk.dim('# Convert document to markdown')} + ${chalk.cyan('$ documind convert -f document.pdf -t markdown -o output.md')} + + ${chalk.dim('# List available templates')} + ${chalk.cyan('$ documind templates')} + +${chalk.bold('Environment Variables:')} + + ${chalk.yellow('OPENAI_API_KEY')} - OpenAI API key for GPT models + ${chalk.yellow('GEMINI_API_KEY')} - Google Gemini API key + ${chalk.yellow('BASE_URL')} - Base URL for local LLM (Ollama) + +${chalk.bold('Supported Models:')} + + ${chalk.green('OpenAI:')} gpt-4o, gpt-4o-mini, gpt-4.1, gpt-4.1-mini + ${chalk.green('Google:')} gemini-2.0-flash-001, gemini-1.5-flash, gemini-1.5-pro + ${chalk.green('Local:')} llama3.2-vision (requires Ollama) +`); + +// Parse arguments +program.parse(process.argv); + +// Show help if no command provided +if (!process.argv.slice(2).length) { + program.outputHelp(); +} diff --git a/cli/src/utils/file-helper.ts b/cli/src/utils/file-helper.ts new file mode 100644 index 0000000..e48473f --- /dev/null +++ b/cli/src/utils/file-helper.ts @@ -0,0 +1,38 @@ +import fs from 'fs'; +import path from 'path'; + +export function fileExists(filePath: string): boolean { + try { + return fs.existsSync(filePath); + } catch { + return false; + } +} + +export function readJsonFile(filePath: string): any { + try { + const content = fs.readFileSync(filePath, 'utf-8'); + return JSON.parse(content); + } catch (error) { + throw new Error(`Failed to read JSON file: ${(error as Error).message}`); + } +} + +export function writeJsonFile(filePath: string, data: any): void { + try { + const dirPath = path.dirname(filePath); + if (!fs.existsSync(dirPath)) { + fs.mkdirSync(dirPath, { recursive: true }); + } + fs.writeFileSync(filePath, JSON.stringify(data, null, 2), 'utf-8'); + } catch (error) { + throw new Error(`Failed to write JSON file: ${(error as Error).message}`); + } +} + +export function resolveFilePath(filePath: string): string { + if (path.isAbsolute(filePath)) { + return filePath; + } + return path.resolve(process.cwd(), filePath); +} diff --git a/cli/src/utils/logger.ts b/cli/src/utils/logger.ts new file mode 100644 index 0000000..718c1b0 --- /dev/null +++ b/cli/src/utils/logger.ts @@ -0,0 +1,23 @@ +import chalk from 'chalk'; + +export const logger = { + info: (message: string) => { + console.log(chalk.blue('ℹ'), message); + }, + + success: (message: string) => { + console.log(chalk.green('✔'), message); + }, + + error: (message: string) => { + console.error(chalk.red('✖'), message); + }, + + warn: (message: string) => { + console.warn(chalk.yellow('⚠'), message); + }, + + header: (message: string) => { + console.log(chalk.bold.cyan(`\n${message}\n${'='.repeat(message.length)}`)); + } +}; diff --git a/cli/tsconfig.json b/cli/tsconfig.json new file mode 100644 index 0000000..3f3ca2f --- /dev/null +++ b/cli/tsconfig.json @@ -0,0 +1,26 @@ +{ + "compilerOptions": { + "target": "ES2020", + "module": "ES2020", + "moduleResolution": "node", + "declaration": true, + "declarationMap": true, + "outDir": "./dist", + "rootDir": "./src", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "resolveJsonModule": true, + "allowSyntheticDefaultImports": true, + "baseUrl": ".", + "paths": { + "extractor": ["../extractor/src/index.ts"] + } + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"], + "references": [ + { "path": "../extractor" } + ] +} diff --git a/core/dist/index.d.ts b/core/dist/index.d.ts deleted file mode 100644 index 4db31a0..0000000 --- a/core/dist/index.d.ts +++ /dev/null @@ -1,2 +0,0 @@ -import { DocumindArgs, DocumindOutput } from "./types"; -export declare const documind: ({ cleanup, concurrency, filePath, llmParams, maintainFormat, model, outputDir, pagesToConvertAsImages, tempDir, }: DocumindArgs) => Promise; diff --git a/core/dist/index.js b/core/dist/index.js deleted file mode 100644 index c0523f7..0000000 --- a/core/dist/index.js +++ /dev/null @@ -1,174 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.documind = void 0; -const utils_1 = require("./utils"); -const fs_extra_1 = __importDefault(require("fs-extra")); -const os_1 = __importDefault(require("os")); -const path_1 = __importDefault(require("path")); -const p_limit_1 = __importDefault(require("p-limit")); -const types_1 = require("./types"); -const providers_1 = require("./providers"); -const documind = async ({ cleanup = true, concurrency = 10, filePath, llmParams = {}, maintainFormat = false, model, //= ModelOptions.gpt_4o_mini, -outputDir, pagesToConvertAsImages = -1, tempDir = os_1.default.tmpdir(), }) => { - let inputTokenCount = 0; - let outputTokenCount = 0; - let priorPage = ""; - const aggregatedMarkdown = []; - const startTime = new Date(); - // Basic checks - if (!filePath || !filePath.length) { - throw new Error("Missing file path"); - } - const defaultModel = model ?? types_1.OpenAIModels.GPT_4O_MINI; - const validatedParams = (0, utils_1.validateLLMParams)(llmParams); - const providerInstance = providers_1.getModel.getProviderForModel(defaultModel); - // Ensure temp directory exists + create temp folder - const rand = Math.floor(1000 + Math.random() * 9000).toString(); - const tempDirectory = path_1.default.join(tempDir || os_1.default.tmpdir(), `documind-file-${rand}`); - await fs_extra_1.default.ensureDir(tempDirectory); - // Download the PDF. Get file name. - const { extension, localPath } = await (0, utils_1.downloadFile)({ - filePath, - tempDir: tempDirectory, - }); - if (!localPath) - throw "Failed to save file to local drive"; - // Sort the `pagesToConvertAsImages` array to make sure we use the right index - // for `formattedPages` as `pdf2pic` always returns images in order - if (Array.isArray(pagesToConvertAsImages)) { - pagesToConvertAsImages.sort((a, b) => a - b); - } - // Convert file to PDF if necessary - if (extension !== ".png") { - let pdfPath; - if (extension === ".pdf") { - pdfPath = localPath; - } - else { - pdfPath = await (0, utils_1.convertFileToPdf)({ - extension, - localPath, - tempDir: tempDirectory, - }); - } - // Convert the file to a series of images - await (0, utils_1.convertPdfToImages)({ - localPath: pdfPath, - pagesToConvertAsImages, - tempDir: tempDirectory, - }); - } - const endOfPath = localPath.split("/")[localPath.split("/").length - 1]; - const rawFileName = endOfPath.split(".")[0]; - const fileName = rawFileName - .replace(/[^\w\s]/g, "") - .replace(/\s+/g, "_") - .toLowerCase() - .substring(0, 255); // Truncate file name to 255 characters to prevent ENAMETOOLONG errors - // Get list of converted images - const files = await fs_extra_1.default.readdir(tempDirectory); - const images = files.filter((file) => file.endsWith(".png")); - if (maintainFormat) { - // Use synchronous processing - for (const image of images) { - const imagePath = path_1.default.join(tempDirectory, image); - try { - const { content, inputTokens, outputTokens } = await providerInstance.getCompletion({ - imagePath, - llmParams: validatedParams, - maintainFormat, - model: defaultModel, - priorPage, - }); - const formattedMarkdown = (0, utils_1.formatMarkdown)(content); - inputTokenCount += inputTokens; - outputTokenCount += outputTokens; - // Update prior page to result from last processing step - priorPage = formattedMarkdown; - // Add all markdown results to array - aggregatedMarkdown.push(formattedMarkdown); - } - catch (error) { - console.error(`Failed to process image ${image}:`, error); - throw error; - } - } - } - else { - // Process in parallel with a limit on concurrent pages - const processPage = async (image) => { - const imagePath = path_1.default.join(tempDirectory, image); - try { - const { content, inputTokens, outputTokens } = await providerInstance.getCompletion({ - imagePath, - llmParams: validatedParams, - maintainFormat, - model: defaultModel, - priorPage, - }); - const formattedMarkdown = (0, utils_1.formatMarkdown)(content); - inputTokenCount += inputTokens; - outputTokenCount += outputTokens; - // Update prior page to result from last processing step - priorPage = formattedMarkdown; - // Add all markdown results to array - return formattedMarkdown; - } - catch (error) { - console.error(`Failed to process image ${image}:`, error); - throw error; - } - }; - // Function to process pages with concurrency limit - const processPagesInBatches = async (images, limit) => { - const results = []; - const promises = images.map((image, index) => limit(() => processPage(image).then((result) => { - results[index] = result; - }))); - await Promise.all(promises); - return results; - }; - const limit = (0, p_limit_1.default)(concurrency); - const results = await processPagesInBatches(images, limit); - const filteredResults = results.filter(utils_1.isString); - aggregatedMarkdown.push(...filteredResults); - } - // Write the aggregated markdown to a file - if (outputDir) { - const resultFilePath = path_1.default.join(outputDir, `${fileName}.md`); - await fs_extra_1.default.writeFile(resultFilePath, aggregatedMarkdown.join("\n\n")); - } - // Cleanup the downloaded PDF file - if (cleanup) - await fs_extra_1.default.remove(tempDirectory); - // Format JSON response - const endTime = new Date(); - const completionTime = endTime.getTime() - startTime.getTime(); - const formattedPages = aggregatedMarkdown.map((el, i) => { - let pageNumber; - // If we convert all pages, just use the array index - if (pagesToConvertAsImages === -1) { - pageNumber = i + 1; - } - // Else if we convert specific pages, use the page number from the parameter - else if (Array.isArray(pagesToConvertAsImages)) { - pageNumber = pagesToConvertAsImages[i]; - } - // Else, the parameter is a number and use it for the page number - else { - pageNumber = pagesToConvertAsImages; - } - return { content: el, page: pageNumber, contentLength: el.length }; - }); - return { - completionTime, - fileName, - inputTokens: inputTokenCount, - outputTokens: outputTokenCount, - pages: formattedPages, - }; -}; -exports.documind = documind; diff --git a/core/dist/openAI.d.ts b/core/dist/openAI.d.ts deleted file mode 100644 index 2e390ed..0000000 --- a/core/dist/openAI.d.ts +++ /dev/null @@ -1,2 +0,0 @@ -import { CompletionArgs, CompletionResponse } from "./types"; -export declare const getCompletion: ({ apiKey, imagePath, llmParams, maintainFormat, model, priorPage, }: CompletionArgs) => Promise; diff --git a/core/dist/openAI.js b/core/dist/openAI.js deleted file mode 100644 index 7c22118..0000000 --- a/core/dist/openAI.js +++ /dev/null @@ -1,75 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.getCompletion = void 0; -const utils_1 = require("./utils"); -const axios_1 = __importDefault(require("axios")); -const getCompletion = async ({ apiKey, imagePath, llmParams, maintainFormat, model, priorPage, }) => { - const validModelsForCustomBaseUrl = [ - "llava", - "llama3.2-vision", - ]; - const validModelsForOpenAi = ["gpt-4o", "gpt-4o-mini"]; - const baseUrl = process.env.BASE_URL || "https://api.openai.com/v1"; - if (baseUrl !== "https://api.openai.com/v1") { - if (!validModelsForCustomBaseUrl.includes(model)) { - throw new Error(`Invalid model "${model}" for custom base URL. Valid options are: ${validModelsForCustomBaseUrl.join(", ")}.`); - } - } - else { - if (!validModelsForOpenAi.includes(model)) { - throw new Error(`Invalid model "${model}" for OpenAI. Valid options are: ${validModelsForOpenAi.join(", ")}.`); - } - } - const systemPrompt = ` - Convert the following image/document to markdown. - Return only the markdown with no explanation text. Do not include deliminators like '''markdown. - You must include all information on the page. Do not exclude headers, footers, or subtext. - `; - // Default system message. - const messages = [{ role: "system", content: systemPrompt }]; - // If content has already been generated, add it to context. - // This helps maintain the same format across pages - if (maintainFormat && priorPage && priorPage.length) { - messages.push({ - role: "system", - content: `Markdown must maintain consistent formatting with the following page: \n\n """${priorPage}"""`, - }); - } - // Add Image to request - const base64Image = await (0, utils_1.encodeImageToBase64)(imagePath); - messages.push({ - role: "user", - content: [ - { - type: "image_url", - image_url: { url: `data:image/png;base64,${base64Image}` }, - }, - ], - }); - try { - const response = await axios_1.default.post(`${baseUrl}/chat/completions`, { - messages, - model, - ...(0, utils_1.convertKeysToSnakeCase)(llmParams ?? null), - }, { - headers: { - Authorization: `Bearer ${apiKey}`, - "Content-Type": "application/json", - }, - }); - const data = response.data; - return { - content: data.choices[0].message.content, - inputTokens: data.usage.prompt_tokens, - outputTokens: data.usage.completion_tokens, - }; - } - catch (err) { - console.error("Error in OpenAI completion", err); - throw err; - } -}; -exports.getCompletion = getCompletion; diff --git a/core/dist/providers/google.d.ts b/core/dist/providers/google.d.ts deleted file mode 100644 index da5acb0..0000000 --- a/core/dist/providers/google.d.ts +++ /dev/null @@ -1,5 +0,0 @@ -import { Completion } from "./utils/completion"; -import { CompletionArgs, CompletionResponse } from "../types"; -export declare class Google implements Completion { - getCompletion(args: CompletionArgs): Promise; -} diff --git a/core/dist/providers/google.js b/core/dist/providers/google.js deleted file mode 100644 index bf0e9d5..0000000 --- a/core/dist/providers/google.js +++ /dev/null @@ -1,71 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.Google = void 0; -const axios_1 = __importDefault(require("axios")); -const types_1 = require("../types"); -const utils_1 = require("../utils"); -class Google { - async getCompletion(args) { - const { imagePath, llmParams, maintainFormat, model, priorPage, } = args; - if (!process.env.GEMINI_API_KEY) { - throw new Error("Missing GEMINI_API_KEY in environment variables."); - } - const apiKey = process.env.GEMINI_API_KEY; - const validModels = Object.values(types_1.GoogleModels); - if (!validModels.includes(model)) { - throw new Error(`Model "${model}" is not a google model.`); - } - const systemPrompt = ` - Convert the following image/document to markdown. - Return only the markdown with no explanation text. Do not include deliminators like '''markdown. - You must include all information on the page. Do not exclude headers, footers, or subtext. - `; - const messages = [{ role: "system", content: systemPrompt }]; - if (maintainFormat && priorPage) { - messages.push({ - role: "system", - content: `Please ensure markdown formatting remains consistent with the prior page:\n\n"""${priorPage}"""`, - }); - } - const base64Image = await (0, utils_1.encodeImageToBase64)(imagePath); - messages.push({ - role: "user", - content: [ - { - "type": "text", //Using Gemini via openai requires text parameter - "text": "" - }, - { - type: "image_url", - image_url: { url: `data:image/png;base64,${base64Image}` }, - }, - ], - }); - try { - const response = await axios_1.default.post("https://generativelanguage.googleapis.com/v1beta/openai/chat/completions", { - messages, - model, - // ...convertKeysToSnakeCase(llmParams ?? null), - }, { - headers: { - Authorization: `Bearer ${apiKey}`, - "Content-Type": "application/json", - }, - }); - const data = response.data; - return { - content: data.choices[0].message.content, - inputTokens: data.usage?.prompt_tokens ?? 0, - outputTokens: data.usage?.completion_tokens ?? 0, - }; - } - catch (err) { - console.error("Google provider error:", err); - throw err; - } - } -} -exports.Google = Google; diff --git a/core/dist/providers/index.d.ts b/core/dist/providers/index.d.ts deleted file mode 100644 index 9503a8c..0000000 --- a/core/dist/providers/index.d.ts +++ /dev/null @@ -1,5 +0,0 @@ -import { Completion } from "./utils/completion"; -import { ModelOptions } from "../types"; -export declare class getModel { - static getProviderForModel(model: ModelOptions): Completion; -} diff --git a/core/dist/providers/index.js b/core/dist/providers/index.js deleted file mode 100644 index de6439d..0000000 --- a/core/dist/providers/index.js +++ /dev/null @@ -1,22 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.getModel = void 0; -const openAI_1 = require("./openAI"); -const ollama_1 = require("./ollama"); -const google_1 = require("./google"); -const types_1 = require("../types"); -class getModel { - static getProviderForModel(model) { - if (Object.values(types_1.OpenAIModels).includes(model)) { - return new openAI_1.OpenAI(); - } - if (Object.values(types_1.GoogleModels).includes(model)) { - return new google_1.Google(); - } - if (Object.values(types_1.LocalModels).includes(model)) { - return new ollama_1.Ollama(); - } - throw new Error(`No provider found for model "${model}"`); - } -} -exports.getModel = getModel; diff --git a/core/dist/providers/ollama.d.ts b/core/dist/providers/ollama.d.ts deleted file mode 100644 index 3079696..0000000 --- a/core/dist/providers/ollama.d.ts +++ /dev/null @@ -1,5 +0,0 @@ -import { Completion } from "./utils/completion"; -import { CompletionArgs, CompletionResponse } from "../types"; -export declare class Ollama implements Completion { - getCompletion(args: CompletionArgs): Promise; -} diff --git a/core/dist/providers/ollama.js b/core/dist/providers/ollama.js deleted file mode 100644 index d80af13..0000000 --- a/core/dist/providers/ollama.js +++ /dev/null @@ -1,67 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.Ollama = void 0; -const axios_1 = __importDefault(require("axios")); -const types_1 = require("../types"); -const utils_1 = require("../utils"); -class Ollama { - async getCompletion(args) { - const { imagePath, llmParams, maintainFormat, model, priorPage, } = args; - const baseUrl = process.env.BASE_URL; - if (!baseUrl) { - throw new Error("Missing BASE_URL in environment variables."); - } - const validModels = Object.values(types_1.LocalModels); - if (!validModels.includes(model)) { - throw new Error(`Model "${model}" is not a local model.`); - } - const systemPrompt = ` - Convert the following image/document to markdown. - Return only the markdown with no explanation text. Do not include deliminators like '''markdown. - You must include all information on the page. Do not exclude headers, footers, or subtext. - `; - const messages = [{ role: "system", content: systemPrompt }]; - if (maintainFormat && priorPage) { - messages.push({ - role: "system", - content: `Please ensure markdown formatting remains consistent with the prior page:\n\n"""${priorPage}"""`, - }); - } - const base64Image = await (0, utils_1.encodeImageToBase64)(imagePath); - messages.push({ - role: "user", - content: [ - { - type: "image_url", - image_url: { url: `data:image/png;base64,${base64Image}` }, - }, - ], - }); - try { - const response = await axios_1.default.post(`${baseUrl}/chat/completions`, { - messages, - model, - ...(0, utils_1.convertKeysToSnakeCase)(llmParams ?? null), - }, { - headers: { - // Authorization: "ollama", - "Content-Type": "application/json", - }, - }); - const data = response.data; - return { - content: data.choices[0].message.content, - inputTokens: data.usage?.prompt_tokens ?? 0, - outputTokens: data.usage?.completion_tokens ?? 0, - }; - } - catch (err) { - console.error("Local provider error:", err); - throw err; - } - } -} -exports.Ollama = Ollama; diff --git a/core/dist/providers/openAI.d.ts b/core/dist/providers/openAI.d.ts deleted file mode 100644 index 45e2f52..0000000 --- a/core/dist/providers/openAI.d.ts +++ /dev/null @@ -1,5 +0,0 @@ -import { Completion } from "./utils/completion"; -import { CompletionArgs, CompletionResponse } from "../types"; -export declare class OpenAI implements Completion { - getCompletion(args: CompletionArgs): Promise; -} diff --git a/core/dist/providers/openAI.js b/core/dist/providers/openAI.js deleted file mode 100644 index 8e1e649..0000000 --- a/core/dist/providers/openAI.js +++ /dev/null @@ -1,67 +0,0 @@ -"use strict"; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.OpenAI = void 0; -const axios_1 = __importDefault(require("axios")); -const types_1 = require("../types"); -const utils_1 = require("../utils"); -class OpenAI { - async getCompletion(args) { - const { imagePath, llmParams, maintainFormat, model, priorPage, } = args; - if (!process.env.OPENAI_API_KEY) { - throw new Error("Missing OPENAI_API_KEY in environment variables."); - } - const apiKey = process.env.OPENAI_API_KEY; - const validModels = Object.values(types_1.OpenAIModels); - if (!validModels.includes(model)) { - throw new Error(`Model "${model}" is not an OpenAI model.`); - } - const systemPrompt = ` - Convert the following image/document to markdown. - Return only the markdown with no explanation text. Do not include deliminators like '''markdown. - You must include all information on the page. Do not exclude headers, footers, or subtext. - `; - const messages = [{ role: "system", content: systemPrompt }]; - if (maintainFormat && priorPage) { - messages.push({ - role: "system", - content: `Please ensure markdown formatting remains consistent with the prior page:\n\n"""${priorPage}"""`, - }); - } - const base64Image = await (0, utils_1.encodeImageToBase64)(imagePath); - messages.push({ - role: "user", - content: [ - { - type: "image_url", - image_url: { url: `data:image/png;base64,${base64Image}` }, - }, - ], - }); - try { - const response = await axios_1.default.post("https://api.openai.com/v1/chat/completions", { - messages, - model, - ...(0, utils_1.convertKeysToSnakeCase)(llmParams ?? null), - }, { - headers: { - Authorization: `Bearer ${apiKey}`, - "Content-Type": "application/json", - }, - }); - const data = response.data; - return { - content: data.choices[0].message.content, - inputTokens: data.usage?.prompt_tokens ?? 0, - outputTokens: data.usage?.completion_tokens ?? 0, - }; - } - catch (err) { - console.error("OpenAI error:", err); - throw err; - } - } -} -exports.OpenAI = OpenAI; diff --git a/core/dist/providers/utils/completion.d.ts b/core/dist/providers/utils/completion.d.ts deleted file mode 100644 index 7753fb5..0000000 --- a/core/dist/providers/utils/completion.d.ts +++ /dev/null @@ -1,4 +0,0 @@ -import { CompletionArgs, CompletionResponse } from "../../types"; -export interface Completion { - getCompletion(args: CompletionArgs): Promise; -} diff --git a/core/dist/providers/utils/completion.js b/core/dist/providers/utils/completion.js deleted file mode 100644 index c8ad2e5..0000000 --- a/core/dist/providers/utils/completion.js +++ /dev/null @@ -1,2 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); diff --git a/core/dist/types.d.ts b/core/dist/types.d.ts deleted file mode 100644 index 3f16a1c..0000000 --- a/core/dist/types.d.ts +++ /dev/null @@ -1,59 +0,0 @@ -export declare enum OpenAIModels { - GPT_4O = "gpt-4o", - GPT_4O_MINI = "gpt-4o-mini", - GPT_4_1 = "gpt-4.1", - GPT_4_1_MINI = "gpt-4.1-mini" -} -export declare enum LocalModels { - LLAMA3_2_VISION = "llama3.2-vision" -} -export declare enum GoogleModels { - GEMINI_2_FLASH = "gemini-2.0-flash-001", - GEMINI_2_FLASH_LITE = "gemini-2.0-flash-lite-preview-02-05", - GEMINI_1_5_FLASH = "gemini-1.5-flash", - GEMINI_1_5_FLASH_8B = "gemini-1.5-flash-8b", - GEMINI_1_5_PRO = "gemini-1.5-pro" -} -export type ModelOptions = OpenAIModels | GoogleModels | LocalModels; -export interface DocumindArgs { - cleanup?: boolean; - concurrency?: number; - filePath: string; - llmParams?: LLMParams; - maintainFormat?: boolean; - model?: ModelOptions; - outputDir?: string; - pagesToConvertAsImages?: number | number[]; - tempDir?: string; -} -export interface Page { - content: string; - contentLength: number; - page: number; -} -export interface DocumindOutput { - completionTime: number; - fileName: string; - inputTokens: number; - outputTokens: number; - pages: Page[]; -} -export interface CompletionResponse { - content: string; - inputTokens: number; - outputTokens: number; -} -export interface CompletionArgs { - imagePath: string; - llmParams?: LLMParams; - maintainFormat: boolean; - model: ModelOptions; - priorPage: string; -} -export interface LLMParams { - frequencyPenalty?: number; - maxTokens?: number; - presencePenalty?: number; - temperature?: number; - topP?: number; -} diff --git a/core/dist/types.js b/core/dist/types.js deleted file mode 100644 index 2828a4b..0000000 --- a/core/dist/types.js +++ /dev/null @@ -1,23 +0,0 @@ -"use strict"; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.GoogleModels = exports.LocalModels = exports.OpenAIModels = void 0; -var OpenAIModels; -(function (OpenAIModels) { - OpenAIModels["GPT_4O"] = "gpt-4o"; - OpenAIModels["GPT_4O_MINI"] = "gpt-4o-mini"; - OpenAIModels["GPT_4_1"] = "gpt-4.1"; - OpenAIModels["GPT_4_1_MINI"] = "gpt-4.1-mini"; -})(OpenAIModels || (exports.OpenAIModels = OpenAIModels = {})); -var LocalModels; -(function (LocalModels) { - //LLAVA = "llava", - LocalModels["LLAMA3_2_VISION"] = "llama3.2-vision"; -})(LocalModels || (exports.LocalModels = LocalModels = {})); -var GoogleModels; -(function (GoogleModels) { - GoogleModels["GEMINI_2_FLASH"] = "gemini-2.0-flash-001"; - GoogleModels["GEMINI_2_FLASH_LITE"] = "gemini-2.0-flash-lite-preview-02-05"; - GoogleModels["GEMINI_1_5_FLASH"] = "gemini-1.5-flash"; - GoogleModels["GEMINI_1_5_FLASH_8B"] = "gemini-1.5-flash-8b"; - GoogleModels["GEMINI_1_5_PRO"] = "gemini-1.5-pro"; -})(GoogleModels || (exports.GoogleModels = GoogleModels = {})); diff --git a/core/dist/utils.d.ts b/core/dist/utils.d.ts deleted file mode 100644 index aeab40c..0000000 --- a/core/dist/utils.d.ts +++ /dev/null @@ -1,27 +0,0 @@ -import { LLMParams } from "./types"; -export declare const validateLLMParams: (params: Partial) => LLMParams; -export declare const encodeImageToBase64: (imagePath: string) => Promise; -export declare const formatMarkdown: (text: string) => string; -export declare const isString: (value: string | null) => value is string; -export declare const isValidUrl: (string: string) => boolean; -export declare const downloadFile: ({ filePath, tempDir, }: { - filePath: string; - tempDir: string; -}) => Promise<{ - extension: string; - localPath: string; -}>; -export declare const getTextFromImage: (buffer: Buffer) => Promise<{ - confidence: number; -}>; -export declare const convertPdfToImages: ({ localPath, pagesToConvertAsImages, tempDir, }: { - localPath: string; - pagesToConvertAsImages: number | number[]; - tempDir: string; -}) => Promise; -export declare const convertFileToPdf: ({ extension, localPath, tempDir, }: { - extension: string; - localPath: string; - tempDir: string; -}) => Promise; -export declare const convertKeysToSnakeCase: (obj: Record | null) => Record; diff --git a/core/dist/utils.js b/core/dist/utils.js deleted file mode 100644 index 0885a39..0000000 --- a/core/dist/utils.js +++ /dev/null @@ -1,258 +0,0 @@ -"use strict"; -var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - var desc = Object.getOwnPropertyDescriptor(m, k); - if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { - desc = { enumerable: true, get: function() { return m[k]; } }; - } - Object.defineProperty(o, k2, desc); -}) : (function(o, m, k, k2) { - if (k2 === undefined) k2 = k; - o[k2] = m[k]; -})); -var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { - Object.defineProperty(o, "default", { enumerable: true, value: v }); -}) : function(o, v) { - o["default"] = v; -}); -var __importStar = (this && this.__importStar) || function (mod) { - if (mod && mod.__esModule) return mod; - var result = {}; - if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k); - __setModuleDefault(result, mod); - return result; -}; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -exports.convertKeysToSnakeCase = exports.convertFileToPdf = exports.convertPdfToImages = exports.getTextFromImage = exports.downloadFile = exports.isValidUrl = exports.isString = exports.formatMarkdown = exports.encodeImageToBase64 = exports.validateLLMParams = void 0; -const libreoffice_convert_1 = require("libreoffice-convert"); -const pdf2pic_1 = require("pdf2pic"); -const promises_1 = require("stream/promises"); -const util_1 = require("util"); -const Tesseract = __importStar(require("tesseract.js")); -const axios_1 = __importDefault(require("axios")); -const fs_extra_1 = __importDefault(require("fs-extra")); -const mime_types_1 = __importDefault(require("mime-types")); -const path_1 = __importDefault(require("path")); -const sharp_1 = __importDefault(require("sharp")); -const convertAsync = (0, util_1.promisify)(libreoffice_convert_1.convert); -const defaultLLMParams = { - frequencyPenalty: 0, // OpenAI defaults to 0 - maxTokens: 4000, - presencePenalty: 0, // OpenAI defaults to 0 - temperature: 0, - topP: 1, // OpenAI defaults to 1 -}; -const validateLLMParams = (params) => { - const validKeys = Object.keys(defaultLLMParams); - for (const [key, value] of Object.entries(params)) { - if (!validKeys.includes(key)) { - throw new Error(`Invalid LLM parameter: ${key}`); - } - if (typeof value !== "number") { - throw new Error(`Value for '${key}' must be a number`); - } - } - return { ...defaultLLMParams, ...params }; -}; -exports.validateLLMParams = validateLLMParams; -const encodeImageToBase64 = async (imagePath) => { - const imageBuffer = await fs_extra_1.default.readFile(imagePath); - return imageBuffer.toString("base64"); -}; -exports.encodeImageToBase64 = encodeImageToBase64; -// Strip out the ```markdown wrapper -const formatMarkdown = (text) => { - let formattedMarkdown = text?.trim(); - let loopCount = 0; - const maxLoops = 3; - const startsWithMarkdown = formattedMarkdown.startsWith("```markdown"); - while (startsWithMarkdown && loopCount < maxLoops) { - const endsWithClosing = formattedMarkdown.endsWith("```"); - if (startsWithMarkdown && endsWithClosing) { - const outermostBlockRegex = /^```markdown\n([\s\S]*?)\n```$/; - const match = outermostBlockRegex.exec(formattedMarkdown); - if (match) { - formattedMarkdown = match[1].trim(); - loopCount++; - } - else { - break; - } - } - else { - break; - } - } - return formattedMarkdown; -}; -exports.formatMarkdown = formatMarkdown; -const isString = (value) => { - return value !== null; -}; -exports.isString = isString; -const isValidUrl = (string) => { - let url; - try { - url = new URL(string); - } - catch (_) { - return false; - } - return url.protocol === "http:" || url.protocol === "https:"; -}; -exports.isValidUrl = isValidUrl; -// Save file to local tmp directory -const downloadFile = async ({ filePath, tempDir, }) => { - // Shorten the file name by removing URL parameters - const baseFileName = path_1.default.basename(filePath.split("?")[0]); - const localPath = path_1.default.join(tempDir, baseFileName); - let mimetype; - // Check if filePath is a URL - if ((0, exports.isValidUrl)(filePath)) { - const writer = fs_extra_1.default.createWriteStream(localPath); - const response = await (0, axios_1.default)({ - url: filePath, - method: "GET", - responseType: "stream", - }); - if (response.status !== 200) { - throw new Error(`HTTP error! Status: ${response.status}`); - } - mimetype = response.headers?.["content-type"]; - await (0, promises_1.pipeline)(response.data, writer); - } - else { - // If filePath is a local file, copy it to the temp directory - await fs_extra_1.default.copyFile(filePath, localPath); - } - if (!mimetype) { - mimetype = mime_types_1.default.lookup(localPath); - } - let extension = mime_types_1.default.extension(mimetype) || ""; - if (!extension) { - if (mimetype === "binary/octet-stream") { - extension = ".bin"; - } - else { - throw new Error("File extension missing"); - } - } - if (!extension.startsWith(".")) { - extension = `.${extension}`; - } - return { extension, localPath }; -}; -exports.downloadFile = downloadFile; -// Extract text confidence from image buffer using Tesseract -const getTextFromImage = async (buffer) => { - try { - // Get image and metadata - const image = (0, sharp_1.default)(buffer); - const metadata = await image.metadata(); - // Crop to a 150px wide column in the center of the document. - // This section produced the highest confidence/speed tradeoffs. - const cropWidth = 150; - const cropHeight = metadata.height || 0; - const left = Math.max(0, Math.floor((metadata.width - cropWidth) / 2)); - const top = 0; - // Extract the cropped image - const croppedBuffer = await image - .extract({ left, top, width: cropWidth, height: cropHeight }) - .toBuffer(); - // Pass the croppedBuffer to Tesseract.recognize - // @TODO: How can we generalize this to non eng languages? - const { data: { confidence }, } = await Tesseract.recognize(croppedBuffer, "eng"); - return { confidence }; - } - catch (error) { - console.error("Error during OCR:", error); - return { confidence: 0 }; - } -}; -exports.getTextFromImage = getTextFromImage; -// Correct image orientation based on OCR confidence -// Run Tesseract on 4 different orientations of the image and compare the output -const correctImageOrientation = async (buffer) => { - const image = (0, sharp_1.default)(buffer); - const rotations = [0, 90, 180, 270]; - const results = await Promise.all(rotations.map(async (rotation) => { - const rotatedImageBuffer = await image - .clone() - .rotate(rotation) - .toBuffer(); - const { confidence } = await (0, exports.getTextFromImage)(rotatedImageBuffer); - return { rotation, confidence }; - })); - // Find the rotation with the best confidence score - const bestResult = results.reduce((best, current) => current.confidence > best.confidence ? current : best); - if (bestResult.rotation !== 0) { - console.log(`Reorienting image ${bestResult.rotation} degrees (Confidence: ${bestResult.confidence}%).`); - } - // Rotate the image to the best orientation - const correctedImageBuffer = await image - .rotate(bestResult.rotation) - .toBuffer(); - return correctedImageBuffer; -}; -// Convert each page to a png, correct orientation, and save that image to tmp -const convertPdfToImages = async ({ localPath, pagesToConvertAsImages, tempDir, }) => { - const options = { - density: 300, - format: "png", - height: 2048, - preserveAspectRatio: true, - saveFilename: path_1.default.basename(localPath, path_1.default.extname(localPath)), - savePath: tempDir, - }; - const storeAsImage = (0, pdf2pic_1.fromPath)(localPath, options); - try { - const convertResults = await storeAsImage.bulk(pagesToConvertAsImages, { - responseType: "buffer", - }); - await Promise.all(convertResults.map(async (result) => { - if (!result || !result.buffer) { - throw new Error("Could not convert page to image buffer"); - } - if (!result.page) - throw new Error("Could not identify page data"); - const paddedPageNumber = result.page.toString().padStart(5, "0"); - // Correct the image orientation - const correctedBuffer = await correctImageOrientation(result.buffer); - const imagePath = path_1.default.join(tempDir, `${options.saveFilename}_page_${paddedPageNumber}.png`); - await fs_extra_1.default.writeFile(imagePath, correctedBuffer); - })); - return convertResults; - } - catch (err) { - console.error("Error during PDF conversion:", err); - throw err; - } -}; -exports.convertPdfToImages = convertPdfToImages; -// Convert each page (from other formats like docx) to a png and save that image to tmp -const convertFileToPdf = async ({ extension, localPath, tempDir, }) => { - const inputBuffer = await fs_extra_1.default.readFile(localPath); - const outputFilename = path_1.default.basename(localPath, extension) + ".pdf"; - const outputPath = path_1.default.join(tempDir, outputFilename); - try { - const pdfBuffer = await convertAsync(inputBuffer, ".pdf", undefined); - await fs_extra_1.default.writeFile(outputPath, pdfBuffer); - return outputPath; - } - catch (err) { - console.error(`Error converting ${extension} to .pdf:`, err); - throw err; - } -}; -exports.convertFileToPdf = convertFileToPdf; -const camelToSnakeCase = (str) => str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`); -const convertKeysToSnakeCase = (obj) => { - if (typeof obj !== "object" || obj === null) { - return obj ?? {}; - } - return Object.fromEntries(Object.entries(obj).map(([key, value]) => [camelToSnakeCase(key), value])); -}; -exports.convertKeysToSnakeCase = convertKeysToSnakeCase; diff --git a/core/package.json b/core/package.json index 5efc90a..696b4e8 100644 --- a/core/package.json +++ b/core/package.json @@ -3,12 +3,13 @@ "version": "1.0.0", "description": "Core package for document conversion", "main": "dist/index.js", - "type": "commonjs", + "types": "dist/index.d.ts", + "type": "module", "files": [ "dist" ], "scripts": { - "build": "tsc" + "build": "tsc --build && tsc-alias -p tsconfig.json" }, "dependencies": { "axios": "^1.7.2", diff --git a/core/src/index.ts b/core/src/index.ts index fd87076..001c399 100644 --- a/core/src/index.ts +++ b/core/src/index.ts @@ -5,7 +5,7 @@ import { formatMarkdown, isString, validateLLMParams -} from "./utils"; +} from "./utils.js"; import fs from "fs-extra"; import os from "os"; import path from "path"; @@ -15,9 +15,9 @@ import { DocumindOutput, ModelOptions, OpenAIModels, -} from "./types"; -import { getModel } from "./providers"; -import { Completion } from "./providers/utils/completion"; +} from "./types.js"; +import { getModel } from "./providers/index.js"; +import { Completion } from "./providers/utils/completion.js"; export const documind = async ({ cleanup = true, @@ -29,6 +29,9 @@ export const documind = async ({ outputDir, pagesToConvertAsImages = -1, tempDir = os.tmpdir(), + language = "eng", + imageQuality = 85, + maxImageWidth = 2048, }: DocumindArgs): Promise => { let inputTokenCount = 0; @@ -83,6 +86,9 @@ export const documind = async ({ localPath: pdfPath, pagesToConvertAsImages, tempDir: tempDirectory, + language, + imageQuality, + maxImageWidth, }); } @@ -94,9 +100,9 @@ export const documind = async ({ .toLowerCase() .substring(0, 255); // Truncate file name to 255 characters to prevent ENAMETOOLONG errors - // Get list of converted images + // Get list of converted images (now in JPG format for better compression) const files = await fs.readdir(tempDirectory); - const images = files.filter((file) => file.endsWith(".png")); + const images = files.filter((file) => file.endsWith(".jpg") || file.endsWith(".png")); if (maintainFormat) { // Use synchronous processing @@ -211,3 +217,6 @@ export const documind = async ({ pages: formattedPages, }; }; + +export type { ModelOptions, DocumindArgs, DocumindOutput } from "./types.js"; +export { OpenAIModels, GoogleModels, LocalModels } from "./types.js"; diff --git a/core/src/providers/google.ts b/core/src/providers/google.ts index 8052be8..febb15b 100644 --- a/core/src/providers/google.ts +++ b/core/src/providers/google.ts @@ -1,7 +1,7 @@ import axios from "axios"; -import { Completion } from "./utils/completion"; -import { CompletionArgs, CompletionResponse, GoogleModels } from "../types"; -import { convertKeysToSnakeCase, encodeImageToBase64 } from "../utils"; +import { Completion } from "./utils/completion.js"; +import { CompletionArgs, CompletionResponse, GoogleModels } from "../types.js"; +import { convertKeysToSnakeCase, encodeImageToBase64 } from "../utils.js"; export class Google implements Completion { public async getCompletion(args: CompletionArgs): Promise { diff --git a/core/src/providers/index.ts b/core/src/providers/index.ts index 337d5fb..f3db33f 100644 --- a/core/src/providers/index.ts +++ b/core/src/providers/index.ts @@ -1,8 +1,8 @@ -import { Completion } from "./utils/completion"; -import { OpenAI } from "./openAI"; -import { Ollama } from "./ollama"; -import { Google } from "./google"; -import { ModelOptions, OpenAIModels, LocalModels, GoogleModels } from "../types"; +import { Completion } from "./utils/completion.js"; +import { OpenAI } from "./openAI.js"; +import { Ollama } from "./ollama.js"; +import { Google } from "./google.js"; +import { ModelOptions, OpenAIModels, LocalModels, GoogleModels } from "../types.js"; export class getModel { public static getProviderForModel(model: ModelOptions): Completion { diff --git a/core/src/providers/ollama.ts b/core/src/providers/ollama.ts index ece3559..22ed612 100644 --- a/core/src/providers/ollama.ts +++ b/core/src/providers/ollama.ts @@ -1,7 +1,7 @@ import axios from "axios"; -import { Completion } from "./utils/completion"; -import { CompletionArgs, CompletionResponse, LocalModels } from "../types"; -import { convertKeysToSnakeCase, encodeImageToBase64 } from "../utils"; +import { Completion } from "./utils/completion.js"; +import { CompletionArgs, CompletionResponse, LocalModels } from "../types.js"; +import { convertKeysToSnakeCase, encodeImageToBase64 } from "../utils.js"; export class Ollama implements Completion { public async getCompletion(args: CompletionArgs): Promise { diff --git a/core/src/providers/openAI.ts b/core/src/providers/openAI.ts index 90431fa..594a924 100644 --- a/core/src/providers/openAI.ts +++ b/core/src/providers/openAI.ts @@ -1,7 +1,7 @@ import axios from "axios"; -import { Completion } from "./utils/completion"; -import { CompletionArgs, CompletionResponse, OpenAIModels } from "../types"; -import { convertKeysToSnakeCase, encodeImageToBase64 } from "../utils"; +import { Completion } from "./utils/completion.js"; +import { CompletionArgs, CompletionResponse, OpenAIModels } from "../types.js"; +import { convertKeysToSnakeCase, encodeImageToBase64 } from "../utils.js"; export class OpenAI implements Completion { public async getCompletion(args: CompletionArgs): Promise { diff --git a/core/src/providers/utils/completion.ts b/core/src/providers/utils/completion.ts index bbce5ba..3d2ac52 100644 --- a/core/src/providers/utils/completion.ts +++ b/core/src/providers/utils/completion.ts @@ -1,4 +1,4 @@ -import { CompletionArgs, CompletionResponse } from "../../types"; +import { CompletionArgs, CompletionResponse } from "../../types.js"; export interface Completion { getCompletion(args: CompletionArgs): Promise; diff --git a/core/src/types.ts b/core/src/types.ts index 67abc8b..25dbf59 100644 --- a/core/src/types.ts +++ b/core/src/types.ts @@ -6,8 +6,10 @@ export enum OpenAIModels { } export enum LocalModels { - //LLAVA = "llava", + LLAVA13 = "llava:13b", + LLAVA = "llava", LLAMA3_2_VISION = "llama3.2-vision", + QWEN2_5VL_7B = "qwen2.5vl:7b", } export enum GoogleModels { @@ -30,6 +32,9 @@ export interface DocumindArgs { outputDir?: string; pagesToConvertAsImages?: number | number[]; tempDir?: string; + language?: string; // Language code for OCR (e.g., 'eng', 'deu', 'fra') + imageQuality?: number; // Image quality for compression (1-100, default 85) + maxImageWidth?: number; // Max width for image resizing (default 2048) } export interface Page { diff --git a/core/src/utils.ts b/core/src/utils.ts index 9377fe4..e06e075 100644 --- a/core/src/utils.ts +++ b/core/src/utils.ts @@ -1,9 +1,9 @@ import { convert } from "libreoffice-convert"; import { fromPath } from "pdf2pic"; -import { LLMParams } from "./types"; +import { LLMParams } from "./types.js"; import { pipeline } from "stream/promises"; import { promisify } from "util"; -import * as Tesseract from "tesseract.js"; +import Tesseract from "tesseract.js"; import axios from "axios"; import fs from "fs-extra"; import mime from "mime-types"; @@ -137,7 +137,8 @@ export const downloadFile = async ({ // Extract text confidence from image buffer using Tesseract export const getTextFromImage = async ( - buffer: Buffer + buffer: Buffer, + language: string = "eng" ): Promise<{ confidence: number }> => { try { // Get image and metadata @@ -156,11 +157,10 @@ export const getTextFromImage = async ( .extract({ left, top, width: cropWidth, height: cropHeight }) .toBuffer(); - // Pass the croppedBuffer to Tesseract.recognize - // @TODO: How can we generalize this to non eng languages? + // Pass the croppedBuffer to Tesseract.recognize with language parameter const { data: { confidence }, - } = await Tesseract.recognize(croppedBuffer, "eng"); + } = await Tesseract.recognize(croppedBuffer, language); return { confidence }; } catch (error) { @@ -171,7 +171,10 @@ export const getTextFromImage = async ( // Correct image orientation based on OCR confidence // Run Tesseract on 4 different orientations of the image and compare the output -const correctImageOrientation = async (buffer: Buffer): Promise => { +const correctImageOrientation = async ( + buffer: Buffer, + language: string = "eng" +): Promise => { const image = sharp(buffer); const rotations = [0, 90, 180, 270]; @@ -181,7 +184,7 @@ const correctImageOrientation = async (buffer: Buffer): Promise => { .clone() .rotate(rotation) .toBuffer(); - const { confidence } = await getTextFromImage(rotatedImageBuffer); + const { confidence } = await getTextFromImage(rotatedImageBuffer, language); return { rotation, confidence }; }) ); @@ -205,20 +208,27 @@ const correctImageOrientation = async (buffer: Buffer): Promise => { return correctedImageBuffer; }; -// Convert each page to a png, correct orientation, and save that image to tmp +// Convert each page to a png, correct orientation, optimize, and save that image to tmp export const convertPdfToImages = async ({ localPath, pagesToConvertAsImages, tempDir, + language = "eng", + imageQuality = 85, + maxImageWidth = 2048, }: { localPath: string; pagesToConvertAsImages: number | number[]; tempDir: string; + language?: string; + imageQuality?: number; + maxImageWidth?: number; }) => { + // Use lower density (72 DPI is standard screen resolution, sufficient for LLM vision) const options = { - density: 300, + density: 72, // Reduced to 72 DPI to avoid high DPI warnings and reduce file size format: "png", - height: 2048, + width: maxImageWidth, // Use width instead of height for better control preserveAspectRatio: true, saveFilename: path.basename(localPath, path.extname(localPath)), savePath: tempDir, @@ -237,14 +247,29 @@ export const convertPdfToImages = async ({ if (!result.page) throw new Error("Could not identify page data"); const paddedPageNumber = result.page.toString().padStart(5, "0"); - // Correct the image orientation - const correctedBuffer = await correctImageOrientation(result.buffer); - + // Correct the image orientation with language parameter + const correctedBuffer = await correctImageOrientation(result.buffer, language); + + // Optimize image for LLM: resize and compress aggressively + // Convert to JPEG for better compression (much smaller than PNG) + const optimizedBuffer = await sharp(correctedBuffer) + .resize(maxImageWidth, null, { + fit: 'inside', + withoutEnlargement: true, + }) + .jpeg({ + quality: imageQuality, + progressive: true, + mozjpeg: true // Use mozjpeg for better compression + }) + .toBuffer(); + + // Save as JPG instead of PNG for better compression const imagePath = path.join( tempDir, - `${options.saveFilename}_page_${paddedPageNumber}.png` + `${options.saveFilename}_page_${paddedPageNumber}.jpg` ); - await fs.writeFile(imagePath, correctedBuffer); + await fs.writeFile(imagePath, optimizedBuffer); }) ); return convertResults; diff --git a/core/tsconfig.json b/core/tsconfig.json index b962015..a96cc4b 100644 --- a/core/tsconfig.json +++ b/core/tsconfig.json @@ -1,12 +1,16 @@ { "compilerOptions": { "target": "ES2020", - "module": "CommonJS", + "module": "ES2020", + "moduleResolution": "node", "declaration": true, + "declarationMap": true, "outDir": "./dist", + "rootDir": "./src", "strict": true, "esModuleInterop": true, - "skipLibCheck": true + "skipLibCheck": true, + "composite": true }, "include": ["src/**/*"], "exclude": ["node_modules", "dist"] diff --git a/extractor/package.json b/extractor/package.json index 7953dd3..440346e 100644 --- a/extractor/package.json +++ b/extractor/package.json @@ -3,9 +3,15 @@ "version": "1.0.0", "description": "Document extraction and processing package.", "type": "module", + "main": "dist/index.js", + "types": "dist/index.d.ts", + "files": [ + "dist" + ], "scripts": { - "start": "node src/index.js", - "dev": "nodemon src/index.js" + "build": "tsc --build && tsc-alias -p tsconfig.json", + "start": "node dist/index.js", + "dev": "tsc --watch" }, "dependencies": { "@ai-sdk/google": "^1.1.14", @@ -21,6 +27,10 @@ "zod-to-json-schema": "^3.24.2" }, "devDependencies": { - "nodemon": "^3.1.7" + "@types/node": "^20.14.11", + "@types/uuid": "^11.0.0", + "nodemon": "^3.1.7", + "tsc-alias": "^1.8.8", + "typescript": "^5.6.3" } } diff --git a/extractor/src/autoschema/autogenerateSchema.d.ts b/extractor/src/autoschema/autogenerateSchema.d.ts new file mode 100644 index 0000000..b6d677c --- /dev/null +++ b/extractor/src/autoschema/autogenerateSchema.d.ts @@ -0,0 +1,8 @@ +import { SchemaField } from '../utils/convertToZodSchema.js'; +type AutoSchemaOption = boolean | { + instructions: string; +}; +export declare function autogenerateSchema(markdown: string, model: string, autoSchema: AutoSchemaOption): Promise; +export declare const autoschema: typeof autogenerateSchema; +export {}; +//# sourceMappingURL=autogenerateSchema.d.ts.map \ No newline at end of file diff --git a/extractor/src/autoschema/autogenerateSchema.d.ts.map b/extractor/src/autoschema/autogenerateSchema.d.ts.map new file mode 100644 index 0000000..c6156d2 --- /dev/null +++ b/extractor/src/autoschema/autogenerateSchema.d.ts.map @@ -0,0 +1 @@ +{"version":3,"file":"autogenerateSchema.d.ts","sourceRoot":"","sources":["autogenerateSchema.ts"],"names":[],"mappings":"AAMA,OAAO,EAAE,WAAW,EAAE,MAAM,gCAAgC,CAAC;AAG7D,KAAK,gBAAgB,GAAG,OAAO,GAAG;IAAE,YAAY,EAAE,MAAM,CAAA;CAAE,CAAC;AAE3D,wBAAsB,kBAAkB,CACtC,QAAQ,EAAE,MAAM,EAChB,KAAK,EAAE,MAAM,EACb,UAAU,EAAE,gBAAgB,GAC3B,OAAO,CAAC,WAAW,EAAE,CAAC,CA0BxB;AAiED,eAAO,MAAM,UAAU,2BAAqB,CAAC"} \ No newline at end of file diff --git a/extractor/src/autoschema/autogenerateSchema.js b/extractor/src/autoschema/autogenerateSchema.js index 46cfd0f..26bc9fc 100644 --- a/extractor/src/autoschema/autogenerateSchema.js +++ b/extractor/src/autoschema/autogenerateSchema.js @@ -5,93 +5,65 @@ import { baseSchema } from './generation-schemas/base.js'; import { secondarySchema } from './generation-schemas/secondary.js'; import { cleanSchemaFields } from "./cleanSchemaFields.js"; import { z } from 'zod'; - export async function autogenerateSchema(markdown, model, autoSchema) { - if (autoSchema === true) { - return await blanketSchema(markdown, model); - } - - if ( - typeof autoSchema === "object" && - autoSchema !== null - ) { - const keys = Object.keys(autoSchema); - if (keys.length !== 1 || keys[0] !== "instructions") { - throw new Error("autoSchema object must only have a single 'instructions' property"); + if (autoSchema === true) { + return await blanketSchema(markdown, model); } - - if (typeof autoSchema.instructions !== "string" || !autoSchema.instructions.trim()) { - throw new Error("Instructions can't be empty"); + if (typeof autoSchema === "object" && + autoSchema !== null) { + const keys = Object.keys(autoSchema); + if (keys.length !== 1 || keys[0] !== "instructions") { + throw new Error("autoSchema object must only have a single 'instructions' property"); + } + if (typeof autoSchema.instructions !== "string" || !autoSchema.instructions.trim()) { + throw new Error("Instructions can't be empty"); + } + return await instructionBasedSchema(markdown, model, autoSchema.instructions); } - - - return await instructionBasedSchema( - markdown, - model, - autoSchema.instructions - ); - } - - return await blanketSchema(markdown, model); + return await blanketSchema(markdown, model); } - async function blanketSchema(markdown, model) { - const extraction = getExtractor(model); - const schemaToUse = extraction === googleExtractor ? secondarySchema : baseSchema; - - const result = await extraction({ - markdown, - zodSchema: schemaToUse, - prompt: AUTO_SCHEMA_PROMPT(markdown), - model: model, - }); - - if (!result || !result.fields) { - throw new Error("Error auto generating default schema."); - } - - return cleanSchemaFields(result.fields); + const extraction = getExtractor(model); + const schemaToUse = extraction === googleExtractor ? secondarySchema : baseSchema; + const result = await extraction({ + markdown, + zodSchema: schemaToUse, + prompt: AUTO_SCHEMA_PROMPT(markdown), + model: model, + }); + if (!result || !result.fields) { + throw new Error("Error auto generating default schema."); + } + return cleanSchemaFields(result.fields); } - async function instructionBasedSchema(markdown, model, instructions) { - - const instructionsZod = z.object({ - fields: z.array(z.string()), - }); - - const instructionPrompt = ` + const instructionsZod = z.object({ + fields: z.array(z.string()), + }); + const instructionPrompt = ` Extract the name of the fields the user wants to extract. - ` - - const extraction = getExtractor(model); - - const instructionFields = await extraction({ - markdown: instructions, - zodSchema: instructionsZod, - prompt: instructionPrompt, - model: model, - }); - - if (!instructionFields || !instructionFields.fields) { - throw new Error("Error identifying the fields to be extracted."); - } - - const data = instructionFields.fields; - - const schemaToUse = extraction === googleExtractor ? secondarySchema : baseSchema; - - const result = await extraction({ - markdown, - zodSchema: schemaToUse, - prompt: INSTRUCTIONS_SCHEMA_PROMPT(markdown, data), - model: model, - }); - - if (!result || !result.fields) { - throw new Error("Error auto generating specified schema."); - } - - return cleanSchemaFields(result.fields); + `; + const extraction = getExtractor(model); + const instructionFields = await extraction({ + markdown: instructions, + zodSchema: instructionsZod, + prompt: instructionPrompt, + model: model, + }); + if (!instructionFields || !instructionFields.fields) { + throw new Error("Error identifying the fields to be extracted."); + } + const data = instructionFields.fields; + const schemaToUse = extraction === googleExtractor ? secondarySchema : baseSchema; + const result = await extraction({ + markdown, + zodSchema: schemaToUse, + prompt: INSTRUCTIONS_SCHEMA_PROMPT(markdown, data), + model: model, + }); + if (!result || !result.fields) { + throw new Error("Error auto generating specified schema."); + } + return cleanSchemaFields(result.fields); } - -export const autoschema = autogenerateSchema; \ No newline at end of file +export const autoschema = autogenerateSchema; diff --git a/extractor/src/autoschema/autogenerateSchema.ts b/extractor/src/autoschema/autogenerateSchema.ts new file mode 100644 index 0000000..8d2844e --- /dev/null +++ b/extractor/src/autoschema/autogenerateSchema.ts @@ -0,0 +1,107 @@ +import { getExtractor } from '../extractors/index.js'; +import { googleExtractor } from '../extractors/google.js'; +import { AUTO_SCHEMA_PROMPT, INSTRUCTIONS_SCHEMA_PROMPT } from "../prompts.js"; +import { baseSchema } from './generation-schemas/base.js'; +import { secondarySchema } from './generation-schemas/secondary.js'; +import { cleanSchemaFields } from "./cleanSchemaFields.js"; +import { SchemaField } from '../utils/convertToZodSchema.js'; +import { z } from 'zod'; + +type AutoSchemaOption = boolean | { instructions: string }; + +export async function autogenerateSchema( + markdown: string, + model: string, + autoSchema: AutoSchemaOption +): Promise { + if (autoSchema === true) { + return await blanketSchema(markdown, model); + } + + if ( + typeof autoSchema === "object" && + autoSchema !== null + ) { + const keys = Object.keys(autoSchema); + if (keys.length !== 1 || keys[0] !== "instructions") { + throw new Error("autoSchema object must only have a single 'instructions' property"); + } + + if (typeof autoSchema.instructions !== "string" || !autoSchema.instructions.trim()) { + throw new Error("Instructions can't be empty"); + } + + return await instructionBasedSchema( + markdown, + model, + autoSchema.instructions + ); + } + + return await blanketSchema(markdown, model); +} + +async function blanketSchema(markdown: string, model: string): Promise { + const extraction = getExtractor(model); + const schemaToUse = extraction === googleExtractor ? secondarySchema : baseSchema; + + const result = await extraction({ + markdown, + zodSchema: schemaToUse, + prompt: AUTO_SCHEMA_PROMPT(markdown), + model: model, + }); + + if (!result || !result.fields) { + throw new Error("Error auto generating default schema."); + } + + return cleanSchemaFields(result.fields); +} + +async function instructionBasedSchema( + markdown: string, + model: string, + instructions: string +): Promise { + + const instructionsZod = z.object({ + fields: z.array(z.string()), + }); + + const instructionPrompt = ` + Extract the name of the fields the user wants to extract. + `; + + const extraction = getExtractor(model); + + const instructionFields = await extraction({ + markdown: instructions, + zodSchema: instructionsZod, + prompt: instructionPrompt, + model: model, + }); + + if (!instructionFields || !instructionFields.fields) { + throw new Error("Error identifying the fields to be extracted."); + } + + const data = instructionFields.fields; + + const schemaToUse = extraction === googleExtractor ? secondarySchema : baseSchema; + + const result = await extraction({ + markdown, + zodSchema: schemaToUse, + prompt: INSTRUCTIONS_SCHEMA_PROMPT(markdown, data), + model: model, + }); + + if (!result || !result.fields) { + throw new Error("Error auto generating specified schema."); + } + + return cleanSchemaFields(result.fields); +} + +export const autoschema = autogenerateSchema; diff --git a/extractor/src/autoschema/cleanSchemaFields.d.ts b/extractor/src/autoschema/cleanSchemaFields.d.ts new file mode 100644 index 0000000..2e23e52 --- /dev/null +++ b/extractor/src/autoschema/cleanSchemaFields.d.ts @@ -0,0 +1,3 @@ +import { SchemaField } from '../utils/convertToZodSchema.js'; +export declare function cleanSchemaFields(fields: SchemaField[]): SchemaField[]; +//# sourceMappingURL=cleanSchemaFields.d.ts.map \ No newline at end of file diff --git a/extractor/src/autoschema/cleanSchemaFields.d.ts.map b/extractor/src/autoschema/cleanSchemaFields.d.ts.map new file mode 100644 index 0000000..097aaca --- /dev/null +++ b/extractor/src/autoschema/cleanSchemaFields.d.ts.map @@ -0,0 +1 @@ +{"version":3,"file":"cleanSchemaFields.d.ts","sourceRoot":"","sources":["cleanSchemaFields.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,gCAAgC,CAAC;AAE7D,wBAAgB,iBAAiB,CAAC,MAAM,EAAE,WAAW,EAAE,GAAG,WAAW,EAAE,CASpE"} \ No newline at end of file diff --git a/extractor/src/autoschema/cleanSchemaFields.js b/extractor/src/autoschema/cleanSchemaFields.js index 84e754c..3d0a7bc 100644 --- a/extractor/src/autoschema/cleanSchemaFields.js +++ b/extractor/src/autoschema/cleanSchemaFields.js @@ -1,10 +1,11 @@ export function cleanSchemaFields(fields) { return fields.map((f) => { - if (f.children && f.children.length === 0) { - delete f.children; - } else if (f.children) { - f.children = cleanSchemaFields(f.children); - } - return f; + if (f.children && f.children.length === 0) { + delete f.children; + } + else if (f.children) { + f.children = cleanSchemaFields(f.children); + } + return f; }); - } \ No newline at end of file +} diff --git a/extractor/src/autoschema/cleanSchemaFields.ts b/extractor/src/autoschema/cleanSchemaFields.ts new file mode 100644 index 0000000..a936a0d --- /dev/null +++ b/extractor/src/autoschema/cleanSchemaFields.ts @@ -0,0 +1,12 @@ +import { SchemaField } from '../utils/convertToZodSchema.js'; + +export function cleanSchemaFields(fields: SchemaField[]): SchemaField[] { + return fields.map((f) => { + if (f.children && f.children.length === 0) { + delete f.children; + } else if (f.children) { + f.children = cleanSchemaFields(f.children); + } + return f; + }); + } diff --git a/extractor/src/autoschema/generation-schemas/base.d.ts b/extractor/src/autoschema/generation-schemas/base.d.ts new file mode 100644 index 0000000..0e1755b --- /dev/null +++ b/extractor/src/autoschema/generation-schemas/base.d.ts @@ -0,0 +1,9 @@ +import { z } from "zod"; +export declare const baseSchema: z.ZodObject<{ + fields: z.ZodArray, "many">; +}, "strip", z.ZodTypeAny, { + fields: any[]; +}, { + fields: any[]; +}>; +//# sourceMappingURL=base.d.ts.map \ No newline at end of file diff --git a/extractor/src/autoschema/generation-schemas/base.d.ts.map b/extractor/src/autoschema/generation-schemas/base.d.ts.map new file mode 100644 index 0000000..8ce60da --- /dev/null +++ b/extractor/src/autoschema/generation-schemas/base.d.ts.map @@ -0,0 +1 @@ +{"version":3,"file":"base.d.ts","sourceRoot":"","sources":["base.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAWxB,eAAO,MAAM,UAAU;;;;;;EAErB,CAAC"} \ No newline at end of file diff --git a/extractor/src/autoschema/generation-schemas/base.js b/extractor/src/autoschema/generation-schemas/base.js index 80e144b..d8294f3 100644 --- a/extractor/src/autoschema/generation-schemas/base.js +++ b/extractor/src/autoschema/generation-schemas/base.js @@ -1,15 +1,11 @@ // Used for non-google models that don't have schema limitations import { z } from "zod"; - -const SchemaField = z.lazy(() => - z.object({ +const SchemaField = z.lazy(() => z.object({ name: z.string(), type: z.enum(["string", "number", "array", "object"]), description: z.string().optional(), children: z.array(SchemaField).optional(), - }) -); - +})); export const baseSchema = z.object({ - fields: z.array(SchemaField), -}); \ No newline at end of file + fields: z.array(SchemaField), +}); diff --git a/extractor/src/autoschema/generation-schemas/base.ts b/extractor/src/autoschema/generation-schemas/base.ts new file mode 100644 index 0000000..c367c60 --- /dev/null +++ b/extractor/src/autoschema/generation-schemas/base.ts @@ -0,0 +1,15 @@ +// Used for non-google models that don't have schema limitations +import { z } from "zod"; + +const SchemaField: z.ZodType = z.lazy(() => + z.object({ + name: z.string(), + type: z.enum(["string", "number", "array", "object"]), + description: z.string().optional(), + children: z.array(SchemaField).optional(), + }) +); + +export const baseSchema = z.object({ + fields: z.array(SchemaField), +}); diff --git a/extractor/src/autoschema/generation-schemas/secondary.d.ts b/extractor/src/autoschema/generation-schemas/secondary.d.ts new file mode 100644 index 0000000..c735a31 --- /dev/null +++ b/extractor/src/autoschema/generation-schemas/secondary.d.ts @@ -0,0 +1,63 @@ +import { z } from "zod"; +export declare const secondarySchema: z.ZodObject<{ + fields: z.ZodArray; + description: z.ZodOptional; + }, { + children: z.ZodOptional; + description: z.ZodOptional; + }, "strip", z.ZodTypeAny, { + type: "string" | "number" | "object" | "array"; + name: string; + description?: string | undefined; + }, { + type: "string" | "number" | "object" | "array"; + name: string; + description?: string | undefined; + }>, "many">>; + }>, "strip", z.ZodTypeAny, { + type: "string" | "number" | "object" | "array"; + name: string; + description?: string | undefined; + children?: { + type: "string" | "number" | "object" | "array"; + name: string; + description?: string | undefined; + }[] | undefined; + }, { + type: "string" | "number" | "object" | "array"; + name: string; + description?: string | undefined; + children?: { + type: "string" | "number" | "object" | "array"; + name: string; + description?: string | undefined; + }[] | undefined; + }>, "many">; +}, "strip", z.ZodTypeAny, { + fields: { + type: "string" | "number" | "object" | "array"; + name: string; + description?: string | undefined; + children?: { + type: "string" | "number" | "object" | "array"; + name: string; + description?: string | undefined; + }[] | undefined; + }[]; +}, { + fields: { + type: "string" | "number" | "object" | "array"; + name: string; + description?: string | undefined; + children?: { + type: "string" | "number" | "object" | "array"; + name: string; + description?: string | undefined; + }[] | undefined; + }[]; +}>; +//# sourceMappingURL=secondary.d.ts.map \ No newline at end of file diff --git a/extractor/src/autoschema/generation-schemas/secondary.d.ts.map b/extractor/src/autoschema/generation-schemas/secondary.d.ts.map new file mode 100644 index 0000000..a49cfdf --- /dev/null +++ b/extractor/src/autoschema/generation-schemas/secondary.d.ts.map @@ -0,0 +1 @@ +{"version":3,"file":"secondary.d.ts","sourceRoot":"","sources":["secondary.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAYxB,eAAO,MAAM,eAAe;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAE1B,CAAC"} \ No newline at end of file diff --git a/extractor/src/autoschema/generation-schemas/secondary.js b/extractor/src/autoschema/generation-schemas/secondary.js index 4a99c86..a8ea0eb 100644 --- a/extractor/src/autoschema/generation-schemas/secondary.js +++ b/extractor/src/autoschema/generation-schemas/secondary.js @@ -1,16 +1,13 @@ // Used for google models import { z } from "zod"; - const BaseSchemaField = z.object({ - name: z.string(), - type: z.enum(["string", "number", "array", "object"]), - description: z.string().optional(), + name: z.string(), + type: z.enum(["string", "number", "array", "object"]), + description: z.string().optional(), }); - const SchemaField = BaseSchemaField.extend({ - children: z.array(BaseSchemaField).optional(), + children: z.array(BaseSchemaField).optional(), }); - export const secondarySchema = z.object({ - fields: z.array(SchemaField), -}); \ No newline at end of file + fields: z.array(SchemaField), +}); diff --git a/extractor/src/autoschema/generation-schemas/secondary.ts b/extractor/src/autoschema/generation-schemas/secondary.ts new file mode 100644 index 0000000..5d1baab --- /dev/null +++ b/extractor/src/autoschema/generation-schemas/secondary.ts @@ -0,0 +1,16 @@ +// Used for google models +import { z } from "zod"; + +const BaseSchemaField = z.object({ + name: z.string(), + type: z.enum(["string", "number", "array", "object"]), + description: z.string().optional(), +}); + +const SchemaField = BaseSchemaField.extend({ + children: z.array(BaseSchemaField).optional(), +}); + +export const secondarySchema = z.object({ + fields: z.array(SchemaField), +}); diff --git a/extractor/src/converter.js b/extractor/src/converter.js deleted file mode 100644 index 62df582..0000000 --- a/extractor/src/converter.js +++ /dev/null @@ -1,21 +0,0 @@ -import { documind } from 'core'; -import { generateMarkdownDocument } from './utils/generateMarkdown.js'; - -export const convertFile = async (filePath, model) => { - try { - const result = await documind({ - filePath, - model, - }); - - const { pages, fileName } = result; - const totalPages = pages.length; - - const markdown = await generateMarkdownDocument(pages); - //console.log('Markdown generated', markdown); - - return { markdown, totalPages, fileName }; - } catch (error) { - console.error('Error running documind core:', error); - } -}; diff --git a/extractor/src/converter.ts b/extractor/src/converter.ts new file mode 100644 index 0000000..31929eb --- /dev/null +++ b/extractor/src/converter.ts @@ -0,0 +1,37 @@ +import { documind, type ModelOptions } from 'core'; +import { generateMarkdownDocument } from './utils/generateMarkdown.js'; + +interface ConvertFileResult { + markdown: string; + totalPages: number; + fileName: string; +} + +export const convertFile = async ( + filePath: string, + model: string, + language: string = "eng", + imageQuality: number = 85, + maxImageWidth: number = 2048 +): Promise => { + try { + const result = await documind({ + filePath, + model: model as ModelOptions, + language, + imageQuality, + maxImageWidth, + }); + + const { pages, fileName } = result; + const totalPages = pages.length; + + const markdown = await generateMarkdownDocument(pages); + //console.log('Markdown generated', markdown); + + return { markdown, totalPages, fileName }; + } catch (error) { + console.error('Error running documind core:', error); + throw error; + } +}; diff --git a/extractor/src/extractors/google.d.ts b/extractor/src/extractors/google.d.ts new file mode 100644 index 0000000..0103748 --- /dev/null +++ b/extractor/src/extractors/google.d.ts @@ -0,0 +1,10 @@ +import { ZodObject, ZodRawShape } from "zod"; +interface ExtractorParams { + markdown: string; + zodSchema: ZodObject; + prompt: string; + model: string; +} +export declare const googleExtractor: ({ markdown, zodSchema, prompt, model }: ExtractorParams) => Promise; +export {}; +//# sourceMappingURL=google.d.ts.map \ No newline at end of file diff --git a/extractor/src/extractors/google.d.ts.map b/extractor/src/extractors/google.d.ts.map new file mode 100644 index 0000000..7959d61 --- /dev/null +++ b/extractor/src/extractors/google.d.ts.map @@ -0,0 +1 @@ +{"version":3,"file":"google.d.ts","sourceRoot":"","sources":["google.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,SAAS,EAAE,WAAW,EAAE,MAAM,KAAK,CAAC;AAE7C,UAAU,eAAe;IACvB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,SAAS,CAAC,WAAW,CAAC,CAAC;IAClC,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;CACf;AAED,eAAO,MAAM,eAAe,2CAAkD,eAAe,KAAG,OAAO,CAAC,GAAG,CAyC1G,CAAA"} \ No newline at end of file diff --git a/extractor/src/extractors/google.js b/extractor/src/extractors/google.js index 777fae1..7cb7e42 100644 --- a/extractor/src/extractors/google.js +++ b/extractor/src/extractors/google.js @@ -1,49 +1,36 @@ import { GoogleGenerativeAI } from "@google/generative-ai"; import { zodToJsonSchema } from "zod-to-json-schema"; - -const genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY); - export const googleExtractor = async ({ markdown, zodSchema, prompt, model }) => { if (!process.env.GEMINI_API_KEY) { - throw new Error("Missing GEMINI_API_KEY"); + throw new Error("Missing GEMINI_API_KEY"); } - -const googleModel = model - -// Convert Zod schema to JSON schema -let jsonSchema = zodToJsonSchema(zodSchema); - -// Remove additionalProperties and $schema keys -const removeKeys = (obj) => { - if (Array.isArray(obj)) { - return obj.map(removeKeys); - } else if (typeof obj === "object" && obj !== null) { - return Object.fromEntries( - Object.entries(obj) + const genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY); + const googleModel = model; + // Convert Zod schema to JSON schema + let jsonSchema = zodToJsonSchema(zodSchema); + // Remove additionalProperties and $schema keys + const removeKeys = (obj) => { + if (Array.isArray(obj)) { + return obj.map(removeKeys); + } + else if (typeof obj === "object" && obj !== null) { + return Object.fromEntries(Object.entries(obj) .filter(([key]) => key !== "additionalProperties" && key !== "$schema") - .map(([key, value]) => [key, removeKeys(value)]) - ); - } - return obj; -}; - -jsonSchema = removeKeys(jsonSchema); - -const modelToUse = genAI.getGenerativeModel({ - model: googleModel, - systemInstruction: prompt, - generationConfig: { - responseMimeType: "application/json", - responseSchema: jsonSchema, - }, + .map(([key, value]) => [key, removeKeys(value)])); + } + return obj; + }; + jsonSchema = removeKeys(jsonSchema); + const modelToUse = genAI.getGenerativeModel({ + model: googleModel, + systemInstruction: prompt, + generationConfig: { + responseMimeType: "application/json", + responseSchema: jsonSchema, + }, }); - -const result = await modelToUse.generateContent( - markdown, - ); - -//console.log(result.response.text()); -const event = JSON.parse(result.response.text()) -return event; -} - + const result = await modelToUse.generateContent(markdown); + //console.log(result.response.text()); + const event = JSON.parse(result.response.text()); + return event; +}; diff --git a/extractor/src/extractors/google.ts b/extractor/src/extractors/google.ts new file mode 100644 index 0000000..c6648c1 --- /dev/null +++ b/extractor/src/extractors/google.ts @@ -0,0 +1,53 @@ +import { GoogleGenerativeAI } from "@google/generative-ai"; +import { zodToJsonSchema } from "zod-to-json-schema"; +import { ZodObject, ZodRawShape } from "zod"; + +interface ExtractorParams { + markdown: string; + zodSchema: ZodObject; + prompt: string; + model: string; +} + +export const googleExtractor = async ({ markdown, zodSchema, prompt, model }: ExtractorParams): Promise => { + if (!process.env.GEMINI_API_KEY) { + throw new Error("Missing GEMINI_API_KEY"); + } + + const genAI = new GoogleGenerativeAI(process.env.GEMINI_API_KEY); + const googleModel = model; + + // Convert Zod schema to JSON schema + let jsonSchema = zodToJsonSchema(zodSchema); + + // Remove additionalProperties and $schema keys + const removeKeys = (obj: any): any => { + if (Array.isArray(obj)) { + return obj.map(removeKeys); + } else if (typeof obj === "object" && obj !== null) { + return Object.fromEntries( + Object.entries(obj) + .filter(([key]) => key !== "additionalProperties" && key !== "$schema") + .map(([key, value]) => [key, removeKeys(value)]) + ); + } + return obj; + }; + + jsonSchema = removeKeys(jsonSchema); + + const modelToUse = genAI.getGenerativeModel({ + model: googleModel, + systemInstruction: prompt, + generationConfig: { + responseMimeType: "application/json", + responseSchema: jsonSchema as any, + }, + }); + + const result = await modelToUse.generateContent(markdown); + + //console.log(result.response.text()); + const event = JSON.parse(result.response.text()); + return event; +} diff --git a/extractor/src/extractors/index.d.ts b/extractor/src/extractors/index.d.ts new file mode 100644 index 0000000..c34a6a7 --- /dev/null +++ b/extractor/src/extractors/index.d.ts @@ -0,0 +1,13 @@ +import { ZodObject, ZodRawShape } from "zod"; +export declare const OpenAIModels: string[]; +export declare const LocalModels: string[]; +export declare const GoogleModels: string[]; +export interface ExtractorParams { + markdown: string; + zodSchema: ZodObject; + prompt: string; + model: string; +} +export type ExtractorFunction = (params: ExtractorParams) => Promise; +export declare function getExtractor(model: string): ExtractorFunction; +//# sourceMappingURL=index.d.ts.map \ No newline at end of file diff --git a/extractor/src/extractors/index.d.ts.map b/extractor/src/extractors/index.d.ts.map new file mode 100644 index 0000000..d03881d --- /dev/null +++ b/extractor/src/extractors/index.d.ts.map @@ -0,0 +1 @@ +{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["index.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,SAAS,EAAE,WAAW,EAAE,MAAM,KAAK,CAAC;AAE7C,eAAO,MAAM,YAAY,UAAuD,CAAC;AACjF,eAAO,MAAM,WAAW,UAAsB,CAAC;AAC/C,eAAO,MAAM,YAAY,UAMxB,CAAC;AAEF,MAAM,WAAW,eAAe;IAC9B,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,SAAS,CAAC,WAAW,CAAC,CAAC;IAClC,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;CACf;AAED,MAAM,MAAM,iBAAiB,GAAG,CAAC,MAAM,EAAE,eAAe,KAAK,OAAO,CAAC,GAAG,CAAC,CAAC;AAE1E,wBAAgB,YAAY,CAAC,KAAK,EAAE,MAAM,GAAG,iBAAiB,CAc7D"} \ No newline at end of file diff --git a/extractor/src/extractors/index.js b/extractor/src/extractors/index.js index 485ad2a..b4bb0c2 100644 --- a/extractor/src/extractors/index.js +++ b/extractor/src/extractors/index.js @@ -1,29 +1,24 @@ import { ollamaExtractor } from "./ollama.js"; import { openAIExtractor } from "./openAI.js"; import { googleExtractor } from "./google.js"; - export const OpenAIModels = ["gpt-4o", "gpt-4o-mini", "gpt-4.1", "gpt-4.1-mini"]; export const LocalModels = ["llama3.2-vision"]; export const GoogleModels = [ - "gemini-2.0-flash-001", - "gemini-2.0-flash-lite-preview-02-05", - "gemini-1.5-flash", - "gemini-1.5-flash-8b", - "gemini-1.5-pro" + "gemini-2.0-flash-001", + "gemini-2.0-flash-lite-preview-02-05", + "gemini-1.5-flash", + "gemini-1.5-flash-8b", + "gemini-1.5-pro" ]; - export function getExtractor(model) { - if (OpenAIModels.includes(model)) { - return openAIExtractor; - } - - if (GoogleModels.includes(model)) { - return googleExtractor; - } - - if (LocalModels.includes(model)) { - return ollamaExtractor; - } - - throw new Error(`Unrecognised model '${model}'.`); + if (OpenAIModels.includes(model)) { + return openAIExtractor; + } + if (GoogleModels.includes(model)) { + return googleExtractor; + } + if (LocalModels.includes(model)) { + return ollamaExtractor; + } + throw new Error(`Unrecognised model '${model}'.`); } diff --git a/extractor/src/extractors/index.ts b/extractor/src/extractors/index.ts new file mode 100644 index 0000000..95411ba --- /dev/null +++ b/extractor/src/extractors/index.ts @@ -0,0 +1,39 @@ +import { ollamaExtractor } from "./ollama.js"; +import { openAIExtractor } from "./openAI.js"; +import { googleExtractor } from "./google.js"; +import { ZodObject, ZodRawShape } from "zod"; + +export const OpenAIModels = ["gpt-4o", "gpt-4o-mini", "gpt-4.1", "gpt-4.1-mini"]; +export const LocalModels = ["llama3.2-vision", "llava", "llava:13b", "qwen2.5vl:7b"]; +export const GoogleModels = [ + "gemini-2.0-flash-001", + "gemini-2.0-flash-lite-preview-02-05", + "gemini-1.5-flash", + "gemini-1.5-flash-8b", + "gemini-1.5-pro" +]; + +export interface ExtractorParams { + markdown: string; + zodSchema: ZodObject; + prompt: string; + model: string; +} + +export type ExtractorFunction = (params: ExtractorParams) => Promise; + +export function getExtractor(model: string): ExtractorFunction { + if (OpenAIModels.includes(model)) { + return openAIExtractor; + } + + if (GoogleModels.includes(model)) { + return googleExtractor; + } + + if (LocalModels.includes(model)) { + return ollamaExtractor; + } + + throw new Error(`Unrecognised model '${model}'.`); +} diff --git a/extractor/src/extractors/ollama.d.ts b/extractor/src/extractors/ollama.d.ts new file mode 100644 index 0000000..e1019a9 --- /dev/null +++ b/extractor/src/extractors/ollama.d.ts @@ -0,0 +1,10 @@ +import { ZodObject, ZodRawShape } from "zod"; +interface ExtractorParams { + markdown: string; + zodSchema: ZodObject; + prompt: string; + model: string; +} +export declare const ollamaExtractor: ({ markdown, zodSchema, prompt, model }: ExtractorParams) => Promise; +export {}; +//# sourceMappingURL=ollama.d.ts.map \ No newline at end of file diff --git a/extractor/src/extractors/ollama.d.ts.map b/extractor/src/extractors/ollama.d.ts.map new file mode 100644 index 0000000..1a12e7b --- /dev/null +++ b/extractor/src/extractors/ollama.d.ts.map @@ -0,0 +1 @@ +{"version":3,"file":"ollama.d.ts","sourceRoot":"","sources":["ollama.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,SAAS,EAAE,WAAW,EAAE,MAAM,KAAK,CAAC;AAE7C,UAAU,eAAe;IACvB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,SAAS,CAAC,WAAW,CAAC,CAAC;IAClC,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;CACf;AAED,eAAO,MAAM,eAAe,2CAAkD,eAAe,KAAG,OAAO,CAAC,GAAG,CAuB1G,CAAA"} \ No newline at end of file diff --git a/extractor/src/extractors/ollama.js b/extractor/src/extractors/ollama.js index a4fedc7..9868034 100644 --- a/extractor/src/extractors/ollama.js +++ b/extractor/src/extractors/ollama.js @@ -1,27 +1,22 @@ import OpenAI from "openai"; import { zodResponseFormat } from "openai/helpers/zod"; - export const ollamaExtractor = async ({ markdown, zodSchema, prompt, model }) => { - if (!process.env.BASE_URL) { - throw new Error("Missing BASE_URL"); - } - - const openai = new OpenAI({ - baseURL: process.env.BASE_URL, - apiKey: 'ollama' - }); - - const ollamaModel = model; - - const completion = await openai.beta.chat.completions.parse({ - model: ollamaModel, - messages: [ - { role: "system", content: prompt }, - { role: "user", content: markdown }, - ], - response_format: zodResponseFormat(zodSchema, "event"), - }); - - const event = completion.choices[0].message.parsed; - return event; -} + if (!process.env.BASE_URL) { + throw new Error("Missing BASE_URL"); + } + const openai = new OpenAI({ + baseURL: process.env.BASE_URL, + apiKey: 'ollama' + }); + const ollamaModel = model; + const completion = await openai.beta.chat.completions.parse({ + model: ollamaModel, + messages: [ + { role: "system", content: prompt }, + { role: "user", content: markdown }, + ], + response_format: zodResponseFormat(zodSchema, "event"), + }); + const event = completion.choices[0].message.parsed; + return event; +}; diff --git a/extractor/src/extractors/ollama.ts b/extractor/src/extractors/ollama.ts new file mode 100644 index 0000000..320bc20 --- /dev/null +++ b/extractor/src/extractors/ollama.ts @@ -0,0 +1,35 @@ +import OpenAI from "openai"; +import { zodResponseFormat } from "openai/helpers/zod"; +import { ZodObject, ZodRawShape } from "zod"; + +interface ExtractorParams { + markdown: string; + zodSchema: ZodObject; + prompt: string; + model: string; +} + +export const ollamaExtractor = async ({ markdown, zodSchema, prompt, model }: ExtractorParams): Promise => { + if (!process.env.BASE_URL) { + throw new Error("Missing BASE_URL"); + } + + const openai = new OpenAI({ + baseURL: process.env.BASE_URL, + apiKey: 'ollama' + }); + + const ollamaModel = model; + + const completion = await openai.beta.chat.completions.parse({ + model: ollamaModel, + messages: [ + { role: "system", content: prompt }, + { role: "user", content: markdown }, + ], + response_format: zodResponseFormat(zodSchema, "event"), + }); + + const event = completion.choices[0].message.parsed; + return event; +} diff --git a/extractor/src/extractors/openAI.d.ts b/extractor/src/extractors/openAI.d.ts new file mode 100644 index 0000000..7ec05d0 --- /dev/null +++ b/extractor/src/extractors/openAI.d.ts @@ -0,0 +1,10 @@ +import { ZodObject, ZodRawShape } from "zod"; +interface ExtractorParams { + markdown: string; + zodSchema: ZodObject; + prompt: string; + model: string; +} +export declare const openAIExtractor: ({ markdown, zodSchema, prompt, model }: ExtractorParams) => Promise; +export {}; +//# sourceMappingURL=openAI.d.ts.map \ No newline at end of file diff --git a/extractor/src/extractors/openAI.d.ts.map b/extractor/src/extractors/openAI.d.ts.map new file mode 100644 index 0000000..ea66b3f --- /dev/null +++ b/extractor/src/extractors/openAI.d.ts.map @@ -0,0 +1 @@ +{"version":3,"file":"openAI.d.ts","sourceRoot":"","sources":["openAI.ts"],"names":[],"mappings":"AAEA,OAAO,EAAK,SAAS,EAAE,WAAW,EAAE,MAAM,KAAK,CAAC;AAEhD,UAAU,eAAe;IACvB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,SAAS,CAAC,WAAW,CAAC,CAAC;IAClC,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;CACf;AAED,eAAO,MAAM,eAAe,2CAAkD,eAAe,KAAG,OAAO,CAAC,GAAG,CAmB1G,CAAA"} \ No newline at end of file diff --git a/extractor/src/extractors/openAI.js b/extractor/src/extractors/openAI.js index 62577db..ad33755 100644 --- a/extractor/src/extractors/openAI.js +++ b/extractor/src/extractors/openAI.js @@ -1,23 +1,19 @@ import OpenAI from "openai"; import { zodResponseFormat } from "openai/helpers/zod"; - export const openAIExtractor = async ({ markdown, zodSchema, prompt, model }) => { - if (!process.env.OPENAI_API_KEY) { - throw new Error("Missing OPENAI_API_KEY"); - } - - const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }); - const openAiModel = model; - - const completion = await openai.beta.chat.completions.parse({ - model: openAiModel, - messages: [ - { role: "system", content: prompt }, - { role: "user", content: markdown }, - ], - response_format: zodResponseFormat(zodSchema, "event"), - }); - - const event = completion.choices[0].message.parsed; - return event; -} + if (!process.env.OPENAI_API_KEY) { + throw new Error("Missing OPENAI_API_KEY"); + } + const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }); + const openAiModel = model; + const completion = await openai.beta.chat.completions.parse({ + model: openAiModel, + messages: [ + { role: "system", content: prompt }, + { role: "user", content: markdown }, + ], + response_format: zodResponseFormat(zodSchema, "event"), + }); + const event = completion.choices[0].message.parsed; + return event; +}; diff --git a/extractor/src/extractors/openAI.ts b/extractor/src/extractors/openAI.ts new file mode 100644 index 0000000..bac7ffb --- /dev/null +++ b/extractor/src/extractors/openAI.ts @@ -0,0 +1,31 @@ +import OpenAI from "openai"; +import { zodResponseFormat } from "openai/helpers/zod"; +import { z, ZodObject, ZodRawShape } from "zod"; + +interface ExtractorParams { + markdown: string; + zodSchema: ZodObject; + prompt: string; + model: string; +} + +export const openAIExtractor = async ({ markdown, zodSchema, prompt, model }: ExtractorParams): Promise => { + if (!process.env.OPENAI_API_KEY) { + throw new Error("Missing OPENAI_API_KEY"); + } + + const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY }); + const openAiModel = model; + + const completion = await openai.beta.chat.completions.parse({ + model: openAiModel, + messages: [ + { role: "system", content: prompt }, + { role: "user", content: markdown }, + ], + response_format: zodResponseFormat(zodSchema, "event"), + }); + + const event = completion.choices[0].message.parsed; + return event; +} diff --git a/extractor/src/index.js b/extractor/src/index.ts similarity index 59% rename from extractor/src/index.js rename to extractor/src/index.ts index 8c19a2d..8bbbdcd 100644 --- a/extractor/src/index.js +++ b/extractor/src/index.ts @@ -4,4 +4,7 @@ export { templates } from './services/templates.js'; export { formatter } from './services/formatter.js'; export { convertToZodSchema } from './utils/convertToZodSchema.js'; -export { getExtractor } from './extractors/index.js'; \ No newline at end of file +export { getExtractor } from './extractors/index.js'; + +export type { SchemaField } from './utils/convertToZodSchema.js'; +export type { ExtractOptions, ExtractResult } from './services/extract.js'; diff --git a/extractor/src/prompts.js b/extractor/src/prompts.ts similarity index 89% rename from extractor/src/prompts.js rename to extractor/src/prompts.ts index 7537a3f..2fd00f2 100644 --- a/extractor/src/prompts.js +++ b/extractor/src/prompts.ts @@ -6,16 +6,16 @@ You are an expert in structured data extraction. Your task is to extract informa - **Do not use substitutes such as "unknown," "missing," or any other placeholder for missing or unknown data. The value **must** always be explicitly null. `; -export const AUTO_SCHEMA_PROMPT = (markdown) => ` +export const AUTO_SCHEMA_PROMPT = (markdown: string): string => ` Read the following markdown content and generate a schema of useful structured data that can be extracted from it. Follow these rules strictly: - The \`children\` field **must only be present if the \`type\` is \`object\` or \`array\`. It should never exist for other types. - \`description\` fields should be concise, no longer than one sentence. """${markdown}""" `; -export const INSTRUCTIONS_SCHEMA_PROMPT = (markdown, data) => ` +export const INSTRUCTIONS_SCHEMA_PROMPT = (markdown: string, data: string[]): string => ` Read the following markdown content and generate a schema for the structured data I require: """${data}""". Use only the fields listed, and follow these rules strictly: - The \`children\` field **must only be present if the \`type\` is \`object\` or \`array\`. It should never exist for other types. - \`description\` fields should be concise, no longer than one sentence. """${markdown}""" -`; \ No newline at end of file +`; diff --git a/extractor/src/services/extract.d.ts b/extractor/src/services/extract.d.ts new file mode 100644 index 0000000..f8d2095 --- /dev/null +++ b/extractor/src/services/extract.d.ts @@ -0,0 +1,24 @@ +import { SchemaField } from '../utils/convertToZodSchema.js'; +export interface ExtractOptions { + file: string; + schema?: SchemaField[]; + template?: string; + model?: string; + autoSchema?: boolean | { + instructions?: string; + }; +} +export interface ExtractResult { + success: boolean; + pages: number; + data: any; + fileName: string; + markdown: string; +} +/** + * Extracts data from a document based on a provided schema. + * @param options - Options for the extraction process. + * @returns - The result of the extraction, including pages, extracted data, and file name. + */ +export declare function extract({ file, schema, template, model, autoSchema }: ExtractOptions): Promise; +//# sourceMappingURL=extract.d.ts.map \ No newline at end of file diff --git a/extractor/src/services/extract.d.ts.map b/extractor/src/services/extract.d.ts.map new file mode 100644 index 0000000..c33ae6f --- /dev/null +++ b/extractor/src/services/extract.d.ts.map @@ -0,0 +1 @@ +{"version":3,"file":"extract.d.ts","sourceRoot":"","sources":["extract.ts"],"names":[],"mappings":"AAGA,OAAO,EAAsB,WAAW,EAAE,MAAM,gCAAgC,CAAC;AAMjF,MAAM,WAAW,cAAc;IAC7B,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,EAAE,CAAC;IACvB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,UAAU,CAAC,EAAE,OAAO,GAAG;QAAE,YAAY,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;CAClD;AAED,MAAM,WAAW,aAAa;IAC5B,OAAO,EAAE,OAAO,CAAC;IACjB,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,GAAG,CAAC;IACV,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED;;;;GAIG;AACH,wBAAsB,OAAO,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,KAAK,EAAE,UAAU,EAAE,EAAE,cAAc,GAAG,OAAO,CAAC,aAAa,CAAC,CA6DnH"} \ No newline at end of file diff --git a/extractor/src/services/extract.js b/extractor/src/services/extract.js index add8284..1c3fee6 100644 --- a/extractor/src/services/extract.js +++ b/extractor/src/services/extract.js @@ -6,71 +6,63 @@ import { autogenerateSchema } from "../autoschema/autogenerateSchema.js"; import { convertFile } from '../converter.js'; import { BASE_EXTRACTION_PROMPT } from "../prompts.js"; import { getExtractor } from '../extractors/index.js'; - /** * Extracts data from a document based on a provided schema. - * @param {object} options - Options for the extraction process. - * @param {string} options.file - The file path to the document. - * @param {object} options.schema - The schema definition for data extraction. - * @param {string} [options.template] - Name of a pre-defined template. - * @param {string} [options.model] - The llm model to use if a base url is set. - * @param {boolean | object} [options.autoSchema] - Option to auto-generate the schema. - * @returns {Promise} - The result of the extraction, including pages, extracted data, and file name. + * @param options - Options for the extraction process. + * @returns - The result of the extraction, including pages, extracted data, and file name. */ export async function extract({ file, schema, template, model, autoSchema }) { - try { - - const defaultModel = model || "gpt-4o-mini"; - - if (!file) { - throw new Error("File is required."); + try { + const defaultModel = model || "gpt-4o-mini"; + if (!file) { + throw new Error("File is required."); + } + if (!isValidFile(file)) { + throw new Error("Invalid file type."); + } + let finalSchema = null; + if (template) { + finalSchema = getTemplate(template); + } + else if (schema) { + const { isValid, errors } = validateSchema(schema); + if (!isValid) { + throw new Error(`Invalid schema: ${errors.join(", ")}`); + } + finalSchema = schema; + } + else if (!autoSchema) { + throw new Error("You must provide a schema, template, or enable autoSchema."); + } + const { markdown, totalPages, fileName } = await convertFile(file, defaultModel); + if (autoSchema) { + const autoSchemaOption = autoSchema === true ? true : { instructions: autoSchema.instructions || "" }; + finalSchema = await autogenerateSchema(markdown, defaultModel, autoSchemaOption); + if (!finalSchema) { + throw new Error("Failed to auto-generate schema."); + } + } + if (!finalSchema) { + throw new Error("No schema available for extraction."); + } + const dynamicZodSchema = convertToZodSchema(finalSchema); + const extraction = getExtractor(defaultModel); + const event = await extraction({ + markdown, + zodSchema: dynamicZodSchema, + prompt: BASE_EXTRACTION_PROMPT, + model: defaultModel, + }); + return { + success: true, + pages: totalPages, + data: event, + fileName, + markdown, + }; } - - if (!isValidFile(file)) { - throw new Error("Invalid file type."); + catch (error) { + console.error("Error processing document:", error); + throw new Error(`Failed to process document: ${error.message}`); } - - let finalSchema = null; - if (template) { - finalSchema = getTemplate(template); - } else if (schema) { - const { isValid, errors } = validateSchema(schema); - if (!isValid) { - throw new Error(`Invalid schema: ${errors.join(", ")}`); - } - finalSchema = schema; - } else if (!autoSchema) { - throw new Error("You must provide a schema, template, or enable autoSchema."); - } - - const { markdown, totalPages, fileName } = await convertFile(file, defaultModel); - - if (autoSchema) { - finalSchema = await autogenerateSchema(markdown, defaultModel, autoSchema); - if (!finalSchema) { - throw new Error("Failed to auto-generate schema."); - } - } - - const dynamicZodSchema = convertToZodSchema(finalSchema); - const extraction = getExtractor(defaultModel); - - const event = await extraction({ - markdown, - zodSchema: dynamicZodSchema, - prompt: BASE_EXTRACTION_PROMPT, - model: defaultModel, - }); - - return { - success: true, - pages: totalPages, - data: event, - fileName, - markdown, - }; - } catch (error) { - console.error("Error processing document:", error); - throw new Error(`Failed to process document: ${error.message}`); - } } diff --git a/extractor/src/services/extract.ts b/extractor/src/services/extract.ts new file mode 100644 index 0000000..492c0a8 --- /dev/null +++ b/extractor/src/services/extract.ts @@ -0,0 +1,110 @@ +import { isValidFile } from '../utils/fileValidator.js'; +import { validateSchema } from '../utils/schemaValidator.js'; +import { getTemplate } from './templates.js'; +import { convertToZodSchema, SchemaField } from '../utils/convertToZodSchema.js'; +import { autogenerateSchema } from "../autoschema/autogenerateSchema.js"; +import { convertFile } from '../converter.js'; +import { BASE_EXTRACTION_PROMPT } from "../prompts.js"; +import { getExtractor } from '../extractors/index.js'; + +export interface ExtractOptions { + file: string; + schema?: SchemaField[]; + template?: string; + model?: string; + autoSchema?: boolean | { instructions?: string }; + language?: string; // Language code for OCR (e.g., 'eng', 'deu', 'fra') + imageQuality?: number; // Image quality for compression (1-100, default 85) + maxImageWidth?: number; // Max width for image resizing (default 2048) +} + +export interface ExtractResult { + success: boolean; + pages: number; + data: any; + fileName: string; + markdown: string; +} + +/** + * Extracts data from a document based on a provided schema. + * @param options - Options for the extraction process. + * @returns - The result of the extraction, including pages, extracted data, and file name. + */ +export async function extract({ + file, + schema, + template, + model, + autoSchema, + language = "eng", + imageQuality = 85, + maxImageWidth = 2048, +}: ExtractOptions): Promise { + try { + + const defaultModel = model || "gpt-4o-mini"; + + if (!file) { + throw new Error("File is required."); + } + + if (!(await isValidFile(file))) { + throw new Error("Invalid file type."); + } + + let finalSchema: SchemaField[] | null = null; + if (template) { + finalSchema = getTemplate(template); + } else if (schema) { + const { isValid, errors } = validateSchema(schema); + if (!isValid) { + throw new Error(`Invalid schema: ${errors.join(", ")}`); + } + finalSchema = schema; + } else if (!autoSchema) { + throw new Error("You must provide a schema, template, or enable autoSchema."); + } + + const { markdown, totalPages, fileName } = await convertFile( + file, + defaultModel, + language, + imageQuality, + maxImageWidth + ); + + if (autoSchema) { + const autoSchemaOption = autoSchema === true ? true : { instructions: autoSchema.instructions || "" }; + finalSchema = await autogenerateSchema(markdown, defaultModel, autoSchemaOption); + if (!finalSchema) { + throw new Error("Failed to auto-generate schema."); + } + } + + if (!finalSchema) { + throw new Error("No schema available for extraction."); + } + + const dynamicZodSchema = convertToZodSchema(finalSchema); + const extraction = getExtractor(defaultModel); + + const event = await extraction({ + markdown, + zodSchema: dynamicZodSchema, + prompt: BASE_EXTRACTION_PROMPT, + model: defaultModel, + }); + + return { + success: true, + pages: totalPages, + data: event, + fileName, + markdown, + }; + } catch (error) { + console.error("Error processing document:", error); + throw new Error(`Failed to process document: ${(error as Error).message}`); + } +} diff --git a/extractor/src/services/formatter.d.ts b/extractor/src/services/formatter.d.ts new file mode 100644 index 0000000..e1071ad --- /dev/null +++ b/extractor/src/services/formatter.d.ts @@ -0,0 +1,13 @@ +interface FormatterOptions { + file: string; + model?: string; +} +/** + * Formatter object for various formats. + */ +export declare const formatter: { + markdown: ({ file, model }: FormatterOptions) => Promise; + plaintext: ({ file, model }: FormatterOptions) => Promise; +}; +export {}; +//# sourceMappingURL=formatter.d.ts.map \ No newline at end of file diff --git a/extractor/src/services/formatter.d.ts.map b/extractor/src/services/formatter.d.ts.map new file mode 100644 index 0000000..6cd09b4 --- /dev/null +++ b/extractor/src/services/formatter.d.ts.map @@ -0,0 +1 @@ +{"version":3,"file":"formatter.d.ts","sourceRoot":"","sources":["formatter.ts"],"names":[],"mappings":"AAIA,UAAU,gBAAgB;IACxB,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AA8CD;;GAEG;AACH,eAAO,MAAM,SAAS;gCA1CsB,gBAAgB,KAAG,OAAO,CAAC,MAAM,CAAC;iCA6BjC,gBAAgB,KAAG,OAAO,CAAC,MAAM,CAAC;CAgB9E,CAAC"} \ No newline at end of file diff --git a/extractor/src/services/formatter.js b/extractor/src/services/formatter.js index 07c395b..3c69578 100644 --- a/extractor/src/services/formatter.js +++ b/extractor/src/services/formatter.js @@ -1,58 +1,50 @@ import { convertFile } from '../converter.js'; import { isPdfFile } from '../utils/pdfValidator.js'; import { convertToText } from '../utils/convertToText.js'; - /** * Extracts markdown content from a PDF. - * @param {object} options - Options for extracting the markdown. - * @param {string} options.file - The PDF file. - * @param {string} [options.model] - The LLM model to use. - * @returns {Promise} - The markdown content. + * @param options - Options for extracting the markdown. + * @returns - The markdown content. */ const getMarkdown = async ({ file, model }) => { - try { - if (!file) { - throw new Error('File is required.'); - } - - if (!isPdfFile(file)) { - throw new Error('File must be a PDF.'); - } - - const { markdown } = await convertFile(file, model); - - if (!markdown) { - throw new Error("Failed to extract markdown."); + try { + if (!file) { + throw new Error('File is required.'); + } + const isPdf = await isPdfFile(file); + if (!isPdf) { + throw new Error('File must be a PDF.'); + } + const { markdown } = await convertFile(file, model || "gpt-4o-mini"); + if (!markdown) { + throw new Error("Failed to extract markdown."); + } + return markdown; + } + catch (error) { + console.error("Error extracting markdown:", error); + throw error; } - - return markdown; - } catch (error) { - console.error("Error extracting markdown:", error); - throw error; - } }; - /** * Extracts plain text from a PDF by converting markdown to text. - * @param {object} options - Options for extracting the plain text. - * @param {string} options.file - The path to the PDF file. - * @param {string} [options.model] - The LLM model to use. - * @returns {Promise} - The plain text content. + * @param options - Options for extracting the plain text. + * @returns - The plain text content. */ const getPlainText = async ({ file, model }) => { - try { - const markdown = await getMarkdown({ file, model }); - return convertToText(markdown); - } catch (error) { - console.error("Error extracting plain text:", error); - throw error; - } + try { + const markdown = await getMarkdown({ file, model }); + return convertToText(markdown); + } + catch (error) { + console.error("Error extracting plain text:", error); + throw error; + } }; - /** * Formatter object for various formats. */ export const formatter = { - markdown: getMarkdown, - plaintext: getPlainText, + markdown: getMarkdown, + plaintext: getPlainText, }; diff --git a/extractor/src/services/formatter.ts b/extractor/src/services/formatter.ts new file mode 100644 index 0000000..d0fbca6 --- /dev/null +++ b/extractor/src/services/formatter.ts @@ -0,0 +1,60 @@ +import { convertFile } from '../converter.js'; +import { isPdfFile } from '../utils/pdfValidator.js'; +import { convertToText } from '../utils/convertToText.js'; + +interface FormatterOptions { + file: string; + model?: string; +} + +/** + * Extracts markdown content from a PDF. + * @param options - Options for extracting the markdown. + * @returns - The markdown content. + */ +const getMarkdown = async ({ file, model }: FormatterOptions): Promise => { + try { + if (!file) { + throw new Error('File is required.'); + } + + const isPdf = await isPdfFile(file); + if (!isPdf) { + throw new Error('File must be a PDF.'); + } + + const { markdown } = await convertFile(file, model || "gpt-4o-mini"); + + if (!markdown) { + throw new Error("Failed to extract markdown."); + } + + return markdown; + } catch (error) { + console.error("Error extracting markdown:", error); + throw error; + } +}; + +/** + * Extracts plain text from a PDF by converting markdown to text. + * @param options - Options for extracting the plain text. + * @returns - The plain text content. + */ +const getPlainText = async ({ file, model }: FormatterOptions): Promise => { + try { + const markdown = await getMarkdown({ file, model }); + return convertToText(markdown); + } catch (error) { + console.error("Error extracting plain text:", error); + throw error; + } +}; + +/** + * Formatter object for various formats. + */ +export const formatter = { + markdown: getMarkdown, + plaintext: getPlainText, +}; diff --git a/extractor/src/services/templates.d.ts b/extractor/src/services/templates.d.ts new file mode 100644 index 0000000..8bb35b7 --- /dev/null +++ b/extractor/src/services/templates.d.ts @@ -0,0 +1,21 @@ +import { SchemaField } from '../utils/convertToZodSchema.js'; +/** + * Lists all available templates. + * @returns - Array of template names. + */ +export declare function listTemplates(): string[]; +/** + * Retrieves a specific template. + * @param name - The name of the template. + * @returns - The template content. + * @throws - If the template is not found. + */ +export declare function getTemplate(name: string): SchemaField[]; +/** + * Exports available templates. + */ +export declare const templates: { + list: typeof listTemplates; + get: typeof getTemplate; +}; +//# sourceMappingURL=templates.d.ts.map \ No newline at end of file diff --git a/extractor/src/services/templates.d.ts.map b/extractor/src/services/templates.d.ts.map new file mode 100644 index 0000000..de1ddea --- /dev/null +++ b/extractor/src/services/templates.d.ts.map @@ -0,0 +1 @@ +{"version":3,"file":"templates.d.ts","sourceRoot":"","sources":["templates.ts"],"names":[],"mappings":"AAGA,OAAO,EAAE,WAAW,EAAE,MAAM,gCAAgC,CAAC;AAO7D;;;GAGG;AACH,wBAAgB,aAAa,IAAI,MAAM,EAAE,CAKxC;AAED;;;;;GAKG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,WAAW,EAAE,CAMvD;AAED;;GAEG;AACH,eAAO,MAAM,SAAS;;;CAGrB,CAAC"} \ No newline at end of file diff --git a/extractor/src/services/templates.js b/extractor/src/services/templates.js index dea34b5..1b6f29c 100644 --- a/extractor/src/services/templates.js +++ b/extractor/src/services/templates.js @@ -1,41 +1,36 @@ import fs from 'fs'; import * as path from 'path'; import { fileURLToPath } from 'url'; - const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); - const templatesDirectory = path.resolve(__dirname, '../templates'); - /** * Lists all available templates. - * @returns {string[]} - Array of template names. + * @returns - Array of template names. */ export function listTemplates() { - return fs - .readdirSync(templatesDirectory) - .filter((file) => file.endsWith('.json')) - .map((file) => file.replace('.json', '')); + return fs + .readdirSync(templatesDirectory) + .filter((file) => file.endsWith('.json')) + .map((file) => file.replace('.json', '')); } - /** * Retrieves a specific template. - * @param {string} name - The name of the template. - * @returns {object} - The template content. - * @throws {Error} - If the template is not found. + * @param name - The name of the template. + * @returns - The template content. + * @throws - If the template is not found. */ export function getTemplate(name) { - const templatePath = path.join(templatesDirectory, `${name}.json`); - if (!fs.existsSync(templatePath)) { - throw new Error(`Template "${name}" not found`); - } - return JSON.parse(fs.readFileSync(templatePath, 'utf8')); + const templatePath = path.join(templatesDirectory, `${name}.json`); + if (!fs.existsSync(templatePath)) { + throw new Error(`Template "${name}" not found`); + } + return JSON.parse(fs.readFileSync(templatePath, 'utf8')); } - /** * Exports available templates. */ export const templates = { - list: listTemplates, - get: getTemplate, + list: listTemplates, + get: getTemplate, }; diff --git a/extractor/src/services/templates.ts b/extractor/src/services/templates.ts new file mode 100644 index 0000000..49c5f40 --- /dev/null +++ b/extractor/src/services/templates.ts @@ -0,0 +1,42 @@ +import fs from 'fs'; +import * as path from 'path'; +import { fileURLToPath } from 'url'; +import { SchemaField } from '../utils/convertToZodSchema.js'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = path.dirname(__filename); + +const templatesDirectory = path.resolve(__dirname, '../templates'); + +/** + * Lists all available templates. + * @returns - Array of template names. + */ +export function listTemplates(): string[] { + return fs + .readdirSync(templatesDirectory) + .filter((file) => file.endsWith('.json')) + .map((file) => file.replace('.json', '')); +} + +/** + * Retrieves a specific template. + * @param name - The name of the template. + * @returns - The template content. + * @throws - If the template is not found. + */ +export function getTemplate(name: string): SchemaField[] { + const templatePath = path.join(templatesDirectory, `${name}.json`); + if (!fs.existsSync(templatePath)) { + throw new Error(`Template "${name}" not found`); + } + return JSON.parse(fs.readFileSync(templatePath, 'utf8')); +} + +/** + * Exports available templates. + */ +export const templates = { + list: listTemplates, + get: getTemplate, +}; diff --git a/extractor/src/utils/convertToText.d.ts b/extractor/src/utils/convertToText.d.ts new file mode 100644 index 0000000..9623778 --- /dev/null +++ b/extractor/src/utils/convertToText.d.ts @@ -0,0 +1,2 @@ +export declare const convertToText: (markdown: string) => string; +//# sourceMappingURL=convertToText.d.ts.map \ No newline at end of file diff --git a/extractor/src/utils/convertToText.d.ts.map b/extractor/src/utils/convertToText.d.ts.map new file mode 100644 index 0000000..f216108 --- /dev/null +++ b/extractor/src/utils/convertToText.d.ts.map @@ -0,0 +1 @@ +{"version":3,"file":"convertToText.d.ts","sourceRoot":"","sources":["convertToText.ts"],"names":[],"mappings":"AAAA,eAAO,MAAM,aAAa,aAAc,MAAM,KAAG,MAoB9C,CAAC"} \ No newline at end of file diff --git a/extractor/src/utils/convertToText.js b/extractor/src/utils/convertToText.js index 8366f31..65c1219 100644 --- a/extractor/src/utils/convertToText.js +++ b/extractor/src/utils/convertToText.js @@ -1,22 +1,19 @@ export const convertToText = (markdown) => { if (!markdown || typeof markdown !== "string") { - throw new Error("Valid markdown content is required."); + throw new Error("Valid markdown content is required."); } - // Strip markdown syntax and handle tables const plainText = markdown - .replace(/(\*\*|__)(.*?)\1/g, "$2") // Bold - .replace(/(\*|_)(.*?)\1/g, "$2") // Italic - .replace(/(#+\s)/g, "") // Headings - .replace(/\[(.*?)\]\(.*?\)/g, "$1") // Links - .replace(/!\[(.*?)\]\(.*?\)/g, "$1") // Images - .replace(/(```.*?\n[\s\S]*?\n```|`.*?`)/g, "") // Code blocks/inline - .replace(/>+/g, "") // Blockquotes - .replace(/\n{2,}/g, "\n") // Excess newlines - .replace(/\|([^|]*)\|/g, (_, row) => row.trim()) // Table rows - .replace(/-+/g, "") // Table dividers (---|---) - .trim(); - + .replace(/(\*\*|__)(.*?)\1/g, "$2") // Bold + .replace(/(\*|_)(.*?)\1/g, "$2") // Italic + .replace(/(#+\s)/g, "") // Headings + .replace(/\[(.*?)\]\(.*?\)/g, "$1") // Links + .replace(/!\[(.*?)\]\(.*?\)/g, "$1") // Images + .replace(/(```.*?\n[\s\S]*?\n```|`.*?`)/g, "") // Code blocks/inline + .replace(/>+/g, "") // Blockquotes + .replace(/\n{2,}/g, "\n") // Excess newlines + .replace(/\|([^|]*)\|/g, (_, row) => row.trim()) // Table rows + .replace(/-+/g, "") // Table dividers (---|---) + .trim(); return plainText; - }; - \ No newline at end of file +}; diff --git a/extractor/src/utils/convertToText.ts b/extractor/src/utils/convertToText.ts new file mode 100644 index 0000000..7b960e0 --- /dev/null +++ b/extractor/src/utils/convertToText.ts @@ -0,0 +1,21 @@ +export const convertToText = (markdown: string): string => { + if (!markdown || typeof markdown !== "string") { + throw new Error("Valid markdown content is required."); + } + + // Strip markdown syntax and handle tables + const plainText = markdown + .replace(/(\*\*|__)(.*?)\1/g, "$2") // Bold + .replace(/(\*|_)(.*?)\1/g, "$2") // Italic + .replace(/(#+\s)/g, "") // Headings + .replace(/\[(.*?)\]\(.*?\)/g, "$1") // Links + .replace(/!\[(.*?)\]\(.*?\)/g, "$1") // Images + .replace(/(```.*?\n[\s\S]*?\n```|`.*?`)/g, "") // Code blocks/inline + .replace(/>+/g, "") // Blockquotes + .replace(/\n{2,}/g, "\n") // Excess newlines + .replace(/\|([^|]*)\|/g, (_, row) => row.trim()) // Table rows + .replace(/-+/g, "") // Table dividers (---|---) + .trim(); + + return plainText; + }; diff --git a/extractor/src/utils/convertToZodSchema.d.ts b/extractor/src/utils/convertToZodSchema.d.ts new file mode 100644 index 0000000..175b13d --- /dev/null +++ b/extractor/src/utils/convertToZodSchema.d.ts @@ -0,0 +1,15 @@ +import { z, ZodTypeAny } from 'zod'; +export interface SchemaField { + name: string; + type: 'string' | 'number' | 'boolean' | 'enum' | 'object' | 'array'; + description?: string; + values?: string[]; + children?: SchemaField[]; +} +/** + * Converts an array of field definitions into a Zod schema. + * @param object - Array of field definitions. + * @returns - A Zod object schema. + */ +export declare const convertToZodSchema: (object: SchemaField[]) => z.ZodObject>; +//# sourceMappingURL=convertToZodSchema.d.ts.map \ No newline at end of file diff --git a/extractor/src/utils/convertToZodSchema.d.ts.map b/extractor/src/utils/convertToZodSchema.d.ts.map new file mode 100644 index 0000000..cfa4634 --- /dev/null +++ b/extractor/src/utils/convertToZodSchema.d.ts.map @@ -0,0 +1 @@ +{"version":3,"file":"convertToZodSchema.d.ts","sourceRoot":"","sources":["convertToZodSchema.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,UAAU,EAAE,MAAM,KAAK,CAAC;AAEpC,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,QAAQ,GAAG,QAAQ,GAAG,SAAS,GAAG,MAAM,GAAG,QAAQ,GAAG,OAAO,CAAC;IACpE,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,MAAM,CAAC,EAAE,MAAM,EAAE,CAAC;IAClB,QAAQ,CAAC,EAAE,WAAW,EAAE,CAAC;CAC1B;AAED;;;;GAIG;AACH,eAAO,MAAM,kBAAkB,WAAY,WAAW,EAAE,KAAG,CAAC,CAAC,SAAS,CAAC,MAAM,CAAC,MAAM,EAAE,UAAU,CAAC,CAsEhG,CAAC"} \ No newline at end of file diff --git a/extractor/src/utils/convertToZodSchema.js b/extractor/src/utils/convertToZodSchema.js index 8f8f4f0..70a6e8d 100644 --- a/extractor/src/utils/convertToZodSchema.js +++ b/extractor/src/utils/convertToZodSchema.js @@ -1,78 +1,66 @@ import { z } from 'zod'; - /** * Converts an array of field definitions into a Zod schema. - * @param {Array} object - Array of field definitions. - * @returns {ZodObject} - A Zod object schema. + * @param object - Array of field definitions. + * @returns - A Zod object schema. */ export const convertToZodSchema = (object) => { - const createZodSchema = (fields) => { - const schema = {}; - - fields.forEach((item) => { - let zodType; - - switch (item.type) { - case 'string': - zodType = z.string(); - break; - - case 'number': - zodType = z.number(); - break; - - case 'boolean': - zodType = z.boolean(); - break; - - case 'enum': - if (item.values && Array.isArray(item.values)) { - zodType = z.enum(item.values); - } else { - throw new Error(`Invalid "enum" type definition for ${item.name}`); - } - break; - - case 'object': - if (item.children) { - zodType = z.object(createZodSchema(item.children)); - } else { - throw new Error(`Invalid "object" type definition for ${item.name}`); - } - break; - - case 'array': - if (item.children && item.children.length > 0) { - if (item.children.length === 1) { - const singleChild = item.children[0]; - const childSchema = createZodSchema([singleChild]); - const childType = childSchema[singleChild.name]; - - zodType = z.array(childType); - } else { - const arraySchema = createZodSchema(item.children); - zodType = z.array(z.object(arraySchema)); + const createZodSchema = (fields) => { + const schema = {}; + fields.forEach((item) => { + let zodType; + switch (item.type) { + case 'string': + zodType = z.string(); + break; + case 'number': + zodType = z.number(); + break; + case 'boolean': + zodType = z.boolean(); + break; + case 'enum': + if (item.values && Array.isArray(item.values) && item.values.length > 0) { + zodType = z.enum(item.values); + } + else { + throw new Error(`Invalid "enum" type definition for ${item.name}`); + } + break; + case 'object': + if (item.children) { + zodType = z.object(createZodSchema(item.children)); + } + else { + throw new Error(`Invalid "object" type definition for ${item.name}`); + } + break; + case 'array': + if (item.children && item.children.length > 0) { + if (item.children.length === 1) { + const singleChild = item.children[0]; + const childSchema = createZodSchema([singleChild]); + const childType = childSchema[singleChild.name]; + zodType = z.array(childType); + } + else { + const arraySchema = createZodSchema(item.children); + zodType = z.array(z.object(arraySchema)); + } + } + else { + throw new Error(`Invalid or unsupported "array" type definition for ${item.name}`); + } + break; + default: + throw new Error(`Unsupported type "${item.type}" for field ${item.name}`); } - } else { - throw new Error( - `Invalid or unsupported "array" type definition for ${item.name}` - ); - } - break; - - default: - throw new Error(`Unsupported type "${item.type}" for field ${item.name}`); - } - - if (item.description) { - zodType = zodType.describe(item.description); - } - - schema[item.name] = zodType; - }); - - return schema; - }; - - return z.object(createZodSchema(object)); + if (item.description) { + zodType = zodType.describe(item.description); + } + schema[item.name] = zodType; + }); + return schema; + }; + return z.object(createZodSchema(object)); }; diff --git a/extractor/src/utils/convertToZodSchema.ts b/extractor/src/utils/convertToZodSchema.ts new file mode 100644 index 0000000..b77e9be --- /dev/null +++ b/extractor/src/utils/convertToZodSchema.ts @@ -0,0 +1,86 @@ +import { z, ZodTypeAny } from 'zod'; + +export interface SchemaField { + name: string; + type: 'string' | 'number' | 'boolean' | 'enum' | 'object' | 'array'; + description?: string; + values?: string[]; + children?: SchemaField[]; +} + +/** + * Converts an array of field definitions into a Zod schema. + * @param object - Array of field definitions. + * @returns - A Zod object schema. + */ +export const convertToZodSchema = (object: SchemaField[]): z.ZodObject> => { + const createZodSchema = (fields: SchemaField[]): Record => { + const schema: Record = {}; + + fields.forEach((item) => { + let zodType: ZodTypeAny; + + switch (item.type) { + case 'string': + zodType = z.string(); + break; + + case 'number': + zodType = z.number(); + break; + + case 'boolean': + zodType = z.boolean(); + break; + + case 'enum': + if (item.values && Array.isArray(item.values) && item.values.length > 0) { + zodType = z.enum(item.values as [string, ...string[]]); + } else { + throw new Error(`Invalid "enum" type definition for ${item.name}`); + } + break; + + case 'object': + if (item.children) { + zodType = z.object(createZodSchema(item.children)); + } else { + throw new Error(`Invalid "object" type definition for ${item.name}`); + } + break; + + case 'array': + if (item.children && item.children.length > 0) { + if (item.children.length === 1) { + const singleChild = item.children[0]; + const childSchema = createZodSchema([singleChild]); + const childType = childSchema[singleChild.name]; + + zodType = z.array(childType); + } else { + const arraySchema = createZodSchema(item.children); + zodType = z.array(z.object(arraySchema)); + } + } else { + throw new Error( + `Invalid or unsupported "array" type definition for ${item.name}` + ); + } + break; + + default: + throw new Error(`Unsupported type "${item.type}" for field ${item.name}`); + } + + if (item.description) { + zodType = zodType.describe(item.description); + } + + schema[item.name] = zodType; + }); + + return schema; + }; + + return z.object(createZodSchema(object)); +}; diff --git a/extractor/src/utils/fileValidator.d.ts b/extractor/src/utils/fileValidator.d.ts new file mode 100644 index 0000000..efeb781 --- /dev/null +++ b/extractor/src/utils/fileValidator.d.ts @@ -0,0 +1,7 @@ +/** + * Function to check if a file is valid based on its URL or MIME type + * @param file - The URL to the file + * @returns - Resolves to true if the file is valid, false otherwise + */ +export declare function isValidFile(file: string): Promise; +//# sourceMappingURL=fileValidator.d.ts.map \ No newline at end of file diff --git a/extractor/src/utils/fileValidator.d.ts.map b/extractor/src/utils/fileValidator.d.ts.map new file mode 100644 index 0000000..886cd87 --- /dev/null +++ b/extractor/src/utils/fileValidator.d.ts.map @@ -0,0 +1 @@ +{"version":3,"file":"fileValidator.d.ts","sourceRoot":"","sources":["fileValidator.ts"],"names":[],"mappings":"AAEA;;;;GAIG;AACH,wBAAsB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,CA4BhE"} \ No newline at end of file diff --git a/extractor/src/utils/fileValidator.js b/extractor/src/utils/fileValidator.js index 65fd347..d311027 100644 --- a/extractor/src/utils/fileValidator.js +++ b/extractor/src/utils/fileValidator.js @@ -1,9 +1,8 @@ import axios from 'axios'; - /** * Function to check if a file is valid based on its URL or MIME type - * @param {string} file - The URL to the file - * @returns {Promise} - Resolves to true if the file is valid, false otherwise + * @param file - The URL to the file + * @returns - Resolves to true if the file is valid, false otherwise */ export async function isValidFile(file) { const allowedExtensions = ['pdf', 'png', 'jpg', 'jpeg', 'txt', 'docx', 'html']; @@ -16,20 +15,18 @@ export async function isValidFile(file) { docx: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', html: 'text/html', }; - const urlPath = new URL(file).pathname; const extensionRegex = new RegExp(`\\.(${allowedExtensions.join('|')})$`, 'i'); - if (!extensionRegex.test(urlPath)) { return false; } - // Optional: Check the MIME type if query parameters are used try { const response = await axios.head(file); const contentType = response.headers['content-type']; return Object.values(allowedMimeTypes).some(mime => contentType.startsWith(mime)); - } catch (error) { + } + catch (error) { console.error('Error checking MIME type:', error); return false; } diff --git a/extractor/src/utils/fileValidator.ts b/extractor/src/utils/fileValidator.ts new file mode 100644 index 0000000..83b6945 --- /dev/null +++ b/extractor/src/utils/fileValidator.ts @@ -0,0 +1,57 @@ +import axios from 'axios'; +import path from 'path'; +import fs from 'fs-extra'; + +/** + * Function to check if a file is valid based on its URL or file path + * @param file - The URL or file path to the file + * @returns - Resolves to true if the file is valid, false otherwise + */ +export async function isValidFile(file: string): Promise { + const allowedExtensions = ['pdf', 'png', 'jpg', 'jpeg', 'txt', 'docx', 'html']; + const allowedMimeTypes: Record = { + pdf: 'application/pdf', + png: 'image/png', + jpg: 'image/jpeg', + jpeg: 'image/jpeg', + txt: 'text/plain', + docx: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + html: 'text/html', + }; + + // Check if it's a local file path + if (!file.startsWith('http://') && !file.startsWith('https://')) { + // Handle local file path + const extension = path.extname(file).substring(1).toLowerCase(); + + if (!allowedExtensions.includes(extension)) { + return false; + } + + // Check if file exists + try { + await fs.access(file); + return true; + } catch { + return false; + } + } + + // Handle URL + const urlPath = new URL(file).pathname; + const extensionRegex = new RegExp(`\\.(${allowedExtensions.join('|')})$`, 'i'); + + if (!extensionRegex.test(urlPath)) { + return false; + } + + // Optional: Check the MIME type if query parameters are used + try { + const response = await axios.head(file); + const contentType = response.headers['content-type']; + return Object.values(allowedMimeTypes).some(mime => contentType.startsWith(mime)); + } catch (error) { + console.error('Error checking MIME type:', error); + return false; + } +} diff --git a/extractor/src/utils/generateMarkdown.d.ts b/extractor/src/utils/generateMarkdown.d.ts new file mode 100644 index 0000000..c70d6cd --- /dev/null +++ b/extractor/src/utils/generateMarkdown.d.ts @@ -0,0 +1,8 @@ +interface Page { + content: string; + page: number; + contentLength: number; +} +export declare const generateMarkdownDocument: (pages: Page[]) => Promise; +export {}; +//# sourceMappingURL=generateMarkdown.d.ts.map \ No newline at end of file diff --git a/extractor/src/utils/generateMarkdown.d.ts.map b/extractor/src/utils/generateMarkdown.d.ts.map new file mode 100644 index 0000000..cd07449 --- /dev/null +++ b/extractor/src/utils/generateMarkdown.d.ts.map @@ -0,0 +1 @@ +{"version":3,"file":"generateMarkdown.d.ts","sourceRoot":"","sources":["generateMarkdown.ts"],"names":[],"mappings":"AAAA,UAAU,IAAI;IACZ,OAAO,EAAE,MAAM,CAAC;IAChB,IAAI,EAAE,MAAM,CAAC;IACb,aAAa,EAAE,MAAM,CAAC;CACvB;AAED,eAAO,MAAM,wBAAwB,UAAiB,IAAI,EAAE,KAAG,OAAO,CAAC,MAAM,CAW5E,CAAC"} \ No newline at end of file diff --git a/extractor/src/utils/generateMarkdown.js b/extractor/src/utils/generateMarkdown.js index 91a9a84..a32a148 100644 --- a/extractor/src/utils/generateMarkdown.js +++ b/extractor/src/utils/generateMarkdown.js @@ -1,12 +1,12 @@ export const generateMarkdownDocument = async (pages) => { - try { - // Combine all markdown pages into a single string - const markdownContent = pages.map((page) => page.content).join("\n\n---\n\n"); - - // Return the combined markdown string directly - return markdownContent; - } catch (error) { - console.error('Error generating markdown:', error); - throw error; - } + try { + // Combine all markdown pages into a single string + const markdownContent = pages.map((page) => page.content).join("\n\n---\n\n"); + // Return the combined markdown string directly + return markdownContent; + } + catch (error) { + console.error('Error generating markdown:', error); + throw error; + } }; diff --git a/extractor/src/utils/generateMarkdown.ts b/extractor/src/utils/generateMarkdown.ts new file mode 100644 index 0000000..ba5eb78 --- /dev/null +++ b/extractor/src/utils/generateMarkdown.ts @@ -0,0 +1,18 @@ +interface Page { + content: string; + page: number; + contentLength: number; +} + +export const generateMarkdownDocument = async (pages: Page[]): Promise => { + try { + // Combine all markdown pages into a single string + const markdownContent = pages.map((page) => page.content).join("\n\n---\n\n"); + + // Return the combined markdown string directly + return markdownContent; + } catch (error) { + console.error('Error generating markdown:', error); + throw error; + } +}; diff --git a/extractor/src/utils/pdfValidator.d.ts b/extractor/src/utils/pdfValidator.d.ts new file mode 100644 index 0000000..92c6319 --- /dev/null +++ b/extractor/src/utils/pdfValidator.d.ts @@ -0,0 +1,7 @@ +/** + * Function to check if a file is a PDF based on its URL or MIME type + * @param file - The URL to the file + * @returns - Resolves to true if the file is a PDF, false otherwise + */ +export declare function isPdfFile(file: string): Promise; +//# sourceMappingURL=pdfValidator.d.ts.map \ No newline at end of file diff --git a/extractor/src/utils/pdfValidator.d.ts.map b/extractor/src/utils/pdfValidator.d.ts.map new file mode 100644 index 0000000..ef2e75d --- /dev/null +++ b/extractor/src/utils/pdfValidator.d.ts.map @@ -0,0 +1 @@ +{"version":3,"file":"pdfValidator.d.ts","sourceRoot":"","sources":["pdfValidator.ts"],"names":[],"mappings":"AAEA;;;;GAIG;AACH,wBAAsB,SAAS,CAAC,IAAI,EAAE,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,CAgB9D"} \ No newline at end of file diff --git a/extractor/src/utils/pdfValidator.js b/extractor/src/utils/pdfValidator.js index f1dc67e..4d13190 100644 --- a/extractor/src/utils/pdfValidator.js +++ b/extractor/src/utils/pdfValidator.js @@ -1,9 +1,8 @@ import axios from 'axios'; - /** * Function to check if a file is a PDF based on its URL or MIME type - * @param {string} file - The URL to the file - * @returns {Promise} - Resolves to true if the file is a PDF, false otherwise + * @param file - The URL to the file + * @returns - Resolves to true if the file is a PDF, false otherwise */ export async function isPdfFile(file) { const urlPath = new URL(file).pathname; @@ -11,13 +10,13 @@ export async function isPdfFile(file) { if (pdfExtensionRegex.test(urlPath)) { return true; } - // Optional: Check the MIME type if query parameters are used try { const response = await axios.head(file); const contentType = response.headers['content-type']; return contentType === 'application/pdf'; - } catch (error) { + } + catch (error) { console.error('Error checking MIME type:', error); return false; } diff --git a/extractor/src/utils/pdfValidator.ts b/extractor/src/utils/pdfValidator.ts new file mode 100644 index 0000000..9d65cf4 --- /dev/null +++ b/extractor/src/utils/pdfValidator.ts @@ -0,0 +1,24 @@ +import axios from 'axios'; + +/** + * Function to check if a file is a PDF based on its URL or MIME type + * @param file - The URL to the file + * @returns - Resolves to true if the file is a PDF, false otherwise + */ +export async function isPdfFile(file: string): Promise { + const urlPath = new URL(file).pathname; + const pdfExtensionRegex = /\.pdf$/i; + if (pdfExtensionRegex.test(urlPath)) { + return true; + } + + // Optional: Check the MIME type if query parameters are used + try { + const response = await axios.head(file); + const contentType = response.headers['content-type']; + return contentType === 'application/pdf'; + } catch (error) { + console.error('Error checking MIME type:', error); + return false; + } +} diff --git a/extractor/src/utils/schemaValidator.d.ts b/extractor/src/utils/schemaValidator.d.ts new file mode 100644 index 0000000..7a507fe --- /dev/null +++ b/extractor/src/utils/schemaValidator.d.ts @@ -0,0 +1,13 @@ +import { SchemaField } from './convertToZodSchema.js'; +interface ValidationResult { + isValid: boolean; + errors: string[]; +} +/** + * Validates the schema format to ensure it meets the required structure. + * @param schema - The schema to validate. + * @returns - { isValid: boolean, errors: Array } + */ +export declare function validateSchema(schema: SchemaField[]): ValidationResult; +export {}; +//# sourceMappingURL=schemaValidator.d.ts.map \ No newline at end of file diff --git a/extractor/src/utils/schemaValidator.d.ts.map b/extractor/src/utils/schemaValidator.d.ts.map new file mode 100644 index 0000000..62e2fe7 --- /dev/null +++ b/extractor/src/utils/schemaValidator.d.ts.map @@ -0,0 +1 @@ +{"version":3,"file":"schemaValidator.d.ts","sourceRoot":"","sources":["schemaValidator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,MAAM,yBAAyB,CAAC;AAEtD,UAAU,gBAAgB;IACxB,OAAO,EAAE,OAAO,CAAC;IACjB,MAAM,EAAE,MAAM,EAAE,CAAC;CAClB;AAED;;;;GAIG;AACH,wBAAgB,cAAc,CAAC,MAAM,EAAE,WAAW,EAAE,GAAG,gBAAgB,CAyDtE"} \ No newline at end of file diff --git a/extractor/src/utils/schemaValidator.js b/extractor/src/utils/schemaValidator.js index 5b7cd00..330fedb 100644 --- a/extractor/src/utils/schemaValidator.js +++ b/extractor/src/utils/schemaValidator.js @@ -1,64 +1,59 @@ /** * Validates the schema format to ensure it meets the required structure. - * @param {Array} schema - The schema to validate. - * @returns {Object} - { isValid: boolean, errors: Array } + * @param schema - The schema to validate. + * @returns - { isValid: boolean, errors: Array } */ - export function validateSchema(schema) { - const validTypes = ["string", "number", "array", "object", "boolean", "enum"]; - let errors = []; - - if (!Array.isArray(schema)) { - errors.push("Schema must be an array."); - return { isValid: false, errors }; - } - - function validateField(field, path) { - if (!field.hasOwnProperty("name") || typeof field.name !== "string" || field.name.trim() === "") { - errors.push(`"name" is required and should be a non-empty string at ${path}`); + const validTypes = ["string", "number", "array", "object", "boolean", "enum"]; + let errors = []; + if (!Array.isArray(schema)) { + errors.push("Schema must be an array."); + return { isValid: false, errors }; } - - if (!field.hasOwnProperty("type") || !validTypes.includes(field.type)) { - errors.push(`"type" is required and must be one of ${validTypes.join(", ")} at ${path}`); + function validateField(field, path) { + if (!field.hasOwnProperty("name") || typeof field.name !== "string" || field.name.trim() === "") { + errors.push(`"name" is required and should be a non-empty string at ${path}`); + } + if (!field.hasOwnProperty("type") || !validTypes.includes(field.type)) { + errors.push(`"type" is required and must be one of ${validTypes.join(", ")} at ${path}`); + } + if (!field.hasOwnProperty("description") || typeof field.description !== "string" || field.description.trim() === "") { + errors.push(`"description" is required and should be a non-empty string at ${path}`); + } + if (field.type === "enum") { + if (!field.hasOwnProperty("values") || !Array.isArray(field.values) || field.values.length === 0) { + errors.push(`"values" is required and must be a non-empty array for enum "${field.name}" at ${path}`); + } + else if (!field.values.every((value) => typeof value === "string")) { + errors.push(`"values" for enum "${field.name}" at ${path} must be an array of strings`); + } + } + if (field.type === "array") { + if (!field.hasOwnProperty("children")) { + errors.push(`"children" property is required for array "${field.name}" at ${path}`); + } + else if (!Array.isArray(field.children) || field.children.length === 0) { + errors.push(`"children" must be a non-empty array for "${field.name}" at ${path}`); + } + else { + field.children.forEach((child, index) => validateField(child, `${path}.children[${index}]`)); + } + } + if (field.type === "object") { + if (!field.hasOwnProperty("children")) { + errors.push(`"children" property is required for object "${field.name}" at ${path}`); + } + else if (!Array.isArray(field.children) || field.children.length === 0) { + errors.push(`"children" must be a non-empty array for "${field.name}" at ${path}`); + } + else { + field.children.forEach((child, index) => validateField(child, `${path}.children[${index}]`)); + } + } } - - if (!field.hasOwnProperty("description") || typeof field.description !== "string" || field.description.trim() === "") { - errors.push(`"description" is required and should be a non-empty string at ${path}`); - } - - if (field.type === "enum") { - if (!field.hasOwnProperty("values") || !Array.isArray(field.values) || field.values.length === 0) { - errors.push(`"values" is required and must be a non-empty array for enum "${field.name}" at ${path}`); - } else if (!field.values.every((value) => typeof value === "string")) { - errors.push(`"values" for enum "${field.name}" at ${path} must be an array of strings`); - } - } - - if (field.type === "array") { - if (!field.hasOwnProperty("children")) { - errors.push(`"children" property is required for array "${field.name}" at ${path}`); - } else if (!Array.isArray(field.children) || field.children.length === 0) { - errors.push(`"children" must be a non-empty array for "${field.name}" at ${path}`); - } else { - field.children.forEach((child, index) => validateField(child, `${path}.children[${index}]`)); - } - } - - if (field.type === "object") { - if (!field.hasOwnProperty("children")) { - errors.push(`"children" property is required for object "${field.name}" at ${path}`); - } else if (!Array.isArray(field.children) || field.children.length === 0) { - errors.push(`"children" must be a non-empty array for "${field.name}" at ${path}`); - } else { - field.children.forEach((child, index) => validateField(child, `${path}.children[${index}]`)); - } - } - } - - schema.forEach((field, index) => validateField(field, `schema[${index}]`)); - - return { - isValid: errors.length === 0, - errors, - }; -} \ No newline at end of file + schema.forEach((field, index) => validateField(field, `schema[${index}]`)); + return { + isValid: errors.length === 0, + errors, + }; +} diff --git a/extractor/src/utils/schemaValidator.ts b/extractor/src/utils/schemaValidator.ts new file mode 100644 index 0000000..3dd462a --- /dev/null +++ b/extractor/src/utils/schemaValidator.ts @@ -0,0 +1,70 @@ +import { SchemaField } from './convertToZodSchema.js'; + +interface ValidationResult { + isValid: boolean; + errors: string[]; +} + +/** + * Validates the schema format to ensure it meets the required structure. + * @param schema - The schema to validate. + * @returns - { isValid: boolean, errors: Array } + */ +export function validateSchema(schema: SchemaField[]): ValidationResult { + const validTypes = ["string", "number", "array", "object", "boolean", "enum"]; + let errors: string[] = []; + + if (!Array.isArray(schema)) { + errors.push("Schema must be an array."); + return { isValid: false, errors }; + } + + function validateField(field: any, path: string): void { + if (!field.hasOwnProperty("name") || typeof field.name !== "string" || field.name.trim() === "") { + errors.push(`"name" is required and should be a non-empty string at ${path}`); + } + + if (!field.hasOwnProperty("type") || !validTypes.includes(field.type)) { + errors.push(`"type" is required and must be one of ${validTypes.join(", ")} at ${path}`); + } + + if (!field.hasOwnProperty("description") || typeof field.description !== "string" || field.description.trim() === "") { + errors.push(`"description" is required and should be a non-empty string at ${path}`); + } + + if (field.type === "enum") { + if (!field.hasOwnProperty("values") || !Array.isArray(field.values) || field.values.length === 0) { + errors.push(`"values" is required and must be a non-empty array for enum "${field.name}" at ${path}`); + } else if (!field.values.every((value: any) => typeof value === "string")) { + errors.push(`"values" for enum "${field.name}" at ${path} must be an array of strings`); + } + } + + if (field.type === "array") { + if (!field.hasOwnProperty("children")) { + errors.push(`"children" property is required for array "${field.name}" at ${path}`); + } else if (!Array.isArray(field.children) || field.children.length === 0) { + errors.push(`"children" must be a non-empty array for "${field.name}" at ${path}`); + } else { + field.children.forEach((child: any, index: number) => validateField(child, `${path}.children[${index}]`)); + } + } + + if (field.type === "object") { + if (!field.hasOwnProperty("children")) { + errors.push(`"children" property is required for object "${field.name}" at ${path}`); + } else if (!Array.isArray(field.children) || field.children.length === 0) { + errors.push(`"children" must be a non-empty array for "${field.name}" at ${path}`); + } else { + field.children.forEach((child: any, index: number) => validateField(child, `${path}.children[${index}]`)); + } + } + } + + schema.forEach((field, index) => validateField(field, `schema[${index}]`)); + + return { + isValid: errors.length === 0, + errors, + }; +} diff --git a/extractor/tsconfig.json b/extractor/tsconfig.json new file mode 100644 index 0000000..42e9d4b --- /dev/null +++ b/extractor/tsconfig.json @@ -0,0 +1,27 @@ +{ + "compilerOptions": { + "target": "ES2020", + "module": "ES2020", + "moduleResolution": "node", + "declaration": true, + "declarationMap": true, + "outDir": "./dist", + "rootDir": "./src", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "resolveJsonModule": true, + "allowSyntheticDefaultImports": true, + "composite": true, + "baseUrl": ".", + "paths": { + "core": ["../core/src/index.ts"] + } + }, + "include": ["src/**/*"], + "exclude": ["node_modules", "dist"], + "references": [ + { "path": "../core" } + ] +} diff --git a/package-lock.json b/package-lock.json index c5db782..4565ea4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,10 +10,11 @@ "license": "AGPL-3.0", "workspaces": [ "core", - "extractor" + "extractor", + "cli" ], "dependencies": { - "axios": "^1.7.7", + "axios": "^1.12.2", "child_process": "^1.0.2", "core": "file:./core", "dotenv": "^16.4.5", @@ -33,6 +34,41 @@ "zod": "^3.23.8" } }, + "cli": { + "version": "1.0.0", + "dependencies": { + "chalk": "^5.3.0", + "commander": "^12.0.0", + "dotenv": "^16.4.5", + "extractor": "*", + "ora": "^8.0.1" + }, + "bin": { + "documind": "dist/index.js" + }, + "devDependencies": { + "@types/node": "^20.14.11", + "tsc-alias": "^1.8.8", + "typescript": "^5.6.3" + } + }, + "cli/node_modules/@types/node": { + "version": "20.19.20", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.20.tgz", + "integrity": "sha512-2Q7WS25j4pS1cS8yw3d6buNCVJukOTeQ39bAnwR6sOJbaxvyCGebzTMypDFN82CxBLnl+lSWVdCCWbRY6y9yZQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~6.21.0" + } + }, + "cli/node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "dev": true, + "license": "MIT" + }, "core": { "version": "1.0.0", "dependencies": { @@ -85,9 +121,30 @@ "zod-to-json-schema": "^3.24.2" }, "devDependencies": { - "nodemon": "^3.1.7" + "@types/node": "^20.14.11", + "@types/uuid": "^11.0.0", + "nodemon": "^3.1.7", + "tsc-alias": "^1.8.8", + "typescript": "^5.6.3" + } + }, + "extractor/node_modules/@types/node": { + "version": "20.19.20", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.19.20.tgz", + "integrity": "sha512-2Q7WS25j4pS1cS8yw3d6buNCVJukOTeQ39bAnwR6sOJbaxvyCGebzTMypDFN82CxBLnl+lSWVdCCWbRY6y9yZQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~6.21.0" } }, + "extractor/node_modules/undici-types": { + "version": "6.21.0", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-6.21.0.tgz", + "integrity": "sha512-iwDZqg0QAGrg9Rav5H4n0M64c3mkR59cJ6wQp+7C4nI0gsmExaedaYLNO44eT4AtBBwjbTiGPMlt2Md0T9H9JQ==", + "dev": true, + "license": "MIT" + }, "node_modules/@ai-sdk/google": { "version": "1.1.14", "resolved": "https://registry.npmjs.org/@ai-sdk/google/-/google-1.1.14.tgz", @@ -575,6 +632,44 @@ "url": "https://opencollective.com/libvips" } }, + "node_modules/@nodelib/fs.scandir": { + "version": "2.1.5", + "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", + "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==", + "dev": true, + "license": "MIT", + "dependencies": { + "@nodelib/fs.stat": "2.0.5", + "run-parallel": "^1.1.9" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@nodelib/fs.stat": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz", + "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 8" + } + }, + "node_modules/@nodelib/fs.walk": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz", + "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@nodelib/fs.scandir": "2.1.5", + "fastq": "^1.6.0" + }, + "engines": { + "node": ">= 8" + } + }, "node_modules/@opentelemetry/api": { "version": "1.9.0", "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz", @@ -655,6 +750,17 @@ "form-data": "^4.0.0" } }, + "node_modules/@types/uuid": { + "version": "11.0.0", + "resolved": "https://registry.npmjs.org/@types/uuid/-/uuid-11.0.0.tgz", + "integrity": "sha512-HVyk8nj2m+jcFRNazzqyVKiZezyhDKrGUA3jlEcg/nZ6Ms+qHwocba1Y/AaVaznJTAM9xpdFSh+ptbNrhOGvZA==", + "deprecated": "This is a stub types definition. uuid provides its own type definitions, so you do not need this installed.", + "dev": true, + "license": "MIT", + "dependencies": { + "uuid": "*" + } + }, "node_modules/abort-controller": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz", @@ -708,6 +814,18 @@ } } }, + "node_modules/ansi-regex": { + "version": "6.2.2", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-6.2.2.tgz", + "integrity": "sha512-Bq3SmSpyFHaWjPk8If9yc6svM8c56dB5BAtW4Qbw5jHTwwXXcTLoRMkpDJp6VL0XzlWaCHTXrkFURMYmD0sLqg==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/ansi-regex?sponsor=1" + } + }, "node_modules/anymatch": { "version": "3.1.3", "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz", @@ -734,6 +852,16 @@ "integrity": "sha512-L0XlBwfx9QetHOsbLDrE/vh2t018w9462HM3iaFfxRiK83aJjAt/Ja3NMkOW7FICwWTlQBa3ZbL5FKhuQWkDrg==", "license": "MIT" }, + "node_modules/array-union": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/array-union/-/array-union-2.1.0.tgz", + "integrity": "sha512-HGyxoOTYUyCM6stUe6EJgnd4EoewAI7zMdfqO+kGjnlZmBDz/cR5pf8r/cR4Wq60sL/p0IkcjUEEPwS3GFrIyw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8" + } + }, "node_modules/async": { "version": "3.2.6", "resolved": "https://registry.npmjs.org/async/-/async-3.2.6.tgz", @@ -762,13 +890,13 @@ } }, "node_modules/axios": { - "version": "1.9.0", - "resolved": "https://registry.npmjs.org/axios/-/axios-1.9.0.tgz", - "integrity": "sha512-re4CqKTJaURpzbLHtIi6XpDv20/CnpXOtjRY5/CU32L8gU8ek9UIivcfvSWvmKEngmVbrUtPpdDwWDWL7DNHvg==", + "version": "1.12.2", + "resolved": "https://registry.npmjs.org/axios/-/axios-1.12.2.tgz", + "integrity": "sha512-vMJzPewAlRyOgxV2dU0Cuz2O8zzzx9VYtbJOaBgXFeLc4IV/Eg50n4LowmehOOR61S8ZMpc2K5Sa7g6A4jfkUw==", "license": "MIT", "dependencies": { "follow-redirects": "^1.15.6", - "form-data": "^4.0.0", + "form-data": "^4.0.4", "proxy-from-env": "^1.1.0" } }, @@ -830,6 +958,19 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/call-bind-apply-helpers": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz", + "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/chalk": { "version": "5.4.1", "resolved": "https://registry.npmjs.org/chalk/-/chalk-5.4.1.tgz", @@ -873,6 +1014,37 @@ "fsevents": "~2.3.2" } }, + "node_modules/cli": { + "resolved": "cli", + "link": true + }, + "node_modules/cli-cursor": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/cli-cursor/-/cli-cursor-5.0.0.tgz", + "integrity": "sha512-aCj4O5wKyszjMmDT4tZj93kxyydN/K5zPWSCe6/0AV/AA1pqe5ZBIw0a2ZfPQV7lL5/yb5HsUreJ6UFAF1tEQw==", + "license": "MIT", + "dependencies": { + "restore-cursor": "^5.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/cli-spinners": { + "version": "2.9.2", + "resolved": "https://registry.npmjs.org/cli-spinners/-/cli-spinners-2.9.2.tgz", + "integrity": "sha512-ywqV+5MmyL4E7ybXgKys4DugZbX0FC6LnwrhjuykIjnK9k8OQacQ7axGKnjDXWNhns0xot3bZI5h55H8yo9cJg==", + "license": "MIT", + "engines": { + "node": ">=6" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/color": { "version": "4.2.3", "resolved": "https://registry.npmjs.org/color/-/color-4.2.3.tgz", @@ -926,6 +1098,15 @@ "node": ">= 0.8" } }, + "node_modules/commander": { + "version": "12.1.0", + "resolved": "https://registry.npmjs.org/commander/-/commander-12.1.0.tgz", + "integrity": "sha512-Vw8qHK3bZM9y/P10u3Vib8o/DdkvA2OtPtZvD871QKjy74Wj1WSKFILMPRPSdUSx5RFK1arlJzEtA4PkFgnbuA==", + "license": "MIT", + "engines": { + "node": ">=18" + } + }, "node_modules/concat-map": { "version": "0.0.1", "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", @@ -1006,6 +1187,19 @@ "integrity": "sha512-IayShXAgj/QMXgB0IWmKx+rOPuGMhqm5w6jvFxmVenXKIzRqTAAsbBPT3kWQeGANj3jGgvcvv4yK6SxqYmikgw==", "license": "Apache-2.0" }, + "node_modules/dir-glob": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/dir-glob/-/dir-glob-3.0.1.tgz", + "integrity": "sha512-WkrWp9GR4KXfKGYzOLmTuGVi1UWFfws377n9cc55/tb6DuqyF6pcQ5AbiHEshaDpY9v6oaSr2XCDidGmMwdzIA==", + "dev": true, + "license": "MIT", + "dependencies": { + "path-type": "^4.0.0" + }, + "engines": { + "node": ">=8" + } + }, "node_modules/dotenv": { "version": "16.4.5", "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz", @@ -1018,18 +1212,35 @@ "url": "https://dotenvx.com" } }, - "node_modules/es-define-property": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.0.tgz", - "integrity": "sha512-jxayLKShrEqqzJ0eumQbVhTYQM27CfT1T35+gCgDFoL82JLsXqTJ76zv6A0YLOgEnLUMvLzsDsGIrl8NFpT2gQ==", + "node_modules/dunder-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", + "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==", "license": "MIT", "dependencies": { - "get-intrinsic": "^1.2.4" + "call-bind-apply-helpers": "^1.0.1", + "es-errors": "^1.3.0", + "gopd": "^1.2.0" }, "engines": { "node": ">= 0.4" } }, + "node_modules/emoji-regex": { + "version": "10.5.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-10.5.0.tgz", + "integrity": "sha512-lb49vf1Xzfx080OKA0o6l8DQQpV+6Vg95zyCJX9VB/BqKYlhG7N4wgROUUHRA+ZPUefLnteQOad7z1kT2bV7bg==", + "license": "MIT" + }, + "node_modules/es-define-property": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz", + "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, "node_modules/es-errors": { "version": "1.3.0", "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", @@ -1039,6 +1250,33 @@ "node": ">= 0.4" } }, + "node_modules/es-object-atoms": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz", + "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-set-tostringtag": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz", + "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==", + "license": "MIT", + "dependencies": { + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.6", + "has-tostringtag": "^1.0.2", + "hasown": "^2.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, "node_modules/event-target-shim": { "version": "5.0.1", "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz", @@ -1061,6 +1299,33 @@ "resolved": "extractor", "link": true }, + "node_modules/fast-glob": { + "version": "3.3.3", + "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.3.tgz", + "integrity": "sha512-7MptL8U0cqcFdzIzwOTHoilX9x5BrNqye7Z/LuC7kCMRio1EMSyqRK3BEAUD7sXRq4iT4AzTVuZdhgQ2TCvYLg==", + "dev": true, + "license": "MIT", + "dependencies": { + "@nodelib/fs.stat": "^2.0.2", + "@nodelib/fs.walk": "^1.2.3", + "glob-parent": "^5.1.2", + "merge2": "^1.3.0", + "micromatch": "^4.0.8" + }, + "engines": { + "node": ">=8.6.0" + } + }, + "node_modules/fastq": { + "version": "1.19.1", + "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.19.1.tgz", + "integrity": "sha512-GwLTyxkCXjXbxqIhTsMI2Nui8huMPtnxg7krajPJAjnEG/iiOS7i+zCtWGZR9G0NBKbXKh6X9m9UIsYX/N6vvQ==", + "dev": true, + "license": "ISC", + "dependencies": { + "reusify": "^1.0.4" + } + }, "node_modules/fill-range": { "version": "7.1.1", "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz", @@ -1104,13 +1369,15 @@ } }, "node_modules/form-data": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.1.tgz", - "integrity": "sha512-tzN8e4TX8+kkxGPK8D5u0FNmjPUjw3lwC9lSLxxoB/+GtsJG91CO8bSWy73APlgAZzZbXEYZJuxjkHH2w+Ezhw==", + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.4.tgz", + "integrity": "sha512-KrGhL9Q4zjj0kiUt5OO4Mr/A/jlI2jDYs5eHBpYHPcBEVSiipAvn2Ko2HnPe20rmcuuvMHNdZFp+4IlGTMF0Ow==", "license": "MIT", "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", + "es-set-tostringtag": "^2.1.0", + "hasown": "^2.0.2", "mime-types": "^2.1.12" }, "engines": { @@ -1174,17 +1441,34 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/get-east-asian-width": { + "version": "1.4.0", + "resolved": "https://registry.npmjs.org/get-east-asian-width/-/get-east-asian-width-1.4.0.tgz", + "integrity": "sha512-QZjmEOC+IT1uk6Rx0sX22V6uHWVwbdbxf1faPqJ1QhLdGgsRGCZoyaQBm/piRdJy/D2um6hM1UP7ZEeQ4EkP+Q==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/get-intrinsic": { - "version": "1.2.4", - "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.2.4.tgz", - "integrity": "sha512-5uYhsJH8VJBTv7oslg4BznJYhDoRI6waYCxMmCdnTrcCrHA/fCFKoTFz2JKKE0HdDFUF7/oQuhzumXJK7paBRQ==", + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz", + "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==", "license": "MIT", "dependencies": { + "call-bind-apply-helpers": "^1.0.2", + "es-define-property": "^1.0.1", "es-errors": "^1.3.0", + "es-object-atoms": "^1.1.1", "function-bind": "^1.1.2", - "has-proto": "^1.0.1", - "has-symbols": "^1.0.3", - "hasown": "^2.0.0" + "get-proto": "^1.0.1", + "gopd": "^1.2.0", + "has-symbols": "^1.1.0", + "hasown": "^2.0.2", + "math-intrinsics": "^1.1.0" }, "engines": { "node": ">= 0.4" @@ -1193,6 +1477,32 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/get-proto": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz", + "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==", + "license": "MIT", + "dependencies": { + "dunder-proto": "^1.0.1", + "es-object-atoms": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/get-tsconfig": { + "version": "4.12.0", + "resolved": "https://registry.npmjs.org/get-tsconfig/-/get-tsconfig-4.12.0.tgz", + "integrity": "sha512-LScr2aNr2FbjAjZh2C6X6BxRx1/x+aTDExct/xyq2XKbYOiG5c0aK7pMsSuyc0brz3ibr/lbQiHD9jzt4lccJw==", + "dev": true, + "license": "MIT", + "dependencies": { + "resolve-pkg-maps": "^1.0.0" + }, + "funding": { + "url": "https://github.com/privatenumber/get-tsconfig?sponsor=1" + } + }, "node_modules/glob-parent": { "version": "5.1.2", "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", @@ -1206,6 +1516,27 @@ "node": ">= 6" } }, + "node_modules/globby": { + "version": "11.1.0", + "resolved": "https://registry.npmjs.org/globby/-/globby-11.1.0.tgz", + "integrity": "sha512-jhIXaOzy1sb8IyocaruWSn1TjmnBVs8Ayhcy83rmxNJ8q2uWKCAj3CnJY+KpGSXCueAPc0i05kVvVKtP1t9S3g==", + "dev": true, + "license": "MIT", + "dependencies": { + "array-union": "^2.1.0", + "dir-glob": "^3.0.1", + "fast-glob": "^3.2.9", + "ignore": "^5.2.0", + "merge2": "^1.4.1", + "slash": "^3.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/gm": { "version": "1.25.0", "resolved": "https://registry.npmjs.org/gm/-/gm-1.25.0.tgz", @@ -1222,12 +1553,12 @@ } }, "node_modules/gopd": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.0.1.tgz", - "integrity": "sha512-d65bNlIadxvpb/A2abVdlqKqV563juRnZ1Wtk6s1sIR8uNsXR70xqIzVqxVf1eTqDunwT2MkczEeaezCKTZhwA==", + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz", + "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==", "license": "MIT", - "dependencies": { - "get-intrinsic": "^1.1.3" + "engines": { + "node": ">= 0.4" }, "funding": { "url": "https://github.com/sponsors/ljharb" @@ -1261,22 +1592,10 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/has-proto": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/has-proto/-/has-proto-1.0.3.tgz", - "integrity": "sha512-SJ1amZAJUiZS+PhsVLf5tGydlaVB8EdFpaSO4gmiUKUOxk8qzn5AIy4ZeJUmh22znIdk/uMAUT2pl3FxzVUH+Q==", - "license": "MIT", - "engines": { - "node": ">= 0.4" - }, - "funding": { - "url": "https://github.com/sponsors/ljharb" - } - }, "node_modules/has-symbols": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.0.3.tgz", - "integrity": "sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz", + "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==", "license": "MIT", "engines": { "node": ">= 0.4" @@ -1327,6 +1646,16 @@ "integrity": "sha512-8Sb3veuYCyrZL+VBt9LJfZjLUPWVvqn8tG28VqYNFCo43KHcKuq+b4EiXGeuaLAQWL2YmyDgMp2aSpH9JHsEQg==", "license": "Apache-2.0" }, + "node_modules/ignore": { + "version": "5.3.2", + "resolved": "https://registry.npmjs.org/ignore/-/ignore-5.3.2.tgz", + "integrity": "sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 4" + } + }, "node_modules/ignore-by-default": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/ignore-by-default/-/ignore-by-default-1.0.1.tgz", @@ -1431,6 +1760,18 @@ "node": ">=0.10.0" } }, + "node_modules/is-interactive": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/is-interactive/-/is-interactive-2.0.0.tgz", + "integrity": "sha512-qP1vozQRI+BMOPcjFzrjXuQvdak2pHNUMZoeG2eRbiSqyvbEf/wQtEOTOX1guk6E3t36RkaqiSt8A/6YElNxLQ==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/is-number": { "version": "7.0.0", "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz", @@ -1456,6 +1797,18 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/is-unicode-supported": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-unicode-supported/-/is-unicode-supported-2.1.0.tgz", + "integrity": "sha512-mE00Gnza5EEB3Ds0HfMyllZzbBrmLOX3vfWoj9A9PEnTfratQ/BcaJOuMhnkhjXvb2+FkY3VuHqtAGpTPmglFQ==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/is-url": { "version": "1.2.4", "resolved": "https://registry.npmjs.org/is-url/-/is-url-1.2.4.tgz", @@ -1516,6 +1869,34 @@ "node": ">=6" } }, + "node_modules/log-symbols": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/log-symbols/-/log-symbols-6.0.0.tgz", + "integrity": "sha512-i24m8rpwhmPIS4zscNzK6MSEhk0DUWa/8iYQWxhffV8jkI4Phvs3F+quL5xvS0gdQR0FyTCMMH33Y78dDTzzIw==", + "license": "MIT", + "dependencies": { + "chalk": "^5.3.0", + "is-unicode-supported": "^1.3.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/log-symbols/node_modules/is-unicode-supported": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/is-unicode-supported/-/is-unicode-supported-1.3.0.tgz", + "integrity": "sha512-43r2mRvz+8JRIKnWJ+3j8JtjRKZ6GmjzfaE/qiBJnikNnYv/6bagRJ1kUhNk8R5EX/GkobD+r+sfxCPJsiKBLQ==", + "license": "MIT", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/lru-cache": { "version": "4.1.5", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-4.1.5.tgz", @@ -1526,6 +1907,39 @@ "yallist": "^2.1.2" } }, + "node_modules/math-intrinsics": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz", + "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==", + "license": "MIT", + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/merge2": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz", + "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">= 8" + } + }, + "node_modules/micromatch": { + "version": "4.0.8", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.8.tgz", + "integrity": "sha512-PXwfBhYu0hBCPw8Dn0E+WDYb7af3dSLVWKi3HGv84IdF4TyFoC0ysxFd0Goxw7nSv4T/PzEJQxsYsEiFCKo2BA==", + "dev": true, + "license": "MIT", + "dependencies": { + "braces": "^3.0.3", + "picomatch": "^2.3.1" + }, + "engines": { + "node": ">=8.6" + } + }, "node_modules/mime-db": { "version": "1.52.0", "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", @@ -1547,12 +1961,38 @@ "node": ">= 0.6" } }, + "node_modules/mimic-function": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/mimic-function/-/mimic-function-5.0.1.tgz", + "integrity": "sha512-VP79XUPxV2CigYP3jWwAUFSku2aKqBH7uTAapFWCBqutsbmDo96KY5o8uh6U+/YSIn5OxJnXp73beVkpqMIGhA==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/ms": { "version": "2.1.3", "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz", "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==", "license": "MIT" }, + "node_modules/mylas": { + "version": "2.1.13", + "resolved": "https://registry.npmjs.org/mylas/-/mylas-2.1.13.tgz", + "integrity": "sha512-+MrqnJRtxdF+xngFfUUkIMQrUUL0KsxbADUkn23Z/4ibGg192Q+z+CQyiYwvWTsYjJygmMR8+w3ZDa98Zh6ESg==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12.0.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/raouldeheer" + } + }, "node_modules/nanoid": { "version": "3.3.8", "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.8.tgz", @@ -1691,6 +2131,21 @@ "node": ">=0.10.0" } }, + "node_modules/onetime": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/onetime/-/onetime-7.0.0.tgz", + "integrity": "sha512-VXJjc87FScF88uafS3JllDgvAm+c/Slfz06lorj2uAY34rlUu0Nt+v8wreiImcrgAjjIHp1rXpTDlLOGw29WwQ==", + "license": "MIT", + "dependencies": { + "mimic-function": "^5.0.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/openai": { "version": "4.72.0", "resolved": "https://registry.npmjs.org/openai/-/openai-4.72.0.tgz", @@ -1741,6 +2196,29 @@ "opencollective-postinstall": "index.js" } }, + "node_modules/ora": { + "version": "8.2.0", + "resolved": "https://registry.npmjs.org/ora/-/ora-8.2.0.tgz", + "integrity": "sha512-weP+BZ8MVNnlCm8c0Qdc1WSWq4Qn7I+9CJGm7Qali6g44e/PUzbjNqJX5NJ9ljlNMosfJvg1fKEGILklK9cwnw==", + "license": "MIT", + "dependencies": { + "chalk": "^5.3.0", + "cli-cursor": "^5.0.0", + "cli-spinners": "^2.9.2", + "is-interactive": "^2.0.0", + "is-unicode-supported": "^2.0.0", + "log-symbols": "^6.0.0", + "stdin-discarder": "^0.2.2", + "string-width": "^7.2.0", + "strip-ansi": "^7.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/os": { "version": "0.1.2", "resolved": "https://registry.npmjs.org/os/-/os-0.1.2.tgz", @@ -1778,6 +2256,16 @@ "util": "^0.10.3" } }, + "node_modules/path-type": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/path-type/-/path-type-4.0.0.tgz", + "integrity": "sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8" + } + }, "node_modules/path/node_modules/inherits": { "version": "2.0.3", "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", @@ -1834,6 +2322,19 @@ "url": "https://github.com/sponsors/jonschlinkert" } }, + "node_modules/plimit-lit": { + "version": "1.6.1", + "resolved": "https://registry.npmjs.org/plimit-lit/-/plimit-lit-1.6.1.tgz", + "integrity": "sha512-B7+VDyb8Tl6oMJT9oSO2CW8XC/T4UcJGrwOVoNGwOQsQYhlpfajmrMj5xeejqaASq3V/EqThyOeATEOMuSEXiA==", + "dev": true, + "license": "MIT", + "dependencies": { + "queue-lit": "^1.5.1" + }, + "engines": { + "node": ">=12" + } + }, "node_modules/possible-typed-array-names": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/possible-typed-array-names/-/possible-typed-array-names-1.0.0.tgz", @@ -1871,6 +2372,37 @@ "dev": true, "license": "MIT" }, + "node_modules/queue-lit": { + "version": "1.5.2", + "resolved": "https://registry.npmjs.org/queue-lit/-/queue-lit-1.5.2.tgz", + "integrity": "sha512-tLc36IOPeMAubu8BkW8YDBV+WyIgKlYU7zUNs0J5Vk9skSZ4JfGlPOqplP0aHdfv7HL0B2Pg6nwiq60Qc6M2Hw==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=12" + } + }, + "node_modules/queue-microtask": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", + "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT" + }, "node_modules/react": { "version": "19.0.0", "resolved": "https://registry.npmjs.org/react/-/react-19.0.0.tgz", @@ -1900,6 +2432,67 @@ "integrity": "sha512-kY1AZVr2Ra+t+piVaJ4gxaFaReZVH40AKNo7UCX6W+dEwBo/2oZJzqfuN1qLq1oL45o56cPaTXELwrTh8Fpggg==", "license": "MIT" }, + "node_modules/resolve-pkg-maps": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/resolve-pkg-maps/-/resolve-pkg-maps-1.0.0.tgz", + "integrity": "sha512-seS2Tj26TBVOC2NIc2rOe2y2ZO7efxITtLZcGSOnHHNOQ7CkiUBfw0Iw2ck6xkIhPwLhKNLS8BO+hEpngQlqzw==", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/privatenumber/resolve-pkg-maps?sponsor=1" + } + }, + "node_modules/restore-cursor": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/restore-cursor/-/restore-cursor-5.1.0.tgz", + "integrity": "sha512-oMA2dcrw6u0YfxJQXm342bFKX/E4sG9rbTzO9ptUcR/e8A33cHuvStiYOwH7fszkZlZ1z/ta9AAoPk2F4qIOHA==", + "license": "MIT", + "dependencies": { + "onetime": "^7.0.0", + "signal-exit": "^4.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/reusify": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.1.0.tgz", + "integrity": "sha512-g6QUff04oZpHs0eG5p83rFLhHeV00ug/Yf9nZM6fLeUrPguBTkTQOdpAWWspMh55TZfVQDPaN3NQJfbVRAxdIw==", + "dev": true, + "license": "MIT", + "engines": { + "iojs": ">=1.0.0", + "node": ">=0.10.0" + } + }, + "node_modules/run-parallel": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", + "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT", + "dependencies": { + "queue-microtask": "^1.2.2" + } + }, "node_modules/secure-json-parse": { "version": "2.7.0", "resolved": "https://registry.npmjs.org/secure-json-parse/-/secure-json-parse-2.7.0.tgz", @@ -1974,6 +2567,18 @@ "@img/sharp-win32-x64": "0.33.5" } }, + "node_modules/signal-exit": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-4.1.0.tgz", + "integrity": "sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==", + "license": "ISC", + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, "node_modules/simple-swizzle": { "version": "0.2.2", "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz", @@ -1996,6 +2601,60 @@ "node": ">=10" } }, + "node_modules/slash": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/slash/-/slash-3.0.0.tgz", + "integrity": "sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=8" + } + }, + "node_modules/stdin-discarder": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/stdin-discarder/-/stdin-discarder-0.2.2.tgz", + "integrity": "sha512-UhDfHmA92YAlNnCfhmq0VeNL5bDbiZGg7sZ2IvPsXubGkiNa9EC+tUTsjBRsYUAz87btI6/1wf4XoVvQ3uRnmQ==", + "license": "MIT", + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/string-width": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-7.2.0.tgz", + "integrity": "sha512-tsaTIkKW9b4N+AEj+SVA+WhJzV7/zMhcSu78mLKWSk7cXMOSHsBKFWUs0fWwq8QyK3MgJBQRX6Gbi4kYbdvGkQ==", + "license": "MIT", + "dependencies": { + "emoji-regex": "^10.3.0", + "get-east-asian-width": "^1.0.0", + "strip-ansi": "^7.1.0" + }, + "engines": { + "node": ">=18" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/strip-ansi": { + "version": "7.1.2", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-7.1.2.tgz", + "integrity": "sha512-gmBGslpoQJtgnMAvOVqGZpEz9dyoKTCzy2nfz/n8aIFhN/jCE/rCmcxabB6jOOHV+0WNnylOxaxBQPSvcWklhA==", + "license": "MIT", + "dependencies": { + "ansi-regex": "^6.0.1" + }, + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/chalk/strip-ansi?sponsor=1" + } + }, "node_modules/supports-color": { "version": "5.5.0", "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-5.5.0.tgz", @@ -2097,6 +2756,38 @@ "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==", "license": "MIT" }, + "node_modules/tsc-alias": { + "version": "1.8.16", + "resolved": "https://registry.npmjs.org/tsc-alias/-/tsc-alias-1.8.16.tgz", + "integrity": "sha512-QjCyu55NFyRSBAl6+MTFwplpFcnm2Pq01rR/uxfqJoLMm6X3O14KEGtaSDZpJYaE1bJBGDjD0eSuiIWPe2T58g==", + "dev": true, + "license": "MIT", + "dependencies": { + "chokidar": "^3.5.3", + "commander": "^9.0.0", + "get-tsconfig": "^4.10.0", + "globby": "^11.0.4", + "mylas": "^2.1.9", + "normalize-path": "^3.0.0", + "plimit-lit": "^1.2.6" + }, + "bin": { + "tsc-alias": "dist/bin/index.js" + }, + "engines": { + "node": ">=16.20.2" + } + }, + "node_modules/tsc-alias/node_modules/commander": { + "version": "9.5.0", + "resolved": "https://registry.npmjs.org/commander/-/commander-9.5.0.tgz", + "integrity": "sha512-KRs7WVDKg86PWiuAqhDrAQnTXZKraVcCc6vFdL14qrZ/DcWwuRo7VoiYXalXO7S5GKpqYiVEwCbgFDfxNHKJBQ==", + "dev": true, + "license": "MIT", + "engines": { + "node": "^12.20.0 || >=14" + } + }, "node_modules/tslib": { "version": "1.14.1", "resolved": "https://registry.npmjs.org/tslib/-/tslib-1.14.1.tgz", diff --git a/package.json b/package.json index 336c9cd..155a499 100644 --- a/package.json +++ b/package.json @@ -3,14 +3,22 @@ "version": "1.1.5", "description": "Intelligent document processing and extraction.", "scripts": { - "start": "npm run start --workspace=extractor", - "dev": "npm run dev --workspace=extractor" + "build": "npm run build:core && npm run build:extractor && npm run build:cli", + "build:core": "npm run build --workspace=core", + "build:extractor": "npm run build --workspace=extractor", + "build:cli": "npm run build --workspace=cli", + "prepublishOnly": "npm run build", + "start": "npm run start --workspace=cli", + "dev": "npm run dev --workspace=cli", + "cli": "npm run start --workspace=cli" }, "workspaces": [ "core", - "extractor" + "extractor", + "cli" ], - "main": "./extractor/src/index.js", + "main": "./extractor/dist/index.js", + "types": "./extractor/dist/index.d.ts", "keywords": [ "document", "extraction", @@ -25,7 +33,7 @@ }, "homepage": "https://documind.xyz", "dependencies": { - "axios": "^1.7.7", + "axios": "^1.12.2", "child_process": "^1.0.2", "core": "file:./core", "dotenv": "^16.4.5",