diff --git a/.changeset/column-count-strategy-rename.md b/.changeset/column-count-strategy-rename.md new file mode 100644 index 00000000..32057dec --- /dev/null +++ b/.changeset/column-count-strategy-rename.md @@ -0,0 +1,11 @@ +--- +"web-csv-toolbox": minor +--- + +**BREAKING CHANGE**: Restrict `columnCountStrategy` options for object output to `fill`/`strict` only. + +Object format now rejects `keep` and `truncate` strategies at runtime, as these strategies are incompatible with object output semantics. Users relying on `keep` or `truncate` with object format must either: +- Switch to `outputFormat: 'array'` to use these strategies, or +- Use `fill` (default) or `strict` for object output + +This change improves API clarity by aligning strategy availability with format capabilities and documenting the purpose-driven strategy matrix (including sparse/header requirements). diff --git a/.changeset/lexer-api-changes.md b/.changeset/lexer-api-changes.md new file mode 100644 index 00000000..63c9186a --- /dev/null +++ b/.changeset/lexer-api-changes.md @@ -0,0 +1,19 @@ +--- +"web-csv-toolbox": minor +--- + +## Lexer API Changes + +This release includes low-level Lexer API changes for performance optimization. + +### Breaking Changes (Low-level API only) + +These changes only affect users of the low-level Lexer API. **High-level APIs (`parseString`, `parseBinary`, etc.) are unchanged.** + +1. **Token type constants**: Changed from `Symbol` to numeric constants +2. **Location tracking**: Now disabled by default. Add `trackLocation: true` to Lexer options if you need token location information. Note: Error messages still include position information even when `trackLocation: false` (computed lazily only when errors occur). +3. **Struct of token objects**: Changed to improve performance. Token properties changed and reduce tokens by combining delimiter and newline information into a field. + +### Who is affected? + +**Most users are NOT affected.** Only users who directly use `FlexibleStringCSVLexer` and rely on `token.location` or `Symbol`-based token type comparison need to update their code. diff --git a/.changeset/performance-improvements.md b/.changeset/performance-improvements.md new file mode 100644 index 00000000..630f5db8 --- /dev/null +++ b/.changeset/performance-improvements.md @@ -0,0 +1,47 @@ +--- +"web-csv-toolbox": patch +--- + +## JavaScript Parser Performance Improvements + +This release includes significant internal optimizations that improve JavaScript-based CSV parsing performance. + +### Before / After Comparison + +| Metric | Before (v0.14) | After | Improvement | +|--------|----------------|-------|-------------| +| 1,000 rows parsing | 3.57 ms | 1.42 ms | **60% faster** | +| 5,000 rows parsing | 19.47 ms | 7.03 ms | **64% faster** | +| Throughput (1,000 rows) | 24.3 MB/s | 61.2 MB/s | **2.51x** | +| Throughput (5,000 rows) | 24.5 MB/s | 67.9 MB/s | **2.77x** | + +### Optimization Summary + +| Optimization | Target | Improvement | +|--------------|--------|-------------| +| Array copy method improvement | Assembler | -8.7% | +| Quoted field parsing optimization | Lexer | Overhead eliminated | +| Object assembler loop optimization | Assembler | -5.4% | +| Regex removal for unquoted fields | Lexer | -14.8% | +| String comparison optimization | Lexer | ~10% | +| Object creation optimization | Lexer | ~20% | +| Non-destructive buffer reading | GC | -46% | +| Token type numeric conversion | Lexer/GC | -7% / -13% | +| Location tracking made optional | Lexer | -19% to -31% | +| Object.create(null) for records | Assembler | -31% | +| Empty-row template cache | Assembler | ~4% faster on sparse CSV | +| Row buffer reuse (no per-record slice) | Assembler | ~6% faster array format | +| Header-length builder preallocation | Assembler | Capacity stays steady on wide CSV | +| Object assembler row buffer pooling | Assembler | Lower GC spikes on object output | +| Lexer segment-buffer pooling | Lexer | Smoother GC for quoted-heavy input | + +### Final Performance Results (Pure JavaScript) + +| Format | Throughput | +|--------|------------| +| Object format (1,000 rows) | **61.2 MB/s** | +| Array format (1,000 rows) | **87.6 MB/s** | +| Object format (5,000 rows) | **67.9 MB/s** | +| Array format (5,000 rows) | **86.4 MB/s** | + +Array format is approximately 43% faster (1.43× throughput) than Object format for the same data. diff --git a/benchmark/package.json b/benchmark/package.json index dc524077..a1bb9b92 100644 --- a/benchmark/package.json +++ b/benchmark/package.json @@ -4,8 +4,13 @@ "private": true, "type": "module", "scripts": { - "start": "tsx main.ts", - "queuing-strategy": "tsx queuing-strategy.bench.ts" + "start": "node --import tsx main.ts", + "queuing-strategy": "node --import tsx queuing-strategy.bench.ts", + "quick": "node --import tsx scripts/quick-bench.mts", + "unified": "node --import tsx scripts/unified-token-bench.mts", + "profile:cpu": "node --cpu-prof --cpu-prof-dir=./profiles --import tsx scripts/profile-cpu.mts", + "profile:memory": "node --heap-prof --heap-prof-dir=./profiles --import tsx scripts/profile-memory.mts", + "profile:memory:gc": "node --heap-prof --heap-prof-dir=./profiles --expose-gc --import tsx scripts/profile-memory.mts" }, "license": "MIT", "dependencies": { @@ -14,4 +19,4 @@ "tsx": "catalog:", "web-csv-toolbox": "workspace:*" } -} \ No newline at end of file +} diff --git a/config/vitest.setup.ts b/config/vitest.setup.ts index 2871b7e8..29e5701d 100644 --- a/config/vitest.setup.ts +++ b/config/vitest.setup.ts @@ -1,5 +1,7 @@ import fc from "fast-check"; fc.configureGlobal({ - // This is the default value, but we set it here to be explicit. + // Set to true to stop property tests on first failure (default is false). + // This speeds up test runs by avoiding unnecessary iterations after a counterexample is found. + endOnFailure: true, }); diff --git a/docs/reference/column-count-strategy-guide.md b/docs/reference/column-count-strategy-guide.md new file mode 100644 index 00000000..42c99464 --- /dev/null +++ b/docs/reference/column-count-strategy-guide.md @@ -0,0 +1,46 @@ +# ColumnCountStrategy Guide + +`columnCountStrategy` controls how the parser handles rows whose column counts differ from the header. The available strategies depend on the output format and whether a header is known in advance. + +## Compatibility Matrix + +| Strategy | Short rows | Long rows | Object | Array (explicit header) | Array (header inferred) | Headerless (`header: []`) | +|------------|------------------------------------|------------------------------|--------|-------------------------|-------------------------|----------------------------| +| `fill` | Pad with `""` | Trim excess columns | ✅ | ✅ | ✅ | ❌ | +| `strict` | Throw error | Throw error | ✅ | ✅ | ✅ | ❌ | +| `keep` | Keep as-is (ragged rows) | Keep as-is | ❌ | ✅ | ✅ | ✅ (mandatory) | +| `truncate` | Keep as-is | Trim to header length | ❌ | ✅ | ❌ (requires header) | ❌ | +| `sparse` | Pad with `undefined` | Trim excess columns | ❌ | ✅ | ❌ (requires header) | ❌ | + +## Strategy Details + +### `fill` (default) +- Guarantees fixed-length records matching the header. +- Object: missing values become `""`, enabling consistent string-based models. +- Array output: missing values also become empty strings. + +### `strict` +- Treats any column-count mismatch as a fatal error, useful for schema validation. +- Requires a header (explicit or inferred). + +### `keep` +- Leaves each row untouched. Arrays can vary in length, making it ideal for ragged data or headerless CSVs. +- Headerless mode (`header: []`) enforces `keep`. + +### `truncate` +- Drops trailing columns that exceed the header length while leaving short rows untouched. +- Only available when a header is provided (array output). + +### `sparse` +- Similar to `fill`, but pads missing entries with `undefined`. This is useful when you want to distinguish between missing and empty values. +- Requires an explicit header to determine the target length. + +## Choosing a Strategy + +1. **Need strict schema enforcement?** Use `strict`. +2. **Need consistent string values?** Use `fill` (object default). +3. **Need ragged rows / headerless CSV?** Use `keep` (array output). +4. **Need to ignore trailing columns?** Use `truncate` (array output with header). +5. **Need optional columns?** Use `sparse` (array output with header). + +Pair this guide with the [Output Format Guide](./output-format-guide.md) to decide which combination best fits your workload. diff --git a/docs/reference/output-format-guide.md b/docs/reference/output-format-guide.md new file mode 100644 index 00000000..dc231b9f --- /dev/null +++ b/docs/reference/output-format-guide.md @@ -0,0 +1,47 @@ +# Output Format Guide + +Many APIs (e.g. `parseString`, `createCSVRecordAssembler`, stream transformers) expose an `outputFormat` option so you can choose the most suitable record representation for your workload. This guide summarizes each format's behavior, strengths, and constraints. + +## Quick Comparison + +| Format | Representation | Best for | ColumnCountStrategy support | Headerless (`header: []`) | `includeHeader` | Notes | +|----------|-------------------------------------|-----------------------------------------|-----------------------------|---------------------------|-----------------|-------| +| `object` | Plain object `{ headerKey: value }` | JSON interoperability, downstream libs | `fill`, `strict` | ❌ | ❌ | Default output. Values are always strings. | +| `array` | Readonly array / named tuple | Maximum throughput, flexible schemas | All strategies (`fill`, `keep`, `truncate`, `sparse`, `strict`) | ✅ (with `keep`) | ✅ | Headerless mode requires `outputFormat: "array"` + `columnCountStrategy: "keep"`. | + +## Object Format (`"object"`) +- Produces pure objects keyed by header names. +- Missing columns are padded with empty strings in `fill` mode, or rejected in `strict`. +- Recommended when you plan to serialize to JSON, access fields by name exclusively, or hand records to other libraries. + +```ts +const assembler = createCSVRecordAssembler({ + header: ["name", "age"] as const, + // outputFormat defaults to "object" +}); +for (const record of assembler.assemble(tokens)) { + record.name; // string +} +``` + +## Array Format (`"array"`) +- Emits header-ordered arrays (typed as named tuples when a header is provided). +- Supports every columnCountStrategy, including `keep` for ragged rows and `sparse` for optional columns. +- Only format that supports headerless mode. + +```ts +const assembler = createCSVRecordAssembler({ + header: ["name", "age"] as const, + outputFormat: "array", + columnCountStrategy: "truncate", +}); +const [row] = assembler.assemble(tokens); +row[0]; // "Alice" +``` + +## Choosing the Right Format + +1. **Need plain JS objects / JSON serialization?** Use `object`. +2. **Need the fastest throughput or ragged rows?** Use `array` with the appropriate `columnCountStrategy`. + +For more details on column-count handling, see the [ColumnCountStrategy guide](./column-count-strategy-guide.md). diff --git a/package.json b/package.json index d0247760..aecd7312 100644 --- a/package.json +++ b/package.json @@ -218,6 +218,7 @@ "@types/node": "^24.10.1", "@vitest/browser-webdriverio": "^4.0.3", "@vitest/coverage-istanbul": "4.0.3", + "@vitest/coverage-v8": "4.0.3", "@wasm-tool/rollup-plugin-rust": "^3.0.5", "changesets-github-release": "^0.1.0", "fast-check": "^4.1.1", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 41f790aa..1395d46f 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -99,6 +99,9 @@ importers: '@vitest/coverage-istanbul': specifier: 4.0.3 version: 4.0.3(vitest@4.0.3) + '@vitest/coverage-v8': + specifier: 4.0.3 + version: 4.0.3(vitest@4.0.3) '@wasm-tool/rollup-plugin-rust': specifier: ^3.0.5 version: 3.0.5(binaryen@121.0.0)(rollup@4.53.1) @@ -417,6 +420,10 @@ packages: resolution: {integrity: sha512-qQ5m48eI/MFLQ5PxQj4PFaprjyCTLI37ElWMmNs0K8Lk3dVeOdNpB3ks8jc7yM5CDmVC73eMVk/trk3fgmrUpA==} engines: {node: '>=6.9.0'} + '@bcoe/v8-coverage@1.0.2': + resolution: {integrity: sha512-6zABk/ECA/QYSCQ1NGiVwwbQerUCZ+TQbp64Q3AgmfNvurHH0j8TtXa1qbShXA6qqkpAj4V5W8pP6mLe1mcMqA==} + engines: {node: '>=18'} + '@biomejs/biome@2.3.4': resolution: {integrity: sha512-TU08LXjBHdy0mEY9APtEtZdNQQijXUDSXR7IK1i45wgoPD5R0muK7s61QcFir6FpOj/RP1+YkPx5QJlycXUU3w==} engines: {node: '>=14.21.3'} @@ -1263,6 +1270,15 @@ packages: peerDependencies: vitest: 4.0.3 + '@vitest/coverage-v8@4.0.3': + resolution: {integrity: sha512-I+MlLwyJRBjmJr1kFYSxoseINbIdpxIAeK10jmXgB0FUtIfdYsvM3lGAvBu5yk8WPyhefzdmbCHCc1idFbNRcg==} + peerDependencies: + '@vitest/browser': 4.0.3 + vitest: 4.0.3 + peerDependenciesMeta: + '@vitest/browser': + optional: true + '@vitest/expect@4.0.3': resolution: {integrity: sha512-v3eSDx/bF25pzar6aEJrrdTXJduEBU3uSGXHslIdGIpJVP8tQQHV6x1ZfzbFQ/bLIomLSbR/2ZCfnaEGkWkiVQ==} @@ -1600,6 +1616,9 @@ packages: resolution: {integrity: sha512-x1FCFnFifvYDDzTaLII71vG5uvDwgtmDTEVWAxrgeiR8VjMONcCXJx7E+USjDtHlwFmt9MysbqgF9b9Vjr6w+w==} engines: {node: '>=4'} + ast-v8-to-istanbul@0.3.8: + resolution: {integrity: sha512-szgSZqUxI5T8mLKvS7WTjF9is+MVbOeLADU73IseOcrqhxr/VAvy6wfoVE39KnKzA7JRhjF5eUagNlHwvZPlKQ==} + async@3.2.6: resolution: {integrity: sha512-htCUDlxyyCLMgaM3xXg0C0LW2xqfuQ6p05pCEIsXuyQ+a1koYKTuBMzRNwmybfLgvJDMd0r1LTn4+E0Ti6C2AA==} @@ -2888,6 +2907,9 @@ packages: js-tokens@4.0.0: resolution: {integrity: sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==} + js-tokens@9.0.1: + resolution: {integrity: sha512-mxa9E9ITFOt0ban3j6L5MpjwegGz6lBQmM1IJkWeBZGcMxto50+eWdjC/52xDbS2vy0k7vIMK0Fe2wfL9OQSpQ==} + js-yaml@3.14.1: resolution: {integrity: sha512-okMH7OXXJ7YrN9Ok3/SXrnu4iX9yOk+25nqX4imS2npuvTYDmo/QEZoqwZkYaIDk3jVvBOTOIEgEhaLOynBS9g==} hasBin: true @@ -4613,6 +4635,8 @@ snapshots: '@babel/helper-string-parser': 7.27.1 '@babel/helper-validator-identifier': 7.28.5 + '@bcoe/v8-coverage@1.0.2': {} + '@biomejs/biome@2.3.4': optionalDependencies: '@biomejs/cli-darwin-arm64': 2.3.4 @@ -5552,6 +5576,23 @@ snapshots: transitivePeerDependencies: - supports-color + '@vitest/coverage-v8@4.0.3(vitest@4.0.3)': + dependencies: + '@bcoe/v8-coverage': 1.0.2 + '@vitest/utils': 4.0.3 + ast-v8-to-istanbul: 0.3.8 + debug: 4.4.3 + istanbul-lib-coverage: 3.2.2 + istanbul-lib-report: 3.0.1 + istanbul-lib-source-maps: 5.0.6 + istanbul-reports: 3.2.0 + magicast: 0.3.5 + std-env: 3.10.0 + tinyrainbow: 3.0.3 + vitest: 4.0.3(@types/node@24.10.1)(@vitest/browser-webdriverio@4.0.8(vite@7.2.2(@types/node@24.10.1)(jiti@2.6.1)(terser@5.44.1)(tsx@4.20.6)(yaml@2.8.1))(vitest@4.0.3)(webdriverio@9.20.0))(jiti@2.6.1)(terser@5.44.1)(tsx@4.20.6)(yaml@2.8.1) + transitivePeerDependencies: + - supports-color + '@vitest/expect@4.0.3': dependencies: '@standard-schema/spec': 1.0.0 @@ -5965,6 +6006,12 @@ snapshots: dependencies: tslib: 2.8.1 + ast-v8-to-istanbul@0.3.8: + dependencies: + '@jridgewell/trace-mapping': 0.3.31 + estree-walker: 3.0.3 + js-tokens: 9.0.1 + async@3.2.6: {} asynckit@0.4.0: {} @@ -7320,6 +7367,8 @@ snapshots: js-tokens@4.0.0: {} + js-tokens@9.0.1: {} + js-yaml@3.14.1: dependencies: argparse: 1.0.10 diff --git a/src/core/constants.ts b/src/core/constants.ts index b812bd25..fe8db7dd 100644 --- a/src/core/constants.ts +++ b/src/core/constants.ts @@ -100,17 +100,18 @@ export const DEFAULT_STREAM_BACKPRESSURE_CHECK_INTERVAL = 100; export const DEFAULT_ASSEMBLER_BACKPRESSURE_CHECK_INTERVAL = 10; /** - * FiledDelimiter is a symbol for field delimiter of CSV. - * @category Constants - */ -export const FieldDelimiter = Symbol.for("web-csv-toolbox.FieldDelimiter"); -/** - * RecordDelimiter is a symbol for record delimiter of CSV. - * @category Constants - */ -export const RecordDelimiter = Symbol.for("web-csv-toolbox.RecordDelimiter"); -/** - * Field is a symbol for field of CSV. + * Delimiter type enumeration for unified token format. + * + * Used in the new FieldToken format to indicate what follows the field value. + * This enables a more efficient token format where only field tokens are emitted. + * * @category Constants */ -export const Field = Symbol.for("web-csv-toolbox.Field"); +export enum Delimiter { + /** Next token is a field (followed by field delimiter like comma) */ + Field = 0, + /** Next token is a record delimiter (newline) */ + Record = 1, + // /** End of file/stream */ + // EOF = 2, +} diff --git a/src/core/types.test-d.ts b/src/core/types.test-d.ts index 05336ed3..f6e4c5a2 100644 --- a/src/core/types.test-d.ts +++ b/src/core/types.test-d.ts @@ -264,7 +264,7 @@ describe("CSVRecordAssemblerOptions", () => { >(); }); - it("Normal mode allows all columnCountStrategy options", () => { + it("Array format accepts all columnCountStrategy options", () => { const opts1: CSVRecordAssemblerOptions = { header: ["a", "b"] as const, outputFormat: "array", @@ -274,43 +274,65 @@ describe("CSVRecordAssemblerOptions", () => { const opts2: CSVRecordAssemblerOptions = { header: ["a", "b"] as const, outputFormat: "array", - columnCountStrategy: "pad", + columnCountStrategy: "sparse", }; const opts3: CSVRecordAssemblerOptions = { header: ["a", "b"] as const, - outputFormat: "object", - columnCountStrategy: "strict", + outputFormat: "array", + columnCountStrategy: "truncate", }; expectTypeOf(opts1.columnCountStrategy).toEqualTypeOf< - "keep" | "pad" | "strict" | "truncate" | undefined + "fill" | "sparse" | "keep" | "strict" | "truncate" | undefined >(); expectTypeOf(opts2.columnCountStrategy).toEqualTypeOf< - "keep" | "pad" | "strict" | "truncate" | undefined + "fill" | "sparse" | "keep" | "strict" | "truncate" | undefined >(); expectTypeOf(opts3.columnCountStrategy).toEqualTypeOf< - "keep" | "pad" | "strict" | "truncate" | undefined + "fill" | "sparse" | "keep" | "strict" | "truncate" | undefined >(); }); - it("Normal mode allows both array and object output formats", () => { - const opts1: CSVRecordAssemblerOptions = { + it("Object format restricts columnCountStrategy to fill | strict", () => { + const opts: CSVRecordAssemblerOptions = { header: ["a", "b"] as const, - outputFormat: "array", + outputFormat: "object", + columnCountStrategy: "strict", }; - const opts2: CSVRecordAssemblerOptions = { + expectTypeOf(opts.columnCountStrategy).toEqualTypeOf< + "fill" | "strict" | undefined + >(); + + // @ts-expect-error keep is not allowed for object format + const _invalidKeep: CSVRecordAssemblerOptions = { header: ["a", "b"] as const, outputFormat: "object", + columnCountStrategy: "keep", }; - expectTypeOf(opts1.outputFormat).toEqualTypeOf< - "object" | "array" | undefined - >(); - expectTypeOf(opts2.outputFormat).toEqualTypeOf< - "object" | "array" | undefined - >(); + // @ts-expect-error truncate is not allowed for object format + const _invalidTruncate: CSVRecordAssemblerOptions = { + header: ["a", "b"] as const, + outputFormat: "object", + columnCountStrategy: "truncate", + }; + }); + + it("Normal mode allows array and object output formats", () => { + const opts1 = { + header: ["a", "b"] as const, + outputFormat: "array" as const, + }; + + const opts2 = { + header: ["a", "b"] as const, + outputFormat: "object" as const, + }; + + expectTypeOf(opts1.outputFormat).toEqualTypeOf<"array">(); + expectTypeOf(opts2.outputFormat).toEqualTypeOf<"object">(); }); }); }); diff --git a/src/core/types.ts b/src/core/types.ts index 0e756d00..d46a1aec 100644 --- a/src/core/types.ts +++ b/src/core/types.ts @@ -1,10 +1,8 @@ import type { DEFAULT_DELIMITER, DEFAULT_QUOTATION, - Field, - FieldDelimiter, + Delimiter, Newline, - RecordDelimiter, } from "@/core/constants.ts"; /** @@ -89,41 +87,82 @@ export interface TokenLocation { } /** - * Field token type. + * Base token properties shared by all token types. + * + * This is the common structure for unified field tokens. + * The `delimiter` property indicates what delimiter follows this field. + * * @category Types */ -export interface FieldToken { - type: typeof Field; +export interface BaseToken { + /** The field value */ value: string; - location: TokenLocation; + /** What delimiter follows this field */ + delimiter: Delimiter; + /** Length of the delimiter in characters (1 for comma/LF/CR, 2 for CRLF, 0 for EOF) */ + delimiterLength: number; } /** - * Field delimiter token type. + * Token without location tracking. + * + * This is the optimized token format where only field tokens are emitted. + * The `next` property indicates what delimiter follows this field. + * * @category Types */ -export interface FieldDelimiterToken { - type: typeof FieldDelimiter; - value: string; - location: TokenLocation; -} +export interface TokenNoLocation extends BaseToken {} /** - * Record delimiter token type. + * Token with location tracking. + * + * This is the optimized token format where only field tokens are emitted, + * with location information included. + * * @category Types */ -export interface RecordDelimiterToken { - type: typeof RecordDelimiter; - value: string; +export interface TokenWithLocation extends BaseToken { + /** Location information for error reporting */ location: TokenLocation; } /** - * Token is a atomic unit of a CSV file. - * It can be a field, field delimiter, or record delimiter. + * Token type. + * + * This is the optimized token format that reduces token count by 50%. + * Instead of emitting separate Field, FieldDelimiter, and RecordDelimiter tokens, + * only unified field tokens are emitted with the `delimiter` property indicating + * what delimiter follows. + * + * @category Types + * @template TrackLocation - Whether to include location information (default: false) + * + * @example Without location tracking (default, fastest) + * ```ts + * // CSV: "a,b,c\n" + * // Tokens: + * // { value: "a", delimiter: Delimiter.Field, delimiterLength: 1 } + * // { value: "b", delimiter: Delimiter.Field, delimiterLength: 1 } + * // { value: "c", delimiter: Delimiter.Record, delimiterLength: 1 } + * ``` + * + * @example With CRLF + * ```ts + * // CSV: "a,b\r\n" + * // Tokens: + * // { value: "a", delimiter: Delimiter.Field, delimiterLength: 1 } + * // { value: "b", delimiter: Delimiter.Record, delimiterLength: 2 } // CRLF = 2 + * ``` + */ +export type Token = + TrackLocation extends true ? TokenWithLocation : TokenNoLocation; + +/** + * Any token type (with or without location). + * Used for APIs that accept tokens regardless of location tracking. * @category Types */ -export type Token = FieldToken | FieldDelimiterToken | RecordDelimiterToken; +export type AnyToken = Token | Token; /** * AbortSignal Options. @@ -438,6 +477,63 @@ export interface BinaryOptions { allowNonStandardCharsets?: boolean | undefined; } +/** + * Options for enabling location tracking in lexer output. + * @category Types + */ +export interface TrackLocationOption { + /** + * Enable location tracking for tokens. + * + * @remarks + * When enabled, tokens include `location` with `start`, `end` Position objects + * and `rowNumber`. This is useful for error reporting but adds overhead. + * + * **Performance impact**: + * - `false` (default): No location tracking, maximum performance + * - `true`: Full location tracking with Position objects + * + * **When to enable**: + * - Custom error handling that needs line/column information + * - Building source maps or editors + * - Debugging CSV parsing issues + * + * **Note**: High-level APIs (parseString, etc.) always use `trackLocation: false` + * for performance. This option is only available in low-level Lexer APIs. + * + * @default false + * + * @example + * ```ts + * // No location tracking (default, fastest) + * const lexer = new FlexibleStringCSVLexer(); + * for (const token of lexer.lex(csv)) { + * console.log(token); // { type: Field, value: 'foo' } + * } + * + * // With location tracking + * const lexer = new FlexibleStringCSVLexer({ trackLocation: true }); + * for (const token of lexer.lex(csv)) { + * console.log(token); + * // { type: Field, value: 'foo', location: { start: {...}, end: {...}, rowNumber: 1 } } + * } + * ``` + */ + trackLocation?: TrackLocation; +} + +/** + * CSV Lexer Transformer Options. + * @category Types + */ +export interface CSVLexerTransformerOptions< + Delimiter extends string = DEFAULT_DELIMITER, + Quotation extends string = DEFAULT_QUOTATION, + TrackLocation extends boolean = false, +> extends CommonOptions, + TrackLocationOption, + AbortSignalOptions {} + /** * String CSV Lexer Transformer Stream Options. * Options for StringCSVLexerTransformer stream behavior. @@ -451,6 +547,18 @@ export interface StringCSVLexerTransformerStreamOptions backpressureCheckInterval?: number; } +/** + * CSV Lexer Transformer Stream Options. + * Options for CSVLexerTransformer stream behavior. + * @category Types + */ +export interface CSVLexerTransformerStreamOptions extends BackpressureOptions { + /** + * @default 100 + */ + backpressureCheckInterval?: number; +} + /** * Options for creating a StringCSVLexer. * @@ -459,7 +567,9 @@ export interface StringCSVLexerTransformerStreamOptions export interface StringCSVLexerOptions< Delimiter extends string = DEFAULT_DELIMITER, Quotation extends string = DEFAULT_QUOTATION, + TrackLocation extends boolean = false, > extends CommonOptions, + TrackLocationOption, AbortSignalOptions, EngineOptions {} @@ -531,7 +641,7 @@ export interface CSVRecordAssemblerFactoryOptions< * **Normal Mode (header inferred or explicit)**: * - `header: undefined` → infer from first row * - `header: ['col1', ...]` → explicit header - * - Allows any `outputFormat` and `columnCountStrategy` + * - Array output can use any {@link ColumnCountStrategy}; object output supports only `'fill'` or `'strict'` * * @example Type-safe headerless mode * ```ts @@ -588,120 +698,88 @@ export type CSVRecordAssemblerOptions
> = */ includeHeader?: boolean; } - : // Normal mode: flexible configuration - CSVRecordAssemblerBaseOptions & { - /** - * CSV header specification. - * - * @remarks - * **Behavior by value**: - * - `undefined` (default): First row is automatically inferred as the header - * - `['col1', 'col2', ...]`: Explicit header, first row is treated as data - * - * @default undefined (infer from first row) - * - * @example - * ```ts - * // Infer header from first row - * const records = parseStringToArraySync('name,age\nAlice,30', { - * // header: undefined (default) - * }); - * // => [{ name: 'Alice', age: '30' }] - * - * // Explicit header - * const records = parseStringToArraySync('Alice,30\nBob,25', { - * header: ['name', 'age'] - * }); - * // => [{ name: 'Alice', age: '30' }, { name: 'Bob', age: '25' }] - * ``` - */ - header?: Header; - - /** - * Output format for CSV records. - * - * @remarks - * - `'object'` (default): Records are returned as objects with header keys - * - `'array'`: Records are returned as readonly arrays (named tuples when header is provided) - * - * @default 'object' - * - * @example - * ```ts - * // With 'object' format (default) - * { name: 'Alice', age: '30' } - * - * // With 'array' format - * ['Alice', '30'] // Type: readonly [name: string, age: string] - * ``` - */ - outputFormat?: CSVOutputFormat; - - /** - * Include header row as the first element in array output. - * - * @remarks - * Only valid when `outputFormat` is `'array'`. - * When true, the header array will be yielded as the first record. - * - * @default false - * - * @throws {Error} If used with `outputFormat: 'object'` - * - * @example - * ```ts - * // With includeHeader: true and header: ['name', 'age'] - * // First record: ['name', 'age'] - * // Second record: ['Alice', '30'] - * ``` - */ - includeHeader?: boolean; - - /** - * Strategy for handling column count mismatches between header and data rows. - * - * @remarks - * Controls how to handle rows with column counts different from the header. - * See {@link ColumnCountStrategy} for detailed strategy descriptions. - * - * **Strategy overview**: - * - `'keep'`: Keep rows as-is (array format varies length; object format acts like `'pad'`) - * - `'pad'`: Pad short rows with undefined, truncate long rows - * - `'strict'`: Throw error on length mismatch - * - `'truncate'`: Truncate long rows, keep short rows as-is (object format: all keys present) - * - * **Headerless CSV**: - * When `header` is undefined or `[]`, this option is accepted but behaves as `'keep'`. - * For explicit headerless mode (`header: []`), only `'keep'` is allowed at runtime. - * - * @default 'keep' for array format, 'pad' for object format - * - * @example Array format examples - * ```ts - * // Header: ['name', 'age', 'city'] - * // Row: 'Alice,30' - * // outputFormat: 'array' - * - * // columnCountStrategy: 'keep' → ['Alice', '30'] (short row kept) - * // columnCountStrategy: 'pad' → ['Alice', '30', undefined] (padded) - * // columnCountStrategy: 'strict' → Error thrown - * // columnCountStrategy: 'truncate' → ['Alice', '30'] (short row kept) - * ``` - * - * @example Object format examples - * ```ts - * // Header: ['name', 'age', 'city'] - * // Row: 'Alice,30' - * // outputFormat: 'object' - * - * // columnCountStrategy: 'keep' → { name: 'Alice', age: '30', city: undefined } (treated as 'pad') - * // columnCountStrategy: 'pad' → { name: 'Alice', age: '30', city: undefined } (all keys) - * // columnCountStrategy: 'strict' → Error thrown - * // columnCountStrategy: 'truncate' → { name: 'Alice', age: '30', city: undefined } (all keys) - * ``` - */ - columnCountStrategy?: ColumnCountStrategy; - }; + : // Normal mode: flexible configuration (object vs array branches) + | (Omit & { + /** + * CSV header specification. + * + * @remarks + * **Behavior by value**: + * - `undefined` (default): First row is automatically inferred as the header + * - `['col1', 'col2', ...]`: Explicit header, first row is treated as data + * + * @default undefined (infer from first row) + */ + header?: Header; + + /** + * Output format for CSV records. + * + * @remarks + * - `'object'` (default): Records are returned as objects with header keys + * - `'array'`: Records are returned as readonly arrays (named tuples when header is provided) + * + * @default 'object' + * + * @example + * ```ts + * // With 'object' format (default) + * { name: 'Alice', age: '30' } + * + * // With 'array' format + * ['Alice', '30'] // Type: readonly [name: string, age: string] + * ``` + */ + outputFormat?: CSVOutputFormat; + + /** + * Column-count strategy for object output. + * + * @remarks + * - `'fill'` (default): Always emit every header key, padding missing values with empty string. + * - `'strict'`: Enforce exact column counts and throw on mismatch. + */ + columnCountStrategy?: ObjectFormatColumnCountStrategy; + + /** + * `includeHeader` is not supported for object output. + */ + includeHeader?: never; + }) + | (Omit & { + /** + * CSV header specification (required for strategies other than 'fill'/'keep'). + */ + header?: Header; + + /** + * Output format for CSV records. + * + * @remarks + * `'array'` returns records as readonly tuples. Enables `includeHeader`. + */ + outputFormat: "array"; + + /** + * Include header row as the first element in array output. + * + * @default false + */ + includeHeader?: boolean; + + /** + * Column-count strategy for array output. + * + * @remarks + * Choose according to purpose: + * - `'fill'`: Pad with `""` and trim excess columns. + * - `'keep'`: Preserve ragged rows (also required for `header: []`). + * - `'truncate'`: Drop extra columns but leave short rows untouched. + * - `'sparse'`: Pad with `undefined` (requires an explicit header). + * - `'strict'`: Throw on any mismatch. + */ + columnCountStrategy?: ColumnCountStrategy; + }); /** * CSV Record Assembler Transformer Stream Options. @@ -843,7 +921,7 @@ export interface QueuingStrategyConfig { * * @default `{ highWaterMark: 1024 }` (1024 tokens) */ - lexerReadable?: QueuingStrategy; + lexerReadable?: QueuingStrategy; /** * Queuing strategy for the assembler's writable side (token input). @@ -853,7 +931,7 @@ export interface QueuingStrategyConfig { * * @default `{ highWaterMark: 1024 }` (1024 tokens) */ - assemblerWritable?: QueuingStrategy; + assemblerWritable?: QueuingStrategy; /** * Queuing strategy for the assembler's readable side (record output). @@ -1242,15 +1320,15 @@ export type InferFormat = Options extends { outputFormat: infer F } * @category Types * * @remarks - * This type extracts the columnCountStrategy from options and defaults to 'keep' if not specified. + * This type extracts the columnCountStrategy from options and defaults to 'fill' if not specified. */ export type InferStrategy = Options extends { columnCountStrategy: infer S; } ? S extends ColumnCountStrategy ? S - : "keep" - : "keep"; + : "fill" + : "fill"; /** * Helper type to get the CSV record type based on header and options. @@ -1259,7 +1337,7 @@ export type InferStrategy = Options extends { * * @remarks * This type determines the CSVRecord type based on the header, output format, and columnCountStrategy in options. - * For array format with 'pad' strategy, fields are typed as `string | undefined`. + * For array format with 'sparse' strategy, fields are typed as `string | undefined`. */ export type InferCSVRecord< Header extends ReadonlyArray, @@ -1450,28 +1528,25 @@ export interface ParseBinaryOptions< * @category Types * * @remarks - * **Available strategies**: - * - `'keep'`: Output rows as-is with their actual length - * - Array format: Row length varies, no padding or truncation - * - Object format: Treated as `'pad'` (all header keys present, missing values = undefined) - * - `'pad'`: Ensure all rows match header length - * - Array format: Pad short rows with undefined, truncate long rows - * - Object format: All header keys present, missing values = undefined, extra values ignored - * - `'strict'`: Enforce exact header length match - * - Both formats: Throws error if row length ≠ header length - * - `'truncate'`: Handle long rows only, keep short rows as-is - * - Array format: Truncate long rows to header length, keep short rows unchanged - * - Object format: All header keys present (like `'pad'`), missing values = undefined - * - * **Default values**: - * - Array format: `'keep'` - * - Object format: `'pad'` - * - * **Headerless CSV behavior**: - * When no header is provided (`header: undefined` or `header: []`): - * - The strategy option is accepted but effectively behaves as `'keep'` - * - All rows maintain their actual length with no validation - * - For explicit headerless mode (`header: []`), only `'keep'` is allowed at runtime + * **Choose by goal:** + * - `'fill'` (default): Keep a consistent shape by padding missing columns with empty string and trimming extra columns (arrays & objects). + * - `'strict'`: Enforce schema correctness by throwing whenever a row length differs from the header (arrays & objects). + * - `'keep'`: Preserve ragged rows exactly as parsed (array format only; required when `header: []`). + * - `'truncate'`: Drop trailing columns from long rows while leaving short rows untouched (array format only). + * - `'sparse'`: Allow optional columns by padding with `undefined` (array format only, explicit header required so the target width is known). + * + * **Format-specific availability**: + * - *Object output*: only `'fill'` and `'strict'` are valid. Selecting `'keep'`, `'truncate'`, or `'sparse'` results in a runtime/type error. + * - *Array output*: all strategies are available. + * + * **Header requirements**: + * - Headerless mode (`header: []`) mandates `'keep'`. + * - Inferred headers (`header` omitted) permit `'fill'` (default) or `'keep'`; other strategies need a declared header so the target column count is known. + * - `'sparse'`, `'strict'`, and `'truncate'` all require an explicit header. + * + * **Defaults:** + * - Array format → `'fill'` + * - Object format → `'fill'` * * @example Array format examples * @@ -1480,8 +1555,9 @@ export interface ParseBinaryOptions< * // Input row: 'Alice,30' * // outputFormat: 'array' * + * // fill → ['Alice', '30', ''] (padded with empty string) + * // sparse → ['Alice', '30', undefined] (padded with undefined) * // keep → ['Alice', '30'] (short row kept as-is) - * // pad → ['Alice', '30', undefined] (padded to match header) * // strict → Error thrown (length mismatch) * // truncate → ['Alice', '30'] (short row kept as-is, only truncates long rows) * ``` @@ -1493,13 +1569,40 @@ export interface ParseBinaryOptions< * // Input row: 'Alice,30' * // outputFormat: 'object' * - * // keep → { name: 'Alice', age: '30', city: undefined } (treated as 'pad') - * // pad → { name: 'Alice', age: '30', city: undefined } (all keys present) + * // fill → { name: 'Alice', age: '30', city: '' } (all keys present with empty string) * // strict → Error thrown (length mismatch) - * // truncate → { name: 'Alice', age: '30', city: undefined } (all keys present) + * // keep → Error (object format requires 'fill' or 'strict') + * // truncate → Error (object format requires 'fill' or 'strict') + * // sparse → Error (not supported for object format) * ``` */ -export type ColumnCountStrategy = "keep" | "pad" | "strict" | "truncate"; +export type ColumnCountStrategy = + | "fill" + | "sparse" + | "keep" + | "strict" + | "truncate"; + +/** + * Column count strategies allowed for object format. + * + * @category Types + * + * @remarks + * Object format does not support 'sparse' strategy because objects cannot + * have undefined values in a type-safe manner. Likewise, 'keep' and 'truncate' + * would drop keys or change row shape, so object output only allows 'fill' + * (default) or 'strict'. + * + * - `'fill'`: Fill missing fields with empty string (default) + * - `'keep'`: Not allowed (throws). Use array output if you need ragged rows. + * - `'strict'`: Throw error if column count doesn't match header + * - `'truncate'`: Not allowed (throws). Use array output to drop extra fields. + */ +export type ObjectFormatColumnCountStrategy = Extract< + ColumnCountStrategy, + "fill" | "strict" +>; /** * CSV output format type. @@ -1536,11 +1639,17 @@ export type CSVOutputFormat = "object" | "array"; * * @category Types * @template Header Header of the CSV. + * @template Strategy Column count strategy (must not be 'sparse', default: 'fill') * * @remarks * This type represents a single CSV record as an object, * where each key corresponds to a header field and the value is the field's string content. * + * **Important**: Object format does NOT support 'sparse' strategy. + * Using 'sparse' with object format will result in a type error at compile time + * and a runtime error if attempted. + * All strategies for object format produce `string` values (never `undefined`). + * * @example * * ```ts @@ -1551,18 +1660,28 @@ export type CSVOutputFormat = "object" | "array"; * * const record: Record = { foo: "1", bar: "2" }; * ``` + * + * @example Type error for sparse strategy + * + * ```ts + * // This will cause a type error because 'sparse' is not allowed + * type InvalidRecord = CSVObjectRecord<["a", "b"], "sparse">; + * // Error: Type '"sparse"' does not satisfy the constraint 'ObjectFormatColumnCountStrategy' + * ``` */ -export type CSVObjectRecord
> = Record< - Header[number], - string ->; +export type CSVObjectRecord< + Header extends ReadonlyArray, + Strategy extends ObjectFormatColumnCountStrategy = "fill", +> = Strategy extends "sparse" + ? never // This branch is unreachable due to constraint, but provides safety + : Record; /** * CSV record as an array (named tuple format). * * @category Types * @template Header Header of the CSV. - * @template Strategy Column count strategy that affects field types (default: 'keep') + * @template Strategy Column count strategy that affects field types (default: 'fill') * * @remarks * This type represents a single CSV record as a readonly array. @@ -1570,8 +1689,8 @@ export type CSVObjectRecord
> = Record< * Without a header, it's a variable-length readonly string array. * * **Type safety with columnCountStrategy**: - * - `'keep'`, `'strict'`, `'truncate'`: Fields are typed as `string` - * - `'pad'`: Fields are typed as `string | undefined` (missing fields padded with undefined) + * - `'fill'`, `'keep'`, `'strict'`, `'truncate'`: Fields are typed as `string` + * - `'sparse'`: Fields are typed as `string | undefined` (missing fields filled with undefined) * * @example With header (named tuple) * @@ -1586,12 +1705,12 @@ export type CSVObjectRecord
> = Record< * row.length; // 3 (compile-time constant) * ``` * - * @example With pad strategy (allows undefined) + * @example With sparse strategy (allows undefined) * * ```ts * const header = ["name", "age", "city"]; * - * type Row = CSVArrayRecord; + * type Row = CSVArrayRecord; * // readonly [name: string | undefined, age: string | undefined, city: string | undefined] * * const row: Row = ["Alice", "30", undefined]; // Type-safe! @@ -1611,10 +1730,10 @@ export type CSVObjectRecord
> = Record< */ export type CSVArrayRecord< Header extends ReadonlyArray, - Strategy extends ColumnCountStrategy = "keep", + Strategy extends ColumnCountStrategy = "fill", > = Header extends readonly [] ? readonly string[] - : Strategy extends "pad" + : Strategy extends "sparse" ? { readonly [K in keyof Header]: string | undefined } : { readonly [K in keyof Header]: string }; @@ -1624,7 +1743,7 @@ export type CSVArrayRecord< * @category Types * @template Header Header of the CSV. * @template Format Output format: 'object' or 'array' (default: 'object') - * @template Strategy Column count strategy for array format (default: 'keep') + * @template Strategy Column count strategy for array format (default: 'fill') * * @remarks * This type represents a single CSV record, which can be either an object or an array @@ -1634,7 +1753,7 @@ export type CSVArrayRecord< * - When `Format` is `'array'`: Returns {@link CSVArrayRecord} (named tuple) * * For array format, the `Strategy` parameter affects field types: - * - `'pad'`: Fields are typed as `string | undefined` + * - `'sparse'`: Fields are typed as `string | undefined` * - Other strategies: Fields are typed as `string` * * @example Object format (default) @@ -1652,10 +1771,10 @@ export type CSVArrayRecord< * // Type: readonly [foo: string, bar: string] * ``` * - * @example Array format with pad strategy + * @example Array format with sparse strategy * ```ts * const header = ["foo", "bar"]; - * const record: CSVRecord = ["1", undefined]; + * const record: CSVRecord = ["1", undefined]; * // Type: readonly [foo: string | undefined, bar: string | undefined] * ``` */ @@ -1665,7 +1784,12 @@ export type CSVRecord< Strategy extends ColumnCountStrategy = "keep", > = Format extends "array" ? CSVArrayRecord - : CSVObjectRecord
; + : Strategy extends "sparse" + ? never // sparse is not allowed for object format + : CSVObjectRecord< + Header, + Strategy extends ObjectFormatColumnCountStrategy ? Strategy : "fill" + >; /** * Join CSV field array into a CSV-formatted string with proper escaping. @@ -1952,23 +2076,30 @@ export interface CSVLexerLexOptions { * String CSV Lexer interface * * StringCSVLexer tokenizes string CSV data into fields and records. + * + * @template TrackLocation - Whether to include location information in tokens (default: false) */ -export interface StringCSVLexer { +export interface StringCSVLexer { /** * Lexes the given chunk of CSV string data. * @param chunk - The chunk of CSV string data to be lexed. Omit to flush remaining data. * @param options - Lexer options. * @returns An iterable iterator of tokens. */ - lex(chunk?: string, options?: CSVLexerLexOptions): IterableIterator; + lex( + chunk?: string, + options?: CSVLexerLexOptions, + ): IterableIterator>; } /** * Binary CSV Lexer interface * * BinaryCSVLexer tokenizes binary CSV data (Uint8Array) into fields and records. + * + * @template TrackLocation - Whether to include location information in tokens (default: false) */ -export interface BinaryCSVLexer { +export interface BinaryCSVLexer { /** * Lexes the given chunk of CSV binary data. * @param chunk - The chunk of CSV binary data (Uint8Array) to be lexed. Omit to flush remaining data. @@ -1978,7 +2109,7 @@ export interface BinaryCSVLexer { lex( chunk?: Uint8Array, options?: CSVLexerLexOptions, - ): IterableIterator; + ): IterableIterator>; } /** @@ -1991,7 +2122,6 @@ export interface CSVRecordAssemblerAssembleOptions { */ stream?: boolean; } - /** * CSV Object Record Assembler interface. * @@ -2020,7 +2150,7 @@ export interface CSVObjectRecordAssembler< * @returns An iterable iterator of CSV records as objects. */ assemble( - input?: Token | Iterable, + input?: AnyToken | Iterable, options?: CSVRecordAssemblerAssembleOptions, ): IterableIterator>; } @@ -2051,7 +2181,7 @@ export interface CSVArrayRecordAssembler
> { * @returns An iterable iterator of CSV records as arrays/tuples. */ assemble( - input?: Token | Iterable, + input?: AnyToken | Iterable, options?: CSVRecordAssemblerAssembleOptions, ): IterableIterator>; } diff --git a/src/parser/api/binary/parseBinary.browser.spec.ts b/src/parser/api/binary/parseBinary.browser.spec.ts index c5fbfbbf..bd561dad 100644 --- a/src/parser/api/binary/parseBinary.browser.spec.ts +++ b/src/parser/api/binary/parseBinary.browser.spec.ts @@ -63,7 +63,7 @@ describe("parseBinary with execution strategies", () => { async ({ data, csv }) => { let i = 0; for await (const row of parseBinary(csv, { engine })) { - expect(data[i++]).toStrictEqual(row); + expect(data[i++]).toEqual(row); } }, ), diff --git a/src/parser/api/binary/parseBinary.spec.ts b/src/parser/api/binary/parseBinary.spec.ts index c312eb01..60a824d4 100644 --- a/src/parser/api/binary/parseBinary.spec.ts +++ b/src/parser/api/binary/parseBinary.spec.ts @@ -54,7 +54,7 @@ describe("parseBinary function", () => { async ({ data, csv }) => { let i = 0; for await (const row of parseBinary(csv)) { - expect(data[i++]).toStrictEqual(row); + expect(data[i++]).toEqual(row); } }, ), diff --git a/src/parser/api/binary/parseBinaryStream.browser.spec.ts b/src/parser/api/binary/parseBinaryStream.browser.spec.ts index d676707d..0b1b3862 100644 --- a/src/parser/api/binary/parseBinaryStream.browser.spec.ts +++ b/src/parser/api/binary/parseBinaryStream.browser.spec.ts @@ -57,7 +57,7 @@ describe("parseBinaryStream with execution strategies", () => { }, }).pipeThrough(new TextEncoderStream()); for await (const row of parseBinaryStream(stream, { engine })) { - expect(data[i++]).toStrictEqual(row); + expect(data[i++]).toEqual(row); } }, ), diff --git a/src/parser/api/binary/parseBinaryStream.spec.ts b/src/parser/api/binary/parseBinaryStream.spec.ts index 237535e7..f50dccba 100644 --- a/src/parser/api/binary/parseBinaryStream.spec.ts +++ b/src/parser/api/binary/parseBinaryStream.spec.ts @@ -53,7 +53,7 @@ describe("parseBinaryStream function", () => { async ({ data, csv }) => { let i = 0; for await (const row of parseBinaryStream(csv)) { - expect(data[i++]).toStrictEqual(row); + expect(data[i++]).toEqual(row); } }, ), @@ -69,7 +69,7 @@ describe("parseBinaryStream function", () => { const expected = [{ a: "1", b: "2", c: "3" }]; let i = 0; for await (const row of parseBinaryStream(csv)) { - expect(row).toStrictEqual(expected[i++]); + expect(row).toEqual(expected[i++]); } }); @@ -83,7 +83,7 @@ describe("parseBinaryStream function", () => { const expected = [{ a: "1", b: "2", c: "3" }]; let i = 0; for await (const row of parseBinaryStream(csv)) { - expect(row).toStrictEqual(expected[i++]); + expect(row).toEqual(expected[i++]); } }); @@ -148,7 +148,7 @@ describe("parseBinaryStream function", () => { for await (const row of parseBinaryStream(csv, { decompression: decompression, })) { - expect(data[i++]).toStrictEqual(row); + expect(data[i++]).toEqual(row); } }, ), @@ -168,6 +168,6 @@ test("throws an error if the CSV is invalid", async () => { // Do nothing } }).rejects.toThrowErrorMatchingInlineSnapshot( - `[ParseError: Unexpected EOF while parsing quoted field.]`, + `[ParseError: Unexpected EOF while parsing quoted field at line 2, column 1.]`, ); }); diff --git a/src/parser/api/binary/parseBinaryStreamToStream.spec.ts b/src/parser/api/binary/parseBinaryStreamToStream.spec.ts index 9860eee2..801c74b8 100644 --- a/src/parser/api/binary/parseBinaryStreamToStream.spec.ts +++ b/src/parser/api/binary/parseBinaryStreamToStream.spec.ts @@ -41,6 +41,6 @@ test("throws an error if the CSV is invalid", async () => { }), ).pipeTo(new WritableStream({ write() {} })); }).rejects.toThrowErrorMatchingInlineSnapshot( - `[ParseError: Unexpected EOF while parsing quoted field.]`, + `[ParseError: Unexpected EOF while parsing quoted field at line 2, column 1.]`, ); }); diff --git a/src/parser/api/binary/parseBinaryToIterableIterator.test.ts b/src/parser/api/binary/parseBinaryToIterableIterator.test.ts index 948b37aa..b9b9e88c 100644 --- a/src/parser/api/binary/parseBinaryToIterableIterator.test.ts +++ b/src/parser/api/binary/parseBinaryToIterableIterator.test.ts @@ -142,7 +142,7 @@ describe("Integration with FlexibleBinaryObjectCSVParser", () => { expect(results).toEqual([ { name: "Alice", age: "30" }, - { name: "Bob", age: undefined }, // Missing field remains undefined + { name: "Bob", age: "" }, // Missing field returns empty string ]); }); }); diff --git a/src/parser/api/file/parseBlob.spec.ts b/src/parser/api/file/parseBlob.spec.ts index d38aa971..0db52f5d 100644 --- a/src/parser/api/file/parseBlob.spec.ts +++ b/src/parser/api/file/parseBlob.spec.ts @@ -54,7 +54,7 @@ describe("parseBlob function", () => { async ({ data, blob }) => { let i = 0; for await (const row of parseBlob(blob)) { - expect(data[i++]).toStrictEqual(row); + expect(data[i++]).toEqual(row); } }, ), @@ -70,7 +70,7 @@ describe("parseBlob function", () => { let i = 0; for await (const row of parseBlob(blob)) { - expect(row).toStrictEqual(expected[i++]); + expect(row).toEqual(expected[i++]); } }); @@ -84,7 +84,7 @@ describe("parseBlob function", () => { let i = 0; for await (const row of parseBlob(blob)) { - expect(row).toStrictEqual(expected[i++]); + expect(row).toEqual(expected[i++]); } }); @@ -98,7 +98,7 @@ describe("parseBlob function", () => { let i = 0; for await (const row of parseBlob(file)) { - expect(row).toStrictEqual(expected[i++]); + expect(row).toEqual(expected[i++]); } }); @@ -111,7 +111,7 @@ describe("parseBlob function", () => { ]; const records = await parseBlob.toArray(blob); - expect(records).toStrictEqual(expected); + expect(records).toEqual(expected); }); describe("source handling", () => { diff --git a/src/parser/api/file/parseFile.spec.ts b/src/parser/api/file/parseFile.spec.ts index a3be8736..544f14a9 100644 --- a/src/parser/api/file/parseFile.spec.ts +++ b/src/parser/api/file/parseFile.spec.ts @@ -12,7 +12,7 @@ describe("parseFile function", () => { let i = 0; for await (const row of parseFile(file)) { - expect(row).toStrictEqual(expected[i++]); + expect(row).toEqual(expected[i++]); } }); @@ -25,7 +25,7 @@ describe("parseFile function", () => { ]; const records = await parseFile.toArray(file); - expect(records).toStrictEqual(expected); + expect(records).toEqual(expected); }); describe("automatic source tracking", () => { diff --git a/src/parser/api/file/parseFileToArray.spec.ts b/src/parser/api/file/parseFileToArray.spec.ts index fd5d4ac4..0c5a4f82 100644 --- a/src/parser/api/file/parseFileToArray.spec.ts +++ b/src/parser/api/file/parseFileToArray.spec.ts @@ -8,7 +8,7 @@ describe("parseFileToArray", () => { const records = await parseFileToArray(file); - expect(records).toStrictEqual([ + expect(records).toEqual([ { name: "Alice", age: "42" }, { name: "Bob", age: "69" }, ]); @@ -19,7 +19,7 @@ describe("parseFileToArray", () => { const records = await parseFileToArray(file); - expect(records).toStrictEqual([]); + expect(records).toEqual([]); }); it("should handle CSV with only headers", async () => { @@ -28,7 +28,7 @@ describe("parseFileToArray", () => { const records = await parseFileToArray(file); - expect(records).toStrictEqual([]); + expect(records).toEqual([]); }); it("should respect parsing options", async () => { @@ -39,7 +39,7 @@ describe("parseFileToArray", () => { const records = await parseFileToArray(file, { delimiter: "\t" } as any); - expect(records).toStrictEqual([ + expect(records).toEqual([ { name: "Alice", age: "42" }, { name: "Bob", age: "69" }, ]); @@ -88,8 +88,8 @@ describe("parseFileToArray", () => { const records = await parseFileToArray(file); expect(records).toHaveLength(1000); - expect(records[0]).toStrictEqual({ name: "User0", age: "20" }); - expect(records[999]).toStrictEqual({ name: "User999", age: "69" }); // 20 + (999 % 50) = 20 + 49 = 69 + expect(records[0]).toEqual({ name: "User0", age: "20" }); + expect(records[999]).toEqual({ name: "User999", age: "69" }); // 20 + (999 % 50) = 20 + 49 = 69 }); it("should handle CSV with quoted fields", async () => { @@ -98,7 +98,7 @@ describe("parseFileToArray", () => { const records = await parseFileToArray(file); - expect(records).toStrictEqual([ + expect(records).toEqual([ { name: "Alice", message: "Hello, World" }, { name: "Bob", message: "Line1\\nLine2" }, ]); diff --git a/src/parser/api/file/parseFileToStream.spec.ts b/src/parser/api/file/parseFileToStream.spec.ts index d1c9ff88..d31dc947 100644 --- a/src/parser/api/file/parseFileToStream.spec.ts +++ b/src/parser/api/file/parseFileToStream.spec.ts @@ -16,7 +16,7 @@ describe("parseFileToStream", () => { result = await reader.read(); } - expect(records).toStrictEqual([ + expect(records).toEqual([ { name: "Alice", age: "42" }, { name: "Bob", age: "69" }, ]); @@ -53,7 +53,7 @@ describe("parseFileToStream", () => { const reader = stream.getReader(); const result = await reader.read(); - expect(result.value).toStrictEqual({ name: "Alice", age: "42" }); + expect(result.value).toEqual({ name: "Alice", age: "42" }); }); it("should support streaming large files", async () => { @@ -146,12 +146,12 @@ describe("parseFileToStream", () => { const reader = stream.getReader(); const record1 = await reader.read(); - expect(record1.value).toStrictEqual({ + expect(record1.value).toEqual({ name: "Alice", message: "Hello, World", }); const record2 = await reader.read(); - expect(record2.value).toStrictEqual({ name: "Bob", message: "Hi there" }); + expect(record2.value).toEqual({ name: "Bob", message: "Hi there" }); }); }); diff --git a/src/parser/api/model/createCSVRecordAssembler.test.ts b/src/parser/api/model/createCSVRecordAssembler.test.ts new file mode 100644 index 00000000..4c3d2568 --- /dev/null +++ b/src/parser/api/model/createCSVRecordAssembler.test.ts @@ -0,0 +1,85 @@ +import { describe, expect, test } from "vitest"; +import { Delimiter } from "@/core/constants.ts"; +import type { AnyToken } from "@/core/types.ts"; +import { createCSVRecordAssembler } from "@/parser/api/model/createCSVRecordAssembler.ts"; + +describe("createCSVRecordAssembler", () => { + const makeToken = (value: string): AnyToken => ({ + value, + delimiter: Delimiter.Record, + delimiterLength: 0, + }); + + test("returns object assembler by default", () => { + const assembler = createCSVRecordAssembler({ + header: ["name"] as const, + }); + expect(assembler).toHaveProperty("assemble"); + const records = [...assembler.assemble([makeToken("Alice")])]; + expect(records).toEqual([{ name: "Alice" }]); + }); + + test("returns array assembler when outputFormat is 'array'", () => { + const assembler = createCSVRecordAssembler({ + header: [] as const, + outputFormat: "array", + columnCountStrategy: "keep", + }); + const records = [...assembler.assemble([makeToken("Alice")])]; + expect(records).toEqual([["Alice"]]); + }); + + test("throws when headerless mode is not array format", () => { + expect(() => { + createCSVRecordAssembler({ + header: [] as const, + }); + }).toThrow(/outputFormat: 'array'/); + }); + + test("throws when headerless mode uses non-keep strategy", () => { + expect(() => { + createCSVRecordAssembler({ + header: [] as const, + outputFormat: "array", + columnCountStrategy: "fill", + }); + }).toThrow(/only supports columnCountStrategy: 'keep'/); + }); + + test("throws when includeHeader is used with non-array format", () => { + expect(() => { + createCSVRecordAssembler({ + header: ["name"] as const, + includeHeader: true, + } as any); + }).toThrow(/includeHeader option is only valid for array format/); + }); + + test("throws when object uses sparse strategy", () => { + expect(() => { + createCSVRecordAssembler({ + header: ["name"] as const, + columnCountStrategy: "sparse", + }); + }).toThrow(/'sparse' is not allowed for object format/); + }); + + test("throws when object uses keep strategy", () => { + expect(() => { + createCSVRecordAssembler({ + header: ["name"] as const, + columnCountStrategy: "keep", + }); + }).toThrow(/'keep' is not allowed for object format/); + }); + + test("throws when object uses truncate strategy", () => { + expect(() => { + createCSVRecordAssembler({ + header: ["name"] as const, + columnCountStrategy: "truncate", + }); + }).toThrow(/'truncate' is not allowed for object format/); + }); +}); diff --git a/src/parser/api/model/createCSVRecordAssembler.ts b/src/parser/api/model/createCSVRecordAssembler.ts index d45fa7c5..21f95a14 100644 --- a/src/parser/api/model/createCSVRecordAssembler.ts +++ b/src/parser/api/model/createCSVRecordAssembler.ts @@ -129,6 +129,24 @@ export function createCSVRecordAssembler< throw new Error("includeHeader option is only valid for array format"); } + // Validate that 'sparse' strategy is not used with object format + if (format === "object" && options?.columnCountStrategy) { + const strategy = options.columnCountStrategy; + if (strategy === "sparse") { + throw new Error( + "columnCountStrategy 'sparse' is not allowed for object format. " + + "'sparse' fills missing fields with undefined, which is not compatible with object format. " + + "Use 'fill' (fills with empty string) or outputFormat: 'array' for sparse data.", + ); + } + if (strategy === "keep" || strategy === "truncate") { + throw new Error( + `columnCountStrategy '${strategy}' is not allowed for object format. ` + + "Use 'fill' (default) or 'strict' for object output.", + ); + } + } + if (format === "array") { return new FlexibleCSVArrayRecordAssembler
( (options as any) ?? {}, diff --git a/src/parser/api/model/createStringCSVLexer.ts b/src/parser/api/model/createStringCSVLexer.ts index 5c3a5554..8766a129 100644 --- a/src/parser/api/model/createStringCSVLexer.ts +++ b/src/parser/api/model/createStringCSVLexer.ts @@ -1,5 +1,5 @@ import type { DEFAULT_DELIMITER, DEFAULT_QUOTATION } from "@/core/constants.ts"; -import type { StringCSVLexerOptions } from "@/core/types.ts"; +import type { StringCSVLexer, StringCSVLexerOptions } from "@/core/types.ts"; import { FlexibleStringCSVLexer } from "@/parser/models/FlexibleStringCSVLexer.ts"; // Re-export the lexer class @@ -37,8 +37,11 @@ export { FlexibleStringCSVLexer } from "@/parser/models/FlexibleStringCSVLexer.t export function createStringCSVLexer< Delimiter extends string = DEFAULT_DELIMITER, Quotation extends string = DEFAULT_QUOTATION, + TrackLocation extends boolean = false, >( - options?: StringCSVLexerOptions, -): FlexibleStringCSVLexer { - return new FlexibleStringCSVLexer(options ?? {}); + options?: StringCSVLexerOptions, +): StringCSVLexer { + return new FlexibleStringCSVLexer( + options ?? {}, + ); } diff --git a/src/parser/api/network/parseRequest.spec.ts b/src/parser/api/network/parseRequest.spec.ts index 2522919b..5e86415b 100644 --- a/src/parser/api/network/parseRequest.spec.ts +++ b/src/parser/api/network/parseRequest.spec.ts @@ -91,7 +91,7 @@ describe("parseRequest function", () => { async ({ data, request }) => { let i = 0; for await (const row of parseRequest(request)) { - expect(data[i++]).toStrictEqual(row); + expect(data[i++]).toEqual(row); } }, ), @@ -113,7 +113,7 @@ describe("parseRequest function", () => { let i = 0; for await (const row of parseRequest(request)) { - expect(row).toStrictEqual(expected[i++]); + expect(row).toEqual(expected[i++]); } }); @@ -132,6 +132,6 @@ describe("parseRequest function", () => { ]; const records = await parseRequest.toArray(request); - expect(records).toStrictEqual(expected); + expect(records).toEqual(expected); }); }); diff --git a/src/parser/api/network/parseResponse.browser.spec.ts b/src/parser/api/network/parseResponse.browser.spec.ts index d9a054dd..12ed0af5 100644 --- a/src/parser/api/network/parseResponse.browser.spec.ts +++ b/src/parser/api/network/parseResponse.browser.spec.ts @@ -71,7 +71,7 @@ describe("parseResponse with execution strategies", () => { let i = 0; // parseResponse returns AsyncIterableIterator directly, not Promise for await (const row of parseResponse(response, { engine })) { - expect(data[i++]).toStrictEqual(row); + expect(data[i++]).toEqual(row); } }, ), diff --git a/src/parser/api/network/parseResponse.spec.ts b/src/parser/api/network/parseResponse.spec.ts index 634526a8..637b2ce4 100644 --- a/src/parser/api/network/parseResponse.spec.ts +++ b/src/parser/api/network/parseResponse.spec.ts @@ -4,7 +4,7 @@ import { FC } from "@/__tests__/helper.ts"; import { parseResponse } from "@/parser/api/network/parseResponse.ts"; import { escapeField } from "@/utils/serialization/escapeField.ts"; -describe("parseRequest function", () => { +describe("parseResponse function", () => { it("should throw error if content-type header is not text/csv", async () => { const response = new Response("", { headers: { @@ -87,7 +87,7 @@ describe("parseRequest function", () => { async ({ data, response }) => { let i = 0; for await (const row of parseResponse(response)) { - expect(data[i++]).toStrictEqual(row); + expect(data[i++]).toEqual(row); } }, ), diff --git a/src/parser/api/network/parseResponseToStream.spec.ts b/src/parser/api/network/parseResponseToStream.spec.ts index 222bb55b..b5e6a04e 100644 --- a/src/parser/api/network/parseResponseToStream.spec.ts +++ b/src/parser/api/network/parseResponseToStream.spec.ts @@ -97,7 +97,7 @@ describe("parseResponseToStream", () => { async ({ data, response }) => { let i = 0; for await (const row of parseResponse(response)) { - expect(data[i++]).toStrictEqual(row); + expect(data[i++]).toEqual(row); } }, ), diff --git a/src/parser/api/parse.spec.ts b/src/parser/api/parse.spec.ts index d8d9fbbd..e4400eba 100644 --- a/src/parser/api/parse.spec.ts +++ b/src/parser/api/parse.spec.ts @@ -62,6 +62,7 @@ describe("parse function", () => { { csv: "a,b,c\n\n1,2,3", data: [ + // Empty line only has 1 empty field, so b and c are missing (empty strings) { a: "", b: "", c: "" }, { a: "1", b: "2", c: "3" }, ], diff --git a/src/parser/api/stream/createCSVRecordAssemblerTransformer.spec.ts b/src/parser/api/stream/createCSVRecordAssemblerTransformer.spec.ts index 7cb69459..86ffd27b 100644 --- a/src/parser/api/stream/createCSVRecordAssemblerTransformer.spec.ts +++ b/src/parser/api/stream/createCSVRecordAssemblerTransformer.spec.ts @@ -1,8 +1,8 @@ import fc from "fast-check"; import { describe as describe_, expect, it as it_ } from "vitest"; import { FC, transform } from "@/__tests__/helper.ts"; -import { Field, FieldDelimiter, RecordDelimiter } from "@/core/constants.ts"; -import type { Token } from "@/core/types.ts"; +import { Delimiter } from "@/core/constants.ts"; +import type { AnyToken } from "@/core/types.ts"; import { createCSVRecordAssemblerTransformer } from "@/parser/api/stream/createCSVRecordAssemblerTransformer.ts"; import { CSVRecordAssemblerTransformer } from "@/parser/stream/CSVRecordAssemblerTransformer.ts"; @@ -33,15 +33,12 @@ describe("createCSVRecordAssemblerTransformer", () => { it("should create transformer with default options", async () => { const transformer = createCSVRecordAssemblerTransformer(); - const tokens: Token[] = [ - { type: Field, value: "name", location: LOCATION_SHAPE }, - { type: FieldDelimiter, value: ",", location: LOCATION_SHAPE }, - { type: Field, value: "age", location: LOCATION_SHAPE }, - { type: RecordDelimiter, value: "\n", location: LOCATION_SHAPE }, - { type: Field, value: "Alice", location: LOCATION_SHAPE }, - { type: FieldDelimiter, value: ",", location: LOCATION_SHAPE }, - { type: Field, value: "20", location: LOCATION_SHAPE }, - { type: RecordDelimiter, value: "\n", location: LOCATION_SHAPE }, + // New unified token format: each field token includes what follows (delimiter info) + const tokens: AnyToken[] = [ + { value: "name", delimiter: Delimiter.Field, delimiterLength: 1 }, + { value: "age", delimiter: Delimiter.Record, delimiterLength: 1 }, + { value: "Alice", delimiter: Delimiter.Field, delimiterLength: 1 }, + { value: "20", delimiter: Delimiter.Record, delimiterLength: 1 }, ]; const records = await transform(transformer, tokens); @@ -53,15 +50,11 @@ describe("createCSVRecordAssemblerTransformer", () => { header: ["name", "age"] as const, }); - const tokens: Token[] = [ - { type: Field, value: "Alice", location: LOCATION_SHAPE }, - { type: FieldDelimiter, value: ",", location: LOCATION_SHAPE }, - { type: Field, value: "20", location: LOCATION_SHAPE }, - { type: RecordDelimiter, value: "\n", location: LOCATION_SHAPE }, - { type: Field, value: "Bob", location: LOCATION_SHAPE }, - { type: FieldDelimiter, value: ",", location: LOCATION_SHAPE }, - { type: Field, value: "25", location: LOCATION_SHAPE }, - { type: RecordDelimiter, value: "\n", location: LOCATION_SHAPE }, + const tokens: AnyToken[] = [ + { value: "Alice", delimiter: Delimiter.Field, delimiterLength: 1 }, + { value: "20", delimiter: Delimiter.Record, delimiterLength: 1 }, + { value: "Bob", delimiter: Delimiter.Field, delimiterLength: 1 }, + { value: "25", delimiter: Delimiter.Record, delimiterLength: 1 }, ]; const records = await transform(transformer, tokens); @@ -76,15 +69,11 @@ describe("createCSVRecordAssemblerTransformer", () => { outputFormat: "array", }); - const tokens: Token[] = [ - { type: Field, value: "name", location: LOCATION_SHAPE }, - { type: FieldDelimiter, value: ",", location: LOCATION_SHAPE }, - { type: Field, value: "age", location: LOCATION_SHAPE }, - { type: RecordDelimiter, value: "\n", location: LOCATION_SHAPE }, - { type: Field, value: "Alice", location: LOCATION_SHAPE }, - { type: FieldDelimiter, value: ",", location: LOCATION_SHAPE }, - { type: Field, value: "20", location: LOCATION_SHAPE }, - { type: RecordDelimiter, value: "\n", location: LOCATION_SHAPE }, + const tokens: AnyToken[] = [ + { value: "name", delimiter: Delimiter.Field, delimiterLength: 1 }, + { value: "age", delimiter: Delimiter.Record, delimiterLength: 1 }, + { value: "Alice", delimiter: Delimiter.Field, delimiterLength: 1 }, + { value: "20", delimiter: Delimiter.Record, delimiterLength: 1 }, ]; const records = await transform(transformer, tokens); @@ -118,38 +107,25 @@ describe("createCSVRecordAssemblerTransformer", () => { maxLength: header.length, }, }); - const tokens: Token[] = [ + // New unified token format + const tokens: AnyToken[] = [ // generate header tokens - ...header.flatMap((field, i) => [ - { type: Field, value: field, location: LOCATION_SHAPE }, - i === header.length - 1 - ? { - type: RecordDelimiter, - value: "\n", - location: LOCATION_SHAPE, - } - : { - type: FieldDelimiter, - value: ",", - location: LOCATION_SHAPE, - }, - ]), + ...header.map((field, i) => ({ + value: field, + delimiter: + i === header.length - 1 ? Delimiter.Record : Delimiter.Field, + delimiterLength: 1, + location: LOCATION_SHAPE, + })), // generate rows tokens ...rows.flatMap((row) => - row.flatMap((field, j) => [ - { type: Field, value: field, location: LOCATION_SHAPE }, - j === row.length - 1 - ? { - type: RecordDelimiter, - value: "\n", - location: LOCATION_SHAPE, - } - : { - type: FieldDelimiter, - value: ",", - location: LOCATION_SHAPE, - }, - ]), + row.map((field, j) => ({ + value: field, + delimiter: + j === row.length - 1 ? Delimiter.Record : Delimiter.Field, + delimiterLength: 1, + location: LOCATION_SHAPE, + })), ), ]; const expected = rows.map((row) => diff --git a/src/parser/api/stream/createStringCSVLexerTransformer.spec.ts b/src/parser/api/stream/createStringCSVLexerTransformer.spec.ts index b36c3433..006b676e 100644 --- a/src/parser/api/stream/createStringCSVLexerTransformer.spec.ts +++ b/src/parser/api/stream/createStringCSVLexerTransformer.spec.ts @@ -1,28 +1,11 @@ import fc from "fast-check"; -import { describe as describe_, expect, it as it_ } from "vitest"; +import { describe, expect, it } from "vitest"; import { autoChunk, FC, transform } from "@/__tests__/helper.ts"; -import { Field, FieldDelimiter, RecordDelimiter } from "@/core/constants.ts"; +import { Delimiter } from "@/core/constants.ts"; import { createStringCSVLexerTransformer } from "@/parser/api/stream/createStringCSVLexerTransformer.ts"; import { StringCSVLexerTransformer } from "@/parser/stream/StringCSVLexerTransformer.ts"; import { escapeField } from "@/utils/serialization/escapeField.ts"; -const describe = describe_.concurrent; -const it = it_.concurrent; - -const LOCATION_SHAPE = { - start: { - line: expect.any(Number), - column: expect.any(Number), - offset: expect.any(Number), - }, - end: { - line: expect.any(Number), - column: expect.any(Number), - offset: expect.any(Number), - }, - rowNumber: expect.any(Number), -}; - describe("createStringCSVLexerTransformer", () => { it("should return a StringCSVLexerTransformer instance", () => { const transformer = createStringCSVLexerTransformer(); @@ -35,12 +18,12 @@ describe("createStringCSVLexerTransformer", () => { const chunks = ["name,age\r\n", "Alice,20\r\n"]; const tokens = await transform(transformer, chunks); - const flat = tokens.flat(); - - // Should have fields and delimiters - expect(flat.some((t) => t.type === Field)).toBe(true); - expect(flat.some((t) => t.type === FieldDelimiter)).toBe(true); - expect(flat.some((t) => t.type === RecordDelimiter)).toBe(true); + expect(tokens).toEqual([ + { value: "name", delimiter: Delimiter.Field, delimiterLength: 1 }, + { value: "age", delimiter: Delimiter.Record, delimiterLength: 2 }, + { value: "Alice", delimiter: Delimiter.Field, delimiterLength: 1 }, + { value: "20", delimiter: Delimiter.Record, delimiterLength: 2 }, + ]); }); it("should create transformer with custom delimiter", async () => { @@ -48,12 +31,12 @@ describe("createStringCSVLexerTransformer", () => { const chunks = ["name\tage\r\n", "Alice\t20\r\n"]; const tokens = await transform(transformer, chunks); - const flat = tokens.flat(); - - // Field delimiter should be tab - const fieldDelimiters = flat.filter((t) => t.type === FieldDelimiter); - expect(fieldDelimiters.length).toBeGreaterThan(0); - expect(fieldDelimiters[0]?.value).toBe("\t"); + expect(tokens).toEqual([ + { value: "name", delimiter: Delimiter.Field, delimiterLength: 1 }, + { value: "age", delimiter: Delimiter.Record, delimiterLength: 2 }, + { value: "Alice", delimiter: Delimiter.Field, delimiterLength: 1 }, + { value: "20", delimiter: Delimiter.Record, delimiterLength: 2 }, + ]); }); it("should create transformer with custom quotation", async () => { @@ -63,10 +46,10 @@ describe("createStringCSVLexerTransformer", () => { const tokens = await transform(transformer, chunks); const flat = tokens.flat(); - // Fields should be extracted correctly - const fields = flat.filter((t) => t.type === Field); - expect(fields.map((f) => f.value)).toContain("name"); - expect(fields.map((f) => f.value)).toContain("age"); + // Fields should be extracted correctly (value property) + const values = flat.map((f) => f.value); + expect(values).toContain("name"); + expect(values).toContain("age"); }); it("should separate fields by commas by default", async () => { @@ -75,34 +58,43 @@ describe("createStringCSVLexerTransformer", () => { fc.gen().map((g) => { const row = g(FC.row); const quote = g(FC.quote); - const chunks = autoChunk( - g, - row.map((v) => escapeField(v, { quote })).join(","), - ); - const expected = [ - ...row.flatMap((value, index) => [ - ...(quote || value - ? [{ type: Field, value, location: LOCATION_SHAPE }] - : []), - ...(index === row.length - 1 - ? [] - : [ - { - type: FieldDelimiter, - value: ",", - location: LOCATION_SHAPE, - }, - ]), - ]), - ]; + const csv = row.map((v) => escapeField(v, { quote })).join(","); + const chunks = csv.length === 0 ? [""] : autoChunk(g, csv); + const ambiguousSingleEmpty = + row.length === 1 && + row[0] === "" && + escapeField(row[0]!, { quote }) === row[0]; + // New unified format: each field token includes delimiter info + const expected = row.map((value, index) => ({ + value, + delimiter: + index === row.length - 1 ? Delimiter.Record : Delimiter.Field, + delimiterLength: expect.any(Number), + })); + if (ambiguousSingleEmpty) { + expected.pop(); + } return { row, chunks, expected }; }), async ({ chunks, expected }) => { const transformer = createStringCSVLexerTransformer(); - const actual = (await transform(transformer, chunks)).flat(); + const actual = await transform(transformer, chunks); expect(actual).toMatchObject(expected); }, ), + { + // examples: [ + // { + // row: ["name", "age"], + // chunks: ["name,age\r\n"], + // expected: [ + // { value: "name", delimiter: Delimiter.Field, delimiterLength: 1 }, + // { value: "age", delimiter: Delimiter.Record, delimiterLength: 2 }, + // ], + // }, + // { row: [""], chunks: [""], expected: [{ value: "", delimiter: 1, delimiterLength: 0 }] }, + // ], + }, ); }); @@ -128,31 +120,30 @@ describe("createStringCSVLexerTransformer", () => { ) .join(eol) + (EOF ? eol : ""); const chunks = autoChunk(g, csv); - const expected = [ - ...data.flatMap((row, i) => [ - ...row.flatMap((value, j) => [ - ...(quote || value !== "" ? [{ type: Field, value }] : []), - ...(row.length - 1 !== j - ? [ - { - type: FieldDelimiter, - value: options.delimiter, - location: LOCATION_SHAPE, - }, - ] - : []), - ]), - ...(data.length - 1 !== i - ? [ - { - type: RecordDelimiter, - value: eol, - location: LOCATION_SHAPE, - }, - ] - : []), - ]), - ]; + // New unified format + const expected = data.flatMap((row, i) => + row.map((value, j) => { + const isLastField = j === row.length - 1; + const isLastRow = i === data.length - 1; + let delimiter: Delimiter; + let delimiterLength: number; + + if (isLastField) { + if (isLastRow && !EOF) { + delimiter = Delimiter.Record; + delimiterLength = 0; + } else { + delimiter = Delimiter.Record; + delimiterLength = eol.length; + } + } else { + delimiter = Delimiter.Field; + delimiterLength = options.delimiter.length; + } + + return { value, delimiter, delimiterLength }; + }), + ); return { options, chunks, expected }; }), async ({ options, chunks, expected }) => { diff --git a/src/parser/api/string/parseString.spec.ts b/src/parser/api/string/parseString.spec.ts index 72e683bf..3b4fc833 100644 --- a/src/parser/api/string/parseString.spec.ts +++ b/src/parser/api/string/parseString.spec.ts @@ -49,7 +49,7 @@ describe("parseString function", () => { // Do nothing. } }).rejects.toThrowErrorMatchingInlineSnapshot( - `[ParseError: Unexpected EOF while parsing quoted field.]`, + `[ParseError: Unexpected EOF while parsing quoted field at line 2, column 1.]`, ); }); diff --git a/src/parser/api/string/parseStringStream.spec.ts b/src/parser/api/string/parseStringStream.spec.ts index 8bf70762..23c04deb 100644 --- a/src/parser/api/string/parseStringStream.spec.ts +++ b/src/parser/api/string/parseStringStream.spec.ts @@ -65,6 +65,6 @@ test("throws an error if invalid input", async () => { // Do nothing } }).rejects.toThrowErrorMatchingInlineSnapshot( - `[ParseError: Unexpected EOF while parsing quoted field.]`, + `[ParseError: Unexpected EOF while parsing quoted field at line 2, column 1.]`, ); }); diff --git a/src/parser/api/string/parseStringStreamToStream.spec.ts b/src/parser/api/string/parseStringStreamToStream.spec.ts index 616258cc..a64b6d22 100644 --- a/src/parser/api/string/parseStringStreamToStream.spec.ts +++ b/src/parser/api/string/parseStringStreamToStream.spec.ts @@ -45,6 +45,6 @@ test("throws an error if the CSV is invalid", async () => { }), ); }).rejects.toThrowErrorMatchingInlineSnapshot( - `[ParseError: Unexpected EOF while parsing quoted field.]`, + `[ParseError: Unexpected EOF while parsing quoted field at line 2, column 1.]`, ); }); diff --git a/src/parser/api/string/parseStringToArraySync.spec.ts b/src/parser/api/string/parseStringToArraySync.spec.ts index 5ba9f903..3a1bc66c 100644 --- a/src/parser/api/string/parseStringToArraySync.spec.ts +++ b/src/parser/api/string/parseStringToArraySync.spec.ts @@ -18,6 +18,6 @@ test("throws an error if the CSV is invalid", () => { expect(() => parseStringToArraySync('a\n"'), ).toThrowErrorMatchingInlineSnapshot( - `[ParseError: Unexpected EOF while parsing quoted field.]`, + `[ParseError: Unexpected EOF while parsing quoted field at line 2, column 1.]`, ); }); diff --git a/src/parser/api/string/parseStringToIterableIterator.spec.ts b/src/parser/api/string/parseStringToIterableIterator.spec.ts index 68ce3d33..72d49394 100644 --- a/src/parser/api/string/parseStringToIterableIterator.spec.ts +++ b/src/parser/api/string/parseStringToIterableIterator.spec.ts @@ -23,6 +23,6 @@ test("throws an error if the CSV is invalid", () => { // Do nothing } }).toThrowErrorMatchingInlineSnapshot( - `[ParseError: Unexpected EOF while parsing quoted field.]`, + `[ParseError: Unexpected EOF while parsing quoted field at line 2, column 1.]`, ); }); diff --git a/src/parser/api/string/parseStringToStream.spec.ts b/src/parser/api/string/parseStringToStream.spec.ts index 0750e25b..4c4d4a4c 100644 --- a/src/parser/api/string/parseStringToStream.spec.ts +++ b/src/parser/api/string/parseStringToStream.spec.ts @@ -31,6 +31,6 @@ test("throws an error if the CSV is invalid", async () => { }), ); }).rejects.toThrowErrorMatchingInlineSnapshot( - `[ParseError: Unexpected EOF while parsing quoted field.]`, + `[ParseError: Unexpected EOF while parsing quoted field at line 2, column 1.]`, ); }); diff --git a/src/parser/models/FlexibleBinaryArrayCSVParser.ts b/src/parser/models/FlexibleBinaryArrayCSVParser.ts index c80b40bd..20b4eb2e 100644 --- a/src/parser/models/FlexibleBinaryArrayCSVParser.ts +++ b/src/parser/models/FlexibleBinaryArrayCSVParser.ts @@ -36,7 +36,7 @@ import { BaseBinaryCSVParser } from "@/parser/models/base/BaseBinaryCSVParser.ts * ``` */ export class FlexibleBinaryArrayCSVParser< - Header extends ReadonlyArray = readonly string[], + const Header extends ReadonlyArray = readonly string[], > extends BaseBinaryCSVParser implements BinaryArrayCSVParser
diff --git a/src/parser/models/FlexibleBinaryCSVParser.test.ts b/src/parser/models/FlexibleBinaryCSVParser.test.ts index 81c2dbc8..a7572cba 100644 --- a/src/parser/models/FlexibleBinaryCSVParser.test.ts +++ b/src/parser/models/FlexibleBinaryCSVParser.test.ts @@ -216,16 +216,16 @@ describe("FlexibleBinaryCSVParser (Object and Array)", () => { ]); }); - test("should preserve undefined for missing fields in array format (with pad strategy)", () => { - const parserWithPad = new FlexibleBinaryArrayCSVParser({ + test("should preserve undefined for missing fields in array format (with sparse strategy)", () => { + const parserWithSparse = new FlexibleBinaryArrayCSVParser({ header: ["name", "age", "city"] as const, - columnCountStrategy: "pad", + columnCountStrategy: "sparse", charset: "utf-8", }); const records = Array.from( - parserWithPad.parse(encoder.encode("Alice,30\nBob")), + parserWithSparse.parse(encoder.encode("Alice,30\nBob")), ); expect(records).toEqual([ @@ -268,7 +268,7 @@ describe("FlexibleBinaryCSVParser (Object and Array)", () => { const records = Array.from(parser.parse(encoder.encode("Alice,30\nBob"))); expect(records).toEqual([ { name: "Alice", age: "30" }, - { name: "Bob", age: undefined }, // Missing field remains undefined + { name: "Bob", age: "" }, // Missing field filled with "" (fill strategy default) ]); }); @@ -396,10 +396,10 @@ describe("FlexibleBinaryCSVParser (Object and Array)", () => { }); describe("Column count strategy", () => { - test("should pad short rows with undefined in object format", () => { + test("should fill short rows with empty string in object format", () => { const parser = new FlexibleBinaryObjectCSVParser({ header: ["name", "age", "city"] as const, - columnCountStrategy: "pad", + columnCountStrategy: "fill", charset: "utf-8", }); @@ -407,7 +407,7 @@ describe("FlexibleBinaryCSVParser (Object and Array)", () => { parser.parse(encoder.encode("Alice,30\nBob,25,NYC")), ); expect(records).toEqual([ - { name: "Alice", age: "30", city: undefined }, + { name: "Alice", age: "30", city: "" }, { name: "Bob", age: "25", city: "NYC" }, ]); }); @@ -424,20 +424,16 @@ describe("FlexibleBinaryCSVParser (Object and Array)", () => { ).toThrow(); }); - test("should truncate long rows with 'truncate' strategy", () => { - const parser = new FlexibleBinaryObjectCSVParser({ - header: ["name", "age"] as const, - columnCountStrategy: "truncate", - charset: "utf-8", - }); - - const records = Array.from( - parser.parse(encoder.encode("Alice,30,extra\nBob,25")), + test("should reject 'truncate' strategy for object output", () => { + expect(() => { + new FlexibleBinaryObjectCSVParser({ + header: ["name", "age"] as const, + columnCountStrategy: "truncate", + charset: "utf-8", + }); + }).toThrow( + /columnCountStrategy 'truncate' is not allowed for object format/, ); - expect(records).toEqual([ - { name: "Alice", age: "30" }, - { name: "Bob", age: "25" }, - ]); }); }); diff --git a/src/parser/models/FlexibleBinaryObjectCSVParser.ts b/src/parser/models/FlexibleBinaryObjectCSVParser.ts index 8701b3f6..e0d81abc 100644 --- a/src/parser/models/FlexibleBinaryObjectCSVParser.ts +++ b/src/parser/models/FlexibleBinaryObjectCSVParser.ts @@ -36,7 +36,7 @@ import { BaseBinaryCSVParser } from "@/parser/models/base/BaseBinaryCSVParser.ts * ``` */ export class FlexibleBinaryObjectCSVParser< - Header extends ReadonlyArray = readonly string[], + const Header extends ReadonlyArray = readonly string[], > extends BaseBinaryCSVParser implements BinaryObjectCSVParser
diff --git a/src/parser/models/FlexibleCSVArrayRecordAssembler.ts b/src/parser/models/FlexibleCSVArrayRecordAssembler.ts index cef8fe12..b86cbfb9 100644 --- a/src/parser/models/FlexibleCSVArrayRecordAssembler.ts +++ b/src/parser/models/FlexibleCSVArrayRecordAssembler.ts @@ -1,38 +1,33 @@ import { DEFAULT_ASSEMBLER_MAX_FIELD_COUNT, - FieldDelimiter, - RecordDelimiter, + Delimiter, } from "@/core/constants.ts"; import { ParseError } from "@/core/errors.ts"; import type { + AnyToken, ColumnCountStrategy, CSVArrayRecord, - CSVArrayRecordAssembler, CSVRecordAssemblerAssembleOptions, CSVRecordAssemblerCommonOptions, - Token, } from "@/core/types.ts"; /** * Flexible CSV Array Record Assembler implementation. * - * A balanced implementation that assembles tokens into CSV records as arrays, - * optimizing for both performance and memory efficiency. + * An optimized assembler that works with unified field tokens. + * No switch statement needed - simply processes each field and checks + * the `delimiter` property to determine when a record is complete. * * @remarks - * This implementation is designed to handle various CSV formats flexibly - * while maintaining good performance characteristics. For specialized use cases, - * future implementations may provide optimizations for specific scenarios - * (e.g., speed-optimized, memory-optimized). + * This implementation provides better performance by eliminating + * the token type switch statement and reducing token iteration count by 50%. */ export class FlexibleCSVArrayRecordAssembler< - Header extends ReadonlyArray, -> implements CSVArrayRecordAssembler
-{ + const Header extends ReadonlyArray, +> { #fieldIndex = 0; #row: string[] = []; #header: Header | undefined; - #dirty = false; #signal?: AbortSignal | undefined; #maxFieldCount: number; #skipEmptyLines: boolean; @@ -40,19 +35,26 @@ export class FlexibleCSVArrayRecordAssembler< #source?: string | undefined; #includeHeader: boolean; #columnCountStrategy: ColumnCountStrategy; - #headerIncluded = false; // Track if header has been included in output + #headerIncluded = false; + + #assembleRecordFn: + | ((rowLength: number) => CSVArrayRecord
) + | undefined; + #headerLength = 0; + #hasContent = false; + #emptyRecordTemplate: string[] | undefined; + #rowLength = 0; constructor(options: CSVRecordAssemblerCommonOptions
= {}) { - // Validate includeHeader option this.#includeHeader = options.includeHeader ?? false; - // Validate headerless mode (header: []) - if ( + // Detect headerless mode (header: []) + const isHeaderlessMode = options.header !== undefined && Array.isArray(options.header) && - options.header.length === 0 - ) { - // Headerless mode: only 'keep' strategy is allowed + options.header.length === 0; + + if (isHeaderlessMode) { if ( options.columnCountStrategy !== undefined && options.columnCountStrategy !== "keep" @@ -65,17 +67,21 @@ export class FlexibleCSVArrayRecordAssembler< } } - // Validate and set columnCountStrategy - this.#columnCountStrategy = options.columnCountStrategy ?? "keep"; - if (this.#columnCountStrategy !== "keep" && options.header === undefined) { + // Default to "keep" for headerless mode, "fill" otherwise + this.#columnCountStrategy = + options.columnCountStrategy ?? (isHeaderlessMode ? "keep" : "fill"); + if ( + this.#columnCountStrategy !== "keep" && + this.#columnCountStrategy !== "fill" && + options.header === undefined + ) { throw new Error( `columnCountStrategy '${this.#columnCountStrategy}' requires header option. ` + - `Use 'keep' or omit columnCountStrategy for headerless CSV.`, + `Use 'keep', 'fill', or omit columnCountStrategy for headerless CSV.`, ); } const mfc = options.maxFieldCount ?? DEFAULT_ASSEMBLER_MAX_FIELD_COUNT; - // Validate maxFieldCount if ( !(Number.isFinite(mfc) || mfc === Number.POSITIVE_INFINITY) || (Number.isFinite(mfc) && (mfc < 1 || !Number.isInteger(mfc))) @@ -102,22 +108,19 @@ export class FlexibleCSVArrayRecordAssembler< * @returns An iterable iterator of CSV records. */ public *assemble( - input?: Token | Iterable, + input?: AnyToken | Iterable, options?: CSVRecordAssemblerAssembleOptions, ): IterableIterator> { const stream = options?.stream ?? false; - // Yield header if includeHeader is enabled (before processing any records) yield* this.#maybeYieldHeader(); if (input !== undefined) { - // Check if input is iterable (has Symbol.iterator) if (this.#isIterable(input)) { for (const token of input) { yield* this.#processToken(token); } } else { - // Single token yield* this.#processToken(input); } } @@ -127,80 +130,63 @@ export class FlexibleCSVArrayRecordAssembler< } } - /** - * Checks if a value is iterable. - */ - #isIterable(value: any): value is Iterable { + #isIterable(value: any): value is Iterable { return value != null && typeof value[Symbol.iterator] === "function"; } /** - * Processes a single token and yields a record if one is completed. + * Processes a single token. + * No switch needed - always a field, just check what follows. */ - *#processToken(token: Token): IterableIterator> { + *#processToken(token: AnyToken): IterableIterator> { this.#signal?.throwIfAborted(); - // Track the current record number for error reporting - if (token.location) { + // Track row number for error reporting + if ("location" in token && token.location) { this.#currentRowNumber = token.location.rowNumber; } - switch (token.type) { - case FieldDelimiter: - // Set empty string for empty fields - if (this.#row[this.#fieldIndex] === undefined) { - this.#row[this.#fieldIndex] = ""; - } + // Store the field value and track if row has content + const value = token.value; + this.#row[this.#fieldIndex] = value; + if (value !== "") { + this.#hasContent = true; + } + this.#rowLength = Math.max(this.#rowLength, this.#fieldIndex + 1); + + // Check what follows this field + switch (token.delimiter) { + case Delimiter.Field: { this.#fieldIndex++; this.#checkFieldCount(); - this.#dirty = true; + this.#rowLength = Math.max(this.#rowLength, this.#fieldIndex + 1); break; - case RecordDelimiter: - // Set empty string for the last field if empty - if (this.#row[this.#fieldIndex] === undefined) { - this.#row[this.#fieldIndex] = ""; - } + } + default: { + // End of record - yield assembled record + const rowLength = this.#rowLength || this.#fieldIndex + 1 || 0; if (this.#header === undefined) { - this.#setHeader(this.#row as unknown as Header); - // Yield header if includeHeader is enabled after header inference + const headerRow = this.#takeRow(rowLength); + this.#setHeader(headerRow as unknown as Header); yield* this.#maybeYieldHeader(); } else { - if (this.#dirty) { - yield this.#assembleRecord(); + // Check if row has any non-empty content (tracked incrementally) + if (this.#hasContent) { + yield this.#assembleRecord(rowLength); + } else if (!this.#skipEmptyLines) { + this.#resetRowBuilderState(); + yield this.#createEmptyRecord(); } else { - if (!this.#skipEmptyLines) { - // For empty lines, generate empty record - yield new Array(this.#header.length).fill( - "", - ) as unknown as CSVArrayRecord
; - } + this.#resetRowBuilderState(); } } - // Reset the row fields buffer. - this.#fieldIndex = 0; - this.#row = []; - this.#dirty = false; - break; - default: - this.#dirty = true; - this.#row[this.#fieldIndex] = token.value; break; + } } } - /** - * Flushes any remaining buffered data as a final record. - */ *#flush(): IterableIterator> { - if (this.#header !== undefined) { - if (this.#dirty) { - // Set empty string for the last field if empty - if (this.#row[this.#fieldIndex] === undefined) { - this.#row[this.#fieldIndex] = ""; - } - yield this.#assembleRecord(); - } - } + // Nothing to flush - unified tokens always complete records } #checkFieldCount(): void { @@ -222,8 +208,6 @@ export class FlexibleCSVArrayRecordAssembler< ); } this.#header = header; - // Allow empty header for headerless mode (all rows are data) - // Only validate duplicates when header is non-empty if ( this.#header.length > 0 && new Set(this.#header).size !== this.#header.length @@ -232,76 +216,152 @@ export class FlexibleCSVArrayRecordAssembler< source: this.#source, }); } + + this.#headerLength = header.length; + this.#emptyRecordTemplate = undefined; + switch (this.#columnCountStrategy) { + case "fill": + this.#assembleRecordFn = this.#assembleRecordFill; + break; + case "sparse": + this.#assembleRecordFn = this.#assembleRecordSparse; + break; + case "keep": + this.#assembleRecordFn = this.#assembleRecordKeep; + break; + case "strict": + this.#assembleRecordFn = this.#assembleRecordStrict; + break; + case "truncate": + this.#assembleRecordFn = this.#assembleRecordTruncate; + break; + default: + this.#assembleRecordFn = this.#assembleRecordFill; + break; + } + this.#row = this.#allocateRowBuffer(this.#headerLength); + this.#rowLength = 0; } - /** - * Assembles a record in array format. - * Applies column count strategy if header is defined. - */ - #assembleRecord(): CSVArrayRecord
{ - if (!this.#header) { - // Headerless: return row as-is - return [...this.#row] as unknown as CSVArrayRecord
; + #assembleRecord(rowLength: number): CSVArrayRecord
{ + if (!this.#assembleRecordFn) { + return this.#takeRow(rowLength) as unknown as CSVArrayRecord
; } + return this.#assembleRecordFn(rowLength); + } - // Apply column count strategy - const headerLength = this.#header.length; - const rowLength = this.#row.length; + #assembleRecordKeep = (rowLength: number): CSVArrayRecord
=> { + const record = this.#takeRow(rowLength); + record.length = rowLength; + return record as unknown as CSVArrayRecord
; + }; - switch (this.#columnCountStrategy) { - case "keep": - // Return row as-is - return [...this.#row] as unknown as CSVArrayRecord
; - - case "pad": - // Pad short rows with undefined, truncate long rows - if (rowLength < headerLength) { - const padded = [...this.#row]; - while (padded.length < headerLength) { - padded.push(undefined as unknown as string); - } - return padded as unknown as CSVArrayRecord
; - } else if (rowLength > headerLength) { - return this.#row.slice( - 0, - headerLength, - ) as unknown as CSVArrayRecord
; - } - return [...this.#row] as unknown as CSVArrayRecord
; + #assembleRecordFill = (rowLength: number): CSVArrayRecord
=> { + const headerLength = this.#headerLength; + const record = this.#takeRow(rowLength); - case "strict": - // Throw error if length doesn't match - if (rowLength !== headerLength) { - throw new ParseError( - `Expected ${headerLength} columns, got ${rowLength}${ - this.#currentRowNumber ? ` at row ${this.#currentRowNumber}` : "" - }${this.#source ? ` in ${JSON.stringify(this.#source)}` : ""}`, - { - source: this.#source, - }, - ); - } - return [...this.#row] as unknown as CSVArrayRecord
; + if (rowLength < headerLength) { + for (let i = rowLength; i < headerLength; i++) { + record[i] = ""; + } + record.length = headerLength; + return record as unknown as CSVArrayRecord
; + } else if (rowLength > headerLength) { + record.length = headerLength; + return record as unknown as CSVArrayRecord
; + } + record.length = rowLength; + return record as unknown as CSVArrayRecord
; + }; - case "truncate": - // Truncate long rows, keep short rows as-is - if (rowLength > headerLength) { - return this.#row.slice( - 0, - headerLength, - ) as unknown as CSVArrayRecord
; - } - return [...this.#row] as unknown as CSVArrayRecord
; + #assembleRecordSparse = (rowLength: number): CSVArrayRecord
=> { + const headerLength = this.#headerLength; + const record = this.#takeRow(rowLength); - default: - // Should never reach here due to validation - return [...this.#row] as unknown as CSVArrayRecord
; + if (rowLength < headerLength) { + for (let i = rowLength; i < headerLength; i++) { + record[i] = undefined as unknown as string; + } + record.length = headerLength; + return record as unknown as CSVArrayRecord
; + } else if (rowLength > headerLength) { + record.length = headerLength; + return record as unknown as CSVArrayRecord
; + } + record.length = rowLength; + return record as unknown as CSVArrayRecord
; + }; + + #assembleRecordStrict = (rowLength: number): CSVArrayRecord
=> { + const headerLength = this.#headerLength; + + if (rowLength !== headerLength) { + this.#takeRow(rowLength); + throw new ParseError( + `Expected ${headerLength} columns, got ${rowLength}${ + this.#currentRowNumber ? ` at row ${this.#currentRowNumber}` : "" + }${this.#source ? ` in ${JSON.stringify(this.#source)}` : ""}`, + { + source: this.#source, + }, + ); } + const record = this.#takeRow(rowLength); + record.length = headerLength; + return record as unknown as CSVArrayRecord
; + }; + + #assembleRecordTruncate = (rowLength: number): CSVArrayRecord
=> { + const headerLength = this.#headerLength; + const record = this.#takeRow(rowLength); + + if (rowLength > headerLength) { + record.length = headerLength; + return record as unknown as CSVArrayRecord
; + } + record.length = rowLength; + return record as unknown as CSVArrayRecord
; + }; + + #takeRow(rowLength: number): string[] { + const record = this.#row; + record.length = rowLength; + const capacityHint = + this.#headerLength > 0 ? this.#headerLength : Math.max(rowLength, 4); + this.#row = this.#allocateRowBuffer(capacityHint); + this.#resetRowBuilderState(true); + return record; + } + + #resetRowBuilderState(replacedBuffer = false): void { + this.#fieldIndex = 0; + this.#rowLength = 0; + this.#hasContent = false; + if (!replacedBuffer) { + this.#row.length = 0; + } + } + + #allocateRowBuffer(capacityHint: number): string[] { + if (capacityHint <= 0) { + return []; + } + return new Array(capacityHint); + } + + #createEmptyRecord(): CSVArrayRecord
{ + if (this.#headerLength === 0) { + return [] as unknown as CSVArrayRecord
; + } + if ( + this.#emptyRecordTemplate === undefined || + this.#emptyRecordTemplate.length !== this.#headerLength + ) { + this.#emptyRecordTemplate = new Array(this.#headerLength).fill(""); + } + return this.#emptyRecordTemplate.slice() as unknown as CSVArrayRecord
; } - /** - * Yields the header row if includeHeader is enabled and header hasn't been included yet. - */ *#maybeYieldHeader(): IterableIterator> { if ( this.#includeHeader && @@ -309,8 +369,9 @@ export class FlexibleCSVArrayRecordAssembler< !this.#headerIncluded ) { this.#headerIncluded = true; - // Yield header as array - yield [...this.#header] as unknown as CSVArrayRecord
; + yield ( + this.#header as unknown as string[] + ).slice() as unknown as CSVArrayRecord
; } } } diff --git a/src/parser/models/FlexibleCSVObjectRecordAssembler.ts b/src/parser/models/FlexibleCSVObjectRecordAssembler.ts index f8b7e80b..ff616aaa 100644 --- a/src/parser/models/FlexibleCSVObjectRecordAssembler.ts +++ b/src/parser/models/FlexibleCSVObjectRecordAssembler.ts @@ -1,60 +1,80 @@ import { DEFAULT_ASSEMBLER_MAX_FIELD_COUNT, - FieldDelimiter, - RecordDelimiter, + Delimiter, } from "@/core/constants.ts"; import { ParseError } from "@/core/errors.ts"; import type { + AnyToken, ColumnCountStrategy, CSVObjectRecord, - CSVObjectRecordAssembler, CSVRecordAssemblerAssembleOptions, CSVRecordAssemblerCommonOptions, - Token, } from "@/core/types.ts"; +import { ReusableArrayPool } from "@/utils/memory/ReusableArrayPool.ts"; /** * Flexible CSV Object Record Assembler implementation. * - * A balanced implementation that assembles tokens into CSV records as objects, - * optimizing for both performance and memory efficiency. + * An optimized assembler that works with unified field tokens. + * No switch statement needed - simply processes each field and checks + * the `delimiter` property to determine when a record is complete. * * @remarks - * This implementation is designed to handle various CSV formats flexibly - * while maintaining good performance characteristics. For specialized use cases, - * future implementations may provide optimizations for specific scenarios - * (e.g., speed-optimized, memory-optimized). + * This implementation provides better performance by eliminating + * the token type switch statement and reducing token iteration count by 50%. */ export class FlexibleCSVObjectRecordAssembler< - Header extends ReadonlyArray, -> implements CSVObjectRecordAssembler
-{ + const Header extends ReadonlyArray, +> { #fieldIndex = 0; #row: string[] = []; #header: Header | undefined; - #dirty = false; #signal?: AbortSignal | undefined; #maxFieldCount: number; #skipEmptyLines: boolean; #currentRowNumber?: number | undefined; #source?: string | undefined; #columnCountStrategy: ColumnCountStrategy; + #rowLength = 0; + #rowPool = new ReusableArrayPool(); + + // Optimization: Pre-bound strategy function (avoids switch per record) + #assembleRecordFn: + | ((row: string[], rowLength: number) => CSVObjectRecord
) + | undefined; + // Optimization: Pre-computed valid header indices (avoids if check per field) + #validHeaderIndices: number[] = []; + // Optimization: Pre-created header keys (avoids header lookup per record) + #headerKeys: string[] = []; + // Optimization: Track if row has content (avoids some() call per record) + #hasContent = false; constructor(options: CSVRecordAssemblerCommonOptions
= {}) { // Validate and set columnCountStrategy - this.#columnCountStrategy = options.columnCountStrategy ?? "pad"; - if (this.#columnCountStrategy === "keep") { - console.warn( - "columnCountStrategy 'keep' has no effect in object format. " + - "Object format always maps to header keys. " + - "Falling back to 'pad' strategy.", + this.#columnCountStrategy = options.columnCountStrategy ?? "fill"; + + // 'sparse' is not allowed in object format because object format requires all keys to have string values + if (this.#columnCountStrategy === "sparse") { + throw new Error( + "columnCountStrategy 'sparse' is not allowed for object format. " + + "'sparse' fills missing fields with undefined, which is not compatible with object format. " + + "Use 'fill' (fills with empty string) or outputFormat: 'array' for sparse data.", ); - this.#columnCountStrategy = "pad"; } - if (this.#columnCountStrategy !== "pad" && options.header === undefined) { + + if ( + this.#columnCountStrategy === "keep" || + this.#columnCountStrategy === "truncate" + ) { + throw new Error( + `columnCountStrategy '${this.#columnCountStrategy}' is not allowed for object format. ` + + "Use 'fill' (default) or 'strict' for object output.", + ); + } + if (this.#columnCountStrategy !== "fill" && options.header === undefined) { throw new Error( `columnCountStrategy '${this.#columnCountStrategy}' requires header option. ` + - `Use 'pad' or omit columnCountStrategy for headerless CSV.`, + `Use 'fill' or omit columnCountStrategy for headerless CSV.`, ); } @@ -86,19 +106,17 @@ export class FlexibleCSVObjectRecordAssembler< * @returns An iterable iterator of CSV records. */ public *assemble( - input?: Token | Iterable, + input?: AnyToken | Iterable, options?: CSVRecordAssemblerAssembleOptions, ): IterableIterator> { const stream = options?.stream ?? false; if (input !== undefined) { - // Check if input is iterable (has Symbol.iterator) if (this.#isIterable(input)) { for (const token of input) { yield* this.#processToken(token); } } else { - // Single token yield* this.#processToken(input); } } @@ -108,96 +126,60 @@ export class FlexibleCSVObjectRecordAssembler< } } - /** - * Checks if a value is iterable. - */ - #isIterable(value: any): value is Iterable { + #isIterable(value: any): value is Iterable { return value != null && typeof value[Symbol.iterator] === "function"; } /** - * Processes a single token and yields a record if one is completed. + * Processes a single token. + * No switch needed - always a field, just check what follows. */ - *#processToken(token: Token): IterableIterator> { + *#processToken(token: AnyToken): IterableIterator> { this.#signal?.throwIfAborted(); - // Track the current record number for error reporting - if (token.location) { + // Track row number for error reporting + if ("location" in token && token.location) { this.#currentRowNumber = token.location.rowNumber; } - switch (token.type) { - case FieldDelimiter: - // Set empty string for empty fields - if (this.#row[this.#fieldIndex] === undefined) { - this.#row[this.#fieldIndex] = ""; - } - this.#fieldIndex++; - this.#checkFieldCount(); - this.#dirty = true; - break; - case RecordDelimiter: - // Set empty string for the last field if empty - if (this.#row[this.#fieldIndex] === undefined) { - this.#row[this.#fieldIndex] = ""; - } - if (this.#header === undefined) { - this.#setHeader(this.#row as unknown as Header); - } else { - if (this.#dirty) { - yield this.#assembleRecord(); - } else { - if (!this.#skipEmptyLines) { - // For empty lines, generate empty record - // SAFETY: Object.fromEntries() is safe from prototype pollution. - // See CSVRecordAssembler.prototype-safety.test.ts for details. - yield Object.fromEntries( - this.#header - .filter((header) => header) - .map((header) => [header, ""]), - ) as CSVObjectRecord
; - } - } - } - // Reset the row fields buffer. - this.#fieldIndex = 0; - this.#row = []; - this.#dirty = false; - break; - default: - this.#dirty = true; - this.#row[this.#fieldIndex] = token.value; - break; + // Store the field value and track if row has content + const value = token.value; + this.#row[this.#fieldIndex] = value; + if (value !== "") { + this.#hasContent = true; + } + this.#rowLength = Math.max(this.#rowLength, this.#fieldIndex + 1); + + if (token.delimiter === Delimiter.Field) { + this.#fieldIndex++; + this.#checkFieldCount(); + this.#rowLength = Math.max(this.#rowLength, this.#fieldIndex + 1); + return; + } + + const rowLength = this.#rowLength || this.#fieldIndex + 1 || 0; + const hadContent = this.#hasContent; + const completedRow = this.#takeRow(rowLength); + + if (this.#header === undefined) { + this.#setHeader(completedRow as unknown as Header); + // Header takes ownership of completedRow; do not release. + return; + } + + if (hadContent) { + yield this.#assembleRecord(completedRow, rowLength); + } else if (!this.#skipEmptyLines) { + this.#releaseRow(completedRow); + yield this.#createEmptyRecord(); + } else { + this.#releaseRow(completedRow); } + this.#hasContent = false; } - /** - * Flushes any remaining buffered data as a final record. - * - * @remarks - * Prototype Pollution Safety: - * This method uses Object.fromEntries() to create record objects from CSV data. - * Object.fromEntries() is safe from prototype pollution because it creates - * own properties (not prototype properties) even when keys like "__proto__", - * "constructor", or "prototype" are used. - * - * For example, Object.fromEntries([["__proto__", "value"]]) creates an object - * with an own property "__proto__" set to "value", which does NOT pollute - * Object.prototype and does NOT affect other objects. - * - * This safety is verified by regression tests in: - * CSVRecordAssembler.prototype-safety.test.ts - */ *#flush(): IterableIterator> { - if (this.#header !== undefined) { - if (this.#dirty) { - // Set empty string for the last field if empty - if (this.#row[this.#fieldIndex] === undefined) { - this.#row[this.#fieldIndex] = ""; - } - yield this.#assembleRecord(); - } - } + // Nothing to flush - unified tokens always complete records } #checkFieldCount(): void { @@ -234,72 +216,174 @@ export class FlexibleCSVObjectRecordAssembler< source: this.#source, }); } + + // Optimization: Pre-compute valid header indices (non-empty headers) + this.#validHeaderIndices = []; + this.#headerKeys = []; + for (let i = 0; i < header.length; i++) { + const key = header[i]; + if (key) { + this.#validHeaderIndices.push(i); + this.#headerKeys.push(key); + } + } + + // Optimization: Pre-bind strategy function based on columnCountStrategy + switch (this.#columnCountStrategy) { + case "strict": + this.#assembleRecordFn = this.#assembleRecordStrict; + break; + case "truncate": + this.#assembleRecordFn = this.#assembleRecordTruncate; + break; + default: + this.#assembleRecordFn = this.#assembleRecordFill; + break; + } } /** * Assembles a record in object format. - * Applies column count strategy if header is defined. + * Uses pre-bound strategy function for optimal performance. * * @remarks - * SAFETY: Object.fromEntries() is safe from prototype pollution. + * SAFETY: Object.create(null) creates a prototype-less object, which: + * - Is safe from prototype pollution attacks + * - Correctly stores all header names including "__proto__" as regular properties + * - Is faster than Object.fromEntries() (~3.6x speedup) * See CSVRecordAssembler.prototype-safety.test.ts for details. */ - #assembleRecord(): CSVObjectRecord
{ - if (!this.#header) { + #assembleRecord(row: string[], rowLength: number): CSVObjectRecord
{ + if (!this.#header || !this.#assembleRecordFn) { + this.#releaseRow(row); // Headerless: return empty object (shouldn't happen in normal flow) - return {} as CSVObjectRecord
; + return Object.create(null) as CSVObjectRecord
; } + const record = this.#assembleRecordFn(row, rowLength); + this.#releaseRow(row); + return record; + } - // Apply column count strategy - const headerLength = this.#header.length; - const rowLength = this.#row.length; + /** + * Optimized "fill" strategy: map all header keys, fill missing values with empty string. + * Uses Object.create(null) for faster object creation and proper "__proto__" handling. + */ + #assembleRecordFill = ( + row: string[], + _rowLength: number, + ): CSVObjectRecord
=> { + const indices = this.#validHeaderIndices; + const len = indices.length; + const keys = this.#headerKeys; - switch (this.#columnCountStrategy) { - case "pad": - // Default behavior: map all header keys, keep missing values as undefined - return Object.fromEntries( - this.#header - .map((header, index) => [header, index] as const) - .filter(([header]) => header) - .map(([header, index]) => [header, this.#row.at(index)]), - ) as unknown as CSVObjectRecord
; + // Object.create(null) is ~3.6x faster than Object.fromEntries + // and correctly handles "__proto__" as a regular property + const obj = Object.create(null) as CSVObjectRecord
; + for (let j = 0; j < len; j++) { + (obj as Record)[keys[j]!] = row[indices[j]!] ?? ""; + } - case "strict": - // Throw error if length doesn't match - if (rowLength !== headerLength) { - throw new ParseError( - `Expected ${headerLength} columns, got ${rowLength}${ - this.#currentRowNumber ? ` at row ${this.#currentRowNumber}` : "" - }${this.#source ? ` in ${JSON.stringify(this.#source)}` : ""}`, - { - source: this.#source, - }, - ); - } - return Object.fromEntries( - this.#header - .map((header, index) => [header, index] as const) - .filter(([header]) => header) - .map(([header, index]) => [header, this.#row[index]]), - ) as unknown as CSVObjectRecord
; + return obj; + }; - case "truncate": - // Only include fields up to header length, ignore extras, keep missing values as undefined - return Object.fromEntries( - this.#header - .map((header, index) => [header, index] as const) - .filter(([header]) => header) - .map(([header, index]) => [header, this.#row[index]]), - ) as unknown as CSVObjectRecord
; + /** + * Optimized "strict" strategy: throw error if column count doesn't match. + * Uses Object.create(null) for faster object creation and proper "__proto__" handling. + */ + #assembleRecordStrict = ( + row: string[], + rowLength: number, + ): CSVObjectRecord
=> { + const headerLength = this.#header!.length; - default: - // Fallback to pad behavior - return Object.fromEntries( - this.#header - .map((header, index) => [header, index] as const) - .filter(([header]) => header) - .map(([header, index]) => [header, this.#row.at(index)]), - ) as unknown as CSVObjectRecord
; + if (rowLength !== headerLength) { + this.#releaseRow(row); + throw new ParseError( + `Expected ${headerLength} columns, got ${rowLength}${ + this.#currentRowNumber ? ` at row ${this.#currentRowNumber}` : "" + }${this.#source ? ` in ${JSON.stringify(this.#source)}` : ""}`, + { + source: this.#source, + }, + ); + } + + const indices = this.#validHeaderIndices; + const len = indices.length; + const keys = this.#headerKeys; + + const obj = Object.create(null) as CSVObjectRecord
; + for (let j = 0; j < len; j++) { + (obj as Record)[keys[j]!] = row[indices[j]!] ?? ""; + } + + return obj; + }; + + /** + * Optimized "truncate" strategy: only include fields up to header length. + * Uses Object.create(null) for faster object creation and proper "__proto__" handling. + */ + #assembleRecordTruncate = ( + row: string[], + _rowLength: number, + ): CSVObjectRecord
=> { + const indices = this.#validHeaderIndices; + const len = indices.length; + const keys = this.#headerKeys; + + const obj = Object.create(null) as CSVObjectRecord
; + for (let j = 0; j < len; j++) { + (obj as Record)[keys[j]!] = row[indices[j]!] ?? ""; + } + + return obj; + }; + + #takeRow(rowLength: number): string[] { + const record = this.#row; + record.length = rowLength; + const capacityHint = Math.max(rowLength, this.#header?.length ?? 4); + this.#row = this.#allocateRowBuffer(capacityHint); + this.#resetRowBuilderState(true); + return record; + } + + #releaseRow(row: string[]): void { + row.length = 0; + this.#rowPool.release(row); + } + + #resetRowBuilderState(replacedBuffer = false): void { + this.#fieldIndex = 0; + this.#rowLength = 0; + this.#hasContent = false; + if (!replacedBuffer) { + this.#row.length = 0; + } + } + + #allocateRowBuffer(capacityHint: number): string[] { + const ensure = Math.max(capacityHint, 0); + const buffer = this.#rowPool.take(() => + ensure > 0 ? new Array(ensure) : [], + ); + if (ensure > 0 && buffer.length < ensure) { + buffer.length = ensure; + } + buffer.length = 0; + return buffer; + } + + #createEmptyRecord(): CSVObjectRecord
{ + const header = this.#header; + const obj = Object.create(null) as CSVObjectRecord
; + if (!header) { + return obj; + } + for (let i = 0; i < header.length; i++) { + (obj as Record)[header[i]!] = ""; } + return obj; } } diff --git a/src/parser/models/FlexibleCSVRecordAssembler.array-output.test.ts b/src/parser/models/FlexibleCSVRecordAssembler.array-output.test.ts deleted file mode 100644 index c0437443..00000000 --- a/src/parser/models/FlexibleCSVRecordAssembler.array-output.test.ts +++ /dev/null @@ -1,457 +0,0 @@ -import { describe, expect, test } from "vitest"; -import { createCSVRecordAssembler } from "@/parser/api/model/createCSVRecordAssembler.ts"; -import { FlexibleStringCSVLexer } from "@/parser/api/model/createStringCSVLexer.ts"; - -describe("CSVRecordAssembler - Array Output Format", () => { - describe("outputFormat option", () => { - test("should output array format when outputFormat is 'array'", () => { - const csv = `name,age,city -Alice,30,NY -Bob,25,LA`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - outputFormat: "array", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(2); - expect(records[0]).toEqual(["Alice", "30", "NY"]); - expect(records[1]).toEqual(["Bob", "25", "LA"]); - }); - - test("should output object format by default", () => { - const csv = `name,age -Alice,30`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler(); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(1); - expect(records[0]).toEqual({ name: "Alice", age: "30" }); - }); - - test("should support named tuple with header", () => { - const csv = `Alice,30,NY -Bob,25,LA`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "array", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(2); - expect(records[0]).toEqual(["Alice", "30", "NY"]); - expect(records[1]).toEqual(["Bob", "25", "LA"]); - // Type should be: readonly [name: string, age: string, city: string] - if (Array.isArray(records[0])) { - expect(records[0].length).toBe(3); - } - }); - }); - - describe("includeHeader option", () => { - test("should include header row when includeHeader is true", () => { - const csv = `Alice,30,NY -Bob,25,LA`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "array", - includeHeader: true, - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(3); - expect(records[0]).toEqual(["name", "age", "city"]); // Header row - expect(records[1]).toEqual(["Alice", "30", "NY"]); - expect(records[2]).toEqual(["Bob", "25", "LA"]); - }); - - test("should not include header row by default", () => { - const csv = `Alice,30,NY`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "array", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(1); - expect(records[0]).toEqual(["Alice", "30", "NY"]); - }); - - test("should throw error if includeHeader is used with object format", () => { - expect(() => { - createCSVRecordAssembler({ - header: ["name", "age"] as const, - includeHeader: true, - outputFormat: "object", - }); - }).toThrow("includeHeader option is only valid for array format"); - }); - }); - - describe("columnCountStrategy option", () => { - describe("keep strategy (default)", () => { - test("should keep rows as-is with their actual length", () => { - const csv = `Alice,30 -Bob,25,LA -Charlie,35,SF,Extra`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "array", - columnCountStrategy: "keep", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(3); - expect(records[0]).toEqual(["Alice", "30"]); // Short row - expect(records[1]).toEqual(["Bob", "25", "LA"]); // Exact match - expect(records[2]).toEqual(["Charlie", "35", "SF", "Extra"]); // Long row - }); - }); - - describe("pad strategy", () => { - test("should pad short rows with undefined", () => { - const csv = `Alice,30 -Bob,25,LA`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "array", - columnCountStrategy: "pad", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(2); - expect(records[0]).toEqual(["Alice", "30", undefined]); // Padded - expect(records[1]).toEqual(["Bob", "25", "LA"]); // Exact match - }); - - test("should pad second row with undefined (regression test)", () => { - const csv = `Alice,30,NY -Bob,25`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "array", - columnCountStrategy: "pad", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(2); - expect(records[0]).toEqual(["Alice", "30", "NY"]); // Exact match - expect(records[1]).toEqual(["Bob", "25", undefined]); // Padded - }); - - test("should truncate long rows to match header length", () => { - const csv = `Alice,30,NY,Extra1,Extra2`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "array", - columnCountStrategy: "pad", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(1); - expect(records[0]).toEqual(["Alice", "30", "NY"]); // Truncated - }); - }); - - describe("strict strategy", () => { - test("should throw error if row length doesn't match header length", () => { - const csv = `Alice,30 -Bob,25,LA`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "array", - columnCountStrategy: "strict", - }); - - expect(() => { - [...assembler.assemble(tokens)]; - }).toThrow("Expected 3 columns, got 2"); - }); - - test("should throw error if second row is short (regression test)", () => { - const csv = `Alice,30,NY -Bob,25`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "array", - columnCountStrategy: "strict", - }); - - expect(() => { - [...assembler.assemble(tokens)]; - }).toThrow("Expected 3 columns, got 2"); - }); - - test("should not throw error if all rows match header length", () => { - const csv = `Alice,30,NY -Bob,25,LA`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "array", - columnCountStrategy: "strict", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(2); - expect(records[0]).toEqual(["Alice", "30", "NY"]); - expect(records[1]).toEqual(["Bob", "25", "LA"]); - }); - }); - - describe("truncate strategy", () => { - test("should truncate long rows to match header length", () => { - const csv = `Alice,30,NY,Extra`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "array", - columnCountStrategy: "truncate", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(1); - expect(records[0]).toEqual(["Alice", "30", "NY"]); // Truncated - }); - - test("should keep short rows as-is", () => { - const csv = `Alice,30`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "array", - columnCountStrategy: "truncate", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(1); - expect(records[0]).toEqual(["Alice", "30"]); // Not padded - }); - }); - - test("should throw error if columnCountStrategy is used without header", () => { - expect(() => { - createCSVRecordAssembler({ - outputFormat: "array", - columnCountStrategy: "pad", - }); - }).toThrow("columnCountStrategy 'pad' requires header option"); - }); - }); - - describe("variable-length CSV (headerless)", () => { - test("should handle variable-length rows without header", () => { - const csv = `Alice,30 -Bob,25,LA -Charlie,35,SF,Extra`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - outputFormat: "array", - }); - - const records = [...assembler.assemble(tokens)]; - - // First row becomes header - expect(records).toHaveLength(2); - expect(records[0]).toEqual(["Bob", "25", "LA"]); - expect(records[1]).toEqual(["Charlie", "35", "SF", "Extra"]); - }); - }); - - describe("Headerless mode (header: [])", () => { - describe("Valid configurations", () => { - test("should treat all rows as data when header is empty array", () => { - const csv = `Alice,30,NY -Bob,25,LA -Charlie,35,SF`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: [] as const, - outputFormat: "array", - }); - - const records = [...assembler.assemble(tokens)]; - - // All three rows should be treated as data (no header inference) - expect(records).toHaveLength(3); - expect(records[0]).toEqual(["Alice", "30", "NY"]); - expect(records[1]).toEqual(["Bob", "25", "LA"]); - expect(records[2]).toEqual(["Charlie", "35", "SF"]); - }); - - test("should work with single row CSV in headerless mode", () => { - const csv = `Alice,30,NY`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: [] as const, - outputFormat: "array", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(1); - expect(records[0]).toEqual(["Alice", "30", "NY"]); - }); - - test("should work with empty CSV in headerless mode", () => { - const csv = ``; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: [] as const, - outputFormat: "array", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(0); - }); - - test("should support varying column counts in headerless mode with columnCountStrategy: keep", () => { - const csv = `Alice,30 -Bob,25,LA -Charlie,35,SF,Extra`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: [] as const, - outputFormat: "array", - columnCountStrategy: "keep", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(3); - expect(records[0]).toEqual(["Alice", "30"]); - expect(records[1]).toEqual(["Bob", "25", "LA"]); - expect(records[2]).toEqual(["Charlie", "35", "SF", "Extra"]); - }); - }); - - describe("Runtime validation errors", () => { - test("should throw error when header: [] with columnCountStrategy: 'pad'", () => { - expect(() => - createCSVRecordAssembler({ - header: [] as const, - outputFormat: "array", - columnCountStrategy: "pad", - }), - ).toThrow( - /Headerless mode \(header: \[\]\) only supports columnCountStrategy: 'keep'/, - ); - }); - - test("should throw error when header: [] with columnCountStrategy: 'strict'", () => { - expect(() => - createCSVRecordAssembler({ - header: [] as const, - outputFormat: "array", - columnCountStrategy: "strict", - }), - ).toThrow( - /Headerless mode \(header: \[\]\) only supports columnCountStrategy: 'keep'/, - ); - }); - - test("should throw error when header: [] with columnCountStrategy: 'truncate'", () => { - expect(() => - createCSVRecordAssembler({ - header: [] as const, - outputFormat: "array", - columnCountStrategy: "truncate", - }), - ).toThrow( - /Headerless mode \(header: \[\]\) only supports columnCountStrategy: 'keep'/, - ); - }); - - test("should throw error when header: [] with outputFormat: 'object'", () => { - expect(() => - createCSVRecordAssembler({ - header: [] as const, - outputFormat: "object", - }), - ).toThrow( - /Headerless mode \(header: \[\]\) is not supported for outputFormat: 'object'/, - ); - }); - }); - }); -}); diff --git a/src/parser/models/FlexibleCSVRecordAssembler.field-count-limit.test.ts b/src/parser/models/FlexibleCSVRecordAssembler.field-count-limit.test.ts deleted file mode 100644 index b7f5ce1f..00000000 --- a/src/parser/models/FlexibleCSVRecordAssembler.field-count-limit.test.ts +++ /dev/null @@ -1,570 +0,0 @@ -import { beforeEach, describe, expect, test } from "vitest"; -import { Field, FieldDelimiter, RecordDelimiter } from "@/core/constants.ts"; -import type { CSVRecordAssembler, Token } from "@/core/types.ts"; -import { createCSVRecordAssembler } from "@/parser/api/model/createCSVRecordAssembler.ts"; - -describe("CSVRecordAssembler - Field Count Limit Protection", () => { - describe("with default field count limit (100000)", () => { - let assembler: CSVRecordAssembler; - beforeEach(() => { - assembler = createCSVRecordAssembler(); - }); - - test("should not throw error for normal field counts", () => { - const tokens: Token[] = [ - { - type: Field, - value: "a", - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 2, offset: 1 }, - rowNumber: 1, - }, - }, - { - type: FieldDelimiter, - value: ",", - location: { - start: { line: 1, column: 2, offset: 1 }, - end: { line: 1, column: 3, offset: 2 }, - rowNumber: 1, - }, - }, - { - type: Field, - value: "b", - location: { - start: { line: 1, column: 3, offset: 2 }, - end: { line: 1, column: 4, offset: 3 }, - rowNumber: 1, - }, - }, - { - type: RecordDelimiter, - value: "\n", - location: { - start: { line: 1, column: 4, offset: 3 }, - end: { line: 2, column: 1, offset: 4 }, - rowNumber: 1, - }, - }, - ]; - - expect(() => [...assembler.assemble(tokens)]).not.toThrow(); - }); - - test("should throw RangeError when field count exceeds limit during header parsing", () => { - const tokens: Token[] = []; - const maxFields = 100001; - - // Create header with excessive fields - for (let i = 0; i < maxFields; i++) { - tokens.push({ - type: Field, - value: `field${i}`, - location: { - start: { line: 1, column: i * 2 + 1, offset: i * 2 }, - end: { line: 1, column: i * 2 + 2, offset: i * 2 + 1 }, - rowNumber: 1, - }, - }); - if (i < maxFields - 1) { - tokens.push({ - type: FieldDelimiter, - value: ",", - location: { - start: { line: 1, column: i * 2 + 2, offset: i * 2 + 1 }, - end: { line: 1, column: i * 2 + 3, offset: i * 2 + 2 }, - rowNumber: 1, - }, - }); - } - } - tokens.push({ - type: RecordDelimiter, - value: "\n", - location: { - start: { line: 1, column: maxFields * 2, offset: maxFields * 2 - 1 }, - end: { line: 2, column: 1, offset: maxFields * 2 }, - rowNumber: 1, - }, - }); - - expect(() => [...assembler.assemble(tokens)]).toThrow(RangeError); - }); - - test("should throw RangeError with proper error details", () => { - const tokens: Token[] = []; - const maxFields = 100001; - - for (let i = 0; i < maxFields; i++) { - tokens.push({ - type: Field, - value: `f${i}`, - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 2, offset: 1 }, - rowNumber: 1, - }, - }); - if (i < maxFields - 1) { - tokens.push({ - type: FieldDelimiter, - value: ",", - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 2, offset: 1 }, - rowNumber: 1, - }, - }); - } - } - tokens.push({ - type: RecordDelimiter, - value: "\n", - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 2, column: 1, offset: 1 }, - rowNumber: 1, - }, - }); - - try { - [...assembler.assemble(tokens)]; - expect.fail("Should have thrown RangeError"); - } catch (error) { - expect(error).toBeInstanceOf(RangeError); - expect((error as RangeError).message).toContain("Field count"); - expect((error as RangeError).message).toContain( - "exceeded maximum allowed count", - ); - } - }); - }); - - describe("with custom field count limit", () => { - test("should allow exactly N fields when limit is N", () => { - const assembler = createCSVRecordAssembler({ maxFieldCount: 10 }); - const tokens: Token[] = []; - - // Create exactly 10 fields (at the limit, should succeed) - for (let i = 0; i < 10; i++) { - tokens.push({ - type: Field, - value: `field${i}`, - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 2, offset: 1 }, - rowNumber: 1, - }, - }); - if (i < 9) { - // 9 delimiters between 10 fields - tokens.push({ - type: FieldDelimiter, - value: ",", - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 2, offset: 1 }, - rowNumber: 1, - }, - }); - } - } - tokens.push({ - type: RecordDelimiter, - value: "\n", - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 2, column: 1, offset: 1 }, - rowNumber: 1, - }, - }); - - // Should not throw - exactly at the limit - expect(() => [...assembler.assemble(tokens)]).not.toThrow(); - - // Verify the record was correctly assembled - const records = [...assembler.assemble(tokens)]; - expect(records).toHaveLength(1); - expect(Object.keys(records[0] as object)).toHaveLength(10); - }); - - test("should respect custom maxFieldCount option", () => { - const assembler = createCSVRecordAssembler({ maxFieldCount: 10 }); - const tokens: Token[] = []; - - // Create 11 fields (exceeds limit of 10) - for (let i = 0; i < 11; i++) { - tokens.push({ - type: Field, - value: `f${i}`, - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 2, offset: 1 }, - rowNumber: 1, - }, - }); - if (i < 10) { - tokens.push({ - type: FieldDelimiter, - value: ",", - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 2, offset: 1 }, - rowNumber: 1, - }, - }); - } - } - tokens.push({ - type: RecordDelimiter, - value: "\n", - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 2, column: 1, offset: 1 }, - rowNumber: 1, - }, - }); - - expect(() => [...assembler.assemble(tokens)]).toThrow(RangeError); - }); - - test("should allow Number.POSITIVE_INFINITY as maxFieldCount to disable limit", () => { - const assembler = createCSVRecordAssembler({ - maxFieldCount: Number.POSITIVE_INFINITY, - }); - const tokens: Token[] = []; - - // Create 200000 fields (would exceed default limit) - for (let i = 0; i < 200000; i++) { - tokens.push({ - type: Field, - value: `f${i}`, - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 2, offset: 1 }, - rowNumber: 1, - }, - }); - if (i < 199999) { - tokens.push({ - type: FieldDelimiter, - value: ",", - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 2, offset: 1 }, - rowNumber: 1, - }, - }); - } - } - tokens.push({ - type: RecordDelimiter, - value: "\n", - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 2, column: 1, offset: 1 }, - rowNumber: 1, - }, - }); - - // This should not throw, but will take time and memory - expect(() => [...assembler.assemble(tokens)]).not.toThrow(RangeError); - }); - }); - - describe("header validation with field count limit", () => { - test("should throw RangeError when provided header exceeds limit", () => { - const largeHeader = Array.from({ length: 100001 }, (_, i) => `field${i}`); - - expect(() => createCSVRecordAssembler({ header: largeHeader })).toThrow( - RangeError, - ); - }); - - test("should accept header within limit", () => { - const normalHeader = ["field1", "field2", "field3"]; - - expect(() => - createCSVRecordAssembler({ header: normalHeader }), - ).not.toThrow(); - }); - }); - - describe("realistic attack scenarios", () => { - test("should prevent DoS via CSV with excessive columns", () => { - const assembler = createCSVRecordAssembler({ maxFieldCount: 1000 }); - const tokens: Token[] = []; - - // Simulate attack with 2000 columns - for (let i = 0; i < 2000; i++) { - tokens.push({ - type: Field, - value: "x", - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 2, offset: 1 }, - rowNumber: 1, - }, - }); - if (i < 1999) { - tokens.push({ - type: FieldDelimiter, - value: ",", - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 2, offset: 1 }, - rowNumber: 1, - }, - }); - } - } - - expect(() => [...assembler.assemble(tokens)]).toThrow(RangeError); - }); - - test("should properly handle CSV within field count limits", () => { - const assembler = createCSVRecordAssembler({ maxFieldCount: 100 }); - const tokens: Token[] = []; - - // Create 50 fields (within limit) - for (let i = 0; i < 50; i++) { - tokens.push({ - type: Field, - value: `field${i}`, - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 2, offset: 1 }, - rowNumber: 1, - }, - }); - if (i < 49) { - tokens.push({ - type: FieldDelimiter, - value: ",", - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 2, offset: 1 }, - rowNumber: 1, - }, - }); - } - } - tokens.push({ - type: RecordDelimiter, - value: "\n", - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 2, column: 1, offset: 1 }, - rowNumber: 1, - }, - }); - - // Add data row with same field count - for (let i = 0; i < 50; i++) { - tokens.push({ - type: Field, - value: `data${i}`, - location: { - start: { line: 2, column: 1, offset: 0 }, - end: { line: 2, column: 2, offset: 1 }, - rowNumber: 2, - }, - }); - if (i < 49) { - tokens.push({ - type: FieldDelimiter, - value: ",", - location: { - start: { line: 2, column: 1, offset: 0 }, - end: { line: 2, column: 2, offset: 1 }, - rowNumber: 2, - }, - }); - } - } - tokens.push({ - type: RecordDelimiter, - value: "\n", - location: { - start: { line: 2, column: 1, offset: 0 }, - end: { line: 3, column: 1, offset: 1 }, - rowNumber: 2, - }, - }); - - const records = [...assembler.assemble(tokens)]; - expect(records).toHaveLength(1); - expect(Object.keys(records[0] as object)).toHaveLength(50); - }); - }); - - describe("error message details", () => { - test("should include row number in error message", () => { - const assembler = createCSVRecordAssembler({ maxFieldCount: 5 }); - const tokens: Token[] = []; - - // Create 6 fields (exceeds limit of 5) - for (let i = 0; i < 6; i++) { - tokens.push({ - type: Field, - value: `field${i}`, - location: { - start: { line: 1, column: i * 2 + 1, offset: i * 2 }, - end: { line: 1, column: i * 2 + 2, offset: i * 2 + 1 }, - rowNumber: 3, - }, - }); - if (i < 5) { - tokens.push({ - type: FieldDelimiter, - value: ",", - location: { - start: { line: 1, column: i * 2 + 2, offset: i * 2 + 1 }, - end: { line: 1, column: i * 2 + 3, offset: i * 2 + 2 }, - rowNumber: 3, - }, - }); - } - } - - try { - [...assembler.assemble(tokens)]; - expect.fail("Should have thrown RangeError"); - } catch (error) { - expect(error).toBeInstanceOf(RangeError); - expect((error as RangeError).message).toContain("at row 3"); - } - }); - - test("should include source in error message when provided", () => { - const assembler = createCSVRecordAssembler({ - maxFieldCount: 5, - source: "data.csv", - }); - const tokens: Token[] = []; - - // Create 6 fields (exceeds limit of 5) - for (let i = 0; i < 6; i++) { - tokens.push({ - type: Field, - value: `field${i}`, - location: { - start: { line: 1, column: i * 2 + 1, offset: i * 2 }, - end: { line: 1, column: i * 2 + 2, offset: i * 2 + 1 }, - rowNumber: 2, - }, - }); - if (i < 5) { - tokens.push({ - type: FieldDelimiter, - value: ",", - location: { - start: { line: 1, column: i * 2 + 2, offset: i * 2 + 1 }, - end: { line: 1, column: i * 2 + 3, offset: i * 2 + 2 }, - rowNumber: 2, - }, - }); - } - } - - try { - [...assembler.assemble(tokens)]; - expect.fail("Should have thrown RangeError"); - } catch (error) { - expect(error).toBeInstanceOf(RangeError); - expect((error as RangeError).message).toContain('in "data.csv"'); - } - }); - - test("should include both row number and source in error message", () => { - const assembler = createCSVRecordAssembler({ - maxFieldCount: 3, - source: "users.csv", - }); - const tokens: Token[] = []; - - // Create 4 fields (exceeds limit of 3) - for (let i = 0; i < 4; i++) { - tokens.push({ - type: Field, - value: `col${i}`, - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 2, offset: 1 }, - rowNumber: 10, - }, - }); - if (i < 3) { - tokens.push({ - type: FieldDelimiter, - value: ",", - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 2, offset: 1 }, - rowNumber: 10, - }, - }); - } - } - - try { - [...assembler.assemble(tokens)]; - expect.fail("Should have thrown RangeError"); - } catch (error) { - expect(error).toBeInstanceOf(RangeError); - const message = (error as RangeError).message; - expect(message).toContain("at row 10"); - expect(message).toContain('in "users.csv"'); - expect(message).toContain( - "Field count (4) exceeded maximum allowed count of 3", - ); - } - }); - - test("should only include field count info when source is not provided", () => { - const assembler = createCSVRecordAssembler({ maxFieldCount: 2 }); - const tokens: Token[] = []; - - // Create 3 fields (exceeds limit of 2) - for (let i = 0; i < 3; i++) { - tokens.push({ - type: Field, - value: `f${i}`, - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 2, offset: 1 }, - rowNumber: 1, - }, - }); - if (i < 2) { - tokens.push({ - type: FieldDelimiter, - value: ",", - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 2, offset: 1 }, - rowNumber: 1, - }, - }); - } - } - - try { - [...assembler.assemble(tokens)]; - expect.fail("Should have thrown RangeError"); - } catch (error) { - expect(error).toBeInstanceOf(RangeError); - const message = (error as RangeError).message; - // Should not include source when not provided - expect(message).not.toContain('in "'); - // Should include row number - expect(message).toContain("at row 1"); - expect(message).toContain( - "Field count (3) exceeded maximum allowed count of 2", - ); - } - }); - }); -}); diff --git a/src/parser/models/FlexibleCSVRecordAssembler.object-output.test.ts b/src/parser/models/FlexibleCSVRecordAssembler.object-output.test.ts deleted file mode 100644 index cec59c80..00000000 --- a/src/parser/models/FlexibleCSVRecordAssembler.object-output.test.ts +++ /dev/null @@ -1,298 +0,0 @@ -import { describe, expect, test, vi } from "vitest"; -import { createCSVRecordAssembler } from "@/parser/api/model/createCSVRecordAssembler.ts"; -import { FlexibleStringCSVLexer } from "@/parser/api/model/createStringCSVLexer.ts"; - -describe("CSVRecordAssembler - Object Output", () => { - describe("columnCountStrategy option", () => { - describe("pad strategy (default)", () => { - test("should pad short rows with undefined", () => { - const csv = `Alice,30 -Bob,25,LA`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "object", - columnCountStrategy: "pad", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(2); - expect(records[0]).toEqual({ - name: "Alice", - age: "30", - city: undefined, - }); // Missing field filled with undefined (pad behavior) - expect(records[1]).toEqual({ name: "Bob", age: "25", city: "LA" }); // Exact match - }); - - test("should pad second row with undefined (regression test)", () => { - const csv = `Alice,30,NY -Bob,25`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "object", - columnCountStrategy: "pad", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(2); - expect(records[0]).toEqual({ name: "Alice", age: "30", city: "NY" }); // Exact match - expect(records[1]).toEqual({ name: "Bob", age: "25", city: undefined }); // Missing field filled with undefined - }); - - test("should ignore extra fields in long rows", () => { - const csv = `Alice,30,NY,Extra1,Extra2`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "object", - columnCountStrategy: "pad", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(1); - expect(records[0]).toEqual({ name: "Alice", age: "30", city: "NY" }); // Extra fields ignored - }); - }); - - describe("strict strategy", () => { - test("should throw error if row length doesn't match header length", () => { - const csv = `Alice,30 -Bob,25,LA`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "object", - columnCountStrategy: "strict", - }); - - expect(() => { - [...assembler.assemble(tokens)]; - }).toThrow("Expected 3 columns, got 2"); - }); - - test("should throw error if second row is short (regression test)", () => { - const csv = `Alice,30,NY -Bob,25`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "object", - columnCountStrategy: "strict", - }); - - expect(() => { - [...assembler.assemble(tokens)]; - }).toThrow("Expected 3 columns, got 2"); - }); - - test("should throw error if row is too long", () => { - const csv = `Alice,30,NY,Extra`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "object", - columnCountStrategy: "strict", - }); - - expect(() => { - [...assembler.assemble(tokens)]; - }).toThrow("Expected 3 columns, got 4"); - }); - - test("should not throw error if all rows match header length", () => { - const csv = `Alice,30,NY -Bob,25,LA`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "object", - columnCountStrategy: "strict", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(2); - expect(records[0]).toEqual({ name: "Alice", age: "30", city: "NY" }); - expect(records[1]).toEqual({ name: "Bob", age: "25", city: "LA" }); - }); - }); - - describe("truncate strategy", () => { - test("should truncate long rows to match header length", () => { - const csv = `Alice,30,NY,Extra`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "object", - columnCountStrategy: "truncate", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(1); - expect(records[0]).toEqual({ name: "Alice", age: "30", city: "NY" }); // Truncated - }); - - test("should keep short rows as-is without padding", () => { - const csv = `Alice,30`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "object", - columnCountStrategy: "truncate", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(1); - expect(records[0]).toEqual({ - name: "Alice", - age: "30", - city: undefined, - }); // Missing field remains undefined with truncate strategy - }); - }); - - describe("keep strategy", () => { - test("should warn and fallback to pad strategy", () => { - const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {}); - - const csv = `Alice,30`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["name", "age", "city"] as const, - outputFormat: "object", - columnCountStrategy: "keep", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(warnSpy).toHaveBeenCalledWith( - expect.stringContaining( - "columnCountStrategy 'keep' has no effect in object format", - ), - ); - expect(records).toHaveLength(1); - expect(records[0]).toEqual({ - name: "Alice", - age: "30", - city: undefined, - }); // Behaves like pad (fills with undefined) - - warnSpy.mockRestore(); - }); - }); - - describe("empty fields vs missing fields", () => { - test("should use empty string for empty fields", () => { - const csv = `,x,`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["a", "b", "c"] as const, - outputFormat: "object", - columnCountStrategy: "pad", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(1); - expect(records[0]).toEqual({ a: "", b: "x", c: "" }); // Empty fields → "" - }); - - test("should use undefined for missing fields in short rows (object format)", () => { - const csv = `x`; - - const lexer = new FlexibleStringCSVLexer(); - const tokens = lexer.lex(csv); - - const assembler = createCSVRecordAssembler({ - header: ["a", "b", "c"] as const, - outputFormat: "object", - columnCountStrategy: "pad", - }); - - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(1); - expect(records[0]).toEqual({ a: "x", b: undefined, c: undefined }); // Missing fields → undefined - }); - }); - }); - - describe("Headerless mode (header: []) - Runtime Validation Errors", () => { - test("should throw error when header: [] with outputFormat: 'object'", () => { - // Error should be thrown when creating the assembler (not during assembly) - // because header is explicitly provided in options - expect(() => - createCSVRecordAssembler({ - header: [] as const, - outputFormat: "object", - }), - ).toThrow( - /Headerless mode \(header: \[\]\) is not supported for outputFormat: 'object'/, - ); - }); - - test("should throw error when header: [] with object format and columnCountStrategy: 'pad'", () => { - expect(() => - createCSVRecordAssembler({ - header: [] as const, - outputFormat: "object", - columnCountStrategy: "pad", - }), - ).toThrow( - /Headerless mode \(header: \[\]\) is not supported for outputFormat: 'object'/, - ); - }); - - test("should throw error when header: [] with object format and columnCountStrategy: 'strict'", () => { - expect(() => - createCSVRecordAssembler({ - header: [] as const, - outputFormat: "object", - columnCountStrategy: "strict", - }), - ).toThrow( - /Headerless mode \(header: \[\]\) is not supported for outputFormat: 'object'/, - ); - }); - }); -}); diff --git a/src/parser/models/FlexibleCSVRecordAssembler.prototype-safety.test.ts b/src/parser/models/FlexibleCSVRecordAssembler.prototype-safety.test.ts deleted file mode 100644 index 933ad7e4..00000000 --- a/src/parser/models/FlexibleCSVRecordAssembler.prototype-safety.test.ts +++ /dev/null @@ -1,214 +0,0 @@ -import { describe, expect, test } from "vitest"; -import { createCSVRecordAssembler } from "@/parser/api/model/createCSVRecordAssembler.ts"; -import { FlexibleStringCSVLexer } from "@/parser/api/model/createStringCSVLexer.ts"; - -/** - * Regression tests to ensure that CSVRecordAssembler does not cause prototype pollution. - * - * These tests verify that using dangerous property names like __proto__, constructor, - * or prototype as CSV headers does NOT pollute Object.prototype or affect other objects. - * - * Context: Object.fromEntries() creates own properties, not prototype properties, - * so it is safe from prototype pollution attacks. - */ -describe("CSVRecordAssembler - Prototype Pollution Safety (Regression)", () => { - test("should not pollute Object.prototype when __proto__ is used as CSV header", () => { - const lexer = new FlexibleStringCSVLexer(); - const assembler = createCSVRecordAssembler(); - - // CSV with __proto__ as a header - const csv = "__proto__,name,age\r\nmalicious_value,Alice,30"; - - const tokens = lexer.lex(csv); - const records = [...assembler.assemble(tokens)]; - - // Verify the record has __proto__ as its own property - expect(records).toHaveLength(1); - expect(records[0]!).toHaveProperty("__proto__"); - expect(records[0]!.__proto__).toBe("malicious_value"); - expect(records[0]!.name).toBe("Alice"); - expect(records[0]!.age).toBe("30"); - - // CRITICAL: Verify that Object.prototype was NOT polluted - // If prototype pollution occurred, all new objects would have this property - const testObject = {}; - expect(testObject).not.toHaveProperty("malicious_value"); - expect((testObject as any).malicious_value).toBeUndefined(); - - // Verify __proto__ is an own property of the record, not inherited - expect(Object.hasOwn(records[0]!, "__proto__")).toBe(true); - }); - - test("should not pollute when constructor is used as CSV header", () => { - const lexer = new FlexibleStringCSVLexer(); - const assembler = createCSVRecordAssembler(); - - const csv = "constructor,name\r\nmalicious_value,Alice"; - - const tokens = lexer.lex(csv); - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(1); - expect(records[0]!.constructor).toBe("malicious_value"); - expect(records[0]!.name).toBe("Alice"); - - // Verify the constructor property is a string (own property), not the Function constructor - expect(typeof records[0]!.constructor).toBe("string"); - - // Verify constructor is an own property - expect(Object.hasOwn(records[0]!, "constructor")).toBe(true); - - // Verify Object.constructor is not affected - const testObject = {}; - expect(typeof testObject.constructor).toBe("function"); - }); - - test("should not pollute when prototype is used as CSV header", () => { - const lexer = new FlexibleStringCSVLexer(); - const assembler = createCSVRecordAssembler(); - - const csv = "prototype,name\r\nmalicious_value,Alice"; - - const tokens = lexer.lex(csv); - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(1); - expect(records[0]!.prototype).toBe("malicious_value"); - expect(records[0]!.name).toBe("Alice"); - - // Verify prototype is an own property - expect(Object.hasOwn(records[0]!, "prototype")).toBe(true); - }); - - test("should handle multiple dangerous property names together", () => { - const lexer = new FlexibleStringCSVLexer(); - const assembler = createCSVRecordAssembler(); - - // Multiple potentially dangerous headers in one CSV - const csv = - "__proto__,constructor,prototype,toString,valueOf,hasOwnProperty\r\nv1,v2,v3,v4,v5,v6"; - - const tokens = lexer.lex(csv); - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(1); - const record = records[0]!; - - // All values should be strings (own properties) - expect(record.__proto__).toBe("v1"); - expect(record.constructor).toBe("v2"); - expect(record.prototype).toBe("v3"); - expect(record.toString).toBe("v4"); - expect(record.valueOf).toBe("v5"); - expect(record.hasOwnProperty).toBe("v6"); - - expect(typeof record.__proto__).toBe("string"); - expect(typeof record.constructor).toBe("string"); - expect(typeof record.prototype).toBe("string"); - expect(typeof record.toString).toBe("string"); - expect(typeof record.valueOf).toBe("string"); - expect(typeof record.hasOwnProperty).toBe("string"); - - // Verify no prototype pollution occurred - const testObject = {}; - expect((testObject as any).v1).toBeUndefined(); - expect((testObject as any).v2).toBeUndefined(); - expect((testObject as any).v3).toBeUndefined(); - expect((testObject as any).v4).not.toBe("v4"); // Should be the native function - expect((testObject as any).v5).not.toBe("v5"); // Should be the native function - expect((testObject as any).v6).not.toBe("v6"); // Should be the native function - }); - - test("should handle multiple records with __proto__ header without pollution", () => { - const lexer = new FlexibleStringCSVLexer(); - const assembler = createCSVRecordAssembler(); - - const csv = - "__proto__,name\r\nvalue1,Alice\r\nvalue2,Bob\r\nvalue3,Charlie"; - - const tokens = lexer.lex(csv); - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(3); - - // Each record should have its own __proto__ value - expect(records[0]!.__proto__).toBe("value1"); - expect(records[1]!.__proto__).toBe("value2"); - expect(records[2]!.__proto__).toBe("value3"); - - // Verify no global pollution after processing multiple records - const testObject = {}; - expect((testObject as any).value1).toBeUndefined(); - expect((testObject as any).value2).toBeUndefined(); - expect((testObject as any).value3).toBeUndefined(); - }); - - test("should verify Object.fromEntries behavior is safe (baseline test)", () => { - // This test documents the safe behavior of Object.fromEntries() - // which is used internally by CSVRecordAssembler - - const dangerousEntries: Array<[string, string]> = [ - ["__proto__", "polluted"], - ["constructor", "malicious"], - ["name", "test"], - ]; - - const obj = Object.fromEntries(dangerousEntries); - - // Verify properties are set as own properties - expect(Object.hasOwn(obj, "__proto__")).toBe(true); - expect(Object.hasOwn(obj, "constructor")).toBe(true); - expect(obj.__proto__!).toBe("polluted"); - expect(obj.constructor!).toBe("malicious"); - - // CRITICAL: Verify no prototype pollution occurred - const testObject = {}; - expect((testObject as any).__proto__).not.toBe("polluted"); - expect((testObject as any).polluted).toBeUndefined(); - expect(typeof testObject.constructor).toBe("function"); // Should be the native Function constructor - }); - - test("should handle edge case with object-like notation in quoted values", () => { - const lexer = new FlexibleStringCSVLexer(); - const assembler = createCSVRecordAssembler(); - - // Object-like syntax must be quoted to be treated as a single field - const csv = '__proto__,name\r\n"{""polluted"":true}",Alice'; - - const tokens = lexer.lex(csv); - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(1); - // The value should be treated as a plain string - expect(records[0]!.__proto__).toBe('{"polluted":true}'); - expect(records[0]!.name).toBe("Alice"); - - // Verify no pollution - const testObject = {}; - expect((testObject as any).polluted).toBeUndefined(); - }); - - test("should maintain safety with quoted fields containing dangerous names", () => { - const lexer = new FlexibleStringCSVLexer(); - const assembler = createCSVRecordAssembler(); - - // Using quoted fields with dangerous property names - const csv = '"__proto__","constructor"\r\n"evil1","evil2"'; - - const tokens = lexer.lex(csv); - const records = [...assembler.assemble(tokens)]; - - expect(records).toHaveLength(1); - expect(records[0]!.__proto__).toBe("evil1"); - expect(records[0]!.constructor).toBe("evil2"); - - // Verify both are strings (own properties) - expect(typeof records[0]!.__proto__).toBe("string"); - expect(typeof records[0]!.constructor).toBe("string"); - - // Verify no pollution - const testObject = {}; - expect((testObject as any).evil1).toBeUndefined(); - expect((testObject as any).evil2).toBeUndefined(); - }); -}); diff --git a/src/parser/models/FlexibleCSVRecordAssembler.spec.ts b/src/parser/models/FlexibleCSVRecordAssembler.spec.ts index 6aa4e8dd..645bdf9c 100644 --- a/src/parser/models/FlexibleCSVRecordAssembler.spec.ts +++ b/src/parser/models/FlexibleCSVRecordAssembler.spec.ts @@ -1,13 +1,8 @@ import fc from "fast-check"; import { describe, expect, it } from "vitest"; import { FC } from "@/__tests__/helper.ts"; -import { - Field, - FieldDelimiter, - LF, - RecordDelimiter, -} from "@/core/constants.ts"; -import type { Token } from "@/core/types.ts"; +import { Delimiter } from "@/core/constants.ts"; +import type { AnyToken } from "@/core/types.ts"; import { createCSVRecordAssembler } from "@/parser/api/model/createCSVRecordAssembler.ts"; const LOCATION_SHAPE = { @@ -45,7 +40,6 @@ describe("class RecordAssembler", () => { fc.assert( fc.asyncProperty( fc.gen().map((g) => { - const EOL = g(FC.eol); const header = g(FC.header); const rows = g(FC.csvData, { columnsConstraints: { @@ -53,38 +47,25 @@ describe("class RecordAssembler", () => { maxLength: header.length, }, }); - const tokens = [ + // In unified token format, each token represents a field with its following delimiter + const tokens: AnyToken[] = [ // generate header tokens - ...header.flatMap((field, i) => [ - { type: Field, value: field, location: LOCATION_SHAPE }, - i === header.length - 1 - ? { - type: RecordDelimiter, - value: EOL, - location: LOCATION_SHAPE, - } - : { - type: FieldDelimiter, - value: ",", - location: LOCATION_SHAPE, - }, - ]), + ...header.map((field, i) => ({ + value: field, + delimiter: + i === header.length - 1 ? Delimiter.Record : Delimiter.Field, + delimiterLength: 1, + location: LOCATION_SHAPE, + })), // generate rows tokens - ...rows.flatMap((row) => - // generate row tokens - row.flatMap((field, j) => [ - { type: Field, value: field, location: LOCATION_SHAPE }, - { type: FieldDelimiter, value: ",", location: LOCATION_SHAPE }, - // generate record delimiter token - ...((j === row.length - 1 - ? [ - { - type: RecordDelimiter, - value: LF, - }, - ] - : []) as Token[]), - ]), + ...rows.flatMap((row) => + row.map((field, j) => ({ + value: field, + delimiter: + j === row.length - 1 ? Delimiter.Record : Delimiter.Field, + delimiterLength: 1, + location: LOCATION_SHAPE, + })), ), ]; const expected = rows.map((row) => @@ -112,22 +93,16 @@ describe("class RecordAssembler", () => { maxLength: header.length, }, }); - const tokens: Token[] = [ - ...rows.flatMap((row) => - row.flatMap((field, j) => [ - { type: Field, value: field, location: LOCATION_SHAPE }, - { type: FieldDelimiter, value: ",", location: LOCATION_SHAPE }, - ...((j === row.length - 1 - ? [ - { - type: RecordDelimiter, - value: LF, - }, - ] - : []) as Token[]), - ]), - ), - ]; + // In unified token format, each token represents a field with its following delimiter + const tokens: AnyToken[] = rows.flatMap((row) => + row.map((field, j) => ({ + value: field, + delimiter: + j === row.length - 1 ? Delimiter.Record : Delimiter.Field, + delimiterLength: 1, + location: LOCATION_SHAPE, + })), + ); const expected = rows.map((row) => Object.fromEntries(row.map((field, i) => [header[i], field])), ); diff --git a/src/parser/models/FlexibleCSVRecordAssembler.test.ts b/src/parser/models/FlexibleCSVRecordAssembler.test.ts index 9d4897f8..a243d1c9 100644 --- a/src/parser/models/FlexibleCSVRecordAssembler.test.ts +++ b/src/parser/models/FlexibleCSVRecordAssembler.test.ts @@ -1,6 +1,6 @@ import { assert, beforeEach, describe, expect, test } from "vitest"; -import { Field } from "@/core/constants"; -import type { CSVRecordAssembler } from "@/core/types.ts"; +import { Delimiter } from "@/core/constants.ts"; +import type { AnyToken, CSVRecordAssembler } from "@/core/types.ts"; import { createCSVRecordAssembler } from "@/parser/api/model/createCSVRecordAssembler.ts"; import { FlexibleStringCSVLexer } from "@/parser/api/model/createStringCSVLexer.ts"; @@ -51,20 +51,20 @@ describe("CSVRecordAssembler", () => { }); test("should throw DOMException named AbortError if the signal is aborted", () => { controller.abort(); + const tokens: AnyToken[] = [ + { + value: "", + delimiter: Delimiter.Record, + delimiterLength: 0, + location: { + start: { line: 1, column: 1, offset: 0 }, + end: { line: 1, column: 1, offset: 0 }, + rowNumber: 1, + }, + }, + ]; try { - [ - ...assembler.assemble([ - { - type: Field, - value: "", - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 1, offset: 0 }, - rowNumber: 1, - }, - }, - ]), - ]; + [...assembler.assemble(tokens)]; expect.unreachable(); } catch (error) { assert(error instanceof DOMException); @@ -82,20 +82,20 @@ describe("CSVRecordAssembler", () => { controller.abort(new MyCustomError("Custom reason")); + const tokens: AnyToken[] = [ + { + value: "", + delimiter: Delimiter.Record, + delimiterLength: 0, + location: { + start: { line: 1, column: 1, offset: 0 }, + end: { line: 1, column: 1, offset: 0 }, + rowNumber: 1, + }, + }, + ]; expect(() => { - [ - ...assembler.assemble([ - { - type: Field, - value: "", - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 1, offset: 0 }, - rowNumber: 1, - }, - }, - ]), - ]; + [...assembler.assemble(tokens)]; }).toThrowErrorMatchingInlineSnapshot(`[MyCustomError: Custom reason]`); }); }); @@ -111,20 +111,20 @@ describe("CSVRecordAssembler", () => { const signal = AbortSignal.timeout(0); await waitAbort(signal); const assembler = createCSVRecordAssembler({ signal }); + const tokens: AnyToken[] = [ + { + value: "", + delimiter: Delimiter.Record, + delimiterLength: 0, + location: { + start: { line: 1, column: 1, offset: 0 }, + end: { line: 1, column: 1, offset: 0 }, + rowNumber: 1, + }, + }, + ]; try { - [ - ...assembler.assemble([ - { - type: Field, - value: "", - location: { - start: { line: 1, column: 1, offset: 0 }, - end: { line: 1, column: 1, offset: 0 }, - rowNumber: 1, - }, - }, - ]), - ]; + [...assembler.assemble(tokens)]; expect.unreachable(); } catch (error) { assert(error instanceof DOMException); @@ -177,4 +177,1378 @@ describe("CSVRecordAssembler", () => { expect(records[1]).toEqual({ name: "Bob", age: "25" }); }); }); + + describe("Array Output Format", () => { + describe("outputFormat option", () => { + test("should output array format when outputFormat is 'array'", () => { + const csv = `name,age,city +Alice,30,NY +Bob,25,LA`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + outputFormat: "array", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(2); + expect(records[0]).toEqual(["Alice", "30", "NY"]); + expect(records[1]).toEqual(["Bob", "25", "LA"]); + }); + + test("should output object format by default", () => { + const csv = `name,age +Alice,30`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler(); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(1); + expect(records[0]).toEqual({ name: "Alice", age: "30" }); + }); + + test("should support named tuple with header", () => { + const csv = `Alice,30,NY +Bob,25,LA`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "array", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(2); + expect(records[0]).toEqual(["Alice", "30", "NY"]); + expect(records[1]).toEqual(["Bob", "25", "LA"]); + // Type should be: readonly [name: string, age: string, city: string] + if (Array.isArray(records[0])) { + expect(records[0].length).toBe(3); + } + }); + }); + + describe("includeHeader option", () => { + test("should include header row when includeHeader is true", () => { + const csv = `Alice,30,NY +Bob,25,LA`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "array", + includeHeader: true, + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(3); + expect(records[0]).toEqual(["name", "age", "city"]); // Header row + expect(records[1]).toEqual(["Alice", "30", "NY"]); + expect(records[2]).toEqual(["Bob", "25", "LA"]); + }); + + test("should not include header row by default", () => { + const csv = `Alice,30,NY`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "array", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(1); + expect(records[0]).toEqual(["Alice", "30", "NY"]); + }); + + test("should throw error if includeHeader is used with object format", () => { + expect(() => { + createCSVRecordAssembler({ + header: ["name", "age"] as const, + includeHeader: true, + outputFormat: "object", + }); + }).toThrow("includeHeader option is only valid for array format"); + }); + }); + + describe("columnCountStrategy for array output", () => { + describe("keep strategy", () => { + test("should keep rows as-is with their actual length", () => { + const csv = `Alice,30 +Bob,25,LA +Charlie,35,SF,Extra`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "array", + columnCountStrategy: "keep", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(3); + expect(records[0]).toEqual(["Alice", "30"]); // Short row + expect(records[1]).toEqual(["Bob", "25", "LA"]); // Exact match + expect(records[2]).toEqual(["Charlie", "35", "SF", "Extra"]); // Long row + }); + }); + + describe("fill strategy", () => { + test("should fill short rows with empty string", () => { + const csv = `Alice,30 +Bob,25,LA`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "array", + columnCountStrategy: "fill", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(2); + expect(records[0]).toEqual(["Alice", "30", ""]); // Filled with empty string + expect(records[1]).toEqual(["Bob", "25", "LA"]); // Exact match + }); + + test("should fill second row with empty string (regression test)", () => { + const csv = `Alice,30,NY +Bob,25`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "array", + columnCountStrategy: "fill", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(2); + expect(records[0]).toEqual(["Alice", "30", "NY"]); // Exact match + expect(records[1]).toEqual(["Bob", "25", ""]); // Filled with empty string + }); + + test("should truncate long rows to match header length", () => { + const csv = `Alice,30,NY,Extra1,Extra2`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "array", + columnCountStrategy: "fill", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(1); + expect(records[0]).toEqual(["Alice", "30", "NY"]); // Truncated + }); + }); + + describe("sparse strategy", () => { + test("should fill short rows with undefined", () => { + const csv = `Alice,30 +Bob,25,LA`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "array", + columnCountStrategy: "sparse", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(2); + expect(records[0]).toEqual(["Alice", "30", undefined]); // Sparse - undefined + expect(records[1]).toEqual(["Bob", "25", "LA"]); // Exact match + }); + + test("should fill second row with undefined (regression test)", () => { + const csv = `Alice,30,NY +Bob,25`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "array", + columnCountStrategy: "sparse", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(2); + expect(records[0]).toEqual(["Alice", "30", "NY"]); // Exact match + expect(records[1]).toEqual(["Bob", "25", undefined]); // Sparse - undefined + }); + + test("should truncate long rows to match header length", () => { + const csv = `Alice,30,NY,Extra1,Extra2`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "array", + columnCountStrategy: "sparse", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(1); + expect(records[0]).toEqual(["Alice", "30", "NY"]); // Truncated + }); + }); + + describe("strict strategy", () => { + test("should throw error if row length doesn't match header length", () => { + const csv = `Alice,30 +Bob,25,LA`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "array", + columnCountStrategy: "strict", + }); + + expect(() => { + [...assembler.assemble(tokens)]; + }).toThrow("Expected 3 columns, got 2"); + }); + + test("should throw error if second row is short (regression test)", () => { + const csv = `Alice,30,NY +Bob,25`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "array", + columnCountStrategy: "strict", + }); + + expect(() => { + [...assembler.assemble(tokens)]; + }).toThrow("Expected 3 columns, got 2"); + }); + + test("should not throw error if all rows match header length", () => { + const csv = `Alice,30,NY +Bob,25,LA`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "array", + columnCountStrategy: "strict", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(2); + expect(records[0]).toEqual(["Alice", "30", "NY"]); + expect(records[1]).toEqual(["Bob", "25", "LA"]); + }); + }); + + describe("truncate strategy", () => { + test("should truncate long rows to match header length", () => { + const csv = `Alice,30,NY,Extra`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "array", + columnCountStrategy: "truncate", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(1); + expect(records[0]).toEqual(["Alice", "30", "NY"]); // Truncated + }); + + test("should keep short rows as-is", () => { + const csv = `Alice,30`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "array", + columnCountStrategy: "truncate", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(1); + expect(records[0]).toEqual(["Alice", "30"]); // Not padded + }); + }); + + test("should throw error if columnCountStrategy is used without header", () => { + expect(() => { + createCSVRecordAssembler({ + outputFormat: "array", + columnCountStrategy: "sparse", + }); + }).toThrow("columnCountStrategy 'sparse' requires header option"); + }); + }); + + describe("variable-length CSV (with keep strategy)", () => { + test("should handle variable-length rows with keep strategy", () => { + const csv = `Alice,30 +Bob,25,LA +Charlie,35,SF,Extra`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + outputFormat: "array", + columnCountStrategy: "keep", // Required for variable-length output + }); + + const records = [...assembler.assemble(tokens)]; + + // First row becomes header, subsequent rows keep their variable length + expect(records).toHaveLength(2); + expect(records[0]).toEqual(["Bob", "25", "LA"]); + expect(records[1]).toEqual(["Charlie", "35", "SF", "Extra"]); + }); + }); + + describe("Headerless mode (header: [])", () => { + describe("Valid configurations", () => { + test("should treat all rows as data when header is empty array", () => { + const csv = `Alice,30,NY +Bob,25,LA +Charlie,35,SF`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: [] as const, + outputFormat: "array", + }); + + const records = [...assembler.assemble(tokens)]; + + // All three rows should be treated as data (no header inference) + expect(records).toHaveLength(3); + expect(records[0]).toEqual(["Alice", "30", "NY"]); + expect(records[1]).toEqual(["Bob", "25", "LA"]); + expect(records[2]).toEqual(["Charlie", "35", "SF"]); + }); + + test("should work with single row CSV in headerless mode", () => { + const csv = `Alice,30,NY`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: [] as const, + outputFormat: "array", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(1); + expect(records[0]).toEqual(["Alice", "30", "NY"]); + }); + + test("should work with empty CSV in headerless mode", () => { + const csv = ``; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: [] as const, + outputFormat: "array", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(0); + }); + + test("should support varying column counts in headerless mode with columnCountStrategy: keep", () => { + const csv = `Alice,30 +Bob,25,LA +Charlie,35,SF,Extra`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: [] as const, + outputFormat: "array", + columnCountStrategy: "keep", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(3); + expect(records[0]).toEqual(["Alice", "30"]); + expect(records[1]).toEqual(["Bob", "25", "LA"]); + expect(records[2]).toEqual(["Charlie", "35", "SF", "Extra"]); + }); + }); + + describe("Runtime validation errors", () => { + test("should throw error when header: [] with columnCountStrategy: 'fill'", () => { + expect(() => + createCSVRecordAssembler({ + header: [] as const, + outputFormat: "array", + columnCountStrategy: "fill", + }), + ).toThrow( + /Headerless mode \(header: \[\]\) only supports columnCountStrategy: 'keep'/, + ); + }); + + test("should throw error when header: [] with columnCountStrategy: 'sparse'", () => { + expect(() => + createCSVRecordAssembler({ + header: [] as const, + outputFormat: "array", + columnCountStrategy: "sparse", + }), + ).toThrow( + /Headerless mode \(header: \[\]\) only supports columnCountStrategy: 'keep'/, + ); + }); + + test("should throw error when header: [] with columnCountStrategy: 'strict'", () => { + expect(() => + createCSVRecordAssembler({ + header: [] as const, + outputFormat: "array", + columnCountStrategy: "strict", + }), + ).toThrow( + /Headerless mode \(header: \[\]\) only supports columnCountStrategy: 'keep'/, + ); + }); + + test("should throw error when header: [] with columnCountStrategy: 'truncate'", () => { + expect(() => + createCSVRecordAssembler({ + header: [] as const, + outputFormat: "array", + columnCountStrategy: "truncate", + }), + ).toThrow( + /Headerless mode \(header: \[\]\) only supports columnCountStrategy: 'keep'/, + ); + }); + + test("should throw error when header: [] with outputFormat: 'object'", () => { + expect(() => + createCSVRecordAssembler({ + header: [] as const, + outputFormat: "object", + }), + ).toThrow( + /Headerless mode \(header: \[\]\) is not supported for outputFormat: 'object'/, + ); + }); + }); + }); + }); + + describe("Object Output Format", () => { + describe("columnCountStrategy for object output", () => { + describe("fill strategy (default)", () => { + test("should fill short rows with empty string", () => { + const csv = `Alice,30 +Bob,25,LA`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "object", + columnCountStrategy: "fill", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(2); + expect(records[0]).toEqual({ + name: "Alice", + age: "30", + city: "", + }); // Missing field filled with empty string (fill behavior) + expect(records[1]).toEqual({ name: "Bob", age: "25", city: "LA" }); // Exact match + }); + + test("should fill second row with empty string (regression test)", () => { + const csv = `Alice,30,NY +Bob,25`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "object", + columnCountStrategy: "fill", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(2); + expect(records[0]).toEqual({ name: "Alice", age: "30", city: "NY" }); // Exact match + expect(records[1]).toEqual({ name: "Bob", age: "25", city: "" }); // Missing field filled with empty string + }); + + test("should ignore extra fields in long rows", () => { + const csv = `Alice,30,NY,Extra1,Extra2`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "object", + columnCountStrategy: "fill", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(1); + expect(records[0]).toEqual({ name: "Alice", age: "30", city: "NY" }); // Extra fields ignored + }); + }); + + describe("sparse strategy", () => { + test("should throw error because sparse is not allowed for object format", () => { + expect(() => { + createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "object", + columnCountStrategy: "sparse", + }); + }).toThrow( + "columnCountStrategy 'sparse' is not allowed for object format", + ); + }); + }); + + describe("strict strategy", () => { + test("should throw error if row length doesn't match header length", () => { + const csv = `Alice,30 +Bob,25,LA`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "object", + columnCountStrategy: "strict", + }); + + expect(() => { + [...assembler.assemble(tokens)]; + }).toThrow("Expected 3 columns, got 2"); + }); + + test("should throw error if second row is short (regression test)", () => { + const csv = `Alice,30,NY +Bob,25`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "object", + columnCountStrategy: "strict", + }); + + expect(() => { + [...assembler.assemble(tokens)]; + }).toThrow("Expected 3 columns, got 2"); + }); + + test("should throw error if row is too long", () => { + const csv = `Alice,30,NY,Extra`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "object", + columnCountStrategy: "strict", + }); + + expect(() => { + [...assembler.assemble(tokens)]; + }).toThrow("Expected 3 columns, got 4"); + }); + + test("should not throw error if all rows match header length", () => { + const csv = `Alice,30,NY +Bob,25,LA`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "object", + columnCountStrategy: "strict", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(2); + expect(records[0]).toEqual({ name: "Alice", age: "30", city: "NY" }); + expect(records[1]).toEqual({ name: "Bob", age: "25", city: "LA" }); + }); + }); + + describe("truncate strategy", () => { + test("should throw error because truncate is not allowed for object format", () => { + expect(() => { + createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "object", + columnCountStrategy: "truncate", + }); + }).toThrowError( + /columnCountStrategy 'truncate' is not allowed for object format/, + ); + }); + }); + + describe("keep strategy", () => { + test("should throw error because keep is not allowed for object format", () => { + expect(() => { + createCSVRecordAssembler({ + header: ["name", "age", "city"] as const, + outputFormat: "object", + columnCountStrategy: "keep", + }); + }).toThrowError( + /columnCountStrategy 'keep' is not allowed for object format/, + ); + }); + }); + + describe("empty fields vs missing fields", () => { + test("should use empty string for empty fields", () => { + const csv = `,x,`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["a", "b", "c"] as const, + outputFormat: "object", + columnCountStrategy: "fill", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(1); + expect(records[0]).toEqual({ a: "", b: "x", c: "" }); // Empty fields → "" + }); + + test("should use empty string for missing fields in short rows (object format)", () => { + const csv = `x`; + + const lexer = new FlexibleStringCSVLexer(); + const tokens = lexer.lex(csv); + + const assembler = createCSVRecordAssembler({ + header: ["a", "b", "c"] as const, + outputFormat: "object", + columnCountStrategy: "fill", + }); + + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(1); + expect(records[0]).toEqual({ a: "x", b: "", c: "" }); // Missing fields → "" (fill strategy) + }); + }); + }); + + describe("Headerless mode (header: []) - Runtime Validation Errors", () => { + test("should throw error when header: [] with outputFormat: 'object'", () => { + // Error should be thrown when creating the assembler (not during assembly) + // because header is explicitly provided in options + expect(() => + createCSVRecordAssembler({ + header: [] as const, + outputFormat: "object", + }), + ).toThrow( + /Headerless mode \(header: \[\]\) is not supported for outputFormat: 'object'/, + ); + }); + + test("should throw error when header: [] with object format and columnCountStrategy: 'fill'", () => { + expect(() => + createCSVRecordAssembler({ + header: [] as const, + outputFormat: "object", + columnCountStrategy: "fill", + }), + ).toThrow( + /Headerless mode \(header: \[\]\) is not supported for outputFormat: 'object'/, + ); + }); + + test("should throw error when header: [] with object format and columnCountStrategy: 'sparse'", () => { + expect(() => + createCSVRecordAssembler({ + header: [] as const, + outputFormat: "object", + columnCountStrategy: "sparse", + }), + ).toThrow( + /Headerless mode \(header: \[\]\) is not supported for outputFormat: 'object'/, + ); + }); + + test("should throw error when header: [] with object format and columnCountStrategy: 'strict'", () => { + expect(() => + createCSVRecordAssembler({ + header: [] as const, + outputFormat: "object", + columnCountStrategy: "strict", + }), + ).toThrow( + /Headerless mode \(header: \[\]\) is not supported for outputFormat: 'object'/, + ); + }); + }); + }); + + describe("Prototype Pollution Safety (Regression)", () => { + test("should not pollute Object.prototype when __proto__ is used as CSV header", () => { + const lexer = new FlexibleStringCSVLexer(); + const assembler = createCSVRecordAssembler(); + + // CSV with __proto__ as a header + const csv = "__proto__,name,age\r\nmalicious_value,Alice,30"; + + const tokens = lexer.lex(csv); + const records = [...assembler.assemble(tokens)]; + + // Verify the record has __proto__ as its own property + expect(records).toHaveLength(1); + expect(records[0]!).toHaveProperty("__proto__"); + expect(records[0]!.__proto__).toBe("malicious_value"); + expect(records[0]!.name).toBe("Alice"); + expect(records[0]!.age).toBe("30"); + + // CRITICAL: Verify that Object.prototype was NOT polluted + // If prototype pollution occurred, all new objects would have this property + const testObject = {}; + expect(testObject).not.toHaveProperty("malicious_value"); + expect((testObject as any).malicious_value).toBeUndefined(); + + // Verify __proto__ is an own property of the record, not inherited + expect(Object.hasOwn(records[0]!, "__proto__")).toBe(true); + }); + + test("should not pollute when constructor is used as CSV header", () => { + const lexer = new FlexibleStringCSVLexer(); + const assembler = createCSVRecordAssembler(); + + const csv = "constructor,name\r\nmalicious_value,Alice"; + + const tokens = lexer.lex(csv); + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(1); + expect(records[0]!.constructor).toBe("malicious_value"); + expect(records[0]!.name).toBe("Alice"); + + // Verify the constructor property is a string (own property), not the Function constructor + expect(typeof records[0]!.constructor).toBe("string"); + + // Verify constructor is an own property + expect(Object.hasOwn(records[0]!, "constructor")).toBe(true); + + // Verify Object.constructor is not affected + const testObject = {}; + expect(typeof testObject.constructor).toBe("function"); + }); + + test("should not pollute when prototype is used as CSV header", () => { + const lexer = new FlexibleStringCSVLexer(); + const assembler = createCSVRecordAssembler(); + + const csv = "prototype,name\r\nmalicious_value,Alice"; + + const tokens = lexer.lex(csv); + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(1); + expect(records[0]!.prototype).toBe("malicious_value"); + expect(records[0]!.name).toBe("Alice"); + + // Verify prototype is an own property + expect(Object.hasOwn(records[0]!, "prototype")).toBe(true); + }); + + test("should handle multiple dangerous property names together", () => { + const lexer = new FlexibleStringCSVLexer(); + const assembler = createCSVRecordAssembler(); + + // Multiple potentially dangerous headers in one CSV + const csv = + "__proto__,constructor,prototype,toString,valueOf,hasOwnProperty\r\nv1,v2,v3,v4,v5,v6"; + + const tokens = lexer.lex(csv); + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(1); + const record = records[0]!; + + // All values should be strings (own properties) + expect(record.__proto__).toBe("v1"); + expect(record.constructor).toBe("v2"); + expect(record.prototype).toBe("v3"); + expect(record.toString).toBe("v4"); + expect(record.valueOf).toBe("v5"); + expect(record.hasOwnProperty).toBe("v6"); + + expect(typeof record.__proto__).toBe("string"); + expect(typeof record.constructor).toBe("string"); + expect(typeof record.prototype).toBe("string"); + expect(typeof record.toString).toBe("string"); + expect(typeof record.valueOf).toBe("string"); + expect(typeof record.hasOwnProperty).toBe("string"); + + // Verify no prototype pollution occurred + const testObject = {}; + expect((testObject as any).v1).toBeUndefined(); + expect((testObject as any).v2).toBeUndefined(); + expect((testObject as any).v3).toBeUndefined(); + expect((testObject as any).v4).not.toBe("v4"); // Should be the native function + expect((testObject as any).v5).not.toBe("v5"); // Should be the native function + expect((testObject as any).v6).not.toBe("v6"); // Should be the native function + }); + + test("should handle multiple records with __proto__ header without pollution", () => { + const lexer = new FlexibleStringCSVLexer(); + const assembler = createCSVRecordAssembler(); + + const csv = + "__proto__,name\r\nvalue1,Alice\r\nvalue2,Bob\r\nvalue3,Charlie"; + + const tokens = lexer.lex(csv); + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(3); + + // Each record should have its own __proto__ value + expect(records[0]!.__proto__).toBe("value1"); + expect(records[1]!.__proto__).toBe("value2"); + expect(records[2]!.__proto__).toBe("value3"); + + // Verify no global pollution after processing multiple records + const testObject = {}; + expect((testObject as any).value1).toBeUndefined(); + expect((testObject as any).value2).toBeUndefined(); + expect((testObject as any).value3).toBeUndefined(); + }); + + test("should verify Object.fromEntries behavior is safe (baseline test)", () => { + // This test documents the safe behavior of Object.fromEntries() + // which is used internally by CSVRecordAssembler + + const dangerousEntries: Array<[string, string]> = [ + ["__proto__", "polluted"], + ["constructor", "malicious"], + ["name", "test"], + ]; + + const obj = Object.fromEntries(dangerousEntries); + + // Verify properties are set as own properties + expect(Object.hasOwn(obj, "__proto__")).toBe(true); + expect(Object.hasOwn(obj, "constructor")).toBe(true); + expect(obj.__proto__!).toBe("polluted"); + expect(obj.constructor!).toBe("malicious"); + + // CRITICAL: Verify no prototype pollution occurred + const testObject = {}; + expect((testObject as any).__proto__).not.toBe("polluted"); + expect((testObject as any).polluted).toBeUndefined(); + expect(typeof testObject.constructor).toBe("function"); // Should be the native Function constructor + }); + + test("should handle edge case with object-like notation in quoted values", () => { + const lexer = new FlexibleStringCSVLexer(); + const assembler = createCSVRecordAssembler(); + + // Object-like syntax must be quoted to be treated as a single field + const csv = '__proto__,name\r\n"{""polluted"":true}",Alice'; + + const tokens = lexer.lex(csv); + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(1); + // The value should be treated as a plain string + expect(records[0]!.__proto__).toBe('{"polluted":true}'); + expect(records[0]!.name).toBe("Alice"); + + // Verify no pollution + const testObject = {}; + expect((testObject as any).polluted).toBeUndefined(); + }); + + test("should maintain safety with quoted fields containing dangerous names", () => { + const lexer = new FlexibleStringCSVLexer(); + const assembler = createCSVRecordAssembler(); + + // Using quoted fields with dangerous property names + const csv = '"__proto__","constructor"\r\n"evil1","evil2"'; + + const tokens = lexer.lex(csv); + const records = [...assembler.assemble(tokens)]; + + expect(records).toHaveLength(1); + expect(records[0]!.__proto__).toBe("evil1"); + expect(records[0]!.constructor).toBe("evil2"); + + // Verify both are strings (own properties) + expect(typeof records[0]!.__proto__).toBe("string"); + expect(typeof records[0]!.constructor).toBe("string"); + + // Verify no pollution + const testObject = {}; + expect((testObject as any).evil1).toBeUndefined(); + expect((testObject as any).evil2).toBeUndefined(); + }); + }); + + describe("Field Count Limit Protection", () => { + describe("with default field count limit (100000)", () => { + let assembler: CSVRecordAssembler; + beforeEach(() => { + assembler = createCSVRecordAssembler(); + }); + + test("should not throw error for normal field counts", () => { + // In the unified token format, each token represents a field with `delimiter` (Delimiter.Field or Delimiter.Record) and `delimiterLength` + const tokens: AnyToken[] = [ + { + value: "a", + delimiter: Delimiter.Field, + delimiterLength: 1, + location: { + start: { line: 1, column: 1, offset: 0 }, + end: { line: 1, column: 2, offset: 1 }, + rowNumber: 1, + }, + }, + { + value: "b", + delimiter: Delimiter.Record, + delimiterLength: 1, + location: { + start: { line: 1, column: 3, offset: 2 }, + end: { line: 1, column: 4, offset: 3 }, + rowNumber: 1, + }, + }, + ]; + + expect(() => [...assembler.assemble(tokens)]).not.toThrow(); + }); + + test("should throw RangeError when field count exceeds limit during header parsing", () => { + const tokens: AnyToken[] = []; + const maxFields = 100001; + + // Create header with excessive fields using unified token format + for (let i = 0; i < maxFields; i++) { + tokens.push({ + value: `field${i}`, + delimiter: i < maxFields - 1 ? Delimiter.Field : Delimiter.Record, + delimiterLength: 1, + location: { + start: { line: 1, column: i * 2 + 1, offset: i * 2 }, + end: { line: 1, column: i * 2 + 2, offset: i * 2 + 1 }, + rowNumber: 1, + }, + }); + } + + expect(() => [...assembler.assemble(tokens)]).toThrow(RangeError); + }); + + test("should throw RangeError with proper error details", () => { + const tokens: AnyToken[] = []; + const maxFields = 100001; + + for (let i = 0; i < maxFields; i++) { + tokens.push({ + value: `f${i}`, + delimiter: i < maxFields - 1 ? Delimiter.Field : Delimiter.Record, + delimiterLength: 1, + location: { + start: { line: 1, column: 1, offset: 0 }, + end: { line: 1, column: 2, offset: 1 }, + rowNumber: 1, + }, + }); + } + + try { + [...assembler.assemble(tokens)]; + expect.fail("Should have thrown RangeError"); + } catch (error) { + expect(error).toBeInstanceOf(RangeError); + expect((error as RangeError).message).toContain("Field count"); + expect((error as RangeError).message).toContain( + "exceeded maximum allowed count", + ); + } + }); + }); + + describe("with custom field count limit", () => { + test("should allow exactly N fields when limit is N", () => { + const header = Array.from({ length: 10 }, (_, i) => `col${i}`); + const assembler = createCSVRecordAssembler({ + maxFieldCount: 10, + header, + }); + const tokens: AnyToken[] = []; + + // Create exactly 10 fields (at the limit, should succeed) + for (let i = 0; i < 10; i++) { + tokens.push({ + value: `field${i}`, + delimiter: i < 9 ? Delimiter.Field : Delimiter.Record, + delimiterLength: 1, + location: { + start: { line: 1, column: 1, offset: 0 }, + end: { line: 1, column: 2, offset: 1 }, + rowNumber: 1, + }, + }); + } + + // Verify the record was correctly assembled (exactly at the limit) + const records = [...assembler.assemble(tokens)]; + expect(records).toHaveLength(1); + expect(Object.keys(records[0] as object)).toHaveLength(10); + }); + + test("should respect custom maxFieldCount option", () => { + const assembler = createCSVRecordAssembler({ maxFieldCount: 10 }); + const tokens: AnyToken[] = []; + + // Create 11 fields (exceeds limit of 10) + for (let i = 0; i < 11; i++) { + tokens.push({ + value: `f${i}`, + delimiter: i < 10 ? Delimiter.Field : Delimiter.Record, + delimiterLength: 1, + location: { + start: { line: 1, column: 1, offset: 0 }, + end: { line: 1, column: 2, offset: 1 }, + rowNumber: 1, + }, + }); + } + + expect(() => [...assembler.assemble(tokens)]).toThrow(RangeError); + }); + + test("should allow Number.POSITIVE_INFINITY as maxFieldCount to disable limit", () => { + const assembler = createCSVRecordAssembler({ + maxFieldCount: Number.POSITIVE_INFINITY, + }); + const tokens: AnyToken[] = []; + + // Create 200000 fields (would exceed default limit) + for (let i = 0; i < 200000; i++) { + tokens.push({ + value: `f${i}`, + delimiter: i < 199999 ? Delimiter.Field : Delimiter.Record, + delimiterLength: 1, + location: { + start: { line: 1, column: 1, offset: 0 }, + end: { line: 1, column: 2, offset: 1 }, + rowNumber: 1, + }, + }); + } + + // This should not throw, but will take time and memory + expect(() => [...assembler.assemble(tokens)]).not.toThrow(); + }); + }); + + describe("header validation with field count limit", () => { + test("should throw RangeError when provided header exceeds limit", () => { + const largeHeader = Array.from( + { length: 100001 }, + (_, i) => `field${i}`, + ); + + expect(() => createCSVRecordAssembler({ header: largeHeader })).toThrow( + RangeError, + ); + }); + + test("should accept header within limit", () => { + const normalHeader = ["field1", "field2", "field3"]; + + expect(() => + createCSVRecordAssembler({ header: normalHeader }), + ).not.toThrow(); + }); + }); + + describe("realistic attack scenarios", () => { + test("should prevent DoS via CSV with excessive columns", () => { + const assembler = createCSVRecordAssembler({ maxFieldCount: 1000 }); + const tokens: AnyToken[] = []; + + // Simulate attack with 2000 columns + for (let i = 0; i < 2000; i++) { + tokens.push({ + value: "x", + delimiter: i < 1999 ? Delimiter.Field : Delimiter.Record, + delimiterLength: 1, + location: { + start: { line: 1, column: 1, offset: 0 }, + end: { line: 1, column: 2, offset: 1 }, + rowNumber: 1, + }, + }); + } + + expect(() => [...assembler.assemble(tokens)]).toThrow(RangeError); + }); + + test("should properly handle CSV within field count limits", () => { + const assembler = createCSVRecordAssembler({ maxFieldCount: 100 }); + const tokens: AnyToken[] = []; + + // Create 50 fields (within limit) - header row + for (let i = 0; i < 50; i++) { + tokens.push({ + value: `field${i}`, + delimiter: i < 49 ? Delimiter.Field : Delimiter.Record, + delimiterLength: 1, + location: { + start: { line: 1, column: 1, offset: 0 }, + end: { line: 1, column: 2, offset: 1 }, + rowNumber: 1, + }, + }); + } + + // Add data row with same field count + for (let i = 0; i < 50; i++) { + tokens.push({ + value: `data${i}`, + delimiter: i < 49 ? Delimiter.Field : Delimiter.Record, + delimiterLength: 1, + location: { + start: { line: 2, column: 1, offset: 0 }, + end: { line: 2, column: 2, offset: 1 }, + rowNumber: 2, + }, + }); + } + + const records = [...assembler.assemble(tokens)]; + expect(records).toHaveLength(1); + expect(Object.keys(records[0] as object)).toHaveLength(50); + }); + }); + + describe("error message details", () => { + test("should include row number in error message", () => { + const assembler = createCSVRecordAssembler({ maxFieldCount: 5 }); + const tokens: AnyToken[] = []; + + // Create 6 fields (exceeds limit of 5) + for (let i = 0; i < 6; i++) { + tokens.push({ + value: `field${i}`, + delimiter: i < 5 ? Delimiter.Field : Delimiter.Record, + delimiterLength: 1, + location: { + start: { line: 1, column: i * 2 + 1, offset: i * 2 }, + end: { line: 1, column: i * 2 + 2, offset: i * 2 + 1 }, + rowNumber: 3, + }, + }); + } + + try { + [...assembler.assemble(tokens)]; + expect.fail("Should have thrown RangeError"); + } catch (error) { + expect(error).toBeInstanceOf(RangeError); + expect((error as RangeError).message).toContain("at row 3"); + } + }); + + test("should include source in error message when provided", () => { + const assembler = createCSVRecordAssembler({ + maxFieldCount: 5, + source: "data.csv", + }); + const tokens: AnyToken[] = []; + + // Create 6 fields (exceeds limit of 5) + for (let i = 0; i < 6; i++) { + tokens.push({ + value: `field${i}`, + delimiter: i < 5 ? Delimiter.Field : Delimiter.Record, + delimiterLength: 1, + location: { + start: { line: 1, column: i * 2 + 1, offset: i * 2 }, + end: { line: 1, column: i * 2 + 2, offset: i * 2 + 1 }, + rowNumber: 2, + }, + }); + } + + try { + [...assembler.assemble(tokens)]; + expect.fail("Should have thrown RangeError"); + } catch (error) { + expect(error).toBeInstanceOf(RangeError); + expect((error as RangeError).message).toContain('in "data.csv"'); + } + }); + + test("should include both row number and source in error message", () => { + const assembler = createCSVRecordAssembler({ + maxFieldCount: 3, + source: "users.csv", + }); + const tokens: AnyToken[] = []; + + // Create 4 fields (exceeds limit of 3) + for (let i = 0; i < 4; i++) { + tokens.push({ + value: `col${i}`, + delimiter: i < 3 ? Delimiter.Field : Delimiter.Record, + delimiterLength: 1, + location: { + start: { line: 1, column: 1, offset: 0 }, + end: { line: 1, column: 2, offset: 1 }, + rowNumber: 10, + }, + }); + } + + try { + [...assembler.assemble(tokens)]; + expect.fail("Should have thrown RangeError"); + } catch (error) { + expect(error).toBeInstanceOf(RangeError); + const message = (error as RangeError).message; + expect(message).toContain("at row 10"); + expect(message).toContain('in "users.csv"'); + expect(message).toContain( + "Field count (4) exceeded maximum allowed count of 3", + ); + } + }); + + test("should only include field count info when source is not provided", () => { + const assembler = createCSVRecordAssembler({ maxFieldCount: 2 }); + const tokens: AnyToken[] = []; + + // Create 3 fields (exceeds limit of 2) + for (let i = 0; i < 3; i++) { + tokens.push({ + value: `f${i}`, + delimiter: i < 2 ? Delimiter.Field : Delimiter.Record, + delimiterLength: 1, + location: { + start: { line: 1, column: 1, offset: 0 }, + end: { line: 1, column: 2, offset: 1 }, + rowNumber: 1, + }, + }); + } + + try { + [...assembler.assemble(tokens)]; + expect.fail("Should have thrown RangeError"); + } catch (error) { + expect(error).toBeInstanceOf(RangeError); + const message = (error as RangeError).message; + // Should not include source when not provided + expect(message).not.toContain('in "'); + // Should include row number + expect(message).toContain("at row 1"); + expect(message).toContain( + "Field count (3) exceeded maximum allowed count of 2", + ); + } + }); + }); + }); }); diff --git a/src/parser/models/FlexibleStringArrayCSVParser.ts b/src/parser/models/FlexibleStringArrayCSVParser.ts index 4ce9c85a..6f83562d 100644 --- a/src/parser/models/FlexibleStringArrayCSVParser.ts +++ b/src/parser/models/FlexibleStringArrayCSVParser.ts @@ -32,7 +32,7 @@ import { BaseStringCSVParser } from "@/parser/models/base/BaseStringCSVParser.ts * ``` */ export class FlexibleStringArrayCSVParser< - Header extends ReadonlyArray = readonly string[], + const Header extends ReadonlyArray = readonly string[], > extends BaseStringCSVParser implements StringArrayCSVParser
diff --git a/src/parser/models/FlexibleStringCSVLexer.buffer-overflow.test.ts b/src/parser/models/FlexibleStringCSVLexer.buffer-overflow.test.ts deleted file mode 100644 index 86ca5081..00000000 --- a/src/parser/models/FlexibleStringCSVLexer.buffer-overflow.test.ts +++ /dev/null @@ -1,167 +0,0 @@ -import { beforeEach, describe, expect, test } from "vitest"; -import { Field } from "@/core/constants.ts"; -import type { StringCSVLexer } from "@/core/types.ts"; -import { FlexibleStringCSVLexer } from "@/parser/api/model/createStringCSVLexer.ts"; - -describe("CSVLexer - Buffer Overflow Protection", () => { - describe("with default buffer size (10M characters)", () => { - let lexer: StringCSVLexer; - beforeEach(() => { - lexer = new FlexibleStringCSVLexer(); - }); - - test("should not throw error for normal-sized input", () => { - const data = "a,b,c\n".repeat(1000); - expect(() => [...lexer.lex(data)]).not.toThrow(); - }); - - test("should throw RangeError when buffer exceeds 10M characters", () => { - // Create a large chunk that exceeds 10M characters - const largeChunk = "a".repeat(11 * 1024 * 1024); // 11M characters - - expect(() => [...lexer.lex(largeChunk)]).toThrow(RangeError); - }); - - test("should throw RangeError with proper error details", () => { - const largeChunk = "a".repeat(11 * 1024 * 1024); // 11M characters - - try { - [...lexer.lex(largeChunk)]; - expect.fail("Should have thrown RangeError"); - } catch (error) { - expect(error).toBeInstanceOf(RangeError); - expect((error as RangeError).message).toContain("Buffer size"); - expect((error as RangeError).message).toContain("characters"); - expect((error as RangeError).message).toContain( - "exceeded maximum allowed size", - ); - } - }); - - test("should throw RangeError on incremental buffering attack", () => { - // Simulate streaming attack with many small chunks - const smallChunk = "a".repeat(1024 * 1024); // 1M characters per chunk - - expect(() => { - for (let i = 0; i < 12; i++) { - [...lexer.lex(smallChunk, { stream: true })]; // buffering = true - } - }).toThrow(RangeError); - }); - - test("should throw RangeError on unclosed quoted field", () => { - // Attack vector: unclosed quoted field that accumulates in buffer - const unclosedQuote = `"${"a".repeat(11 * 1024 * 1024)}`; - - expect(() => [...lexer.lex(unclosedQuote, { stream: true })]).toThrow( - RangeError, - ); - }); - }); - - describe("with custom buffer size", () => { - test("should respect custom maxBufferSize option", () => { - const lexer = new FlexibleStringCSVLexer({ maxBufferSize: 1024 }); // 1K characters limit - const largeChunk = "a".repeat(2048); // 2K characters - - expect(() => [...lexer.lex(largeChunk)]).toThrow(RangeError); - }); - - test("should allow Infinity as maxBufferSize to disable limit", () => { - const lexer = new FlexibleStringCSVLexer({ - maxBufferSize: Number.POSITIVE_INFINITY, - }); - const largeChunk = "a".repeat(20 * 1024 * 1024); // 20M characters - - // This should not throw, but may take some time and memory - // We'll just verify it doesn't throw immediately - expect(() => [...lexer.lex(largeChunk)]).not.toThrow(RangeError); - }); - }); - - describe("buffer size check timing", () => { - test("should check buffer size after each chunk addition", () => { - const lexer = new FlexibleStringCSVLexer({ maxBufferSize: 100 }); - - // First chunk is within limit - expect(() => [ - ...lexer.lex("a".repeat(50), { stream: true }), - ]).not.toThrow(); - - // Second chunk exceeds limit - expect(() => [...lexer.lex("a".repeat(60), { stream: true })]).toThrow( - RangeError, - ); - }); - - test("should not check buffer size when chunk is empty", () => { - const lexer = new FlexibleStringCSVLexer({ maxBufferSize: 10 }); - // Pre-fill buffer to near limit - [...lexer.lex("a".repeat(8), { stream: true })]; - - // Empty chunk should not trigger check - expect(() => [...lexer.lex("", { stream: true })]).not.toThrow(); - - // Null chunk should not trigger check - expect(() => [...lexer.lex(undefined, { stream: true })]).not.toThrow(); - }); - }); - - describe("realistic attack scenarios", () => { - test("should prevent DoS via malformed CSV without delimiters", () => { - const lexer = new FlexibleStringCSVLexer({ maxBufferSize: 1024 * 1024 }); // 1M characters limit - // Malformed CSV that doesn't match any token pattern - const malformedData = "x".repeat(2 * 1024 * 1024); // 2M characters of invalid data - - expect(() => [...lexer.lex(malformedData)]).toThrow(RangeError); - }); - - test("should prevent DoS via streaming incomplete quoted fields", () => { - const lexer = new FlexibleStringCSVLexer({ maxBufferSize: 512 * 1024 }); // 512K characters limit - - expect(() => { - // Stream chunks of quoted field without closing quote - for (let i = 0; i < 10; i++) { - const chunk = - i === 0 ? `"${"data".repeat(1024 * 30)}` : "data".repeat(1024 * 30); - [...lexer.lex(chunk, { stream: true })]; - } - }).toThrow(RangeError); - }); - - test("should prevent infinite loop with escaped quotes in long field", () => { - const lexer = new FlexibleStringCSVLexer({ maxBufferSize: 256 * 1024 }); // 256K characters limit - - expect(() => { - // Attack: Field with many escaped quotes that doesn't close - // This simulates the do-while loop scenario mentioned in the security report - const chunk = `"${'""'.repeat(150 * 1024)}`; - [...lexer.lex(chunk, { stream: true })]; - }).toThrow(RangeError); - }); - - test("should handle streaming with escaped quotes that eventually exceeds buffer", () => { - const lexer = new FlexibleStringCSVLexer({ maxBufferSize: 128 * 1024 }); // 128K characters limit - - expect(() => { - // Stream multiple chunks with escaped quotes - for (let i = 0; i < 5; i++) { - const chunk = - i === 0 ? `"${'""'.repeat(30 * 1024)}` : '""'.repeat(30 * 1024); - [...lexer.lex(chunk, { stream: true })]; - } - }).toThrow(RangeError); - }); - - test("should properly parse valid quoted field with many escaped quotes within limit", () => { - const lexer = new FlexibleStringCSVLexer({ maxBufferSize: 1024 * 1024 }); // 1M characters limit - // Valid field with escaped quotes that closes properly - const validData = `"${'""'.repeat(1000)}"`; - - const tokens = [...lexer.lex(validData)]; - expect(tokens).toHaveLength(1); - expect(tokens[0]?.type).toBe(Field); - expect(tokens[0]?.value).toBe('"'.repeat(1000)); - }); - }); -}); diff --git a/src/parser/models/FlexibleStringCSVLexer.spec.ts b/src/parser/models/FlexibleStringCSVLexer.spec.ts index c6f63724..2ce40237 100644 --- a/src/parser/models/FlexibleStringCSVLexer.spec.ts +++ b/src/parser/models/FlexibleStringCSVLexer.spec.ts @@ -1,29 +1,12 @@ import fc from "fast-check"; import { describe, expect, it } from "vitest"; import { autoChunk, FC } from "@/__tests__/helper.ts"; -import { - COMMA, - DOUBLE_QUOTE, - Field, - FieldDelimiter, - RecordDelimiter, -} from "@/core/constants.ts"; +import { COMMA, Delimiter, DOUBLE_QUOTE } from "@/core/constants.ts"; import { FlexibleStringCSVLexer } from "@/parser/api/model/createStringCSVLexer.ts"; import { escapeField } from "@/utils/serialization/escapeField.ts"; -const LOCATION_SHAPE = { - start: { - line: expect.any(Number), - column: expect.any(Number), - offset: expect.any(Number), - }, - end: { - line: expect.any(Number), - column: expect.any(Number), - offset: expect.any(Number), - }, - rowNumber: expect.any(Number), -}; +// Note: We don't use trackLocation in spec tests for performance +// The unit tests verify location tracking works correctly describe("class Lexer", () => { it("should lex with comma as a default field delimiter", () => { @@ -32,24 +15,21 @@ describe("class Lexer", () => { fc.gen().map((g) => { const row = g(FC.row); const csv = row.map((field) => escapeField(field)).join(","); - const expected = [ - ...row.flatMap((field, i) => [ - // if field is empty, it should be ignored - ...(field !== "" - ? [{ type: Field, value: field, location: LOCATION_SHAPE }] - : []), - // if field is not last field, it should be followed by a field delimiter - ...(row.length - 1 !== i - ? [ - { - type: FieldDelimiter, - value: COMMA, - location: LOCATION_SHAPE, - }, - ] - : []), - ]), - ]; + const ambiguousSingleEmpty = + row.length === 1 && + row[0] === "" && + escapeField(row[0]!) === row[0]; + // In unified token format, each token represents a field with its following delimiter + const expected = row.map((field, i) => ({ + value: field, + delimiter: + i === row.length - 1 ? Delimiter.Record : Delimiter.Field, + delimiterLength: expect.any(Number), + })); + if (ambiguousSingleEmpty) { + // CSV "" (no newline, no quotes) cannot represent a concrete field + expected.pop(); + } return { csv, expected }; }), ({ csv, expected }) => { @@ -70,23 +50,13 @@ describe("class Lexer", () => { // field should be escaped with double quote .map((field) => escapeField(field, { quote: true, quotation: '"' })) .join(","); - const expected = [ - ...row.flatMap((field, i) => [ - // field should be escaped with double quote, so empty field should be - // escaped with double quote - { type: Field, value: field, location: LOCATION_SHAPE }, - // if field is not last field, it should be followed by a field delimiter - ...(row.length - 1 !== i - ? [ - { - type: FieldDelimiter, - value: COMMA, - location: LOCATION_SHAPE, - }, - ] - : []), - ]), - ]; + // In unified token format, each token represents a field with its following delimiter + const expected = row.map((field, i) => ({ + value: field, + delimiter: + i === row.length - 1 ? Delimiter.Record : Delimiter.Field, + delimiterLength: expect.any(Number), + })); return { csv, expected }; }), ({ csv, expected }) => { @@ -110,24 +80,20 @@ describe("class Lexer", () => { const csv = row .map((field) => escapeField(field, { delimiter })) .join(delimiter); - const expected = [ - ...row.flatMap((field, i) => [ - // if field is empty, it should be ignored - ...(field !== "" || escapeField(field, { delimiter }) !== field - ? [{ type: Field, value: field, location: LOCATION_SHAPE }] - : []), - // if field is not last field, it should be followed by a field delimiter - ...(row.length - 1 !== i - ? [ - { - type: FieldDelimiter, - value: delimiter, - location: LOCATION_SHAPE, - }, - ] - : []), - ]), - ]; + const ambiguousSingleEmpty = + row.length === 1 && + row[0] === "" && + escapeField(row[0]!, { delimiter }) === row[0]; + // In unified token format, each token represents a field with its following delimiter + const expected = row.map((field, i) => ({ + value: field, + delimiter: + i === row.length - 1 ? Delimiter.Record : Delimiter.Field, + delimiterLength: expect.any(Number), + })); + if (ambiguousSingleEmpty) { + expected.pop(); + } return { delimiter, csv, expected }; }), ({ delimiter, csv, expected }) => { @@ -148,24 +114,20 @@ describe("class Lexer", () => { const csv = row .map((field) => escapeField(field, { quotation })) .join(","); - const expected = [ - ...row.flatMap((field, i) => [ - // if field is empty, it should be ignored - ...(field !== "" - ? [{ type: Field, value: field, location: LOCATION_SHAPE }] - : []), - // if field is not last field, it should be followed by a field delimiter - ...(row.length - 1 !== i - ? [ - { - type: FieldDelimiter, - value: COMMA, - location: LOCATION_SHAPE, - }, - ] - : []), - ]), - ]; + const ambiguousSingleEmpty = + row.length === 1 && + row[0] === "" && + escapeField(row[0]!, { quotation }) === row[0]; + // In unified token format, each token represents a field with its following delimiter + const expected = row.map((field, i) => ({ + value: field, + delimiter: + i === row.length - 1 ? Delimiter.Record : Delimiter.Field, + delimiterLength: expect.any(Number), + })); + if (ambiguousSingleEmpty) { + expected.pop(); + } return { quotation, csv, expected }; }), ({ quotation, csv, expected }) => { @@ -186,24 +148,20 @@ describe("class Lexer", () => { const csv = row .map((field) => escapeField(field, options)) .join(options.delimiter); - const expected = [ - ...row.flatMap((field, i) => [ - // if field is empty or field is escaped, it should be escaped. - ...(field !== "" || escapeField(field, options) !== field - ? [{ type: Field, value: field, location: LOCATION_SHAPE }] - : []), - // if field is not last field, it should be followed by a field delimiter - ...(row.length - 1 !== i - ? [ - { - type: FieldDelimiter, - value: options.delimiter, - location: LOCATION_SHAPE, - }, - ] - : []), - ]), - ]; + const ambiguousSingleEmpty = + row.length === 1 && + row[0] === "" && + escapeField(row[0]!, options) === row[0]; + // In unified token format, each token represents a field with its following delimiter + const expected = row.map((field, i) => ({ + value: field, + delimiter: + i === row.length - 1 ? Delimiter.Record : Delimiter.Field, + delimiterLength: expect.any(Number), + })); + if (ambiguousSingleEmpty) { + expected.pop(); + } return { options, row, csv, expected }; }), ({ options, csv, expected }) => { @@ -236,36 +194,26 @@ describe("class Lexer", () => { .join(options.delimiter), ) .join(eol) + (EOF ? eol : ""); - const expected = [ - ...data.flatMap((row, i) => [ - ...row.flatMap((field, j) => [ - // if quote is false and field is empty, it should be ignored - ...(quote || field !== "" - ? [{ type: Field, value: field }] - : []), - // if field is not last field, it should be followed by a field delimiter - ...(row.length - 1 !== j - ? [ - { - type: FieldDelimiter, - value: options.delimiter, - location: LOCATION_SHAPE, - }, - ] - : []), - ]), - // if row is not last row, it should be followed by a record delimiter. - ...(data.length - 1 !== i - ? [ - { - type: RecordDelimiter, - value: eol, - location: LOCATION_SHAPE, - }, - ] - : []), - ]), - ]; + // In unified token format, each token represents a field with its following delimiter + const expected: { value: string; delimiter?: Delimiter }[] = []; + for (let i = 0; i < data.length; i++) { + const row = data[i]!; + for (let j = 0; j < row.length; j++) { + const field = row[j]!; + const isLastFieldInRow = j === row.length - 1; + const _isLastRow = i === data.length - 1; + + // Only add token if field is non-empty or quoted + if (quote || field !== "") { + expected.push({ + value: field, + delimiter: isLastFieldInRow + ? Delimiter.Record + : Delimiter.Field, + }); + } + } + } return { csv, data, options, expected }; }), ({ options, csv, expected }) => { diff --git a/src/parser/models/FlexibleStringCSVLexer.test.ts b/src/parser/models/FlexibleStringCSVLexer.test.ts index 5d2d8b02..16d9a955 100644 --- a/src/parser/models/FlexibleStringCSVLexer.test.ts +++ b/src/parser/models/FlexibleStringCSVLexer.test.ts @@ -1,20 +1,22 @@ import { assert, beforeEach, describe, expect, test } from "vitest"; -import { Field, FieldDelimiter, RecordDelimiter } from "@/core/constants.ts"; -import type { StringCSVLexer } from "@/core/types.ts"; +import { Delimiter } from "@/core/constants.ts"; +import { ParseError } from "@/core/errors.ts"; import { FlexibleStringCSVLexer } from "@/parser/api/model/createStringCSVLexer.ts"; describe("CSVLexer", () => { - let lexer: StringCSVLexer; + // Use trackLocation: true for tests that verify location tracking + let lexer: FlexibleStringCSVLexer<",", '"', true>; beforeEach(() => { - lexer = new FlexibleStringCSVLexer(); + lexer = new FlexibleStringCSVLexer({ trackLocation: true }); }); test("should parse a field with not escaped", () => { const tokens = lexer.lex("field"); expect([...tokens]).toStrictEqual([ { - type: Field, value: "field", + delimiter: Delimiter.Record, + delimiterLength: 0, location: { start: { line: 1, column: 1, offset: 0 }, end: { line: 1, column: 6, offset: 5 }, @@ -28,8 +30,9 @@ describe("CSVLexer", () => { const tokens = lexer.lex('"field"'); expect([...tokens]).toStrictEqual([ { - type: Field, value: "field", + delimiter: Delimiter.Record, + delimiterLength: 0, location: { start: { line: 1, column: 1, offset: 0 }, end: { line: 1, column: 8, offset: 7 }, @@ -40,11 +43,12 @@ describe("CSVLexer", () => { }); test("should parse a field with escaped and delimiter", () => { - const tokens = lexer.lex('"field",'); + const tokens = lexer.lex('"field",next'); expect([...tokens]).toStrictEqual([ { - type: Field, value: "field", + delimiter: Delimiter.Field, + delimiterLength: 1, location: { start: { line: 1, column: 1, offset: 0 }, end: { line: 1, column: 8, offset: 7 }, @@ -52,11 +56,12 @@ describe("CSVLexer", () => { }, }, { - type: FieldDelimiter, - value: ",", + value: "next", + delimiter: Delimiter.Record, + delimiterLength: 0, location: { - start: { line: 1, column: 8, offset: 7 }, - end: { line: 1, column: 9, offset: 8 }, + start: { line: 1, column: 9, offset: 8 }, + end: { line: 1, column: 13, offset: 12 }, rowNumber: 1, }, }, @@ -67,8 +72,9 @@ describe("CSVLexer", () => { const tokens = lexer.lex('"fie\nld"\n"Hello\nWorld"'); expect([...tokens]).toStrictEqual([ { - type: Field, value: "fie\nld", + delimiter: Delimiter.Record, + delimiterLength: 1, location: { start: { line: 1, column: 1, offset: 0 }, end: { line: 2, column: 4, offset: 8 }, @@ -76,17 +82,9 @@ describe("CSVLexer", () => { }, }, { - type: RecordDelimiter, - value: "\n", - location: { - start: { line: 2, column: 4, offset: 8 }, - end: { line: 3, column: 1, offset: 9 }, - rowNumber: 1, - }, - }, - { - type: Field, value: "Hello\nWorld", + delimiter: Delimiter.Record, + delimiterLength: 0, location: { start: { line: 3, column: 1, offset: 9 }, end: { line: 4, column: 7, offset: 22 }, @@ -97,11 +95,13 @@ describe("CSVLexer", () => { }); test("should parse a field with escaped and delimiter and record delimiter and EOF(LF)", () => { + // Trailing newline should not create an extra empty record const tokens = lexer.lex('"fie\nld"\nHello World\n'); expect([...tokens]).toStrictEqual([ { - type: Field, value: "fie\nld", + delimiter: Delimiter.Record, + delimiterLength: 1, location: { start: { line: 1, column: 1, offset: 0 }, end: { line: 2, column: 4, offset: 8 }, @@ -109,17 +109,9 @@ describe("CSVLexer", () => { }, }, { - type: RecordDelimiter, - value: "\n", - location: { - start: { line: 2, column: 4, offset: 8 }, - end: { line: 3, column: 1, offset: 9 }, - rowNumber: 1, - }, - }, - { - type: Field, value: "Hello World", + delimiter: Delimiter.Record, + delimiterLength: 1, location: { start: { line: 3, column: 1, offset: 9 }, end: { line: 3, column: 12, offset: 20 }, @@ -130,11 +122,13 @@ describe("CSVLexer", () => { }); test("should parse a field with escaped and delimiter and record delimiter and EOF(RCLF)", () => { + // Trailing newline should not create an extra empty record const tokens = lexer.lex('"fie\r\nld"\r\nHello World\r\n'); expect([...tokens]).toStrictEqual([ { - type: Field, value: "fie\r\nld", + delimiter: Delimiter.Record, + delimiterLength: 2, location: { start: { line: 1, column: 1, offset: 0 }, end: { line: 2, column: 4, offset: 9 }, @@ -142,17 +136,9 @@ describe("CSVLexer", () => { }, }, { - type: RecordDelimiter, - value: "\r\n", - location: { - start: { line: 2, column: 4, offset: 9 }, - end: { line: 3, column: 1, offset: 11 }, - rowNumber: 1, - }, - }, - { - type: Field, value: "Hello World", + delimiter: Delimiter.Record, + delimiterLength: 2, location: { start: { line: 3, column: 1, offset: 11 }, end: { line: 3, column: 12, offset: 22 }, @@ -166,30 +152,23 @@ describe("CSVLexer", () => { let tokens = lexer.lex("Hello World\nHello ", { stream: true }); expect([...tokens]).toStrictEqual([ { - type: Field, value: "Hello World", + delimiter: Delimiter.Record, + delimiterLength: 1, location: { start: { line: 1, column: 1, offset: 0 }, end: { line: 1, column: 12, offset: 11 }, rowNumber: 1, }, }, - { - type: RecordDelimiter, - value: "\n", - location: { - start: { line: 1, column: 12, offset: 11 }, - end: { line: 2, column: 1, offset: 12 }, - rowNumber: 1, - }, - }, ]); tokens = lexer.lex("World"); expect([...tokens]).toStrictEqual([ { - type: Field, value: "Hello World", + delimiter: Delimiter.Record, + delimiterLength: 0, location: { start: { line: 2, column: 1, offset: 12 }, end: { line: 2, column: 12, offset: 23 }, @@ -203,30 +182,23 @@ describe("CSVLexer", () => { let tokens = lexer.lex('"Hello World"\n"Hello"', { stream: true }); expect([...tokens]).toStrictEqual([ { - type: Field, value: "Hello World", + delimiter: Delimiter.Record, + delimiterLength: 1, location: { start: { line: 1, column: 1, offset: 0 }, end: { line: 1, column: 14, offset: 13 }, rowNumber: 1, }, }, - { - type: RecordDelimiter, - value: "\n", - location: { - start: { line: 1, column: 14, offset: 13 }, - end: { line: 2, column: 1, offset: 14 }, - rowNumber: 1, - }, - }, ]); tokens = lexer.lex('"World"'); expect([...tokens]).toStrictEqual([ { - type: Field, value: 'Hello"World', + delimiter: Delimiter.Record, + delimiterLength: 0, location: { start: { line: 2, column: 1, offset: 14 }, end: { line: 2, column: 15, offset: 28 }, @@ -248,6 +220,7 @@ describe("CSVLexer", () => { controller = new AbortController(); lexer = new FlexibleStringCSVLexer({ signal: controller.signal, + trackLocation: true, }); }); @@ -289,7 +262,7 @@ describe("CSVLexer", () => { const signal = AbortSignal.timeout(0); await waitAbort(signal); - lexer = new FlexibleStringCSVLexer({ signal }); + lexer = new FlexibleStringCSVLexer({ signal, trackLocation: true }); try { [...lexer.lex('"Hello"')]; expect.unreachable(); @@ -317,7 +290,10 @@ describe("CSVLexer", () => { }); test("should include row number in ParseError", () => { - const lexerWithSource = new FlexibleStringCSVLexer(); + // trackLocation: true is required to get rowNumber in errors + const lexerWithSource = new FlexibleStringCSVLexer({ + trackLocation: true, + }); try { // Invalid CSV: unclosed quoted field (missing closing quote before EOF) @@ -332,8 +308,10 @@ describe("CSVLexer", () => { }); test("should include both source and row number in ParseError", () => { + // trackLocation: true is required to get rowNumber in errors const lexerWithSource = new FlexibleStringCSVLexer({ source: "data.csv", + trackLocation: true, }); try { @@ -361,4 +339,233 @@ describe("CSVLexer", () => { } }); }); + + describe("Undefined checks", () => { + test("should handle empty buffer during quoted field parsing with flush", () => { + const lexerWithoutLocation = new FlexibleStringCSVLexer(); + + // Start a quoted field but don't complete it + // This should trigger the undefined check when flush is called + expect(() => { + const gen = lexerWithoutLocation.lex('"incomplete'); + Array.from(gen); + }).toThrow(ParseError); + }); + + test("should parse complete quoted field correctly", () => { + const lexerWithoutLocation = new FlexibleStringCSVLexer(); + + // Process a complete quoted field + const gen = lexerWithoutLocation.lex('"field"'); + const tokens = Array.from(gen); + + // Should successfully parse the complete field + expect(tokens).toHaveLength(1); + expect(tokens[0]?.value).toBe("field"); + }); + + test("should parse a single unquoted field", () => { + const lexerWithoutLocation = new FlexibleStringCSVLexer(); + + // Normal field parsing should work correctly + const gen = lexerWithoutLocation.lex("field"); + const tokens = Array.from(gen); + + expect(tokens).toHaveLength(1); + expect(tokens[0]?.value).toBe("field"); + }); + }); + + describe("Buffer Overflow Protection", () => { + describe("with default buffer size (10M characters)", () => { + let lexerWithoutLocation: FlexibleStringCSVLexer; + beforeEach(() => { + lexerWithoutLocation = new FlexibleStringCSVLexer(); + }); + + test("should not throw error for normal-sized input", () => { + const data = "a,b,c\n".repeat(1000); + expect(() => [...lexerWithoutLocation.lex(data)]).not.toThrow(); + }); + + test("should throw RangeError when buffer exceeds 10M characters", () => { + // Create a large chunk that exceeds 10M characters + const largeChunk = "a".repeat(11 * 1024 * 1024); // 11M characters + + expect(() => [...lexerWithoutLocation.lex(largeChunk)]).toThrow( + RangeError, + ); + }); + + test("should throw RangeError with proper error details", () => { + const largeChunk = "a".repeat(11 * 1024 * 1024); // 11M characters + + try { + [...lexerWithoutLocation.lex(largeChunk)]; + expect.fail("Should have thrown RangeError"); + } catch (error) { + expect(error).toBeInstanceOf(RangeError); + expect((error as RangeError).message).toContain("Buffer size"); + expect((error as RangeError).message).toContain("characters"); + expect((error as RangeError).message).toContain( + "exceeded maximum allowed size", + ); + } + }); + + test("should throw RangeError on incremental buffering attack", () => { + // Simulate streaming attack with many small chunks + const smallChunk = "a".repeat(1024 * 1024); // 1M characters per chunk + + expect(() => { + for (let i = 0; i < 12; i++) { + [...lexerWithoutLocation.lex(smallChunk, { stream: true })]; // buffering = true + } + }).toThrow(RangeError); + }); + + test("should throw RangeError on unclosed quoted field", () => { + // Attack vector: unclosed quoted field that accumulates in buffer + const unclosedQuote = `"${"a".repeat(11 * 1024 * 1024)}`; + + expect(() => [ + ...lexerWithoutLocation.lex(unclosedQuote, { stream: true }), + ]).toThrow(RangeError); + }); + }); + + describe("with custom buffer size", () => { + test("should respect custom maxBufferSize option", () => { + const lexerWithCustomBuffer = new FlexibleStringCSVLexer({ + maxBufferSize: 1024, + }); // 1K characters limit + const largeChunk = "a".repeat(2048); // 2K characters + + expect(() => [...lexerWithCustomBuffer.lex(largeChunk)]).toThrow( + RangeError, + ); + }); + + test("should allow Infinity as maxBufferSize to disable limit", () => { + const lexerWithInfiniteBuffer = new FlexibleStringCSVLexer({ + maxBufferSize: Number.POSITIVE_INFINITY, + }); + const largeChunk = "a".repeat(20 * 1024 * 1024); // 20M characters + + // This should not throw, but may take some time and memory + // We'll just verify it doesn't throw immediately + expect(() => [...lexerWithInfiniteBuffer.lex(largeChunk)]).not.toThrow( + RangeError, + ); + }); + }); + + describe("buffer size check timing", () => { + test("should check buffer size after each chunk addition", () => { + const lexerWithSmallBuffer = new FlexibleStringCSVLexer({ + maxBufferSize: 100, + }); + + // First chunk is within limit + expect(() => [ + ...lexerWithSmallBuffer.lex("a".repeat(50), { stream: true }), + ]).not.toThrow(); + + // Second chunk exceeds limit + expect(() => [ + ...lexerWithSmallBuffer.lex("a".repeat(60), { stream: true }), + ]).toThrow(RangeError); + }); + + test("should not check buffer size when chunk is empty", () => { + const lexerWithTinyBuffer = new FlexibleStringCSVLexer({ + maxBufferSize: 10, + }); + // Pre-fill buffer to near limit + [...lexerWithTinyBuffer.lex("a".repeat(8), { stream: true })]; + + // Empty chunk should not trigger check + expect(() => [ + ...lexerWithTinyBuffer.lex("", { stream: true }), + ]).not.toThrow(); + + // Null chunk should not trigger check + expect(() => [ + ...lexerWithTinyBuffer.lex(undefined, { stream: true }), + ]).not.toThrow(); + }); + }); + + describe("realistic attack scenarios", () => { + test("should prevent DoS via malformed CSV without delimiters", () => { + const lexerWithMediumBuffer = new FlexibleStringCSVLexer({ + maxBufferSize: 1024 * 1024, + }); // 1M characters limit + // Malformed CSV that doesn't match any token pattern + const malformedData = "x".repeat(2 * 1024 * 1024); // 2M characters of invalid data + + expect(() => [...lexerWithMediumBuffer.lex(malformedData)]).toThrow( + RangeError, + ); + }); + + test("should prevent DoS via streaming incomplete quoted fields", () => { + const lexerWithMediumBuffer = new FlexibleStringCSVLexer({ + maxBufferSize: 512 * 1024, + }); // 512K characters limit + + expect(() => { + // Stream chunks of quoted field without closing quote + for (let i = 0; i < 10; i++) { + const chunk = + i === 0 + ? `"${"data".repeat(1024 * 30)}` + : "data".repeat(1024 * 30); + [...lexerWithMediumBuffer.lex(chunk, { stream: true })]; + } + }).toThrow(RangeError); + }); + + test("should prevent infinite loop with escaped quotes in long field", () => { + const lexerWithMediumBuffer = new FlexibleStringCSVLexer({ + maxBufferSize: 256 * 1024, + }); // 256K characters limit + + expect(() => { + // Attack: Field with many escaped quotes that doesn't close + // This simulates the do-while loop scenario mentioned in the security report + const chunk = `"${'""'.repeat(150 * 1024)}`; + [...lexerWithMediumBuffer.lex(chunk, { stream: true })]; + }).toThrow(RangeError); + }); + + test("should handle streaming with escaped quotes that eventually exceeds buffer", () => { + const lexerWithMediumBuffer = new FlexibleStringCSVLexer({ + maxBufferSize: 128 * 1024, + }); // 128K characters limit + + expect(() => { + // Stream multiple chunks with escaped quotes + for (let i = 0; i < 5; i++) { + const chunk = + i === 0 ? `"${'""'.repeat(30 * 1024)}` : '""'.repeat(30 * 1024); + [...lexerWithMediumBuffer.lex(chunk, { stream: true })]; + } + }).toThrow(RangeError); + }); + + test("should properly parse valid quoted field with many escaped quotes within limit", () => { + const lexerWithLargeBuffer = new FlexibleStringCSVLexer({ + maxBufferSize: 1024 * 1024, + }); // 1M characters limit + // Valid field with escaped quotes that closes properly + const validData = `"${'""'.repeat(1000)}"`; + + const tokens = [...lexerWithLargeBuffer.lex(validData)]; + expect(tokens).toHaveLength(1); + expect(tokens[0]?.delimiter).toBe(Delimiter.Record); + expect(tokens[0]?.value).toBe('"'.repeat(1000)); + }); + }); + }); }); diff --git a/src/parser/models/FlexibleStringCSVLexer.ts b/src/parser/models/FlexibleStringCSVLexer.ts index dca2f24e..c8998e8f 100644 --- a/src/parser/models/FlexibleStringCSVLexer.ts +++ b/src/parser/models/FlexibleStringCSVLexer.ts @@ -1,51 +1,69 @@ import { - CRLF, DEFAULT_DELIMITER, DEFAULT_LEXER_MAX_BUFFER_SIZE, DEFAULT_QUOTATION, - Field, - FieldDelimiter, + Delimiter, LF, - RecordDelimiter, } from "@/core/constants.ts"; import { ParseError } from "@/core/errors.ts"; import type { - AbortSignalOptions, - CommonOptions, CSVLexerLexOptions, Position, - RecordDelimiterToken, StringCSVLexer, + StringCSVLexerOptions, Token, } from "@/core/types.ts"; -import { escapeRegExp } from "@/helpers/string/escapeRegExp.ts"; +import { ReusableArrayPool } from "@/utils/memory/ReusableArrayPool.ts"; import { assertCommonOptions } from "@/utils/validation/assertCommonOptions.ts"; /** * Flexible String CSV Lexer implementation. * - * A balanced implementation that tokenizes CSV data into fields and records, - * optimizing for both performance and memory efficiency. + * An optimized lexer that emits unified field tokens, reducing token count by 50%. + * Instead of separate Field, FieldDelimiter, and RecordDelimiter tokens, + * only field tokens are emitted with the `delimiter` property indicating what follows. * * @remarks - * This implementation is designed to handle various CSV formats flexibly - * while maintaining good performance characteristics. For specialized use cases, - * future implementations may provide optimizations for specific scenarios - * (e.g., speed-optimized, memory-optimized). + * This implementation provides better performance by reducing object allocation + * and simplifying the token stream. + * + * @template Delimiter - The field delimiter character (default: ',') + * @template Quotation - The quotation character (default: '"') + * @template TrackLocation - Whether to include location in tokens (default: false) */ +// Character codes for fast comparison +const CR = 13; // '\r' +const LF_CODE = 10; // '\n' + export class FlexibleStringCSVLexer< - Delimiter extends string = DEFAULT_DELIMITER, + DelimiterType extends string = DEFAULT_DELIMITER, Quotation extends string = DEFAULT_QUOTATION, -> implements StringCSVLexer + TrackLocation extends boolean = false, +> implements StringCSVLexer { #delimiter: string; #quotation: string; #buffer = ""; + #bufferOffset = 0; #flush = false; - #matcher: RegExp; #fieldDelimiterLength: number; #maxBufferSize: number; + #trackLocation: boolean; + + // Pre-computed character codes for fast comparison + #delimiterCode: number; + #quotationCode: number; + + // Track whether we need to emit an empty EOF token after trailing field delimiter + #pendingTrailingFieldEOF = false; + + // For lazy position tracking in no-location mode (streaming support) + // Tracks cumulative line number at the start of current buffer + #baseLineNumber = 1; + // Tracks column position at the start of current buffer (when truncation is mid-line) + #baseColumn = 1; + // Only used when trackLocation is true #cursor: Position = { line: 1, column: 1, @@ -55,18 +73,21 @@ export class FlexibleStringCSVLexer< #signal?: AbortSignal | undefined; #source?: string | undefined; + #segmentPool: ReusableArrayPool; + static readonly #SEGMENT_POOL_LIMIT = 32; - /** - * Constructs a new CSVLexer instance. - * @param options - The common options for the lexer. - */ constructor( - options: CommonOptions & AbortSignalOptions = {}, + options: StringCSVLexerOptions< + DelimiterType, + Quotation, + TrackLocation + > = {}, ) { const { delimiter = DEFAULT_DELIMITER, quotation = DEFAULT_QUOTATION, maxBufferSize = DEFAULT_LEXER_MAX_BUFFER_SIZE, + trackLocation = false as TrackLocation, signal, source, } = options; @@ -75,13 +96,66 @@ export class FlexibleStringCSVLexer< this.#quotation = quotation; this.#fieldDelimiterLength = delimiter.length; this.#maxBufferSize = maxBufferSize; + this.#trackLocation = trackLocation; this.#source = source; this.#signal = signal; - const d = escapeRegExp(delimiter); - const q = escapeRegExp(quotation); - this.#matcher = new RegExp( - `^(?:(?!${q})(?!${d})(?![\\r\\n]))([\\S\\s\\uFEFF\\xA0]+?)(?=${q}|${d}|\\r|\\n|$)`, + this.#segmentPool = new ReusableArrayPool( + FlexibleStringCSVLexer.#SEGMENT_POOL_LIMIT, ); + + // Pre-compute character codes + this.#delimiterCode = delimiter.charCodeAt(0); + this.#quotationCode = quotation.charCodeAt(0); + } + + // ==================== Common Helper Methods ==================== + + /** + * Computes line and column position by scanning the buffer from start to current offset. + * Used for lazy position tracking - only called when an error occurs. + * This is O(n) but errors are rare, so the cost is acceptable. + */ + #computePositionFromBuffer(): { line: number; column: number } { + // Start with cumulative line/column from previously truncated buffer portions + let line = this.#baseLineNumber; + let column = this.#baseColumn; + const end = this.#bufferOffset; + for (let i = 0; i < end; i++) { + if (this.#buffer.charCodeAt(i) === LF_CODE) { + line++; + column = 1; + } else { + column++; + } + } + return { line, column }; + } + + /** + * Throws a ParseError for unexpected EOF while parsing quoted field. + * Uses lazy position tracking - computes position only when error occurs. + */ + #throwUnexpectedEOF(): never { + const pos = this.#computePositionFromBuffer(); + throw new ParseError( + `Unexpected EOF while parsing quoted field at line ${pos.line}, column ${pos.column}.`, + { + position: { ...pos, offset: this.#bufferOffset }, + source: this.#source, + }, + ); + } + + /** + * Throws a ParseError for unexpected EOF while parsing quoted field (with location). + * Used by location-tracking parsing paths. + */ + #throwUnexpectedEOFWithLocation(): never { + throw new ParseError("Unexpected EOF while parsing quoted field.", { + position: { ...this.#cursor }, + rowNumber: this.#rowNumber, + source: this.#source, + }); } /** @@ -93,13 +167,33 @@ export class FlexibleStringCSVLexer< public lex( chunk?: string, options?: CSVLexerLexOptions, - ): IterableIterator { + ): IterableIterator> { const stream = options?.stream ?? false; if (!stream) { this.#flush = true; } if (chunk !== undefined && chunk.length !== 0) { + // Clear pending trailing flag since we're adding more data + this.#pendingTrailingFieldEOF = false; + if (this.#bufferOffset > 0) { + // Update base position for lazy position tracking before truncating + // Track both line and column through the truncated portion + if (!this.#trackLocation) { + let column = this.#baseColumn; + for (let i = 0; i < this.#bufferOffset; i++) { + if (this.#buffer.charCodeAt(i) === LF_CODE) { + this.#baseLineNumber++; + column = 1; + } else { + column++; + } + } + this.#baseColumn = column; + } + this.#buffer = this.#buffer.slice(this.#bufferOffset); + this.#bufferOffset = 0; + } this.#buffer += chunk; this.#checkBufferSize(); } @@ -107,29 +201,13 @@ export class FlexibleStringCSVLexer< return this.#tokens(); } - /** - * Generates tokens from the buffered CSV data. - * @yields Tokens from the buffered CSV data. - */ - *#tokens(): Generator { - if (this.#flush) { - // Trim the last CRLF or LF - if (this.#buffer.endsWith(CRLF)) { - this.#buffer = this.#buffer.slice(0, -2 /* -CRLF.length */); - } else if (this.#buffer.endsWith(LF)) { - this.#buffer = this.#buffer.slice(0, -1 /* -LF.length */); - } - } - let token: Token | null; - while ((token = this.#nextToken())) { + *#tokens(): Generator> { + let token: Token | null; + while ((token = this.#nextField())) { yield token; } } - /** - * Checks if the buffer size exceeds the maximum allowed size. - * @throws {RangeError} If the buffer size exceeds the maximum. - */ #checkBufferSize(): void { if (this.#buffer.length > this.#maxBufferSize) { throw new RangeError( @@ -138,152 +216,630 @@ export class FlexibleStringCSVLexer< } } - /** - * Retrieves the next token from the buffered CSV data. - * @returns The next token or null if there are no more tokens. - */ - #nextToken(): Token | null { + #nextField(): Token | null { this.#signal?.throwIfAborted(); - if (this.#buffer.length === 0) { + const remainingLen = this.#buffer.length - this.#bufferOffset; + if (remainingLen === 0) { + // Emit empty token when flushing + if (this.#flush) { + // Emit empty token after trailing field delimiter (e.g., ",x," -> 3 fields) + if (this.#pendingTrailingFieldEOF) { + this.#pendingTrailingFieldEOF = false; + if (this.#trackLocation) { + return { + value: "", + delimiter: Delimiter.Record, + delimiterLength: 0, + location: { + start: { ...this.#cursor }, + end: { ...this.#cursor }, + rowNumber: this.#rowNumber, + }, + } as Token; + } + return { + value: "", + delimiter: Delimiter.Record, + delimiterLength: 0, + } as Token; + } + } return null; } - // Buffer is Record Delimiter, defer to the next iteration. + + if (!this.#trackLocation) { + return this.#nextFieldNoLocation() as Token; + } + return this.#nextFieldWithLocation() as Token; + } + + // ==================== No-Location Parsing Methods ==================== + + /** + * Fast path: Parse next field without location tracking. + */ + #nextFieldNoLocation(): Token | null { + const code = this.#buffer.charCodeAt(this.#bufferOffset); + + // Empty field at start of record or between delimiters + if (code === this.#delimiterCode) { + this.#bufferOffset += this.#fieldDelimiterLength; + this.#pendingTrailingFieldEOF = this.#bufferOffset >= this.#buffer.length; + return { + value: "", + delimiter: Delimiter.Field, + delimiterLength: this.#fieldDelimiterLength, + }; + } + + // Empty field at end of record (CRLF) if ( - this.#flush === false && - (this.#buffer === CRLF || this.#buffer === LF) + code === CR && + this.#buffer.charCodeAt(this.#bufferOffset + 1) === LF_CODE ) { + this.#bufferOffset += 2; + return { value: "", delimiter: Delimiter.Record, delimiterLength: 2 }; + } + + // Empty field at end of record (LF) + if (code === LF_CODE) { + this.#bufferOffset += 1; + return { value: "", delimiter: Delimiter.Record, delimiterLength: 1 }; + } + + // Parse quoted field + if (code === this.#quotationCode) { + return this.#parseQuotedFieldNoLocation(); + } + + // Parse unquoted field + return this.#parseUnquotedFieldNoLocation(); + } + + #parseQuotedFieldNoLocation(): Token | null { + const segments = this.#borrowSegments(); + try { + const baseOffset = this.#bufferOffset; + const buf = this.#buffer; + const quotCode = this.#quotationCode; + let localOffset = 1; + let segmentStart = localOffset; + + let curCode = buf.charCodeAt(baseOffset + localOffset); + if (Number.isNaN(curCode)) { + if (!this.#flush) { + return null; + } + this.#throwUnexpectedEOF(); + } + + let nextCode = buf.charCodeAt(baseOffset + localOffset + 1); + do { + if (curCode === quotCode) { + if (nextCode === quotCode) { + // Escaped quote + segments.push( + buf.slice( + baseOffset + segmentStart, + baseOffset + localOffset + 1, + ), + ); + localOffset += 2; + segmentStart = localOffset; + curCode = buf.charCodeAt(baseOffset + localOffset); + nextCode = buf.charCodeAt(baseOffset + localOffset + 1); + continue; + } + + if (Number.isNaN(nextCode) && !this.#flush) { + return null; + } + + // End of quoted field - collect value + if (localOffset > segmentStart) { + segments.push( + buf.slice(baseOffset + segmentStart, baseOffset + localOffset), + ); + } + + const value = + segments.length === 1 ? segments[0]! : segments.join(""); + localOffset++; // skip closing quote + + // Inline delimiter determination using charCodeAt + const delimPos = baseOffset + localOffset; + const delimCode = buf.charCodeAt(delimPos); + + if (Number.isNaN(delimCode)) { + if (!this.#flush) { + return null; + } + this.#bufferOffset = delimPos; + return { value, delimiter: Delimiter.Record, delimiterLength: 0 }; + } + + // Field delimiter + if (delimCode === this.#delimiterCode) { + this.#bufferOffset = delimPos + this.#fieldDelimiterLength; + this.#pendingTrailingFieldEOF = + this.#bufferOffset >= this.#buffer.length; + return { + value, + delimiter: Delimiter.Field, + delimiterLength: this.#fieldDelimiterLength, + }; + } + + // Record delimiter (CRLF) + if (delimCode === CR) { + const nextCode = buf.charCodeAt(delimPos + 1); + if (Number.isNaN(nextCode)) { + if (!this.#flush) { + return null; + } + } else if (nextCode === LF_CODE) { + this.#bufferOffset = delimPos + 2; + return { value, delimiter: Delimiter.Record, delimiterLength: 2 }; + } + } + + // Record delimiter (LF) + if (delimCode === LF_CODE) { + this.#bufferOffset = delimPos + 1; + return { value, delimiter: Delimiter.Record, delimiterLength: 1 }; + } + + // EOF + this.#bufferOffset = delimPos; + return { value, delimiter: Delimiter.Record, delimiterLength: 0 }; + } + + localOffset++; + curCode = nextCode; + nextCode = buf.charCodeAt(baseOffset + localOffset + 1); + } while (!Number.isNaN(curCode)); + + if (this.#flush) { + this.#throwUnexpectedEOF(); + } + return null; + } finally { + this.#releaseSegments(segments); + } + } + + #parseUnquotedFieldNoLocation(): Token | null { + const startOffset = this.#bufferOffset; + const bufLen = this.#buffer.length; + const buf = this.#buffer; + const delimCode = this.#delimiterCode; + const quotCode = this.#quotationCode; + let localEnd = 0; + + while (startOffset + localEnd < bufLen) { + const code = buf.charCodeAt(startOffset + localEnd); + + // Field delimiter - inline determination + if (code === delimCode) { + const value = buf.slice(startOffset, startOffset + localEnd); + this.#bufferOffset = + startOffset + localEnd + this.#fieldDelimiterLength; + this.#pendingTrailingFieldEOF = this.#bufferOffset >= bufLen; + return { + value, + delimiter: Delimiter.Field, + delimiterLength: this.#fieldDelimiterLength, + }; + } + + // Record delimiter (CRLF) + if ( + code === CR && + buf.charCodeAt(startOffset + localEnd + 1) === LF_CODE + ) { + const value = buf.slice(startOffset, startOffset + localEnd); + this.#bufferOffset = startOffset + localEnd + 2; + return { value, delimiter: Delimiter.Record, delimiterLength: 2 }; + } + + // Record delimiter (LF) + if (code === LF_CODE) { + const value = buf.slice(startOffset, startOffset + localEnd); + this.#bufferOffset = startOffset + localEnd + 1; + return { value, delimiter: Delimiter.Record, delimiterLength: 1 }; + } + + // Quotation in middle of unquoted field - RFC 4180 violation but we handle it + // by throwing an error since quotes shouldn't appear in unquoted fields + if (code === quotCode) { + // Continue scanning until we find a delimiter or EOL, then throw error + // This will be caught when parsePartialQuotedFieldNoLocation finds no closing quote + return this.#parsePartialQuotedFieldNoLocation(startOffset, localEnd); + } + + localEnd++; + } + + // End of buffer + if (!this.#flush) { + return null; + } + + // EOF + const value = buf.slice(startOffset, startOffset + localEnd); + this.#bufferOffset = startOffset + localEnd; + return { value, delimiter: Delimiter.Record, delimiterLength: 0 }; + } + + /** + * Parse a field that starts unquoted but contains a quoted section. + * E.g., `a"quoted"b` or `a"unclosed` (which throws error) + */ + #parsePartialQuotedFieldNoLocation( + startOffset: number, + prefixLen: number, + ): Token | null { + const buf = this.#buffer; + const quotCode = this.#quotationCode; + const segments = this.#borrowSegments(); + try { + // Add prefix (unquoted part before the quote) + if (prefixLen > 0) { + segments.push(buf.slice(startOffset, startOffset + prefixLen)); + } + + let localOffset = prefixLen + 1; // Skip opening quote + let segmentStart = localOffset; + + let curCode = buf.charCodeAt(startOffset + localOffset); + if (Number.isNaN(curCode)) { + if (!this.#flush) { + return null; + } + this.#throwUnexpectedEOF(); + } + + let nextCode = buf.charCodeAt(startOffset + localOffset + 1); + + while (!Number.isNaN(curCode)) { + if (curCode === quotCode) { + if (nextCode === quotCode) { + // Escaped quote + segments.push( + buf.slice( + startOffset + segmentStart, + startOffset + localOffset + 1, + ), + ); + localOffset += 2; + segmentStart = localOffset; + curCode = buf.charCodeAt(startOffset + localOffset); + nextCode = buf.charCodeAt(startOffset + localOffset + 1); + continue; + } + + if (Number.isNaN(nextCode) && !this.#flush) { + return null; + } + + // End of quoted section - collect value + if (localOffset > segmentStart) { + segments.push( + buf.slice(startOffset + segmentStart, startOffset + localOffset), + ); + } + + localOffset++; // skip closing quote + + // Check what follows the quoted section + const afterCode = buf.charCodeAt(startOffset + localOffset); + const value = segments.join(""); + + if (Number.isNaN(afterCode)) { + if (!this.#flush) { + return null; + } + this.#bufferOffset = startOffset + localOffset; + return { value, delimiter: Delimiter.Record, delimiterLength: 0 }; + } + + // Continue parsing if more unquoted content follows + if ( + !Number.isNaN(afterCode) && + afterCode !== this.#delimiterCode && + afterCode !== CR && + afterCode !== LF_CODE + ) { + // Recursively handle the rest (could have more quoted sections) + this.#bufferOffset = startOffset + localOffset; + const rest = this.#parseUnquotedFieldNoLocation(); + if (rest === null) { + return null; + } + segments.push(rest.value); + return { + value: segments.join(""), + delimiter: rest.delimiter, + delimiterLength: rest.delimiterLength, + }; + } + + // Field delimiter + if (afterCode === this.#delimiterCode) { + this.#bufferOffset = + startOffset + localOffset + this.#fieldDelimiterLength; + this.#pendingTrailingFieldEOF = + this.#bufferOffset >= this.#buffer.length; + return { + value, + delimiter: Delimiter.Field, + delimiterLength: this.#fieldDelimiterLength, + }; + } + + // Record delimiter (CRLF) + if (afterCode === CR) { + const nextAfter = buf.charCodeAt(startOffset + localOffset + 1); + if (Number.isNaN(nextAfter)) { + if (!this.#flush) { + return null; + } + } else if (nextAfter === LF_CODE) { + this.#bufferOffset = startOffset + localOffset + 2; + return { value, delimiter: Delimiter.Record, delimiterLength: 2 }; + } + } + + // Record delimiter (LF) + if (afterCode === LF_CODE) { + this.#bufferOffset = startOffset + localOffset + 1; + return { value, delimiter: Delimiter.Record, delimiterLength: 1 }; + } + + // EOF + this.#bufferOffset = startOffset + localOffset; + return { value, delimiter: Delimiter.Record, delimiterLength: 0 }; + } + + localOffset++; + curCode = nextCode; + nextCode = buf.charCodeAt(startOffset + localOffset + 1); + } + + if (this.#flush) { + this.#throwUnexpectedEOF(); + } return null; + } finally { + this.#releaseSegments(segments); } + } + + // ==================== Location-Tracking Parsing Methods ==================== + + /** + * Full path: Parse next field with location tracking. + */ + #nextFieldWithLocation(): Token | null { + const firstChar = this.#buffer[this.#bufferOffset]; - // Check for CRLF - if (this.#buffer.startsWith(CRLF)) { - this.#buffer = this.#buffer.slice(2); + // Empty field at start of record or between delimiters + if (firstChar === this.#delimiter) { const start: Position = { ...this.#cursor }; - this.#cursor.line++; - this.#cursor.column = 1; - this.#cursor.offset += 2; // CRLF.length - const token: RecordDelimiterToken = { - type: RecordDelimiter, - value: CRLF, + this.#bufferOffset += this.#fieldDelimiterLength; + this.#cursor.column += this.#fieldDelimiterLength; + this.#cursor.offset += this.#fieldDelimiterLength; + this.#pendingTrailingFieldEOF = this.#bufferOffset >= this.#buffer.length; + return { + value: "", + delimiter: Delimiter.Field, + delimiterLength: this.#fieldDelimiterLength, location: { start, end: { ...this.#cursor }, - rowNumber: this.#rowNumber++, + rowNumber: this.#rowNumber, }, }; - return token; } - // Check for LF - if (this.#buffer.startsWith(LF)) { - this.#buffer = this.#buffer.slice(1); + // Empty field at end of record (CRLF) + if (firstChar === "\r" && this.#buffer[this.#bufferOffset + 1] === "\n") { const start: Position = { ...this.#cursor }; + this.#bufferOffset += 2; this.#cursor.line++; this.#cursor.column = 1; - this.#cursor.offset += 1; // LF.length - const token: RecordDelimiterToken = { - type: RecordDelimiter, - value: LF, + this.#cursor.offset += 2; + const rowNum = this.#rowNumber++; + return { + value: "", + delimiter: Delimiter.Record, + delimiterLength: 2, location: { start, end: { ...this.#cursor }, - rowNumber: this.#rowNumber++, + rowNumber: rowNum, }, }; - return token; } - // Check for Delimiter - if (this.#buffer.startsWith(this.#delimiter)) { - this.#buffer = this.#buffer.slice(1); + // Empty field at end of record (LF) + if (firstChar === "\n") { const start: Position = { ...this.#cursor }; - this.#cursor.column += this.#fieldDelimiterLength; - this.#cursor.offset += this.#fieldDelimiterLength; + this.#bufferOffset += 1; + this.#cursor.line++; + this.#cursor.column = 1; + this.#cursor.offset += 1; + const rowNum = this.#rowNumber++; return { - type: FieldDelimiter, - value: this.#delimiter, + value: "", + delimiter: Delimiter.Record, + delimiterLength: 1, location: { start, end: { ...this.#cursor }, - rowNumber: this.#rowNumber, + rowNumber: rowNum, }, }; } - // Check for Quoted String - if (this.#buffer.startsWith(this.#quotation)) { - /** - * Extract Quoted field. - * - * The following code is equivalent to the following: - * - * If the next character is a quote: - * - If the character after that is a quote, then append a quote to the value and skip two characters. - * - Otherwise, return the quoted string. - * Otherwise, append the character to the value and skip one character. - * - * ```plaintext - * | `i` | `i + 1` | `i + 2` | - * |------------|------------|----------| - * | cur | next | | => Variable names - * | #quotation | #quotation | | => Escaped quote - * | #quotation | (EOF) | | => Closing quote - * | #quotation | undefined | | => End of buffer - * | undefined | | | => End of buffer - * ``` - */ - let value = ""; - let offset = 1; // Skip the opening quote - let column = 2; // Skip the opening quote - let line = 0; + // Parse quoted field + if (firstChar === this.#quotation) { + return this.#parseQuotedFieldWithLocation(); + } + + // Parse unquoted field + return this.#parseUnquotedFieldWithLocation(); + } + + #parseQuotedFieldWithLocation(): Token | null { + const start: Position = { ...this.#cursor }; + const baseOffset = this.#bufferOffset; + let localOffset = 1; + let column = 2; + let line = 0; - // Define variables - let cur: string | undefined = this.#buffer[offset]; + const segments = this.#borrowSegments(); + try { + let segmentStart = localOffset; + + let cur: string | undefined = this.#buffer[baseOffset + localOffset]; if (cur === undefined) { - if (this.#flush === false) { + if (!this.#flush) { return null; } - throw new ParseError("Unexpected EOF while parsing quoted field.", { - position: { ...this.#cursor }, - rowNumber: this.#rowNumber, - source: this.#source, - }); + this.#throwUnexpectedEOFWithLocation(); } - let next: string | undefined = this.#buffer[offset + 1]; + + let next: string | undefined = this.#buffer[baseOffset + localOffset + 1]; + do { - // If the current character is a quote, check the next characters for closing quotes. if (cur === this.#quotation) { - // If the cur character is a quote and the next character is a quote, - // then append a quote to the value and skip two characters. if (next === this.#quotation) { - // Append a quote to the value and skip two characters. - value += this.#quotation; - offset += 2; - cur = this.#buffer[offset]; - next = this.#buffer[offset + 1]; - - // Update the diff + segments.push( + this.#buffer.slice( + baseOffset + segmentStart, + baseOffset + localOffset + 1, + ), + ); + localOffset += 2; + segmentStart = localOffset; + cur = this.#buffer[baseOffset + localOffset]; + next = this.#buffer[baseOffset + localOffset + 1]; column += 2; continue; } - // If the cur character is a quote and the next character is undefined, - // then return null. - if (next === undefined && this.#flush === false) { + if (next === undefined && !this.#flush) { + return null; + } + + // End of quoted field - collect value + if (localOffset > segmentStart) { + segments.push( + this.#buffer.slice( + baseOffset + segmentStart, + baseOffset + localOffset, + ), + ); + } + + const value = + segments.length === 1 ? segments[0]! : segments.join(""); + localOffset++; // skip closing quote + const nextChar = this.#buffer[baseOffset + localOffset]; + + if (nextChar === undefined) { + if (!this.#flush) { + return null; + } + this.#cursor.column += column; + this.#cursor.offset += localOffset; + this.#cursor.line += line; + this.#bufferOffset = baseOffset + localOffset; + return { + value, + delimiter: Delimiter.Record, + delimiterLength: 0, + location: { + start, + end: { ...this.#cursor }, + rowNumber: this.#rowNumber, + }, + }; + } + + if ( + nextChar === "\r" && + this.#buffer[baseOffset + localOffset + 1] === undefined && + !this.#flush + ) { return null; } - // Otherwise, return the quoted string. - // Update the buffer and return the token - offset++; - this.#buffer = this.#buffer.slice(offset); - const start: Position = { ...this.#cursor }; this.#cursor.column += column; - this.#cursor.offset += offset; + this.#cursor.offset += localOffset; this.#cursor.line += line; + + // Inline delimiter determination + + // Field delimiter + if (nextChar === this.#delimiter) { + const end: Position = { ...this.#cursor }; + this.#bufferOffset = + baseOffset + localOffset + this.#fieldDelimiterLength; + this.#cursor.column += this.#fieldDelimiterLength; + this.#cursor.offset += this.#fieldDelimiterLength; + this.#pendingTrailingFieldEOF = + this.#bufferOffset >= this.#buffer.length; + return { + value, + delimiter: Delimiter.Field, + delimiterLength: this.#fieldDelimiterLength, + location: { start, end, rowNumber: this.#rowNumber }, + }; + } + + // Record delimiter (CRLF) + if ( + nextChar === "\r" && + this.#buffer[baseOffset + localOffset + 1] === "\n" + ) { + const end: Position = { ...this.#cursor }; + this.#bufferOffset = baseOffset + localOffset + 2; + this.#cursor.line++; + this.#cursor.column = 1; + this.#cursor.offset += 2; + const rowNum = this.#rowNumber++; + return { + value, + delimiter: Delimiter.Record, + delimiterLength: 2, + location: { start, end, rowNumber: rowNum }, + }; + } + + // Record delimiter (LF) + if (nextChar === "\n") { + const end: Position = { ...this.#cursor }; + this.#bufferOffset = baseOffset + localOffset + 1; + this.#cursor.line++; + this.#cursor.column = 1; + this.#cursor.offset += 1; + const rowNum = this.#rowNumber++; + return { + value, + delimiter: Delimiter.Record, + delimiterLength: 1, + location: { start, end, rowNumber: rowNum }, + }; + } + + // EOF + this.#bufferOffset = baseOffset + localOffset; return { - type: Field, value, + delimiter: Delimiter.Record, + delimiterLength: 0, location: { start, end: { ...this.#cursor }, @@ -292,63 +848,349 @@ export class FlexibleStringCSVLexer< }; } - // Append the character to the value. - value += cur; - - // Prepare for the next iteration if (cur === LF) { - // If the current character is a LF, - // then increment the line number and reset the column number. line++; column = 1; } else { - // Otherwise, increment the column number and offset. column++; } - offset++; + localOffset++; cur = next; - next = this.#buffer[offset + 1]; + next = this.#buffer[baseOffset + localOffset + 1]; } while (cur !== undefined); if (this.#flush) { - throw new ParseError("Unexpected EOF while parsing quoted field.", { - position: { ...this.#cursor }, - rowNumber: this.#rowNumber, - source: this.#source, - }); + this.#throwUnexpectedEOFWithLocation(); } return null; + } finally { + this.#releaseSegments(segments); } + } + + #parseUnquotedFieldWithLocation(): Token | null { + const start: Position = { ...this.#cursor }; + const startOffset = this.#bufferOffset; + const bufLen = this.#buffer.length; + let localEnd = 0; + + while (startOffset + localEnd < bufLen) { + const ch = this.#buffer[startOffset + localEnd]; - // Check for Unquoted String - const match = this.#matcher.exec(this.#buffer); - if (match) { - // If we're flushing and the match doesn't consume the entire buffer, - // then return null - if (this.#flush === false && match[0].length === this.#buffer.length) { - return null; + // Field delimiter - inline determination + if (ch === this.#delimiter) { + const value = this.#buffer.slice(startOffset, startOffset + localEnd); + this.#cursor.column += localEnd; + this.#cursor.offset += localEnd; + const end: Position = { ...this.#cursor }; + this.#bufferOffset = + startOffset + localEnd + this.#fieldDelimiterLength; + this.#cursor.column += this.#fieldDelimiterLength; + this.#cursor.offset += this.#fieldDelimiterLength; + this.#pendingTrailingFieldEOF = this.#bufferOffset >= bufLen; + return { + value, + delimiter: Delimiter.Field, + delimiterLength: this.#fieldDelimiterLength, + location: { start, end, rowNumber: this.#rowNumber }, + }; } - const value = match[1]; - if (value === undefined) { - return null; + + // Record delimiter (CRLF) + if (ch === "\r" && this.#buffer[startOffset + localEnd + 1] === "\n") { + const value = this.#buffer.slice(startOffset, startOffset + localEnd); + this.#cursor.column += localEnd; + this.#cursor.offset += localEnd; + const end: Position = { ...this.#cursor }; + this.#bufferOffset = startOffset + localEnd + 2; + this.#cursor.line++; + this.#cursor.column = 1; + this.#cursor.offset += 2; + const rowNum = this.#rowNumber++; + return { + value, + delimiter: Delimiter.Record, + delimiterLength: 2, + location: { start, end, rowNumber: rowNum }, + }; } - this.#buffer = this.#buffer.slice(value.length); - const start: Position = { ...this.#cursor }; - this.#cursor.column += value.length; - this.#cursor.offset += value.length; - return { - type: Field, - value, - location: { + + // Record delimiter (LF) + if (ch === "\n") { + const value = this.#buffer.slice(startOffset, startOffset + localEnd); + this.#cursor.column += localEnd; + this.#cursor.offset += localEnd; + const end: Position = { ...this.#cursor }; + this.#bufferOffset = startOffset + localEnd + 1; + this.#cursor.line++; + this.#cursor.column = 1; + this.#cursor.offset += 1; + const rowNum = this.#rowNumber++; + return { + value, + delimiter: Delimiter.Record, + delimiterLength: 1, + location: { start, end, rowNumber: rowNum }, + }; + } + + // Quotation in middle of unquoted field - parse as quoted section + if (ch === this.#quotation) { + return this.#parsePartialQuotedFieldWithLocation( start, - end: { ...this.#cursor }, - rowNumber: this.#rowNumber, - }, - }; + startOffset, + localEnd, + ); + } + + localEnd++; + } + + // End of buffer + if (!this.#flush) { + return null; + } + + // EOF + const value = this.#buffer.slice(startOffset, startOffset + localEnd); + this.#bufferOffset = startOffset + localEnd; + this.#cursor.column += localEnd; + this.#cursor.offset += localEnd; + return { + value, + delimiter: Delimiter.Record, + delimiterLength: 0, + location: { + start, + end: { ...this.#cursor }, + rowNumber: this.#rowNumber, + }, + }; + } + + /** + * Parse a field that starts unquoted but contains a quoted section (with location tracking). + * E.g., `a"quoted"b` or `a"unclosed` (which throws error) + */ + #parsePartialQuotedFieldWithLocation( + start: Position, + startOffset: number, + prefixLen: number, + ): Token | null { + const buf = this.#buffer; + const quotation = this.#quotation; + const segments = this.#borrowSegments(); + try { + // Add prefix (unquoted part before the quote) + if (prefixLen > 0) { + segments.push(buf.slice(startOffset, startOffset + prefixLen)); + } + + // Update cursor for the prefix and opening quote + this.#cursor.column += prefixLen + 1; + this.#cursor.offset += prefixLen + 1; + + let localOffset = prefixLen + 1; // Skip opening quote + let segmentStart = localOffset; + let line = 0; + let column = 0; + + let cur: string | undefined = buf[startOffset + localOffset]; + if (cur === undefined) { + if (!this.#flush) { + return null; + } + this.#throwUnexpectedEOFWithLocation(); + } + + let next: string | undefined = buf[startOffset + localOffset + 1]; + + while (cur !== undefined) { + if (cur === quotation) { + if (next === quotation) { + // Escaped quote + segments.push( + buf.slice( + startOffset + segmentStart, + startOffset + localOffset + 1, + ), + ); + localOffset += 2; + segmentStart = localOffset; + cur = buf[startOffset + localOffset]; + next = buf[startOffset + localOffset + 1]; + column += 2; + continue; + } + + if (next === undefined && !this.#flush) { + return null; + } + + // End of quoted section - collect value + if (localOffset > segmentStart) { + segments.push( + buf.slice(startOffset + segmentStart, startOffset + localOffset), + ); + } + + localOffset++; // skip closing quote + + // Check what follows the quoted section + const afterChar = buf[startOffset + localOffset]; + const value = segments.join(""); + + if (afterChar === undefined) { + if (!this.#flush) { + return null; + } + this.#cursor.column += column + 1; + this.#cursor.offset += localOffset - prefixLen - 1; + this.#cursor.line += line; + this.#bufferOffset = startOffset + localOffset; + return { + value, + delimiter: Delimiter.Record, + delimiterLength: 0, + location: { + start, + end: { ...this.#cursor }, + rowNumber: this.#rowNumber, + }, + }; + } + + if ( + afterChar === "\r" && + buf[startOffset + localOffset + 1] === undefined && + !this.#flush + ) { + return null; + } + + this.#cursor.column += column + 1; + this.#cursor.offset += localOffset - prefixLen - 1; + this.#cursor.line += line; + + // Continue parsing if more unquoted content follows + if ( + afterChar !== undefined && + afterChar !== this.#delimiter && + afterChar !== "\r" && + afterChar !== "\n" + ) { + // Recursively handle the rest (could have more quoted sections) + this.#bufferOffset = startOffset + localOffset; + const rest = this.#parseUnquotedFieldWithLocation(); + if (rest === null) { + return null; + } + segments.push(rest.value); + return { + value: segments.join(""), + delimiter: rest.delimiter, + delimiterLength: rest.delimiterLength, + location: { + start, + end: rest.location.end, + rowNumber: rest.location.rowNumber, + }, + }; + } + + const end: Position = { ...this.#cursor }; + + // Field delimiter + if (afterChar === this.#delimiter) { + this.#bufferOffset = + startOffset + localOffset + this.#fieldDelimiterLength; + this.#cursor.column += this.#fieldDelimiterLength; + this.#cursor.offset += this.#fieldDelimiterLength; + this.#pendingTrailingFieldEOF = + this.#bufferOffset >= this.#buffer.length; + return { + value, + delimiter: Delimiter.Field, + delimiterLength: this.#fieldDelimiterLength, + location: { start, end, rowNumber: this.#rowNumber }, + }; + } + + // Record delimiter (CRLF) + if (afterChar === "\r") { + const nextAfter = buf[startOffset + localOffset + 1]; + if (nextAfter === undefined) { + if (!this.#flush) { + return null; + } + } else if (nextAfter === "\n") { + this.#bufferOffset = startOffset + localOffset + 2; + this.#cursor.line++; + this.#cursor.column = 1; + this.#cursor.offset += 2; + const rowNum = this.#rowNumber++; + return { + value, + delimiter: Delimiter.Record, + delimiterLength: 2, + location: { start, end, rowNumber: rowNum }, + }; + } + } + + // Record delimiter (LF) + if (afterChar === "\n") { + this.#bufferOffset = startOffset + localOffset + 1; + this.#cursor.line++; + this.#cursor.column = 1; + this.#cursor.offset += 1; + const rowNum = this.#rowNumber++; + return { + value, + delimiter: Delimiter.Record, + delimiterLength: 1, + location: { start, end, rowNumber: rowNum }, + }; + } + + // EOF + this.#bufferOffset = startOffset + localOffset; + return { + value, + delimiter: Delimiter.Record, + delimiterLength: 0, + location: { start, end, rowNumber: this.#rowNumber }, + }; + } + + if (cur === LF) { + line++; + column = 0; + } else { + column++; + } + + localOffset++; + cur = next; + next = buf[startOffset + localOffset + 1]; + } + + if (this.#flush) { + this.#throwUnexpectedEOFWithLocation(); + } + return null; + } finally { + this.#releaseSegments(segments); } + } + + #borrowSegments(): string[] { + return this.#segmentPool.take(() => []); + } - // Otherwise, return null - return null; + #releaseSegments(segments: string[]): void { + segments.length = 0; + this.#segmentPool.release(segments); } } diff --git a/src/parser/models/FlexibleStringCSVLexer.undefined-check.test.ts b/src/parser/models/FlexibleStringCSVLexer.undefined-check.test.ts deleted file mode 100644 index 40a857c8..00000000 --- a/src/parser/models/FlexibleStringCSVLexer.undefined-check.test.ts +++ /dev/null @@ -1,42 +0,0 @@ -import { describe, expect, it } from "vitest"; -import { ParseError } from "@/core/errors.ts"; -import { FlexibleStringCSVLexer } from "@/parser/api/model/createStringCSVLexer.ts"; - -/** - * Tests for undefined checks added for TypeScript 5.9 strict type checking - */ -describe("CSVLexer undefined checks", () => { - it("should handle empty buffer during quoted field parsing with flush", () => { - const lexer = new FlexibleStringCSVLexer(); - - // Start a quoted field but don't complete it - // This should trigger the undefined check when flush is called - expect(() => { - const gen = lexer.lex('"incomplete'); - Array.from(gen); - }).toThrow(ParseError); - }); - - it("should parse complete quoted field correctly", () => { - const lexer = new FlexibleStringCSVLexer(); - - // Process a complete quoted field - const gen = lexer.lex('"field"'); - const tokens = Array.from(gen); - - // Should successfully parse the complete field - expect(tokens).toHaveLength(1); - expect(tokens[0]?.value).toBe("field"); - }); - - it("should parse a single unquoted field", () => { - const lexer = new FlexibleStringCSVLexer(); - - // Normal field parsing should work correctly - const gen = lexer.lex("field"); - const tokens = Array.from(gen); - - expect(tokens).toHaveLength(1); - expect(tokens[0]?.value).toBe("field"); - }); -}); diff --git a/src/parser/models/FlexibleStringCSVParser.test.ts b/src/parser/models/FlexibleStringCSVParser.test.ts index 6edb9e12..0a850f67 100644 --- a/src/parser/models/FlexibleStringCSVParser.test.ts +++ b/src/parser/models/FlexibleStringCSVParser.test.ts @@ -95,39 +95,39 @@ describe("FlexibleStringCSVParser (Object and Array)", () => { ]); }); - test("should preserve undefined for missing fields in array format (with pad strategy)", () => { - // In array format with 'pad' strategy, missing fields are filled with undefined - const parserWithPad = new FlexibleStringArrayCSVParser({ + test("should preserve undefined for missing fields in array format (with sparse strategy)", () => { + // In array format with 'sparse' strategy, missing fields are filled with undefined + const parserWithSparse = new FlexibleStringArrayCSVParser({ header: ["name", "age", "city"] as const, - columnCountStrategy: "pad", + columnCountStrategy: "sparse", }); - const records = Array.from(parserWithPad.parse("Alice,30\nBob")); + const records = Array.from(parserWithSparse.parse("Alice,30\nBob")); expect(records).toEqual([ ["Alice", "30", undefined], - ["Bob", undefined, undefined], // Missing fields → undefined (array format behavior) + ["Bob", undefined, undefined], // Missing fields → undefined (sparse strategy) ]); }); - test("should distinguish empty vs missing in array format", () => { - const parserWithPad = new FlexibleStringArrayCSVParser({ + test("should distinguish empty vs missing in array format (sparse strategy)", () => { + const parserWithSparse = new FlexibleStringArrayCSVParser({ header: ["name", "age"] as const, - columnCountStrategy: "pad", + columnCountStrategy: "sparse", }); // "Bob," has an empty age field → "" // "Charlie" has a missing age field → undefined const records = Array.from( - parserWithPad.parse("Alice,30\nBob,\nCharlie"), + parserWithSparse.parse("Alice,30\nBob,\nCharlie"), ); expect(records).toEqual([ ["Alice", "30"], ["Bob", ""], // empty field → "" - ["Charlie", undefined], // missing field → undefined (array format preserves undefined) + ["Charlie", undefined], // missing field → undefined (sparse strategy preserves undefined) ]); }); }); @@ -160,19 +160,19 @@ describe("FlexibleStringCSVParser (Object and Array)", () => { const records = Array.from(parser.parse("Alice,30\nBob")); expect(records).toEqual([ { name: "Alice", age: "30" }, - { name: "Bob", age: undefined }, // Missing field remains undefined + { name: "Bob", age: "" }, // Missing field filled with "" (fill strategy default) ]); }); - test("should distinguish empty field from missing field in object format", () => { - // In object format, empty fields stay "", missing fields remain undefined + test("should fill missing field with empty string in object format", () => { + // In object format with fill strategy (default), both empty and missing fields are "" // "Bob," has an empty age field (present but empty) → "" - // "Charlie" has a missing age field (row too short) → undefined + // "Charlie" has a missing age field (row too short) → "" (fill strategy) const records = Array.from(parser.parse("Alice,30\nBob,\nCharlie")); expect(records).toEqual([ { name: "Alice", age: "30" }, { name: "Bob", age: "" }, // empty field → "" - { name: "Charlie", age: undefined }, // missing field → undefined + { name: "Charlie", age: "" }, // missing field → "" (fill strategy) ]); }); }); @@ -219,15 +219,15 @@ describe("FlexibleStringCSVParser (Object and Array)", () => { }); describe("Column count strategy", () => { - test("should pad short rows with undefined in object format", () => { + test("should fill short rows with empty string in object format", () => { const parser = new FlexibleStringObjectCSVParser({ header: ["name", "age", "city"] as const, - columnCountStrategy: "pad", + columnCountStrategy: "fill", }); const records = Array.from(parser.parse("Alice,30\nBob,25,NYC")); expect(records).toEqual([ - { name: "Alice", age: "30", city: undefined }, // Missing field filled with undefined + { name: "Alice", age: "30", city: "" }, // Missing field filled with empty string (fill strategy) { name: "Bob", age: "25", city: "NYC" }, ]); }); @@ -241,17 +241,15 @@ describe("FlexibleStringCSVParser (Object and Array)", () => { expect(() => Array.from(parser.parse("Alice,30,extra"))).toThrow(); }); - test("should truncate long rows with 'truncate' strategy", () => { - const parser = new FlexibleStringObjectCSVParser({ - header: ["name", "age"] as const, - columnCountStrategy: "truncate", - }); - - const records = Array.from(parser.parse("Alice,30,extra\nBob,25")); - expect(records).toEqual([ - { name: "Alice", age: "30" }, - { name: "Bob", age: "25" }, - ]); + test("should reject 'truncate' strategy for object output", () => { + expect(() => { + new FlexibleStringObjectCSVParser({ + header: ["name", "age"] as const, + columnCountStrategy: "truncate", + }); + }).toThrow( + /columnCountStrategy 'truncate' is not allowed for object format/, + ); }); }); diff --git a/src/parser/models/FlexibleStringObjectCSVParser.ts b/src/parser/models/FlexibleStringObjectCSVParser.ts index 7cdf5711..1a566528 100644 --- a/src/parser/models/FlexibleStringObjectCSVParser.ts +++ b/src/parser/models/FlexibleStringObjectCSVParser.ts @@ -32,7 +32,7 @@ import { BaseStringCSVParser } from "@/parser/models/base/BaseStringCSVParser.ts * ``` */ export class FlexibleStringObjectCSVParser< - Header extends ReadonlyArray = readonly string[], + const Header extends ReadonlyArray = readonly string[], > extends BaseStringCSVParser implements StringObjectCSVParser
diff --git a/src/parser/stream/BinaryCSVParserStream.test.ts b/src/parser/stream/BinaryCSVParserStream.test.ts index a63d278c..482f782b 100644 --- a/src/parser/stream/BinaryCSVParserStream.test.ts +++ b/src/parser/stream/BinaryCSVParserStream.test.ts @@ -1,19 +1,9 @@ -import { - beforeEach, - describe as describe_, - expect, - it as it_, - test, - vi, -} from "vitest"; +import { beforeEach, describe, expect, it, test, vi } from "vitest"; import { transform } from "@/__tests__/helper.ts"; import { FlexibleBinaryArrayCSVParser } from "@/parser/models/FlexibleBinaryArrayCSVParser.ts"; import { FlexibleBinaryObjectCSVParser } from "@/parser/models/FlexibleBinaryObjectCSVParser.ts"; import { BinaryCSVParserStream } from "@/parser/stream/BinaryCSVParserStream.ts"; -const describe = describe_.concurrent; -const it = it_.concurrent; - describe("BinaryCSVParserStream", () => { const encoder = new TextEncoder(); @@ -74,7 +64,7 @@ describe("BinaryCSVParserStream", () => { expect(records).toEqual([ { name: "Alice", age: "30" }, - { name: "Bob", age: undefined }, // Incomplete record leaves missing field undefined + { name: "Bob", age: "" }, // Incomplete record leaves missing field as empty string ]); }); @@ -92,7 +82,7 @@ describe("BinaryCSVParserStream", () => { expect(records).toEqual([ { name: "Alice", age: "30" }, - { name: "Bob", age: undefined }, // Missing field remains undefined + { name: "Bob", age: "" }, // Missing field returns empty string ]); }); }); diff --git a/src/parser/stream/CSVRecordAssemblerTransformer.spec.ts b/src/parser/stream/CSVRecordAssemblerTransformer.spec.ts index 848a06f5..3b8d935b 100644 --- a/src/parser/stream/CSVRecordAssemblerTransformer.spec.ts +++ b/src/parser/stream/CSVRecordAssemblerTransformer.spec.ts @@ -1,8 +1,8 @@ import fc from "fast-check"; import { describe as describe_, expect, it as it_, vi } from "vitest"; import { FC, transform } from "@/__tests__/helper.ts"; -import { Field, FieldDelimiter, RecordDelimiter } from "@/core/constants.ts"; -import type { Token } from "@/core/types.ts"; +import { Delimiter } from "@/core/constants.ts"; +import type { AnyToken } from "@/core/types.ts"; import { createCSVRecordAssembler } from "@/parser/api/model/createCSVRecordAssembler.ts"; import { CSVRecordAssemblerTransformer } from "@/parser/stream/CSVRecordAssemblerTransformer.ts"; @@ -51,42 +51,25 @@ describe("CSVRecordAssemblerTransformer", () => { maxLength: header.length, }, }); - const tokens: Token[] = [ + // In unified token format, each token represents a field with its following delimiter + const tokens: AnyToken[] = [ // generate header tokens - ...header.flatMap((field, i) => [ - { type: Field, value: field, location: LOCATION_SHAPE }, - i === header.length - 1 - ? { - type: RecordDelimiter, - value: "\n", - location: LOCATION_SHAPE, - } - : { - type: FieldDelimiter, - value: ",", - location: LOCATION_SHAPE, - }, - ]), + ...header.map((field, i) => ({ + value: field, + delimiter: + i === header.length - 1 ? Delimiter.Record : Delimiter.Field, + delimiterLength: 1, + location: LOCATION_SHAPE, + })), // generate rows tokens ...rows.flatMap((row) => - // generate row tokens - row.flatMap((field, j) => [ - { type: Field, value: field, location: LOCATION_SHAPE }, - { - type: FieldDelimiter, - value: ",", - location: LOCATION_SHAPE, - }, - // generate record delimiter token - ...((j === row.length - 1 - ? [ - { - type: RecordDelimiter, - value: "\n", - }, - ] - : []) as Token[]), - ]), + row.map((field, j) => ({ + value: field, + delimiter: + j === row.length - 1 ? Delimiter.Record : Delimiter.Field, + delimiterLength: 1, + location: LOCATION_SHAPE, + })), ), ]; const expected = rows.map((row) => @@ -116,26 +99,16 @@ describe("CSVRecordAssemblerTransformer", () => { maxLength: header.length, }, }); - const tokens = [ - ...rows.flatMap((row) => - row.flatMap((field, j) => [ - { type: Field, value: field, location: LOCATION_SHAPE }, - { - type: FieldDelimiter, - value: ",", - location: LOCATION_SHAPE, - }, - ...((j === row.length - 1 - ? [ - { - type: RecordDelimiter, - value: "\n", - }, - ] - : []) as Token[]), - ]), - ), - ]; + // In unified token format, each token represents a field with its following delimiter + const tokens: AnyToken[] = rows.flatMap((row) => + row.map((field, j) => ({ + value: field, + delimiter: + j === row.length - 1 ? Delimiter.Record : Delimiter.Field, + delimiterLength: 1, + location: LOCATION_SHAPE, + })), + ); const expected = rows.map((row) => Object.fromEntries(row.map((field, i) => [header[i], field])), ); @@ -159,10 +132,16 @@ describe("CSVRecordAssemblerTransformer", () => { throw new Error("test"); }, ); + const tokens: AnyToken[] = [ + { + value: "test", + delimiter: Delimiter.Record, + delimiterLength: 0, + location: LOCATION_SHAPE, + }, + ]; await expect(async () => { - await transform(transformer, [ - { type: Field, value: "test", location: LOCATION_SHAPE }, - ]); + await transform(transformer, tokens); }).rejects.toThrowErrorMatchingInlineSnapshot(`[Error: test]`); }); diff --git a/src/parser/stream/StringCSVLexerTransformer.spec.ts b/src/parser/stream/StringCSVLexerTransformer.spec.ts index b3071848..3ab0249e 100644 --- a/src/parser/stream/StringCSVLexerTransformer.spec.ts +++ b/src/parser/stream/StringCSVLexerTransformer.spec.ts @@ -1,27 +1,14 @@ import fc from "fast-check"; import { describe as describe_, expect, it as it_ } from "vitest"; import { autoChunk, FC, transform } from "@/__tests__/helper.ts"; -import { Field, FieldDelimiter, RecordDelimiter } from "@/core/constants.ts"; +import type { TokenNoLocation } from "@/common"; +import { Delimiter } from "@/core/constants.ts"; import { FlexibleStringCSVLexer } from "@/parser/api/model/createStringCSVLexer.ts"; import { StringCSVLexerTransformer } from "@/parser/stream/StringCSVLexerTransformer.ts"; import { escapeField } from "@/utils/serialization/escapeField.ts"; -const describe = describe_.concurrent; -const it = it_.concurrent; - -const LOCATION_SHAPE = { - start: { - line: expect.any(Number), - column: expect.any(Number), - offset: expect.any(Number), - }, - end: { - line: expect.any(Number), - column: expect.any(Number), - offset: expect.any(Number), - }, - rowNumber: expect.any(Number), -}; +const describe = describe_; +const it = it_; describe("StringCSVLexerTransformer", () => { it("should be a TransformStream", () => { @@ -37,28 +24,21 @@ describe("StringCSVLexerTransformer", () => { fc.gen().map((g) => { const row = g(FC.row); const quote = g(FC.quote); - const chunks = autoChunk( - g, - row.map((v) => escapeField(v, { quote })).join(","), - ); - const expected = [ - ...row.flatMap((value, index) => [ - // If the field is empty or quote is true, add a field. - ...(quote || value - ? [{ type: Field, value, location: LOCATION_SHAPE }] - : []), - // If the field is not the last field, add a field delimiter. - ...(index === row.length - 1 - ? [] - : [ - { - type: FieldDelimiter, - value: ",", - location: LOCATION_SHAPE, - }, - ]), - ]), - ]; + const csv = row.map((v) => escapeField(v, { quote })).join(","); + const chunks = csv.length === 0 ? [""] : autoChunk(g, csv); + // In unified token format, each token represents a field with its following delimiter + // Note: If CSV is empty string, no tokens are expected regardless of the row content + const expected: TokenNoLocation[] = + csv.length === 0 + ? [] + : row.map((value, index) => ({ + value, + delimiter: + index === row.length - 1 + ? Delimiter.Record + : Delimiter.Field, + delimiterLength: expect.any(Number), + })); return { row, chunks, expected }; }), async ({ chunks, expected }) => { @@ -68,6 +48,7 @@ describe("StringCSVLexerTransformer", () => { expect(actual).toMatchObject(expected); }, ), + { numRuns: 10 }, // Reduce runs to debug ); }); @@ -80,20 +61,13 @@ describe("StringCSVLexerTransformer", () => { g, row.map((v) => escapeField(v, { quote: true })).join(","), ); - const expected = [ - ...row.flatMap((value, index) => [ - { type: Field, value, location: LOCATION_SHAPE }, - ...(index === row.length - 1 - ? [] - : [ - { - type: FieldDelimiter, - value: ",", - location: LOCATION_SHAPE, - }, - ]), - ]), - ]; + // In unified token format, each token represents a field with its following delimiter + const expected: TokenNoLocation[] = row.map((value, index) => ({ + value, + delimiter: + index === row.length - 1 ? Delimiter.Record : Delimiter.Field, + delimiterLength: expect.any(Number), + })); return { expected, chunks }; }), async ({ expected, chunks }) => { @@ -103,6 +77,7 @@ describe("StringCSVLexerTransformer", () => { expect(actual).toMatchObject(expected); }, ), + { numRuns: 10 }, ); }); @@ -128,56 +103,33 @@ describe("StringCSVLexerTransformer", () => { ) .join(eol) + (EOF ? eol : ""); const chunks = autoChunk(g, csv); - const expected = [ - ...data.flatMap((row, i) => [ - // If row is empty, add a record delimiter. - ...row.flatMap((value, j) => [ - // If the field is empty or quote is true, add a field. - ...(quote || value !== "" ? [{ type: Field, value }] : []), - // If the field is not the last field, add a field delimiter. - ...(row.length - 1 !== j - ? [ - { - type: FieldDelimiter, - value: options.delimiter, - location: LOCATION_SHAPE, - }, - ] - : []), - ]), - // If the field is the last field, add a record delimiter. - ...(data.length - 1 !== i - ? [ - { - type: RecordDelimiter, - value: eol, - location: LOCATION_SHAPE, - }, - ] - : []), - ]), - ]; + // In unified token format, each token represents a field with its following delimiter + const expected: TokenNoLocation[] = []; + for (let i = 0; i < data.length; i++) { + const row = data[i]!; + for (let j = 0; j < row.length; j++) { + const value = row[j]!; + const isLastFieldInRow = j === row.length - 1; + + // Always add token for every field (including empty ones) + expected.push({ + value, + delimiter: isLastFieldInRow + ? Delimiter.Record + : Delimiter.Field, + delimiterLength: expect.any(Number), + }); + } + } return { options, chunks, expected }; }), async ({ options, chunks, expected }) => { const lexer = new FlexibleStringCSVLexer(options); const transformer = new StringCSVLexerTransformer(lexer); - const actual = (await transform(transformer, chunks)).flat(); + const actual = await transform(transformer, chunks); expect(actual).toMatchObject(expected); }, ), - { - examples: [ - [ - // only EOL is ignored - { - options: { delimiter: ",", quotation: '"' } as any, - chunks: ["\n"], - expected: [], - }, - ], - ], - }, ); }); }); diff --git a/src/parser/stream/StringCSVLexerTransformer.test.ts b/src/parser/stream/StringCSVLexerTransformer.test.ts index 70397848..1aa8246e 100644 --- a/src/parser/stream/StringCSVLexerTransformer.test.ts +++ b/src/parser/stream/StringCSVLexerTransformer.test.ts @@ -45,7 +45,7 @@ describe("StringCSVLexerTransformer", () => { await expect(async () => { await transform(transformer, ['"']); }).rejects.toThrowErrorMatchingInlineSnapshot( - `[ParseError: Unexpected EOF while parsing quoted field.]`, + `[ParseError: Unexpected EOF while parsing quoted field at line 1, column 1.]`, ); }); diff --git a/src/parser/stream/StringCSVLexerTransformer.ts b/src/parser/stream/StringCSVLexerTransformer.ts index 719f85d1..e215bd0b 100644 --- a/src/parser/stream/StringCSVLexerTransformer.ts +++ b/src/parser/stream/StringCSVLexerTransformer.ts @@ -132,6 +132,7 @@ export class StringCSVLexerTransformer< } }, }, + // undefined, // writableStrategy writableStrategy, readableStrategy, ); diff --git a/src/parser/stream/StringCSVParserStream.test.ts b/src/parser/stream/StringCSVParserStream.test.ts index 7afb8ad4..01e0a0df 100644 --- a/src/parser/stream/StringCSVParserStream.test.ts +++ b/src/parser/stream/StringCSVParserStream.test.ts @@ -1,19 +1,9 @@ -import { - beforeEach, - describe as describe_, - expect, - it as it_, - test, - vi, -} from "vitest"; +import { beforeEach, describe, expect, it, test, vi } from "vitest"; import { transform } from "@/__tests__/helper.ts"; import { FlexibleStringArrayCSVParser } from "@/parser/models/FlexibleStringArrayCSVParser.ts"; import { FlexibleStringObjectCSVParser } from "@/parser/models/FlexibleStringObjectCSVParser.ts"; import { StringCSVParserStream } from "@/parser/stream/StringCSVParserStream.ts"; -const describe = describe_.concurrent; -const it = it_.concurrent; - describe("StringCSVParserStream", () => { beforeEach(() => { vi.resetAllMocks(); @@ -73,7 +63,7 @@ describe("StringCSVParserStream", () => { expect(records).toEqual([ { name: "Alice", age: "30" }, - { name: "Bob", age: undefined }, // Missing field remains undefined + { name: "Bob", age: "" }, // Missing field returns empty string ]); }); }); diff --git a/src/utils/memory/ReusableArrayPool.ts b/src/utils/memory/ReusableArrayPool.ts new file mode 100644 index 00000000..ff436069 --- /dev/null +++ b/src/utils/memory/ReusableArrayPool.ts @@ -0,0 +1,24 @@ +/** + * Small helper to recycle array instances in hot paths. + * + * @internal + */ +export class ReusableArrayPool { + readonly #pool: T[] = []; + readonly #maxSize: number; + + constructor(maxSize: number = Number.POSITIVE_INFINITY) { + this.#maxSize = maxSize; + } + + take(factory: () => T): T { + return this.#pool.pop() ?? factory(); + } + + release(value: T, reset?: (value: T) => void): void { + reset?.(value); + if (this.#pool.length < this.#maxSize) { + this.#pool.push(value); + } + } +} diff --git a/vite.config.ts b/vite.config.ts index dd8aad96..f0af1665 100644 --- a/vite.config.ts +++ b/vite.config.ts @@ -220,6 +220,16 @@ export default bytes.buffer || bytes; coverage: { provider: "istanbul", include: ["src/**/*.ts"], + exclude: [ + "**/*.{test,spec,test-d}.ts", + "**/*.browser.{test,spec}.ts", + "**/*.node.{test,spec}.ts", + "**/test/**", + "**/tests/**", + "**/__tests__/**", + ], + reporter: ["text", "json", "html", "clover"], + reportsDirectory: "./coverage", }, projects: [ {