kamiazya · kamiazya · Dec 3, 2025 · Nov 30, 2025 · Nov 30, 2025 · Nov 30, 2025
diff --git a/.changeset/column-count-strategy-rename.md b/.changeset/column-count-strategy-rename.md
@@ -0,0 +1,11 @@
+---
+"web-csv-toolbox": minor
+---
+
+**BREAKING CHANGE**: Restrict `columnCountStrategy` options for object output to `fill`/`strict` only.
+
+Object format now rejects `keep` and `truncate` strategies at runtime, as these strategies are incompatible with object output semantics. Users relying on `keep` or `truncate` with object format must either:
+- Switch to `outputFormat: 'array'` to use these strategies, or
+- Use `fill` (default) or `strict` for object output
+
+This change improves API clarity by aligning strategy availability with format capabilities and documenting the purpose-driven strategy matrix (including sparse/header requirements).
diff --git a/.changeset/lexer-api-changes.md b/.changeset/lexer-api-changes.md
@@ -0,0 +1,19 @@
+---
+"web-csv-toolbox": minor
+---
+
+## Lexer API Changes
+
+This release includes low-level Lexer API changes for performance optimization.
+
+### Breaking Changes (Low-level API only)
+
+These changes only affect users of the low-level Lexer API. **High-level APIs (`parseString`, `parseBinary`, etc.) are unchanged.**
+
+1. **Token type constants**: Changed from `Symbol` to numeric constants
+2. **Location tracking**: Now disabled by default. Add `trackLocation: true` to Lexer options if you need token location information. Note: Error messages still include position information even when `trackLocation: false` (computed lazily only when errors occur).
+3. **Struct of token objects**: Changed to improve performance. Token properties changed and reduce tokens by combining delimiter and newline information into a field.
+
+### Who is affected?
+
+**Most users are NOT affected.** Only users who directly use `FlexibleStringCSVLexer` and rely on `token.location` or `Symbol`-based token type comparison need to update their code.
diff --git a/.changeset/performance-improvements.md b/.changeset/performance-improvements.md
@@ -0,0 +1,47 @@
+---
+"web-csv-toolbox": patch
+---
+
+## JavaScript Parser Performance Improvements
+
+This release includes significant internal optimizations that improve JavaScript-based CSV parsing performance.
+
+### Before / After Comparison
+
+| Metric | Before (v0.14) | After | Improvement |
+|--------|----------------|-------|-------------|
+| 1,000 rows parsing | 3.57 ms | 1.42 ms | **60% faster** |
+| 5,000 rows parsing | 19.47 ms | 7.03 ms | **64% faster** |
+| Throughput (1,000 rows) | 24.3 MB/s | 61.2 MB/s | **2.51x** |
+| Throughput (5,000 rows) | 24.5 MB/s | 67.9 MB/s | **2.77x** |
+
+### Optimization Summary
+
+| Optimization | Target | Improvement |
+|--------------|--------|-------------|
+| Array copy method improvement | Assembler | -8.7% |
+| Quoted field parsing optimization | Lexer | Overhead eliminated |
+| Object assembler loop optimization | Assembler | -5.4% |
+| Regex removal for unquoted fields | Lexer | -14.8% |
+| String comparison optimization | Lexer | ~10% |
+| Object creation optimization | Lexer | ~20% |
+| Non-destructive buffer reading | GC | -46% |
+| Token type numeric conversion | Lexer/GC | -7% / -13% |
+| Location tracking made optional | Lexer | -19% to -31% |
+| Object.create(null) for records | Assembler | -31% |
+| Empty-row template cache | Assembler | ~4% faster on sparse CSV |
+| Row buffer reuse (no per-record slice) | Assembler | ~6% faster array format |
+| Header-length builder preallocation | Assembler | Capacity stays steady on wide CSV |
+| Object assembler row buffer pooling | Assembler | Lower GC spikes on object output |
+| Lexer segment-buffer pooling | Lexer | Smoother GC for quoted-heavy input |
+
+### Final Performance Results (Pure JavaScript)
+
+| Format | Throughput |
+|--------|------------|
+| Object format (1,000 rows) | **61.2 MB/s** |
+| Array format (1,000 rows) | **87.6 MB/s** |
+| Object format (5,000 rows) | **67.9 MB/s** |
+| Array format (5,000 rows) | **86.4 MB/s** |
+
+Array format is approximately 43% faster (1.43× throughput) than Object format for the same data.
diff --git a/benchmark/package.json b/benchmark/package.json
@@ -4,8 +4,13 @@
   "private": true,
   "type": "module",
   "scripts": {
-    "start": "tsx main.ts",
-    "queuing-strategy": "tsx queuing-strategy.bench.ts"
+    "start": "node --import tsx main.ts",
+    "queuing-strategy": "node --import tsx queuing-strategy.bench.ts",
+    "quick": "node --import tsx scripts/quick-bench.mts",
+    "unified": "node --import tsx scripts/unified-token-bench.mts",
+    "profile:cpu": "node --cpu-prof --cpu-prof-dir=./profiles --import tsx scripts/profile-cpu.mts",
+    "profile:memory": "node --heap-prof --heap-prof-dir=./profiles --import tsx scripts/profile-memory.mts",
+    "profile:memory:gc": "node --heap-prof --heap-prof-dir=./profiles --expose-gc --import tsx scripts/profile-memory.mts"
   },
   "license": "MIT",
   "dependencies": {
@@ -14,4 +19,4 @@
     "tsx": "catalog:",
     "web-csv-toolbox": "workspace:*"
   }
-}
+}
diff --git a/config/vitest.setup.ts b/config/vitest.setup.ts
@@ -1,5 +1,7 @@
 import fc from "fast-check";
 
 fc.configureGlobal({
-  // This is the default value, but we set it here to be explicit.
+  // Set to true to stop property tests on first failure (default is false).
+  // This speeds up test runs by avoiding unnecessary iterations after a counterexample is found.
+  endOnFailure: true,
 });
diff --git a/docs/reference/column-count-strategy-guide.md b/docs/reference/column-count-strategy-guide.md
@@ -0,0 +1,46 @@
+# ColumnCountStrategy Guide
+
+`columnCountStrategy` controls how the parser handles rows whose column counts differ from the header. The available strategies depend on the output format and whether a header is known in advance.
+
+## Compatibility Matrix
+
+| Strategy   | Short rows                         | Long rows                     | Object | Array (explicit header) | Array (header inferred) | Headerless (`header: []`) |
+|------------|------------------------------------|------------------------------|--------|-------------------------|-------------------------|----------------------------|
+| `fill`     | Pad with `""`                     | Trim excess columns          | ✅      | ✅                       | ✅                       | ❌                        |
+| `strict`   | Throw error                        | Throw error                  | ✅      | ✅                       | ✅                       | ❌                        |
+| `keep`     | Keep as-is (ragged rows)           | Keep as-is                   | ❌      | ✅                       | ✅                       | ✅ (mandatory)            |
+| `truncate` | Keep as-is                         | Trim to header length        | ❌      | ✅                       | ❌ (requires header)     | ❌                        |
+| `sparse`   | Pad with `undefined`               | Trim excess columns          | ❌      | ✅                       | ❌ (requires header)     | ❌                        |
+
+## Strategy Details
+
+### `fill` (default)
+- Guarantees fixed-length records matching the header.
+- Object: missing values become `""`, enabling consistent string-based models.
+- Array output: missing values also become empty strings.
+
+### `strict`
+- Treats any column-count mismatch as a fatal error, useful for schema validation.
+- Requires a header (explicit or inferred).
+
+### `keep`
+- Leaves each row untouched. Arrays can vary in length, making it ideal for ragged data or headerless CSVs.
+- Headerless mode (`header: []`) enforces `keep`.
+
+### `truncate`
+- Drops trailing columns that exceed the header length while leaving short rows untouched.
+- Only available when a header is provided (array output).
+
+### `sparse`
+- Similar to `fill`, but pads missing entries with `undefined`. This is useful when you want to distinguish between missing and empty values.
+- Requires an explicit header to determine the target length.
+
+## Choosing a Strategy
+
+1. **Need strict schema enforcement?** Use `strict`.
+2. **Need consistent string values?** Use `fill` (object default).
+3. **Need ragged rows / headerless CSV?** Use `keep` (array output).
+4. **Need to ignore trailing columns?** Use `truncate` (array output with header).
+5. **Need optional columns?** Use `sparse` (array output with header).
+
+Pair this guide with the [Output Format Guide](./output-format-guide.md) to decide which combination best fits your workload.
diff --git a/docs/reference/output-format-guide.md b/docs/reference/output-format-guide.md
@@ -0,0 +1,47 @@
+# Output Format Guide
+
+Many APIs (e.g. `parseString`, `createCSVRecordAssembler`, stream transformers) expose an `outputFormat` option so you can choose the most suitable record representation for your workload. This guide summarizes each format's behavior, strengths, and constraints.
+
+## Quick Comparison
+
+| Format   | Representation                      | Best for                               | ColumnCountStrategy support | Headerless (`header: []`) | `includeHeader` | Notes |
+|----------|-------------------------------------|-----------------------------------------|-----------------------------|---------------------------|-----------------|-------|
+| `object` | Plain object `{ headerKey: value }` | JSON interoperability, downstream libs | `fill`, `strict`            | ❌                       | ❌             | Default output. Values are always strings. |
+| `array`  | Readonly array / named tuple        | Maximum throughput, flexible schemas   | All strategies (`fill`, `keep`, `truncate`, `sparse`, `strict`) | ✅ (with `keep`) | ✅             | Headerless mode requires `outputFormat: "array"` + `columnCountStrategy: "keep"`. |
+
+## Object Format (`"object"`)
+- Produces pure objects keyed by header names.
+- Missing columns are padded with empty strings in `fill` mode, or rejected in `strict`.
+- Recommended when you plan to serialize to JSON, access fields by name exclusively, or hand records to other libraries.
+
+```ts
+const assembler = createCSVRecordAssembler({
+  header: ["name", "age"] as const,
+  // outputFormat defaults to "object"
+});
+for (const record of assembler.assemble(tokens)) {
+  record.name; // string
+}
+```
+
+## Array Format (`"array"`)
+- Emits header-ordered arrays (typed as named tuples when a header is provided).
+- Supports every columnCountStrategy, including `keep` for ragged rows and `sparse` for optional columns.
+- Only format that supports headerless mode.
+
+```ts
+const assembler = createCSVRecordAssembler({
+  header: ["name", "age"] as const,
+  outputFormat: "array",
+  columnCountStrategy: "truncate",
+});
+const [row] = assembler.assemble(tokens);
+row[0]; // "Alice"
+```
+
+## Choosing the Right Format
+
+1. **Need plain JS objects / JSON serialization?** Use `object`.
+2. **Need the fastest throughput or ragged rows?** Use `array` with the appropriate `columnCountStrategy`.
+
+For more details on column-count handling, see the [ColumnCountStrategy guide](./column-count-strategy-guide.md).
diff --git a/package.json b/package.json
@@ -218,6 +218,7 @@
     "@types/node": "^24.10.1",
     "@vitest/browser-webdriverio": "^4.0.3",
     "@vitest/coverage-istanbul": "4.0.3",
+    "@vitest/coverage-v8": "4.0.3",
     "@wasm-tool/rollup-plugin-rust": "^3.0.5",
     "changesets-github-release": "^0.1.0",
     "fast-check": "^4.1.1",

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
diff --git a/src/core/constants.ts b/src/core/constants.ts
@@ -100,17 +100,18 @@ export const DEFAULT_STREAM_BACKPRESSURE_CHECK_INTERVAL = 100;
 export const DEFAULT_ASSEMBLER_BACKPRESSURE_CHECK_INTERVAL = 10;
 
 /**
- * FiledDelimiter is a symbol for field delimiter of CSV.
- * @category Constants
- */
-export const FieldDelimiter = Symbol.for("web-csv-toolbox.FieldDelimiter");
-/**
- * RecordDelimiter is a symbol for record delimiter of CSV.
- * @category Constants
- */
-export const RecordDelimiter = Symbol.for("web-csv-toolbox.RecordDelimiter");
-/**
- * Field is a symbol for field of CSV.
+ * Delimiter type enumeration for unified token format.
+ *
+ * Used in the new FieldToken format to indicate what follows the field value.
+ * This enables a more efficient token format where only field tokens are emitted.
+ *
  * @category Constants
  */
-export const Field = Symbol.for("web-csv-toolbox.Field");
+export enum Delimiter {
+  /** Next token is a field (followed by field delimiter like comma) */
+  Field = 0,
+  /** Next token is a record delimiter (newline) */
+  Record = 1,
+  // /** End of file/stream */
+  // EOF = 2,
+}