From 449ebeccad0cdd790df60e4a69b175ce978b9ef4 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Tue, 4 Nov 2025 11:10:58 -0600 Subject: [PATCH] refactor(web): renames PendingTokenization as TokenizationPath, adds doc-comments Now that the SearchSpace rework is stabilizing, I've finally landed on a better name for the type. PendingTokenization _directly_ correlates to `SearchPath` entries once processed, so the name `TokenizationPath` will reflect this nicely. New doc-comments are also added documenting the relationship of `TokenizationPath` and `TokenizationSubset` to `SearchPath` and `SearchCluster`. Admittedly, it may be wise to additionally rename `TokenizationSubset` to `TokenizationCluster` (and rename the source file) to further mirror the relationship of these types... but that can always be done in a follow-up. Build-bot: skip build:web Test-bot: skip --- .../src/main/correction/context-state.ts | 2 +- .../main/correction/context-tokenization.ts | 22 +++++----- .../src/main/correction/search-space.ts | 2 +- .../main/correction/tokenization-subsets.ts | 36 ++++++++++++---- .../context/context-tokenization.tests.ts | 6 +-- .../context/tokenization-subsets.tests.ts | 42 +++++++++---------- 6 files changed, 64 insertions(+), 46 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts index 0b1798298e6..27bb0ae848a 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts @@ -243,7 +243,7 @@ export class ContextState { const baseTokenization = startTokenizationsAfterSlide[0]; // For multiple tokenizations, we'd retrieve each, use the "most likely" one as base, // and then fold all resulting search spaces (on the final token) into one. - const tokenizationAnalysis = trueInputSubset.pendingSet.get(baseTokenization); + const tokenizationAnalysis = trueInputSubset.transitionPaths.get(baseTokenization); // Determine the best probability from among ALL available inputs, before they're split // into subsets. diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts index b89b05203af..9bdd67bd228 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts @@ -15,7 +15,7 @@ import TransformUtils from '../transformUtils.js'; import { computeDistance, EditOperation, EditTuple } from './classical-calculation.js'; import { determineModelTokenizer } from '../model-helpers.js'; import { ExtendedEditOperation, SegmentableDistanceCalculation } from './segmentable-calculation.js'; -import { PendingTokenization } from './tokenization-subsets.js'; +import { TokenizationPath } from './tokenization-subsets.js'; import LexicalModel = LexicalModelTypes.LexicalModel; import Transform = LexicalModelTypes.Transform; @@ -108,7 +108,7 @@ export class ContextTokenization { * The tokenization-transition metadata relating this instance to the most likely * tokenization from a prior state. */ - readonly transitionEdits?: PendingTokenization; + readonly transitionEdits?: TokenizationPath; /** * The portion of edits from the true input keystroke that are not part of the @@ -125,10 +125,10 @@ export class ContextTokenization { constructor(priorToClone: ContextTokenization); constructor(tokens: ContextToken[]); - constructor(tokens: ContextToken[], alignment: PendingTokenization, taillessTrueKeystroke: Transform); + constructor(tokens: ContextToken[], alignment: TokenizationPath, taillessTrueKeystroke: Transform); constructor( param1: ContextToken[] | ContextTokenization, - alignment?: PendingTokenization, + alignment?: TokenizationPath, taillessTrueKeystroke?: Transform ) { if(!(param1 instanceof ContextTokenization)) { @@ -490,7 +490,7 @@ export class ContextTokenization { * Given results from `precomputeTokenizationAfterInput`, this method will * evaluate the pending transition in tokenization for all associated inputs * while reusing as many correction-search intermediate results as possible. - * @param pendingTokenization Batched results from one or more + * @param tokenizationPath Batched results from one or more * `precomputeTokenizationAfterInput` calls on this instance, all with the * same alignment values. * @param lexicalModel The active lexical model @@ -499,16 +499,16 @@ export class ContextTokenization { * @param bestProbFromSet The probability of the single most likely input * transform in the overall transformDistribution associated with the * keystroke triggering theh transition. It need not be represented by the - * pendingTokenization to be built. + * tokenizationPath to be built. * @returns */ evaluateTransition( - pendingTokenization: PendingTokenization, + tokenizationPath: TokenizationPath, lexicalModel: LexicalModel, sourceInput: Transform, bestProbFromSet: number ): ContextTokenization { - const { alignment: alignment, inputs } = pendingTokenization; + const { alignment: alignment, inputs } = tokenizationPath; const sliceIndex = alignment.edgeWindow.sliceIndex; const baseTokenization = this.tokens.slice(sliceIndex); let affectedToken: ContextToken; @@ -592,7 +592,7 @@ export class ContextTokenization { start: appliedLength }, bestProbFromSet: bestProbFromSet, - subsetId: pendingTokenization.inputSubsetId + subsetId: tokenizationPath.inputSubsetId }, distribution); appliedLength += KMWString.length(distribution[0].sample.insert); @@ -605,7 +605,7 @@ export class ContextTokenization { return new ContextTokenization( this.tokens.slice(0, sliceIndex).concat(tokenization), null /* tokenMapping */, - determineTaillessTrueKeystroke(pendingTokenization) + determineTaillessTrueKeystroke(tokenizationPath) ); } } @@ -1122,7 +1122,7 @@ export function assembleTransforms(stackedInserts: string[], stackedDeletes: num * @param tokenizationAnalysis * @returns */ -export function determineTaillessTrueKeystroke(tokenizationAnalysis: PendingTokenization) { +export function determineTaillessTrueKeystroke(tokenizationAnalysis: TokenizationPath) { // undefined by default; we haven't yet determined if we're still affecting // the same token that was the tail in the previous tokenization state. let taillessTrueKeystroke: Transform; diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts index aca5c9743fa..0afbc5d46e6 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/search-space.ts @@ -85,7 +85,7 @@ export interface PathInputProperties { * This tends to serve as an identifying factor for tokenized input distributions, * indicating the distributions were all sourced from the same original input event. * - * @see PendingTokenization.inputSubsetId + * @see TokenizationPath.inputSubsetId */ subsetId: number; } diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-subsets.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-subsets.ts index 27c505f3539..294ce1af594 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-subsets.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-subsets.ts @@ -13,21 +13,29 @@ export function generateSubsetId() { return SUBSET_ID_SEED++; } -export interface PendingTokenization { +/** + * Tracks metadata about the "path" for transitioning from one source + * ContextTokenization to a potentially-common destination ContextTokenization. + * + * Once evaluated, each entry within its `.inputs` field should have a + * one-to-one relationship with instances of the `SearchPath` class. + */ +export interface TokenizationPath { /** - * The edge window corresponding to the common tokenization for the subset's inputs + * The edge window corresponding to the common ContextTokenization context + * to which this path's inputs will be applied. */ alignment: TokenizationEdgeAlignment, /** * A set of incoming keystrokes with compatible effects when applied. * - * If passed to `subsetByInterval`, the transforms should result in a single subset. + * If passed to the`subsetByInterval`, the transforms should result in a single subset. */ inputs: Distribution> /** - * A unique identifier associated with this PendingTokenization and its + * A unique identifier associated with this TokenizationPath and its * transforms within `SearchSpace`s. This ID assists with detecting when * split transforms are re-merged during SearchSpace merges. Only * input-sources with matching subset ID come from the same subset, and thus @@ -42,7 +50,17 @@ export interface PendingTokenization { } /** - * Defines a subset of pending tokenization transitions based on potential inputs. + * Defines a subset of pending tokenization transitions based on potential + * inputs. + * + * If more than one `transitionPaths` entry exists, this should directly + * correspond to a unique instance of `SearchCluster` (per affected + * `ContextToken`) once fully processed, each comprised of the corresponding + * `SearchPath` entries constructed from each `transitionPaths` entry. + * + * If only one `transitionPaths` entry exists, it should correspond to + * `SearchPath` instances instead; there is no need for `SearchCluster` overhead + * in such cases. */ export interface TokenizationSubset { /** @@ -55,7 +73,7 @@ export interface TokenizationSubset { * them, yielding compatible search paths and tokenization effects after their * application. */ - readonly pendingSet: Map; + readonly transitionPaths: Map; } export function editKeyer(precomputation: TokenizationTransitionEdits): string[] { @@ -213,13 +231,13 @@ export class TokenizationSubsetBuilder { // Maps any number of Tokenizations and their incoming alignment data to a common key // for final tokenization forms. const entry: TokenizationSubset = this._subsets.get(key) ?? { - pendingSet: new Map(), + transitionPaths: new Map(), key: key } // Finds any previously-accumulated data corresponding to both the incoming and // target final tokenization form, creating an empty entry if none yet exists. - const forTokenization: PendingTokenization = entry.pendingSet.get(tokenization) ?? { + const forTokenization: TokenizationPath = entry.transitionPaths.get(tokenization) ?? { alignment: precomputation.alignment, inputs: [], inputSubsetId: generateSubsetId() @@ -228,7 +246,7 @@ export class TokenizationSubsetBuilder { // Adds the incoming tokenized transform data for the pairing... forTokenization.inputs.push({sample: precomputation.tokenizedTransform, p}); // and ensures that the pairing's data-accumulator is in the map. - entry.pendingSet.set(tokenization, forTokenization); + entry.transitionPaths.set(tokenization, forTokenization); // Also ensures that the target tokenization's data (accumulating the pairings) // is made available within the top-level map. diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts index 58de2ae590f..5e50b41f50b 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/context-tokenization.tests.ts @@ -15,7 +15,7 @@ import { jsonFixture } from '@keymanapp/common-test-resources/model-helpers.mjs' import { LexicalModelTypes } from '@keymanapp/common-types'; import { KMWString } from '@keymanapp/web-utils'; -import { analyzePathMergesAndSplits, assembleTransforms, buildEdgeWindow, ContextToken, ContextTokenization, EditOperation, EditTuple, ExtendedEditOperation, generateSubsetId, models, PendingTokenization, SearchPath, traceInsertEdits } from '@keymanapp/lm-worker/test-index'; +import { analyzePathMergesAndSplits, assembleTransforms, buildEdgeWindow, ContextToken, ContextTokenization, EditOperation, EditTuple, ExtendedEditOperation, generateSubsetId, models, TokenizationPath, SearchPath, traceInsertEdits } from '@keymanapp/lm-worker/test-index'; import Transform = LexicalModelTypes.Transform; import TrieModel = models.TrieModel; @@ -96,7 +96,7 @@ describe('ContextTokenization', function() { // We _could_ flesh this out a bit more... but it's not really needed for this test. const edgeWindow = buildEdgeWindow(tokens, emptyTransform, false, testEdgeWindowSpec); - let transitionEdits: PendingTokenization = { + let transitionEdits: TokenizationPath = { alignment: { merges: [], splits: [], @@ -129,7 +129,7 @@ describe('ContextTokenization', function() { // We _could_ flesh this out a bit more... but it's not really needed for this test. const edgeWindow = buildEdgeWindow(tokens, emptyTransform, false, testEdgeWindowSpec); - let transitionEdits: PendingTokenization = { + let transitionEdits: TokenizationPath = { alignment: { merges: [], splits: [], diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/tokenization-subsets.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/tokenization-subsets.tests.ts index 35c55dec97c..9c4cb9b7d16 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/tokenization-subsets.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/context/tokenization-subsets.tests.ts @@ -618,8 +618,8 @@ describe('TokenizationSubsetBuilder', function() { assert.equal(subsetBuilder.subsets.size, 1); // All transforms have similar impacts. const subset = [...subsetBuilder.subsets.values()][0]; - assert.equal(subset.pendingSet.size, 1); // Built from only one tokenization - assert.deepEqual(subset.pendingSet.get(baseTokenization).inputs, + assert.equal(subset.transitionPaths.size, 1); // Built from only one tokenization + assert.deepEqual(subset.transitionPaths.get(baseTokenization).inputs, inputDistribution.map((sample) => { const map = new Map(); map.set(0, sample.sample); @@ -646,18 +646,18 @@ describe('TokenizationSubsetBuilder', function() { assert.equal(subsetBuilder.subsets.size, 2); // All transforms have similar impacts. const subsets = [...subsetBuilder.subsets.values()]; - subsets.forEach((subset) => assert.equal(subset.pendingSet.size, 1)); // Built from only one tokenization + subsets.forEach((subset) => assert.equal(subset.transitionPaths.size, 1)); // Built from only one tokenization const distributionWithoutWhitespace = inputDistribution.slice(0, inputDistribution.length-1); - const extendingSubset = subsets.find((subset) => subset.pendingSet.get(baseTokenization).inputs.length > 1); - assert.deepEqual(extendingSubset.pendingSet.get(baseTokenization).inputs, + const extendingSubset = subsets.find((subset) => subset.transitionPaths.get(baseTokenization).inputs.length > 1); + assert.deepEqual(extendingSubset.transitionPaths.get(baseTokenization).inputs, distributionWithoutWhitespace.map((sample) => { const map = new Map(); map.set(0, sample.sample); return { sample: map, p: sample.p }; })); - const whitespaceSubset = subsets.find((subset) => subset.pendingSet.get(baseTokenization).inputs.length == 1); + const whitespaceSubset = subsets.find((subset) => subset.transitionPaths.get(baseTokenization).inputs.length == 1); const whitespaceSample = inputDistribution[inputDistribution.length - 1]; const expectedWhitespaceTransformTokenization = { sample: (() => { @@ -670,7 +670,7 @@ describe('TokenizationSubsetBuilder', function() { })(), p: whitespaceSample.p }; - assert.deepEqual(whitespaceSubset.pendingSet.get(baseTokenization).inputs, [expectedWhitespaceTransformTokenization]); + assert.deepEqual(whitespaceSubset.transitionPaths.get(baseTokenization).inputs, [expectedWhitespaceTransformTokenization]); }); it("builds different subsets for transforms resulting in different total lengths and token count", () => { @@ -698,54 +698,54 @@ describe('TokenizationSubsetBuilder', function() { const subsets = [...subsetBuilder.subsets.values()]; const sameTokenLen4Subset = subsets.find((subset) => { - const dataForSet = subset.pendingSet.get(baseTokenization); + const dataForSet = subset.transitionPaths.get(baseTokenization); const totalMass = dataForSet.inputs.reduce((accum, curr) => accum + curr.p, 0); // Thanks, floating-point precision. // Should land both the 'é' (delete 1) and empty-string transform (that lacks deletes) return Math.abs(totalMass - .45) < 1e-8; }); assert.isOk(sameTokenLen4Subset); - assert.equal(sameTokenLen4Subset.pendingSet.get(baseTokenization).inputs.length, 2); + assert.equal(sameTokenLen4Subset.transitionPaths.get(baseTokenization).inputs.length, 2); const sameTokenLen5Subset = subsets.find((subset) => { - const dataForSet = subset.pendingSet.get(baseTokenization); + const dataForSet = subset.transitionPaths.get(baseTokenization); const totalMass = dataForSet.inputs.reduce((accum, curr) => accum + curr.p, 0); // Thanks, floating-point precision. // Should land both the 't' and 's' transforms: adds 1 char, deletes none return Math.abs(totalMass - .35) < 1e-8; }); assert.isOk(sameTokenLen5Subset); - assert.equal(sameTokenLen5Subset.pendingSet.get(baseTokenization).inputs.length, 2); + assert.equal(sameTokenLen5Subset.transitionPaths.get(baseTokenization).inputs.length, 2); const sameTokenLen3Subset = subsets.find((subset) => { - const dataForSet = subset.pendingSet.get(baseTokenization); + const dataForSet = subset.transitionPaths.get(baseTokenization); const totalMass = dataForSet.inputs.reduce((accum, curr) => accum + curr.p, 0); // Thanks, floating-point precision. // Should land the backspace transform. return Math.abs(totalMass - .1) < 1e-8; }); assert.isOk(sameTokenLen3Subset); - assert.equal(sameTokenLen3Subset.pendingSet.get(baseTokenization).inputs.length, 1); + assert.equal(sameTokenLen3Subset.transitionPaths.get(baseTokenization).inputs.length, 1); const plusOneTokenSubset = subsets.find((subset) => { - const dataForSet = subset.pendingSet.get(baseTokenization); + const dataForSet = subset.transitionPaths.get(baseTokenization); const totalMass = dataForSet.inputs.reduce((accum, curr) => accum + curr.p, 0); // Thanks, floating-point precision. // Should land the backspace transform. return Math.abs(totalMass - .08) < 1e-8; }); assert.isOk(plusOneTokenSubset); - assert.equal(plusOneTokenSubset.pendingSet.get(baseTokenization).inputs.length, 1); + assert.equal(plusOneTokenSubset.transitionPaths.get(baseTokenization).inputs.length, 1); const plusTwoTokensSubset = subsets.find((subset) => { - const dataForSet = subset.pendingSet.get(baseTokenization); + const dataForSet = subset.transitionPaths.get(baseTokenization); const totalMass = dataForSet.inputs.reduce((accum, curr) => accum + curr.p, 0); // Thanks, floating-point precision. // Should land the backspace transform. return Math.abs(totalMass - .12) < 1e-8; }); assert.isOk(plusTwoTokensSubset); - assert.equal(plusTwoTokensSubset.pendingSet.get(baseTokenization).inputs.length, 1); + assert.equal(plusTwoTokensSubset.transitionPaths.get(baseTokenization).inputs.length, 1); }); it("places compatible results from separate tokenizations in the same subset after whitespace", () => { @@ -801,7 +801,7 @@ describe('TokenizationSubsetBuilder', function() { // consider their paths separately after the transition. assert.equal(subsetBuilder.subsets.size, 1); // Has entries from two different base tokenizations. - assert.equal([...subsetBuilder.subsets.values()][0].pendingSet.size, 2); + assert.equal([...subsetBuilder.subsets.values()][0].transitionPaths.size, 2); }); it("places compatible results from separate tokenizations in the same subset (mid-token)", () => { @@ -865,9 +865,9 @@ describe('TokenizationSubsetBuilder', function() { // sé + an, sea + n: both result in a four-char long token starting at the same point. // Same total amount of .deleteLeft is supported for both variations. - const mergedSubset = subsets.find((subset) => subset.pendingSet.size); + const mergedSubset = subsets.find((subset) => subset.transitionPaths.size); assert.isOk(mergedSubset); - assert.isTrue(mergedSubset.pendingSet.has(twoCharTokenization)); - assert.isTrue(mergedSubset.pendingSet.has(threeCharTokenization)); + assert.isTrue(mergedSubset.transitionPaths.has(twoCharTokenization)); + assert.isTrue(mergedSubset.transitionPaths.has(threeCharTokenization)); }); }); \ No newline at end of file