keymanapp · jahorton · Nov 21, 2025
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-state.ts
@@ -14,10 +14,12 @@ import { applyTransform } from '@keymanapp/models-templates';
 import { KMWString } from '@keymanapp/web-utils';
 
 import { ContextToken } from './context-token.js';
-import { ContextTokenization } from './context-tokenization.js';
+import { ContextTokenization, determineTaillessTrueKeystroke } from './context-tokenization.js';
 import { ContextTransition } from './context-transition.js';
 import { determineModelTokenizer } from '../model-helpers.js';
-import { legacySubsetKeyer, TokenizationSubsetBuilder } from './tokenization-subsets.js';
+import { SearchCluster } from './search-cluster.js';
+import { SearchPath } from './search-path.js';
+import { precomputationSubsetKeyer, TokenizationSubsetBuilder } from './tokenization-subsets.js';
 import TransformUtils from '../transformUtils.js';
 
 import Context = LexicalModelTypes.Context;
@@ -45,7 +47,7 @@ export class ContextState {
   /**
    * Denotes the most likely tokenization for the represented Context.
    */
-  tokenization: ContextTokenization;
+  tokenizations: ContextTokenization[];
 
   /**
    * Denotes the keystroke-sourced Transform that was last applied to a
@@ -118,13 +120,13 @@ export class ContextState {
    * @param tokenization Precomputed tokenization for the context, leveraging previous
    * correction-search progress and results
    */
-  constructor(context: Context, model: LexicalModel, tokenization?: ContextTokenization);
-  constructor(param1: Context | ContextState, model?: LexicalModel, tokenization?: ContextTokenization) {
+  constructor(context: Context, model: LexicalModel, tokenizations?: ContextTokenization[]);
+  constructor(param1: Context | ContextState, model?: LexicalModel, tokenizations?: ContextTokenization[]) {
     if(!(param1 instanceof ContextState)) {
       this.context = param1;
       this.model = model;
-      if(tokenization) {
-        this.tokenization = tokenization;
+      if(tokenizations) {
+        this.tokenizations = tokenizations;
       } else {
         this.initFromReset();
       }
@@ -133,7 +135,7 @@ export class ContextState {
 
       Object.assign(this, stateToClone);
       this.inputTransforms = new Map(stateToClone.inputTransforms);
-      this.tokenization = new ContextTokenization(stateToClone.tokenization);
+      this.tokenizations = stateToClone.tokenizations.map(t => new ContextTokenization(t));
 
       // A shallow copy of the array is fine, but we'd be best off
       // not aliasing the array itself.
@@ -164,7 +166,7 @@ export class ContextState {
     if(baseTokens.length == 0) {
       baseTokens.push(new ContextToken(this.model));
     }
-    this.tokenization = new ContextTokenization(baseTokens);
+    this.tokenizations = [new ContextTokenization(baseTokens)];
     this.inputTransforms = new Map();
   }
 
@@ -198,19 +200,25 @@ export class ContextState {
     appliedSuggestionId?: number
   ): ContextTransition {
     const lexicalModel = this.model;
-
     const trueInput = transformDistribution[0].sample;
+
+    // Determine the best probability from among ALL available inputs, before they're split
+    // into subsets.
+    const bestProb = transformDistribution.reduce((best, cur) => best < cur.p ? cur.p : best, 0);
     const transition = new ContextTransition(this, this.appliedInput?.id);
 
     // From here on, we work toward the common-case - re-using old info when
     // context (and its tokenization) is changed by an input Transform.
-
-    let trueInputSubsetKey: string;
     const slideUpdateTransform = determineContextSlideTransform(this.context, context);
 
     // Goal:  allow multiple base tokenizations.
-    const startTokenizations = [this.tokenization];
-    const startTokenizationsAfterSlide = startTokenizations.map(t => t.applyContextSlide(lexicalModel, slideUpdateTransform));
+    const startTokenizations: Set<ContextTokenization> = new Set();
+    const keyedTokenizations: Map<string, ContextTokenization> = new Map();
+    this.tokenizations.forEach(t => {
+      const slidTokenization = t.applyContextSlide(lexicalModel, slideUpdateTransform);
+      startTokenizations.add(slidTokenization);
+      keyedTokenizations.set(t.clusteringKey, slidTokenization)
+    });
 
     // Easy case - no net change to the tokenizations whatsoever; the actual request
     // aims to save-state the most recent results.
@@ -220,38 +228,92 @@ export class ContextState {
       // If the tokenizations match, clone the ContextState; we want to preserve a post-application
       // context separately from pre-application contexts for predictions based on empty roots.
       const state = new ContextState(this);
-      state.tokenization = startTokenizationsAfterSlide[0];
+      state.tokenizations = [...startTokenizations.values()];
       transition.finalize(state, transformDistribution);
       return transition;
     }
 
-    const subsetBuilder = new TokenizationSubsetBuilder(legacySubsetKeyer);
-    for(let baseTokenization of startTokenizationsAfterSlide) {
-
+    const subsetBuilder = new TokenizationSubsetBuilder(precomputationSubsetKeyer);
+    for(let baseTokenization of startTokenizations.values()) {
       for(let mass of transformDistribution) {
+        // Handle the splits and merges early, here.
         const tokenizationAnalysis = baseTokenization.mapWhitespacedTokenization(lexicalModel, mass.sample);
-        subsetBuilder.addPrecomputation(baseTokenization, tokenizationAnalysis, mass.p);
+        const alignment = tokenizationAnalysis.alignment;
+
+        // Pre-process any splits and merges; the result of these operations may
+        // have the same properties as other base tokenizations within the
+        // subset if compatible.
+        const needsRealignment = (alignment.merges.length > 0 || alignment.splits.length > 0 || alignment.unmappedEdits.length > 0);
+        const sourceTokenization = needsRealignment ? baseTokenization.realign(alignment) : baseTokenization;
 
-        if(mass.sample == trueInput) {
-          trueInputSubsetKey = subsetBuilder.keyer(tokenizationAnalysis);
-        }
+        subsetBuilder.addPrecomputation(sourceTokenization, tokenizationAnalysis, mass.p);
       }
     }
 
-    // And now to (partly) detransform from a multiple-tokenization paradigm.
-    const trueInputSubset = subsetBuilder.subsets.get(trueInputSubsetKey);
-    // Right now, we only have one base tokenization, so we just fetch it.
-    const baseTokenization = startTokenizationsAfterSlide[0];
-    // For multiple tokenizations, we'd retrieve each, use the "most likely" one as base,
-    // and then fold all resulting search spaces (on the final token) into one.
-    const tokenizationAnalysis = trueInputSubset.transitionPaths.get(baseTokenization);
+    // For all target tokenizations - each transition subset...
+    const finalTokenizations = [...subsetBuilder.subsets.values()].map((subset) => {
+      // Iterate over all _source_ tokenizations and the changes used to transition them
+      // to that target tokenization.
+      const transitionSets = [...subset.transitionPaths.entries()];
+      const isolatedSubsetResults = transitionSets.map((precomp) => {
+        const rootTokenization = precomp[0];
 
-    // Determine the best probability from among ALL available inputs, before they're split
-    // into subsets.
-    const bestProb = transformDistribution.reduce((best, curr) => Math.max(best, curr.p), 0);
-    // Should gain one per subsetBuilder.subsets entry.
-    const realignedTokenization = baseTokenization.realign(tokenizationAnalysis.alignment);
-    const resultTokenization = realignedTokenization.evaluateTransition(tokenizationAnalysis, trueInput.id, bestProb, appliedSuggestionId);
+        return rootTokenization.evaluateTransition(precomp[1], trueInput.id, bestProb, appliedSuggestionId);
+      });
+
+      // Super-easy case:  there's only the one tokenization anyway.
+      if(isolatedSubsetResults.length == 1) {
+        return isolatedSubsetResults[0];
+      }
+
+      // Assumption:  all produced "isolatedSubsetResults" should essentially be
+      // the same tokenization. That said, tail entries will likely not be
+      // perfect matches; we need to splice them together, without duplicates.
+      // We also cannot rely on tokens before the standard tail index having
+      // been unmodified; merges and splits may have been applied earlier in the
+      // sequence.
+
+      const tokenCount = isolatedSubsetResults[0].tokens.length;
+      if(isolatedSubsetResults.find(sr => sr.tokens.length != tokenCount)) {
+        throw new Error("Assumption invalidated:  incoming tokenization paths do not converge");
+      }
+
+      const finalizedTokenization: ContextToken[] = [];
+      for(let i = 0; i < tokenCount; i++) {
+        const spaceSet: Set<SearchPath> = new Set();
+        let isWhitespace = true;
+        let isPartial = false;
+
+        isolatedSubsetResults.map((sr) => sr.tokens[i]).forEach((token) => {
+          const searchSpace = token.searchSpace;
+          isWhitespace &&= token.isWhitespace;
+          isPartial ||= token.isPartial;
+
+          if(searchSpace instanceof SearchPath) {
+            spaceSet.add(searchSpace);
+          } else if(searchSpace instanceof SearchCluster) {
+            searchSpace.parents.forEach(p => spaceSet.add(p));
+          } else {
+            throw new Error("Cannot handle unknown SearchSpace type");
+          }
+        });
+
+        const setVals = [...spaceSet.values()]
+        const finalizedSpace = setVals.length > 1 ? new SearchCluster(setVals) : setVals[0];
+
+        const token = new ContextToken(finalizedSpace);
+        token.isWhitespace = isWhitespace;
+        token.isPartial = isPartial;
+
+        finalizedTokenization.push(token)
+      }
+
+      return new ContextTokenization(
+        finalizedTokenization,
+        transitionSets[0][1],
+        determineTaillessTrueKeystroke(transitionSets[0][1])
+      );
+    });
 
     // ------------
 
@@ -261,17 +323,26 @@ export class ContextState {
     // epic/dict-breaker:  if ANY decently-likely tokenization satisfies this, we still
     // have a reasonable candidate for display of a delayed reversion.  (Not 'all' -
     // 'any'.)
-    const tokens = resultTokenization.tokens;
-    const lastIndex = tokens.length - 1;
-    // Ignore a context-final empty '' token; the interesting one is what comes before.
-    const nonEmptyTail = !tokens[lastIndex].isEmptyToken ? tokens[lastIndex] : tokens[lastIndex - 1];
-    const appliedSuggestionTransitionId = nonEmptyTail?.appliedTransitionId;
 
     const state = new ContextState(applyTransform(trueInput, context), lexicalModel);
-    state.tokenization =  new ContextTokenization(resultTokenization.tokens, tokenizationAnalysis, resultTokenization.taillessTrueKeystroke);
+    // Set tokenizations from above.
+    // TODO:
+    // - sort by most .tail.searchSpace.bestExample.p?
+    // - threshold to the N most likely tokenizations?
+    state.tokenizations = finalTokenizations;
     state.appliedInput = transformDistribution?.[0].sample;
     transition.finalize(state, transformDistribution);
-    transition.revertableTransitionId = appliedSuggestionTransitionId;
+
+    // Maybe sort the tokenizations in some manner, first?
+    transition.revertableTransitionId = state.tokenizations.map((tokenization) => {
+      const tokens = tokenization.tokens;
+      const lastIndex = tokens.length - 1;
+      // Ignore a context-final empty '' token; the interesting one is what comes before.
+      const nonEmptyTail = !tokens[lastIndex].isEmptyToken ? tokens[lastIndex] : tokens[lastIndex - 1];
+      return nonEmptyTail?.appliedTransitionId;
+    }).find((transitionId) => {
+      return transitionId !== undefined;
+    });
     return transition;
   }
 }

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-token.ts
@@ -10,7 +10,7 @@
 import { LexicalModelTypes } from '@keymanapp/common-types';
 
 import { SearchPath } from "./search-path.js";
-import { SearchSpace, PathInputProperties } from "./search-space.js";
+import { isSearchSpace, SearchSpace } from "./search-space.js";
 import { TokenSplitMap } from "./context-tokenization.js";
 import { generateSubsetId } from './tokenization-subsets.js';
 
@@ -69,7 +69,7 @@ export class ContextToken {
    * Constructs a new, empty instance for use with the specified LexicalModel.
    * @param model
    */
-  constructor(model: LexicalModel);
+  constructor(model: SearchSpace | LexicalModel);
   /**
    * Constructs a new instance with pre-existing text for use with the specified LexicalModel.
    * @param model
@@ -81,7 +81,7 @@ export class ContextToken {
    * @param baseToken
    */
   constructor(baseToken: ContextToken);
-  constructor(param: ContextToken | LexicalModel, rawText?: string, isPartial?: boolean) {
+  constructor(param: ContextToken | SearchSpace | LexicalModel, rawText?: string, isPartial?: boolean) {
     if(param instanceof ContextToken) {
       const priorToken = param;
       Object.assign(this, priorToken);
@@ -92,7 +92,7 @@ export class ContextToken {
       // we need to ensure that only fully-utilized keystrokes are considered.
       this._searchSpace = priorToken.searchSpace;
     } else {
-      const model = param;
+      const baseSpace = isSearchSpace(param) ? param as SearchSpace : new SearchPath(param as LexicalModel);
 
       // May be altered outside of the constructor.
       this.isWhitespace = false;
@@ -105,7 +105,7 @@ export class ContextToken {
         return [{sample: transform, p: 1.0}];
       });
 
-      let searchSpace = new SearchPath(model);
+      let searchSpace = baseSpace;
 
       rawTransformDistributions.forEach((entry) => {
         searchSpace = new SearchPath(searchSpace, entry, {
@@ -122,14 +122,6 @@ export class ContextToken {
     }
   }
 
-  /**
-   * Call this to record the original keystroke Transforms for the context range
-   * corresponding to this token.
-   */
-  addInput(inputSource: PathInputProperties, distribution: Distribution<Transform>) {
-    this._searchSpace = new SearchPath(this._searchSpace, distribution, inputSource);
-  }
-
   get inputCount() {
     return this._searchSpace.inputCount;
   }

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-tokenization.ts
@@ -9,6 +9,7 @@
 
 import { LexicalModelTypes } from '@keymanapp/common-types';
 import { KMWString } from '@keymanapp/web-utils';
+import { SENTINEL_CODE_UNIT } from '@keymanapp/models-templates';
 
 import { ContextToken } from './context-token.js';
 import TransformUtils from '../transformUtils.js';
@@ -17,6 +18,7 @@ import { determineModelTokenizer } from '../model-helpers.js';
 import { ExtendedEditOperation, SegmentableDistanceCalculation } from './segmentable-calculation.js';
 import { TokenizationPath } from './tokenization-subsets.js';
 import { PathInputProperties } from './search-space.js';
+import { SearchPath } from './search-path.js';
 
 import LexicalModel = LexicalModelTypes.LexicalModel;
 import Transform = LexicalModelTypes.Transform;
@@ -130,7 +132,7 @@ export class ContextTokenization {
 
   constructor(priorToClone: ContextTokenization);
   constructor(tokens: ContextToken[]);
-  constructor(tokens: ContextToken[], alignment: TokenizationPath, taillessTrueKeystroke: Transform);
+  constructor(tokens: ContextToken[], tokenizationPath: TokenizationPath, taillessTrueKeystroke: Transform);
   constructor(
     param1: ContextToken[] | ContextTokenization,
     tokenizationPath?: TokenizationPath,
@@ -599,11 +601,6 @@ export class ContextTokenization {
       }
 
       affectedToken.isPartial = true;
-      if(appliedSuggestionId !== undefined) {
-        affectedToken.appliedTransitionId = appliedSuggestionId;
-      } else {
-        delete affectedToken.appliedTransitionId;
-      }
 
       // If we are completely replacing a token via delete left, erase the deleteLeft;
       // that part applied to a _previous_ token that no longer exists.
@@ -625,11 +622,18 @@ export class ContextTokenization {
         inputSource.segment.end = appliedLength;
       }
 
-      affectedToken = new ContextToken(affectedToken);
-      affectedToken.addInput(inputSource, distribution);
+      const searchPath = new SearchPath(affectedToken.searchSpace, distribution, inputSource); // the token generally holds the current SearchSpace... at present.
+      affectedToken = new ContextToken(searchPath);
+
+      if(appliedSuggestionId !== undefined) {
+        affectedToken.appliedTransitionId = appliedSuggestionId;
+      } else {
+        delete affectedToken.appliedTransitionId;
+      }
 
       const tokenize = determineModelTokenizer(lexicalModel);
       affectedToken.isWhitespace = tokenize({left: affectedToken.exampleInput, startOfBuffer: false, endOfBuffer: false}).left[0]?.isWhitespace ?? false;
+
       // Do not re-use the previous token; the mutation may have unexpected
       // results (say, in unit-testing)
       tailTokenization[tokenIndex] = affectedToken;
@@ -643,6 +647,11 @@ export class ContextTokenization {
       determineTaillessTrueKeystroke(tokenizationPath)
     );
   }
+
+  get clusteringKey(): string {
+    // Note:  SENTINEL_CODE_UNIT is not leveraged by SearchPath.sourceRangeKey.
+    return this.tokens.map(t => `${t.sourceRangeKey}L${t.searchSpace.codepointLength}`).join(SENTINEL_CODE_UNIT);
+  }
 }
 
 const appendText = (full: string, current: string) => full + current;

diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/context-transition.ts
@@ -139,6 +139,12 @@ export class ContextTransition {
       transform: Transform,
       appliedTransitionId: number
     ) => {
+      // TODO:  add NEW tokenization based on base tokenization + suggestion.
+      // Ensure it's the "most likely" in some sense.
+      //
+      // Issue:  suggestions do not currently track their base spaceId - their source.
+      // Cannot reference-equality check due to inter-thread communication.
+      // How can we best remember the suggestion's original source tokenization?
       const state = baseState.analyzeTransition(
         baseState.context,
         [{sample: transform, p: 1}],