Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@ import { applyTransform } from '@keymanapp/models-templates';
import { KMWString } from '@keymanapp/web-utils';

import { ContextToken } from './context-token.js';
import { ContextTokenization } from './context-tokenization.js';
import { ContextTokenization, determineTaillessTrueKeystroke } from './context-tokenization.js';
import { ContextTransition } from './context-transition.js';
import { determineModelTokenizer } from '../model-helpers.js';
import { legacySubsetKeyer, TokenizationSubsetBuilder } from './tokenization-subsets.js';
import { SearchCluster } from './search-cluster.js';
import { SearchPath } from './search-path.js';
import { precomputationSubsetKeyer, TokenizationSubsetBuilder } from './tokenization-subsets.js';
import TransformUtils from '../transformUtils.js';

import Context = LexicalModelTypes.Context;
Expand Down Expand Up @@ -45,7 +47,7 @@ export class ContextState {
/**
* Denotes the most likely tokenization for the represented Context.
*/
tokenization: ContextTokenization;
tokenizations: ContextTokenization[];

/**
* Denotes the keystroke-sourced Transform that was last applied to a
Expand Down Expand Up @@ -118,13 +120,13 @@ export class ContextState {
* @param tokenization Precomputed tokenization for the context, leveraging previous
* correction-search progress and results
*/
constructor(context: Context, model: LexicalModel, tokenization?: ContextTokenization);
constructor(param1: Context | ContextState, model?: LexicalModel, tokenization?: ContextTokenization) {
constructor(context: Context, model: LexicalModel, tokenizations?: ContextTokenization[]);
constructor(param1: Context | ContextState, model?: LexicalModel, tokenizations?: ContextTokenization[]) {
if(!(param1 instanceof ContextState)) {
this.context = param1;
this.model = model;
if(tokenization) {
this.tokenization = tokenization;
if(tokenizations) {
this.tokenizations = tokenizations;
} else {
this.initFromReset();
}
Expand All @@ -133,7 +135,7 @@ export class ContextState {

Object.assign(this, stateToClone);
this.inputTransforms = new Map(stateToClone.inputTransforms);
this.tokenization = new ContextTokenization(stateToClone.tokenization);
this.tokenizations = stateToClone.tokenizations.map(t => new ContextTokenization(t));

// A shallow copy of the array is fine, but we'd be best off
// not aliasing the array itself.
Expand Down Expand Up @@ -164,7 +166,7 @@ export class ContextState {
if(baseTokens.length == 0) {
baseTokens.push(new ContextToken(this.model));
}
this.tokenization = new ContextTokenization(baseTokens);
this.tokenizations = [new ContextTokenization(baseTokens)];
this.inputTransforms = new Map();
}

Expand Down Expand Up @@ -198,19 +200,25 @@ export class ContextState {
appliedSuggestionId?: number
): ContextTransition {
const lexicalModel = this.model;

const trueInput = transformDistribution[0].sample;

// Determine the best probability from among ALL available inputs, before they're split
// into subsets.
const bestProb = transformDistribution.reduce((best, cur) => best < cur.p ? cur.p : best, 0);
const transition = new ContextTransition(this, this.appliedInput?.id);

// From here on, we work toward the common-case - re-using old info when
// context (and its tokenization) is changed by an input Transform.

let trueInputSubsetKey: string;
const slideUpdateTransform = determineContextSlideTransform(this.context, context);

// Goal: allow multiple base tokenizations.
const startTokenizations = [this.tokenization];
const startTokenizationsAfterSlide = startTokenizations.map(t => t.applyContextSlide(lexicalModel, slideUpdateTransform));
const startTokenizations: Set<ContextTokenization> = new Set();
const keyedTokenizations: Map<string, ContextTokenization> = new Map();
this.tokenizations.forEach(t => {
const slidTokenization = t.applyContextSlide(lexicalModel, slideUpdateTransform);
startTokenizations.add(slidTokenization);
keyedTokenizations.set(t.clusteringKey, slidTokenization)
});

// Easy case - no net change to the tokenizations whatsoever; the actual request
// aims to save-state the most recent results.
Expand All @@ -220,38 +228,92 @@ export class ContextState {
// If the tokenizations match, clone the ContextState; we want to preserve a post-application
// context separately from pre-application contexts for predictions based on empty roots.
const state = new ContextState(this);
state.tokenization = startTokenizationsAfterSlide[0];
state.tokenizations = [...startTokenizations.values()];
transition.finalize(state, transformDistribution);
return transition;
}

const subsetBuilder = new TokenizationSubsetBuilder(legacySubsetKeyer);
for(let baseTokenization of startTokenizationsAfterSlide) {

const subsetBuilder = new TokenizationSubsetBuilder(precomputationSubsetKeyer);
for(let baseTokenization of startTokenizations.values()) {
for(let mass of transformDistribution) {
// Handle the splits and merges early, here.
const tokenizationAnalysis = baseTokenization.mapWhitespacedTokenization(lexicalModel, mass.sample);
subsetBuilder.addPrecomputation(baseTokenization, tokenizationAnalysis, mass.p);
const alignment = tokenizationAnalysis.alignment;

// Pre-process any splits and merges; the result of these operations may
// have the same properties as other base tokenizations within the
// subset if compatible.
const needsRealignment = (alignment.merges.length > 0 || alignment.splits.length > 0 || alignment.unmappedEdits.length > 0);
const sourceTokenization = needsRealignment ? baseTokenization.realign(alignment) : baseTokenization;

if(mass.sample == trueInput) {
trueInputSubsetKey = subsetBuilder.keyer(tokenizationAnalysis);
}
subsetBuilder.addPrecomputation(sourceTokenization, tokenizationAnalysis, mass.p);
}
}

// And now to (partly) detransform from a multiple-tokenization paradigm.
const trueInputSubset = subsetBuilder.subsets.get(trueInputSubsetKey);
// Right now, we only have one base tokenization, so we just fetch it.
const baseTokenization = startTokenizationsAfterSlide[0];
// For multiple tokenizations, we'd retrieve each, use the "most likely" one as base,
// and then fold all resulting search spaces (on the final token) into one.
const tokenizationAnalysis = trueInputSubset.transitionPaths.get(baseTokenization);
// For all target tokenizations - each transition subset...
const finalTokenizations = [...subsetBuilder.subsets.values()].map((subset) => {
// Iterate over all _source_ tokenizations and the changes used to transition them
// to that target tokenization.
const transitionSets = [...subset.transitionPaths.entries()];
const isolatedSubsetResults = transitionSets.map((precomp) => {
const rootTokenization = precomp[0];

// Determine the best probability from among ALL available inputs, before they're split
// into subsets.
const bestProb = transformDistribution.reduce((best, curr) => Math.max(best, curr.p), 0);
// Should gain one per subsetBuilder.subsets entry.
const realignedTokenization = baseTokenization.realign(tokenizationAnalysis.alignment);
const resultTokenization = realignedTokenization.evaluateTransition(tokenizationAnalysis, trueInput.id, bestProb, appliedSuggestionId);
return rootTokenization.evaluateTransition(precomp[1], trueInput.id, bestProb, appliedSuggestionId);
});

// Super-easy case: there's only the one tokenization anyway.
if(isolatedSubsetResults.length == 1) {
return isolatedSubsetResults[0];
}

// Assumption: all produced "isolatedSubsetResults" should essentially be
// the same tokenization. That said, tail entries will likely not be
// perfect matches; we need to splice them together, without duplicates.
// We also cannot rely on tokens before the standard tail index having
// been unmodified; merges and splits may have been applied earlier in the
// sequence.

const tokenCount = isolatedSubsetResults[0].tokens.length;
if(isolatedSubsetResults.find(sr => sr.tokens.length != tokenCount)) {
throw new Error("Assumption invalidated: incoming tokenization paths do not converge");
}

const finalizedTokenization: ContextToken[] = [];
for(let i = 0; i < tokenCount; i++) {
const spaceSet: Set<SearchPath> = new Set();
let isWhitespace = true;
let isPartial = false;

isolatedSubsetResults.map((sr) => sr.tokens[i]).forEach((token) => {
const searchSpace = token.searchSpace;
isWhitespace &&= token.isWhitespace;
isPartial ||= token.isPartial;

if(searchSpace instanceof SearchPath) {
spaceSet.add(searchSpace);
} else if(searchSpace instanceof SearchCluster) {
searchSpace.parents.forEach(p => spaceSet.add(p));
} else {
throw new Error("Cannot handle unknown SearchSpace type");
}
});

const setVals = [...spaceSet.values()]
const finalizedSpace = setVals.length > 1 ? new SearchCluster(setVals) : setVals[0];

const token = new ContextToken(finalizedSpace);
token.isWhitespace = isWhitespace;
token.isPartial = isPartial;

finalizedTokenization.push(token)
}

return new ContextTokenization(
finalizedTokenization,
transitionSets[0][1],
determineTaillessTrueKeystroke(transitionSets[0][1])
);
});

// ------------

Expand All @@ -261,17 +323,26 @@ export class ContextState {
// epic/dict-breaker: if ANY decently-likely tokenization satisfies this, we still
// have a reasonable candidate for display of a delayed reversion. (Not 'all' -
// 'any'.)
const tokens = resultTokenization.tokens;
const lastIndex = tokens.length - 1;
// Ignore a context-final empty '' token; the interesting one is what comes before.
const nonEmptyTail = !tokens[lastIndex].isEmptyToken ? tokens[lastIndex] : tokens[lastIndex - 1];
const appliedSuggestionTransitionId = nonEmptyTail?.appliedTransitionId;

const state = new ContextState(applyTransform(trueInput, context), lexicalModel);
state.tokenization = new ContextTokenization(resultTokenization.tokens, tokenizationAnalysis, resultTokenization.taillessTrueKeystroke);
// Set tokenizations from above.
// TODO:
// - sort by most .tail.searchSpace.bestExample.p?
// - threshold to the N most likely tokenizations?
state.tokenizations = finalTokenizations;
state.appliedInput = transformDistribution?.[0].sample;
transition.finalize(state, transformDistribution);
transition.revertableTransitionId = appliedSuggestionTransitionId;

// Maybe sort the tokenizations in some manner, first?
transition.revertableTransitionId = state.tokenizations.map((tokenization) => {
const tokens = tokenization.tokens;
const lastIndex = tokens.length - 1;
// Ignore a context-final empty '' token; the interesting one is what comes before.
const nonEmptyTail = !tokens[lastIndex].isEmptyToken ? tokens[lastIndex] : tokens[lastIndex - 1];
return nonEmptyTail?.appliedTransitionId;
}).find((transitionId) => {
return transitionId !== undefined;
});
return transition;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import { LexicalModelTypes } from '@keymanapp/common-types';

import { SearchPath } from "./search-path.js";
import { SearchSpace, PathInputProperties } from "./search-space.js";
import { isSearchSpace, SearchSpace } from "./search-space.js";
import { TokenSplitMap } from "./context-tokenization.js";
import { generateSubsetId } from './tokenization-subsets.js';

Expand Down Expand Up @@ -69,7 +69,7 @@ export class ContextToken {
* Constructs a new, empty instance for use with the specified LexicalModel.
* @param model
*/
constructor(model: LexicalModel);
constructor(model: SearchSpace | LexicalModel);
/**
* Constructs a new instance with pre-existing text for use with the specified LexicalModel.
* @param model
Expand All @@ -81,7 +81,7 @@ export class ContextToken {
* @param baseToken
*/
constructor(baseToken: ContextToken);
constructor(param: ContextToken | LexicalModel, rawText?: string, isPartial?: boolean) {
constructor(param: ContextToken | SearchSpace | LexicalModel, rawText?: string, isPartial?: boolean) {
if(param instanceof ContextToken) {
const priorToken = param;
Object.assign(this, priorToken);
Expand All @@ -92,7 +92,7 @@ export class ContextToken {
// we need to ensure that only fully-utilized keystrokes are considered.
this._searchSpace = priorToken.searchSpace;
} else {
const model = param;
const baseSpace = isSearchSpace(param) ? param as SearchSpace : new SearchPath(param as LexicalModel);

// May be altered outside of the constructor.
this.isWhitespace = false;
Expand All @@ -105,7 +105,7 @@ export class ContextToken {
return [{sample: transform, p: 1.0}];
});

let searchSpace = new SearchPath(model);
let searchSpace = baseSpace;

rawTransformDistributions.forEach((entry) => {
searchSpace = new SearchPath(searchSpace, entry, {
Expand All @@ -122,14 +122,6 @@ export class ContextToken {
}
}

/**
* Call this to record the original keystroke Transforms for the context range
* corresponding to this token.
*/
addInput(inputSource: PathInputProperties, distribution: Distribution<Transform>) {
this._searchSpace = new SearchPath(this._searchSpace, distribution, inputSource);
}

get inputCount() {
return this._searchSpace.inputCount;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import { LexicalModelTypes } from '@keymanapp/common-types';
import { KMWString } from '@keymanapp/web-utils';
import { SENTINEL_CODE_UNIT } from '@keymanapp/models-templates';

import { ContextToken } from './context-token.js';
import TransformUtils from '../transformUtils.js';
Expand All @@ -17,6 +18,7 @@ import { determineModelTokenizer } from '../model-helpers.js';
import { ExtendedEditOperation, SegmentableDistanceCalculation } from './segmentable-calculation.js';
import { TokenizationPath } from './tokenization-subsets.js';
import { PathInputProperties } from './search-space.js';
import { SearchPath } from './search-path.js';

import LexicalModel = LexicalModelTypes.LexicalModel;
import Transform = LexicalModelTypes.Transform;
Expand Down Expand Up @@ -130,7 +132,7 @@ export class ContextTokenization {

constructor(priorToClone: ContextTokenization);
constructor(tokens: ContextToken[]);
constructor(tokens: ContextToken[], alignment: TokenizationPath, taillessTrueKeystroke: Transform);
constructor(tokens: ContextToken[], tokenizationPath: TokenizationPath, taillessTrueKeystroke: Transform);
constructor(
param1: ContextToken[] | ContextTokenization,
tokenizationPath?: TokenizationPath,
Expand Down Expand Up @@ -599,11 +601,6 @@ export class ContextTokenization {
}

affectedToken.isPartial = true;
if(appliedSuggestionId !== undefined) {
affectedToken.appliedTransitionId = appliedSuggestionId;
} else {
delete affectedToken.appliedTransitionId;
}

// If we are completely replacing a token via delete left, erase the deleteLeft;
// that part applied to a _previous_ token that no longer exists.
Expand All @@ -625,11 +622,18 @@ export class ContextTokenization {
inputSource.segment.end = appliedLength;
}

affectedToken = new ContextToken(affectedToken);
affectedToken.addInput(inputSource, distribution);
const searchPath = new SearchPath(affectedToken.searchSpace, distribution, inputSource); // the token generally holds the current SearchSpace... at present.
affectedToken = new ContextToken(searchPath);

if(appliedSuggestionId !== undefined) {
affectedToken.appliedTransitionId = appliedSuggestionId;
} else {
delete affectedToken.appliedTransitionId;
}

const tokenize = determineModelTokenizer(lexicalModel);
affectedToken.isWhitespace = tokenize({left: affectedToken.exampleInput, startOfBuffer: false, endOfBuffer: false}).left[0]?.isWhitespace ?? false;

// Do not re-use the previous token; the mutation may have unexpected
// results (say, in unit-testing)
tailTokenization[tokenIndex] = affectedToken;
Expand All @@ -643,6 +647,11 @@ export class ContextTokenization {
determineTaillessTrueKeystroke(tokenizationPath)
);
}

get clusteringKey(): string {
// Note: SENTINEL_CODE_UNIT is not leveraged by SearchPath.sourceRangeKey.
return this.tokens.map(t => `${t.sourceRangeKey}L${t.searchSpace.codepointLength}`).join(SENTINEL_CODE_UNIT);
}
}

const appendText = (full: string, current: string) => full + current;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,12 @@ export class ContextTransition {
transform: Transform,
appliedTransitionId: number
) => {
// TODO: add NEW tokenization based on base tokenization + suggestion.
// Ensure it's the "most likely" in some sense.
//
// Issue: suggestions do not currently track their base spaceId - their source.
// Cannot reference-equality check due to inter-thread communication.
// How can we best remember the suggestion's original source tokenization?
const state = baseState.analyzeTransition(
baseState.context,
[{sample: transform, p: 1}],
Expand Down
Loading