Skip to content

Commit ff4e1c5

Browse files
(tree) Update chunking algorithm to use polymorphic shape for incremental fields (#25400)
Currently, fields that are specified as incremental via the `shouldEncodeIncrementally` callback can be monomorphic causing them to be combined in a uniform chunk with its parent. This will make the field to not be incrementally summarized - When incremental summary builder calls `ChunkedForest::chunkField` for such fields, it will return a new chunk every time causing it to encode the chunk again and not re-using a summary handle for it. This PR changes the chunking algorithm so that it always uses polymorphic shape for nodes / fields that are specified as incremental. This is done by plumbing the `shouldEncodeIncrementally` function to the `tryShapeFromSchema` function which decides the shapes for fields and nodes. If `shouldEncodeIncrementally` returns true, a polymorphic shape is returned. [AB#41866](https://dev.azure.com/fluidframework/235294da-091d-4c29-84fc-cdfc3d90890b/_workitems/edit/41866)
1 parent b286e27 commit ff4e1c5

File tree

20 files changed

+431
-168
lines changed

20 files changed

+431
-168
lines changed

packages/dds/tree/src/feature-libraries/chunked-forest/chunkTree.ts

Lines changed: 112 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,8 @@ import {
1212
type ITreeCursorSynchronous,
1313
LeafNodeStoredSchema,
1414
ObjectNodeStoredSchema,
15-
type StoredSchemaCollection,
1615
type TreeFieldStoredSchema,
1716
type TreeNodeSchemaIdentifier,
18-
type TreeStoredSchema,
1917
type TreeStoredSchemaSubscription,
2018
type TreeValue,
2119
type Value,
@@ -24,6 +22,7 @@ import {
2422
ValueSchema,
2523
type TreeChunk,
2624
tryGetChunk,
25+
type SchemaAndPolicy,
2726
} from "../../core/index.js";
2827
import { getOrCreate } from "../../util/index.js";
2928
import type { FullSchemaPolicy } from "../modular-schema/index.js";
@@ -32,28 +31,38 @@ import { isStableNodeIdentifier } from "../node-identifier/index.js";
3231
import { BasicChunk } from "./basicChunk.js";
3332
import { SequenceChunk } from "./sequenceChunk.js";
3433
import { type FieldShape, TreeShape, UniformChunk } from "./uniformChunk.js";
34+
import type { IncrementalEncodingPolicy } from "./codec/index.js";
3535

3636
export interface Disposable {
3737
/**
3838
* Cleans up resources used by this, such as inbound event registrations.
3939
*/
4040
dispose(): void;
4141
}
42-
4342
/**
4443
* Creates a ChunkPolicy which responds to schema changes.
4544
*/
4645
export function makeTreeChunker(
4746
schema: TreeStoredSchemaSubscription,
4847
policy: FullSchemaPolicy,
48+
shouldEncodeIncrementally: IncrementalEncodingPolicy,
4949
): IChunker {
5050
return new Chunker(
5151
schema,
5252
policy,
5353
defaultChunkPolicy.sequenceChunkInlineThreshold,
5454
defaultChunkPolicy.sequenceChunkInlineThreshold,
5555
defaultChunkPolicy.uniformChunkNodeCount,
56-
tryShapeFromSchema,
56+
(type: TreeNodeSchemaIdentifier, shapes: Map<TreeNodeSchemaIdentifier, ShapeInfo>) =>
57+
tryShapeFromNodeSchema(
58+
{
59+
schema,
60+
policy,
61+
shouldEncodeIncrementally,
62+
shapes,
63+
},
64+
type,
65+
),
5766
);
5867
}
5968

@@ -73,7 +82,7 @@ export interface IChunker extends ChunkPolicy, Disposable {
7382
*
7483
* @remarks
7584
* For example, a schema transitively containing a sequence field, optional field, or allowing multiple child types will be Polymorphic.
76-
* See `tryShapeFromSchema` for how to tell if a type is Polymorphic.
85+
* See `tryShapeFromNodeSchema` for how to tell if a type is Polymorphic.
7786
*
7887
* TODO: cache some of the possible shapes here.
7988
*/
@@ -109,9 +118,7 @@ export class Chunker implements IChunker {
109118
public readonly sequenceChunkInlineThreshold: number,
110119
public readonly uniformChunkNodeCount: number,
111120
// eslint-disable-next-line @typescript-eslint/no-shadow
112-
private readonly tryShapeFromSchema: (
113-
schema: TreeStoredSchema,
114-
policy: FullSchemaPolicy,
121+
private readonly tryShapeFromNodeSchema: (
115122
type: TreeNodeSchemaIdentifier,
116123
shapes: Map<TreeNodeSchemaIdentifier, ShapeInfo>,
117124
) => ShapeInfo,
@@ -126,7 +133,7 @@ export class Chunker implements IChunker {
126133
this.sequenceChunkSplitThreshold,
127134
this.sequenceChunkInlineThreshold,
128135
this.uniformChunkNodeCount,
129-
this.tryShapeFromSchema,
136+
this.tryShapeFromNodeSchema,
130137
);
131138
}
132139

@@ -138,7 +145,7 @@ export class Chunker implements IChunker {
138145
this.unregisterSchemaCallback = this.schema.events.on("afterSchemaChange", () =>
139146
this.schemaChanged(),
140147
);
141-
return this.tryShapeFromSchema(this.schema, this.policy, schema, this.typeShapes);
148+
return this.tryShapeFromNodeSchema(schema, this.typeShapes);
142149
}
143150

144151
public dispose(): void {
@@ -226,75 +233,126 @@ export function makePolicy(policy?: Partial<ChunkPolicy>): ChunkPolicy {
226233
return withDefaults;
227234
}
228235

229-
export function shapesFromSchema(
230-
schema: StoredSchemaCollection,
231-
policy: FullSchemaPolicy,
232-
): Map<TreeNodeSchemaIdentifier, ShapeInfo> {
233-
const shapes: Map<TreeNodeSchemaIdentifier, ShapeInfo> = new Map();
234-
for (const identifier of schema.nodeSchema.keys()) {
235-
tryShapeFromSchema(schema, policy, identifier, shapes);
236-
}
237-
return shapes;
236+
export interface ShapeFromSchemaParameters extends SchemaAndPolicy {
237+
/**
238+
* Policy function to determine if a field should be encoded incrementally.
239+
* Incrementally encoding requires the subtree to not start in the middle of a larger uniform chunk.
240+
* Thus returning true from this callback indicates that shapes should not be produced which could
241+
*contain the incremental portion as a part of a larger shape.
242+
*/
243+
readonly shouldEncodeIncrementally: IncrementalEncodingPolicy;
244+
/**
245+
* A cache for shapes which may be read and/or updated.
246+
* As the shape is a function of the other members of `ShapeFromSchemaParameters`,
247+
* it must be replaced or cleared if any of the properties other than this cache are modified.
248+
*/
249+
readonly shapes: Map<TreeNodeSchemaIdentifier, ShapeInfo>;
250+
}
251+
252+
/**
253+
* A TreeFieldStoredSchema with some additional context about where it is in the tree.
254+
*/
255+
export interface FieldSchemaWithContext {
256+
/**
257+
* The identifier of the specific field schema to analyze for shape uniformity.
258+
*/
259+
readonly fieldSchema: TreeFieldStoredSchema;
260+
/**
261+
* The identifier of the parent node schema containing this field.
262+
* If undefined, this is a root field.
263+
*/
264+
readonly parentNodeSchema?: TreeNodeSchemaIdentifier;
265+
/**
266+
* The field key/name used to identify this field within the parent node.
267+
*/
268+
readonly key: FieldKey;
238269
}
239270

240271
/**
241-
* If `schema` has only one shape, return it.
272+
* Analyzes a tree node schema to determine if it has a single, uniform shape that can be optimized for chunking.
273+
* If the schema defines a tree structure with a deterministic, fixed shape (no optional fields, no sequences,
274+
* single child types), returns a TreeShape that can be used for efficient uniform chunking. Otherwise,
275+
* returns Polymorphic to indicate the shape varies and should use basic chunking.
276+
*
277+
* @param context - {@link ShapeFromSchemaParameters}.
278+
* @param nodeSchema - The identifier of the specific node schema to analyze for shape uniformity.
279+
* @returns TreeShape if the schema has a uniform shape, or Polymorphic if shape varies.
242280
*
243-
* Note that this does not tolerate optional or sequence fields, nor does it optimize for patterns of specific values.
281+
* @remarks
282+
* The determination here is conservative. `shouldEncodeIncrementally` is used to split up shapes so incrementally
283+
* encoded schema are not part of larger shapes. It also does not tolerate optional or sequence fields, nor does it
284+
* optimize for patterns of specific values.
244285
*/
245-
export function tryShapeFromSchema(
246-
schema: StoredSchemaCollection,
247-
policy: FullSchemaPolicy,
248-
type: TreeNodeSchemaIdentifier,
249-
shapes: Map<TreeNodeSchemaIdentifier, ShapeInfo>,
286+
export function tryShapeFromNodeSchema(
287+
context: ShapeFromSchemaParameters,
288+
nodeSchema: TreeNodeSchemaIdentifier,
250289
): ShapeInfo {
251-
return getOrCreate(shapes, type, () => {
252-
const treeSchema = schema.nodeSchema.get(type) ?? fail(0xaf9 /* missing schema */);
290+
const { schema, shapes } = context;
291+
return getOrCreate(shapes, nodeSchema, () => {
292+
const treeSchema = schema.nodeSchema.get(nodeSchema) ?? fail(0xaf9 /* missing schema */);
253293
if (treeSchema instanceof LeafNodeStoredSchema) {
254294
// Allow all string values (but only string values) to be compressed by the id compressor.
255295
// This allows compressing all compressible identifiers without requiring additional context to know which values could be identifiers.
256296
// Attempting to compress other string shouldn't have significant overhead,
257297
// and if any of them do end up compressing, that's a benefit not a bug.
258298
return treeSchema.leafValue === ValueSchema.String
259-
? new TreeShape(type, true, [], true)
260-
: new TreeShape(type, true, [], false);
299+
? new TreeShape(nodeSchema, true, [], true)
300+
: new TreeShape(nodeSchema, true, [], false);
261301
}
262302
if (treeSchema instanceof ObjectNodeStoredSchema) {
263303
const fieldsArray: FieldShape[] = [];
264-
for (const [key, field] of treeSchema.objectNodeFields) {
265-
const fieldShape = tryShapeFromFieldSchema(schema, policy, field, key, shapes);
304+
for (const [key, fieldSchema] of treeSchema.objectNodeFields) {
305+
const fieldShape = tryShapeFromFieldSchema(context, {
306+
fieldSchema,
307+
parentNodeSchema: nodeSchema,
308+
key,
309+
});
266310
if (fieldShape === undefined) {
267311
return polymorphic;
268312
}
269313
fieldsArray.push(fieldShape);
270314
}
271-
return new TreeShape(type, false, fieldsArray);
315+
return new TreeShape(nodeSchema, false, fieldsArray);
272316
}
273317
return polymorphic;
274318
});
275319
}
276320

277321
/**
278-
* If `schema` has only one shape, return it.
322+
* Same as {@link tryShapeFromNodeSchema} but for fields with {@link FieldSchemaWithContext} instead of a nodeSchema.
279323
*
280-
* Note that this does not tolerate optional or sequence fields, nor does it optimize for patterns of specific values.
324+
* @param context - {@link ShapeFromFieldSchemaParameters}.
325+
* @param fieldSchemaWithContext - {@link FieldSchemaWithContext}.
326+
* @returns FieldShape if the field has a uniform shape, or undefined if the field is polymorphic.
281327
*/
282328
export function tryShapeFromFieldSchema(
283-
schema: StoredSchemaCollection,
284-
policy: FullSchemaPolicy,
285-
type: TreeFieldStoredSchema,
286-
key: FieldKey,
287-
shapes: Map<TreeNodeSchemaIdentifier, ShapeInfo>,
329+
context: ShapeFromSchemaParameters,
330+
fieldSchemaWithContext: FieldSchemaWithContext,
288331
): FieldShape | undefined {
289-
const kind = policy.fieldKinds.get(type.kind) ?? fail(0xafa /* missing FieldKind */);
332+
const { schema, policy, shouldEncodeIncrementally, shapes } = context;
333+
const { fieldSchema, parentNodeSchema, key } = fieldSchemaWithContext;
334+
// If this field should be encoded incrementally, use polymorphic shape so that they
335+
// are chunked separately and can be re-used across encodings if they do not change.
336+
if (shouldEncodeIncrementally(parentNodeSchema, key)) {
337+
return undefined;
338+
}
339+
const kind = policy.fieldKinds.get(fieldSchema.kind) ?? fail(0xafa /* missing FieldKind */);
290340
if (kind.multiplicity !== Multiplicity.Single) {
291341
return undefined;
292342
}
293-
if (type.types?.size !== 1) {
343+
if (fieldSchema.types?.size !== 1) {
294344
return undefined;
295345
}
296-
const childType = [...type.types][0] ?? oob();
297-
const childShape = tryShapeFromSchema(schema, policy, childType, shapes);
346+
const childType = [...fieldSchema.types][0] ?? oob();
347+
const childShape = tryShapeFromNodeSchema(
348+
{
349+
schema,
350+
policy,
351+
shouldEncodeIncrementally,
352+
shapes,
353+
},
354+
childType,
355+
);
298356
if (childShape instanceof Polymorphic) {
299357
return undefined;
300358
}
@@ -490,7 +548,16 @@ export function chunkRange(
490548
return output;
491549
}
492550
/**
493-
* @param idCompressor - compressor used to encoded string values that are compressible by the idCompressor for in-memory representation.
551+
* Extracts values from the current cursor position according to the provided tree shape.
552+
*
553+
* Walks through the tree structure defined by the shape, extracting values from leaf nodes
554+
* and recursively processing child fields. If an ID compressor is provided, compressible
555+
* string values (stable node identifiers) will be recompressed for optimal storage.
556+
*
557+
* @param cursor - Tree cursor positioned at the node to extract values from
558+
* @param shape - The tree shape defining the structure to extract
559+
* @param values - Array to append the extracted values to
560+
* @param idCompressor - Optional compressor used to encode string values that are compressible by the idCompressor for in-memory representation.
494561
* If the idCompressor is not provided, the values will be the original uncompressed values.
495562
*/
496563
export function insertValues(

packages/dds/tree/src/feature-libraries/chunked-forest/codec/codecs.ts

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,9 @@ import {
1515
} from "../../../codec/index.js";
1616
import {
1717
CursorLocationType,
18-
type FieldKey,
1918
type ITreeCursorSynchronous,
2019
type SchemaAndPolicy,
2120
type TreeChunk,
22-
type TreeNodeSchemaIdentifier,
2321
} from "../../../core/index.js";
2422
import {
2523
brandedNumberType,
@@ -37,6 +35,7 @@ import type { FieldBatch } from "./fieldBatch.js";
3735
import { EncodedFieldBatch, validVersions, type FieldBatchFormatVersion } from "./format.js";
3836
import { schemaCompressedEncode } from "./schemaBasedEncode.js";
3937
import { uncompressedEncode } from "./uncompressedEncode.js";
38+
import type { IncrementalEncodingPolicy } from "./incrementalEncodingPolicy.js";
4039

4140
/**
4241
* Reference ID for a chunk that is incrementally encoded.
@@ -55,14 +54,10 @@ const ChunkReferenceId = brandedNumberType<ChunkReferenceId>({ multipleOf: 1, mi
5554
*/
5655
export interface IncrementalEncoder {
5756
/**
58-
* Returns whether a field should be incrementally encoded.
59-
* @param nodeIdentifier - The identifier of the node containing the field.
60-
* @param fieldKey - The key of the field to check.
57+
* Returns whether a node / field should be incrementally encoded.
58+
* @remarks See {@link IncrementalEncodingPolicy}.
6159
*/
62-
shouldEncodeFieldIncrementally(
63-
nodeIdentifier: TreeNodeSchemaIdentifier,
64-
fieldKey: FieldKey,
65-
): boolean;
60+
shouldEncodeIncrementally: IncrementalEncodingPolicy;
6661
/**
6762
* Called to encode an incremental field at the cursor.
6863
* The chunks for this field are encoded separately from the main buffer.
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
/*!
2+
* Copyright (c) Microsoft Corporation and contributors. All rights reserved.
3+
* Licensed under the MIT License.
4+
*/
5+
6+
import type { FieldKey, TreeNodeSchemaIdentifier } from "../../../core/index.js";
7+
8+
/**
9+
* Policy to determine whether a node / field should be incrementally encoded.
10+
* @param nodeIdentifier - The identifier of the node containing the field.
11+
* If undefined, the field is a root field.
12+
* @param fieldKey - The key of the field to check.
13+
* @returns whether the node / field should be incrementally encoded.
14+
* @remarks
15+
* Incremental encoding has a significant size overhead,
16+
* but allows reuse of previously encoded unchanged subtrees.
17+
* Thus it should only be enabled for large subtrees which are modified infrequently.
18+
* TODO: AB#9068: Measure the actual overhead.
19+
*/
20+
export type IncrementalEncodingPolicy = (
21+
nodeIdentifier: TreeNodeSchemaIdentifier | undefined,
22+
fieldKey: FieldKey,
23+
) => boolean;
24+
25+
/**
26+
* Default policy for incremental encoding is to not encode incrementally.
27+
*/
28+
export const defaultIncrementalEncodingPolicy: IncrementalEncodingPolicy = (
29+
nodeIdentifier: TreeNodeSchemaIdentifier | undefined,
30+
fieldKey: FieldKey,
31+
): boolean => {
32+
return false;
33+
};

packages/dds/tree/src/feature-libraries/chunked-forest/codec/index.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,7 @@ export {
1616
type ChunkReferenceId,
1717
getCodecTreeForFieldBatchFormat,
1818
} from "./codecs.js";
19+
export {
20+
type IncrementalEncodingPolicy,
21+
defaultIncrementalEncodingPolicy,
22+
} from "./incrementalEncodingPolicy.js";

packages/dds/tree/src/feature-libraries/chunked-forest/codec/schemaBasedEncode.ts

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ import {
1616
ValueSchema,
1717
Multiplicity,
1818
identifierFieldKindIdentifier,
19-
type FieldKey,
2019
} from "../../../core/index.js";
2120
import type { FullSchemaPolicy } from "../../modular-schema/index.js";
2221

@@ -35,6 +34,7 @@ import type { FieldBatch } from "./fieldBatch.js";
3534
import { type EncodedFieldBatch, type EncodedValueShape, SpecialField } from "./format.js";
3635
import type { IncrementalEncoder } from "./codecs.js";
3736
import { NodeShapeBasedEncoder } from "./nodeEncoder.js";
37+
import { defaultIncrementalEncodingPolicy } from "./incrementalEncodingPolicy.js";
3838

3939
/**
4040
* Encode data from `fieldBatch` in into an `EncodedChunk`.
@@ -134,14 +134,11 @@ export function getNodeEncoder(
134134
// consider moving some optional and sequence fields to extra fields if they are commonly empty
135135
// to reduce encoded size.
136136

137-
const shouldEncodeFieldIncrementallyLocal = (
138-
nodeIdentifier: TreeNodeSchemaIdentifier,
139-
fieldKey: FieldKey,
140-
): boolean =>
141-
incrementalEncoder?.shouldEncodeFieldIncrementally(nodeIdentifier, fieldKey) ?? false;
137+
const shouldEncodeIncrementally =
138+
incrementalEncoder?.shouldEncodeIncrementally ?? defaultIncrementalEncodingPolicy;
142139
const objectNodeFields: KeyedFieldEncoder[] = [];
143140
for (const [key, field] of schema.objectNodeFields ?? []) {
144-
const fieldEncoder = shouldEncodeFieldIncrementallyLocal(schemaName, key)
141+
const fieldEncoder = shouldEncodeIncrementally(schemaName, key)
145142
? incrementalFieldEncoder
146143
: fieldBuilder.fieldEncoderFromSchema(field);
147144
objectNodeFields.push({

packages/dds/tree/src/feature-libraries/chunked-forest/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,5 +25,7 @@ export {
2525
fluidVersionToFieldBatchCodecWriteVersion,
2626
type IncrementalEncoderDecoder,
2727
type ChunkReferenceId,
28+
type IncrementalEncodingPolicy,
29+
defaultIncrementalEncodingPolicy,
2830
} from "./codec/index.js";
2931
export { emptyChunk } from "./emptyChunk.js";

0 commit comments

Comments
 (0)