Skip to content

Commit 7d0d9da

Browse files
Shawnzheng011019zc277584121
authored andcommitted
[Refactor]: Refactor the file synchronizer to use Merkle DAG
Signed-off-by: ShawnZheng <shawn.zheng@zilliz.com>
1 parent 51b7b7c commit 7d0d9da

File tree

6 files changed

+151
-105
lines changed

6 files changed

+151
-105
lines changed

packages/core/src/context.ts

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,10 @@ export class CodeContext {
351351
updateIgnorePatterns(ignorePatterns: string[]): void {
352352
// Merge with default patterns, avoiding duplicates
353353
const mergedPatterns = [...DEFAULT_IGNORE_PATTERNS, ...ignorePatterns];
354-
this.ignorePatterns = [...new Set(mergedPatterns)]; // Remove duplicates
354+
const uniquePatterns: string[] = [];
355+
const patternSet = new Set(mergedPatterns);
356+
patternSet.forEach(pattern => uniquePatterns.push(pattern));
357+
this.ignorePatterns = uniquePatterns;
355358
console.log(`🚫 Updated ignore patterns: ${ignorePatterns.length} from .gitignore + ${DEFAULT_IGNORE_PATTERNS.length} default = ${this.ignorePatterns.length} total patterns`);
356359
}
357360

packages/core/src/splitter/ast-splitter.ts

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ export class AstCodeSplitter implements Splitter {
3131
if (chunkSize) this.chunkSize = chunkSize;
3232
if (chunkOverlap) this.chunkOverlap = chunkOverlap;
3333
this.parser = new Parser();
34-
34+
3535
// Initialize fallback splitter
3636
const { LangChainCodeSplitter } = require('./langchain-splitter');
3737
this.langchainFallback = new LangChainCodeSplitter(chunkSize, chunkOverlap);
@@ -47,7 +47,7 @@ export class AstCodeSplitter implements Splitter {
4747

4848
try {
4949
console.log(`🌳 Using AST splitter for ${language} file: ${filePath || 'unknown'}`);
50-
50+
5151
this.parser.setLanguage(langConfig.parser);
5252
const tree = this.parser.parse(code);
5353

@@ -58,7 +58,7 @@ export class AstCodeSplitter implements Splitter {
5858

5959
// Extract chunks based on AST nodes
6060
const chunks = this.extractChunks(tree.rootNode, code, langConfig.nodeTypes, language, filePath);
61-
61+
6262
// If chunks are too large, split them further
6363
const refinedChunks = await this.refineChunks(chunks, code);
6464

@@ -100,8 +100,8 @@ export class AstCodeSplitter implements Splitter {
100100
}
101101

102102
private extractChunks(
103-
node: Parser.SyntaxNode,
104-
code: string,
103+
node: Parser.SyntaxNode,
104+
code: string,
105105
splittableTypes: string[],
106106
language: string,
107107
filePath?: string
@@ -255,7 +255,7 @@ export class AstCodeSplitter implements Splitter {
255255
*/
256256
static isLanguageSupported(language: string): boolean {
257257
const supportedLanguages = [
258-
'javascript', 'js', 'typescript', 'ts', 'python', 'py',
258+
'javascript', 'js', 'typescript', 'ts', 'python', 'py',
259259
'java', 'cpp', 'c++', 'c', 'go', 'rust', 'rs'
260260
];
261261
return supportedLanguages.includes(language.toLowerCase());

packages/core/src/sync/merkle.ts

Lines changed: 70 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -1,93 +1,99 @@
11
import * as crypto from 'crypto';
22

3-
export class MerkleNode {
3+
export interface MerkleDAGNode {
4+
id: string;
45
hash: string;
5-
left: MerkleNode | null;
6-
right: MerkleNode | null;
7-
8-
constructor(hash: string, left: MerkleNode | null = null, right: MerkleNode | null = null) {
9-
this.hash = hash;
10-
this.left = left;
11-
this.right = right;
12-
}
13-
14-
static serializeNode(node: MerkleNode | null): any {
15-
if (!node) return null;
16-
return {
17-
hash: node.hash,
18-
left: MerkleNode.serializeNode(node.left),
19-
right: MerkleNode.serializeNode(node.right)
20-
};
21-
}
22-
23-
static deserializeNode(data: any): MerkleNode | null {
24-
if (!data) return null;
25-
return new MerkleNode(
26-
data.hash,
27-
MerkleNode.deserializeNode(data.left),
28-
MerkleNode.deserializeNode(data.right)
29-
);
30-
}
6+
data: string;
7+
parents: string[];
8+
children: string[];
319
}
3210

33-
export class MerkleTree {
34-
root: MerkleNode;
35-
leaves: MerkleNode[];
11+
export class MerkleDAG {
12+
nodes: Map<string, MerkleDAGNode>;
13+
rootIds: string[];
3614

37-
constructor(data: string[]) {
38-
const leaves = data.map(d => new MerkleNode(this.hash(d)));
39-
this.leaves = leaves;
40-
this.root = this.buildTree(leaves);
15+
constructor() {
16+
this.nodes = new Map();
17+
this.rootIds = [];
4118
}
4219

4320
private hash(data: string): string {
4421
return crypto.createHash('sha256').update(data).digest('hex');
4522
}
4623

47-
private buildTree(nodes: MerkleNode[]): MerkleNode {
48-
if (nodes.length === 0) {
49-
return new MerkleNode(this.hash(''));
50-
}
51-
if (nodes.length === 1) {
52-
return nodes[0];
53-
}
24+
public addNode(data: string, parentId?: string): string {
25+
const nodeId = this.hash(data);
26+
const node: MerkleDAGNode = {
27+
id: nodeId,
28+
hash: nodeId,
29+
data,
30+
parents: [],
31+
children: []
32+
};
5433

55-
const parents: MerkleNode[] = [];
56-
for (let i = 0; i < nodes.length; i += 2) {
57-
const left = nodes[i];
58-
const right = (i + 1 < nodes.length) ? nodes[i + 1] : left;
59-
const parentHash = this.hash(left.hash + right.hash);
60-
parents.push(new MerkleNode(parentHash, left, right));
34+
// If there's a parent, create the relationship
35+
if (parentId) {
36+
const parentNode = this.nodes.get(parentId);
37+
if (parentNode) {
38+
node.parents.push(parentId);
39+
parentNode.children.push(nodeId);
40+
this.nodes.set(parentId, parentNode);
41+
}
42+
} else {
43+
// If no parent, it's a root node
44+
this.rootIds.push(nodeId);
6145
}
6246

63-
return this.buildTree(parents);
47+
this.nodes.set(nodeId, node);
48+
return nodeId;
6449
}
6550

66-
public getRootHash(): string {
67-
return this.root.hash;
51+
public getNode(nodeId: string): MerkleDAGNode | undefined {
52+
return this.nodes.get(nodeId);
6853
}
6954

70-
public static compare(tree1: MerkleTree, tree2: MerkleTree): { added: string[], removed: string[], modified: string[] } {
71-
const C1 = new Map(tree1.leaves.map(l => [l.hash, l]));
72-
const C2 = new Map(tree2.leaves.map(l => [l.hash, l]));
55+
public getAllNodes(): MerkleDAGNode[] {
56+
return Array.from(this.nodes.values());
57+
}
7358

74-
const added = Array.from(C2.keys()).filter(k => !C1.has(k));
75-
const removed = Array.from(C1.keys()).filter(k => !C2.has(k));
76-
77-
return { added, removed, modified: [] };
59+
public getRootNodes(): MerkleDAGNode[] {
60+
return this.rootIds.map(id => this.nodes.get(id)!).filter(Boolean);
61+
}
62+
63+
public getLeafNodes(): MerkleDAGNode[] {
64+
return Array.from(this.nodes.values()).filter(node => node.children.length === 0);
7865
}
7966

8067
public serialize(): any {
8168
return {
82-
root: MerkleNode.serializeNode(this.root),
83-
leaves: this.leaves.map(l => MerkleNode.serializeNode(l))
69+
nodes: Array.from(this.nodes.entries()),
70+
rootIds: this.rootIds
8471
};
8572
}
8673

87-
static deserialize(data: any): MerkleTree {
88-
const tree = Object.create(MerkleTree.prototype);
89-
tree.root = MerkleNode.deserializeNode(data.root);
90-
tree.leaves = (data.leaves || []).map((l: any) => MerkleNode.deserializeNode(l));
91-
return tree;
74+
public static deserialize(data: any): MerkleDAG {
75+
const dag = new MerkleDAG();
76+
dag.nodes = new Map(data.nodes);
77+
dag.rootIds = data.rootIds;
78+
return dag;
79+
}
80+
81+
public static compare(dag1: MerkleDAG, dag2: MerkleDAG): { added: string[], removed: string[], modified: string[] } {
82+
const nodes1 = new Map(Array.from(dag1.getAllNodes()).map(n => [n.id, n]));
83+
const nodes2 = new Map(Array.from(dag2.getAllNodes()).map(n => [n.id, n]));
84+
85+
const added = Array.from(nodes2.keys()).filter(k => !nodes1.has(k));
86+
const removed = Array.from(nodes1.keys()).filter(k => !nodes2.has(k));
87+
88+
// For modified, we'll check if the data has changed for nodes that exist in both
89+
const modified: string[] = [];
90+
for (const [id, node1] of Array.from(nodes1.entries())) {
91+
const node2 = nodes2.get(id);
92+
if (node2 && node1.data !== node2.data) {
93+
modified.push(id);
94+
}
95+
}
96+
97+
return { added, removed, modified };
9298
}
9399
}

0 commit comments

Comments
 (0)