Skip to content

Commit 10e9ed8

Browse files
committed
add serializer, internal marking of chunkers, typing
1 parent 46bb88a commit 10e9ed8

File tree

7 files changed

+94
-53
lines changed

7 files changed

+94
-53
lines changed

docling_core/transforms/chunker/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
"""Define the chunker types."""
77

88
from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
9-
from docling_core.transforms.chunker.base_code_chunker import CodeChunker
109
from docling_core.transforms.chunker.code_chunk_utils.chunk_utils import (
1110
ChunkBuilder,
1211
ChunkMetadataBuilder,
@@ -21,9 +20,9 @@
2120
NoOpCodeChunkingStrategy,
2221
)
2322
from docling_core.transforms.chunker.hierarchical_chunker import (
24-
ChunkType,
2523
CodeChunk,
2624
CodeChunkingStrategy,
25+
CodeChunkType,
2726
CodeDocMeta,
2827
DocChunk,
2928
DocMeta,

docling_core/transforms/chunker/base_code_chunker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
from docling_core.types.doc.labels import DocItemLabel
2020

2121

22-
class CodeChunker(BaseChunker):
22+
class _CodeChunker(BaseChunker):
2323
"""Data model for code chunker."""
2424

2525
language: Language

docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
from tree_sitter import Node
55

66
from docling_core.transforms.chunker.hierarchical_chunker import (
7-
ChunkType,
87
CodeChunk,
8+
CodeChunkType,
99
CodeDocMeta,
1010
)
1111
from docling_core.types.doc.document import DocumentOrigin
@@ -99,7 +99,7 @@ def build_function_metadata(
9999
end_line=end_line,
100100
end_line_signature=signature_end_line,
101101
origin=self.origin,
102-
chunk_type=ChunkType.FUNCTION,
102+
chunk_type=CodeChunkType.FUNCTION,
103103
)
104104

105105
def build_class_metadata(
@@ -119,7 +119,7 @@ def build_class_metadata(
119119
end_line=end_line,
120120
end_line_signature=end_line,
121121
origin=self.origin,
122-
chunk_type=ChunkType.CLASS,
122+
chunk_type=CodeChunkType.CLASS,
123123
)
124124

125125
def build_preamble_metadata(
@@ -131,7 +131,7 @@ def build_preamble_metadata(
131131
start_line=start_line,
132132
end_line=end_line,
133133
origin=self.origin,
134-
chunk_type=ChunkType.PREAMBLE,
134+
chunk_type=CodeChunkType.PREAMBLE,
135135
)
136136

137137
def calculate_line_numbers(

docling_core/transforms/chunker/code_chunking_strategy.py

Lines changed: 19 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
11
from typing import Any, Dict, Iterator, Optional
22

3-
from docling_core.transforms.chunker.base_code_chunker import CodeChunker
3+
from docling_core.transforms.chunker.base_code_chunker import _CodeChunker
44
from docling_core.transforms.chunker.code_chunk_utils.utils import Language
55
from docling_core.transforms.chunker.hierarchical_chunker import (
6-
ChunkType,
76
CodeChunk,
7+
CodeChunkType,
88
CodeDocMeta,
99
)
1010
from docling_core.transforms.chunker.language_code_chunkers import (
11-
CFunctionChunker,
12-
JavaFunctionChunker,
13-
JavaScriptFunctionChunker,
14-
PythonFunctionChunker,
15-
TypeScriptFunctionChunker,
11+
_CFunctionChunker,
12+
_JavaFunctionChunker,
13+
_JavaScriptFunctionChunker,
14+
_PythonFunctionChunker,
15+
_TypeScriptFunctionChunker,
1616
)
1717
from docling_core.types.doc.base import Size
1818
from docling_core.types.doc.document import (
@@ -183,15 +183,15 @@ class CodeChunkingStrategyFactory:
183183
"""Factory for creating language-specific code chunking strategies."""
184184

185185
@staticmethod
186-
def create_chunker(language: Language, **kwargs: Any) -> CodeChunker:
186+
def create_chunker(language: Language, **kwargs: Any) -> _CodeChunker:
187187
"""Create a language-specific code chunker."""
188188

189189
chunker_map = {
190-
Language.PYTHON: PythonFunctionChunker,
191-
Language.TYPESCRIPT: TypeScriptFunctionChunker,
192-
Language.JAVASCRIPT: JavaScriptFunctionChunker,
193-
Language.C: CFunctionChunker,
194-
Language.JAVA: JavaFunctionChunker,
190+
Language.PYTHON: _PythonFunctionChunker,
191+
Language.TYPESCRIPT: _TypeScriptFunctionChunker,
192+
Language.JAVASCRIPT: _JavaScriptFunctionChunker,
193+
Language.C: _CFunctionChunker,
194+
Language.JAVA: _JavaFunctionChunker,
195195
}
196196

197197
chunker_class = chunker_map.get(language)
@@ -208,9 +208,9 @@ def __init__(self, **chunker_kwargs: Any):
208208
"""Initialize the strategy with optional chunker parameters."""
209209

210210
self.chunker_kwargs = chunker_kwargs
211-
self._chunker_cache: Dict[Language, CodeChunker] = {}
211+
self._chunker_cache: Dict[Language, _CodeChunker] = {}
212212

213-
def _get_chunker(self, language: Language) -> CodeChunker:
213+
def _get_chunker(self, language: Language) -> _CodeChunker:
214214
"""Get or create a chunker for the given language."""
215215

216216
if language not in self._chunker_cache:
@@ -238,10 +238,12 @@ def chunk_code_item(
238238
filename = original_doc.origin.filename or "code_chunk"
239239
mimetype = original_doc.origin.mimetype or "text/plain"
240240
binary_hash = _create_hash(code_text)
241+
uri = getattr(original_doc.origin, "uri", None)
241242
else:
242243
filename = "code_chunk"
243244
mimetype = "text/plain"
244245
binary_hash = _create_hash(code_text)
246+
uri = None
245247

246248
if original_item and hasattr(original_item, "self_ref"):
247249
self_ref = original_item.self_ref
@@ -255,7 +257,7 @@ def chunk_code_item(
255257
texts=[code_item],
256258
pages={0: PageItem(page_no=0, size=Size(width=612.0, height=792.0))},
257259
origin=DocumentOrigin(
258-
filename=filename, mimetype=mimetype, binary_hash=binary_hash
260+
filename=filename, mimetype=mimetype, binary_hash=binary_hash, uri=uri
259261
),
260262
)
261263

@@ -279,7 +281,7 @@ def chunk_code_item(
279281
return
280282

281283
meta = CodeDocMeta(
282-
chunk_type=ChunkType.CODE_BLOCK,
284+
chunk_type=CodeChunkType.CODE_BLOCK,
283285
start_line=1,
284286
end_line=len(code_text.splitlines()),
285287
)

docling_core/transforms/chunker/hierarchical_chunker.py

Lines changed: 45 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,16 @@
1010
import logging
1111
import re
1212
from enum import Enum
13-
from typing import Any, ClassVar, Final, Iterator, Literal, Optional, Protocol
13+
from typing import (
14+
TYPE_CHECKING,
15+
Any,
16+
ClassVar,
17+
Final,
18+
Iterator,
19+
Literal,
20+
Optional,
21+
Protocol,
22+
)
1423

1524
from pydantic import ConfigDict, Field, StringConstraints, field_validator
1625
from typing_extensions import Annotated, override
@@ -119,7 +128,7 @@ def check_version_is_compatible(cls, v: str) -> str:
119128

120129

121130
class CodeDocMeta(DocMeta):
122-
"""Data model for CodeChunker metadata."""
131+
"""Data model for code chunk metadata."""
123132

124133
doc_items: Optional[list[DocItem]] = Field(default=None, alias=_KEY_DOC_ITEMS)
125134
part_name: Optional[str] = Field(default=None)
@@ -137,7 +146,7 @@ class CodeChunk(BaseChunk):
137146
meta: CodeDocMeta
138147

139148

140-
class ChunkType(str, Enum):
149+
class CodeChunkType(str, Enum):
141150
"""Chunk type"""
142151

143152
FUNCTION = "function"
@@ -157,6 +166,12 @@ def chunk_code_item(
157166
...
158167

159168

169+
if TYPE_CHECKING:
170+
CodeChunkingStrategyType = CodeChunkingStrategy
171+
else:
172+
CodeChunkingStrategyType = Any
173+
174+
160175
class DocChunk(BaseChunk):
161176
"""Data model for document chunks."""
162177

@@ -248,7 +263,7 @@ class HierarchicalChunker(BaseChunker):
248263
model_config = ConfigDict(arbitrary_types_allowed=True)
249264

250265
serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider()
251-
code_chunking_strategy: Optional[Any] = Field(default=None)
266+
code_chunking_strategy: Optional[CodeChunkingStrategyType] = Field(default=None)
252267

253268
# deprecated:
254269
merge_list_items: Annotated[bool, Field(deprecated=True)] = True
@@ -308,14 +323,21 @@ def chunk(
308323
)
309324

310325
if language:
311-
for code_chunk in self.code_chunking_strategy.chunk_code_item(
312-
item.text,
313-
language,
314-
original_doc=dl_doc,
315-
original_item=item,
316-
**kwargs,
317-
):
318-
yield code_chunk
326+
ser_res = my_doc_ser.serialize(item=item, visited=visited)
327+
if ser_res.text:
328+
code_text = self._strip_markdown_code_formatting(
329+
ser_res.text
330+
)
331+
for (
332+
code_chunk
333+
) in self.code_chunking_strategy.chunk_code_item(
334+
code_text,
335+
language,
336+
original_doc=dl_doc,
337+
original_item=item,
338+
**kwargs,
339+
):
340+
yield code_chunk
319341
continue
320342

321343
ser_res = my_doc_ser.serialize(item=item, visited=visited)
@@ -335,3 +357,14 @@ def chunk(
335357
),
336358
)
337359
yield c
360+
361+
def _strip_markdown_code_formatting(self, text: str) -> str:
362+
"""Strip markdown code block formatting from text."""
363+
if not text.startswith("```") or not text.endswith("```"):
364+
return text
365+
366+
lines = text.split("\n")
367+
if len(lines) >= 3 and lines[0].startswith("```") and lines[-1] == "```":
368+
return "\n".join(lines[1:-1])
369+
370+
return text

docling_core/transforms/chunker/language_code_chunkers.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from tree_sitter import Node, Tree
55
from typing_extensions import override
66

7-
from docling_core.transforms.chunker.base_code_chunker import CodeChunker
7+
from docling_core.transforms.chunker.base_code_chunker import _CodeChunker
88
from docling_core.transforms.chunker.code_chunk_utils.utils import (
99
Language,
1010
_get_default_tokenizer,
@@ -16,7 +16,7 @@
1616
from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer
1717

1818

19-
class PythonFunctionChunker(CodeChunker):
19+
class _PythonFunctionChunker(_CodeChunker):
2020

2121
language: Language = Language.PYTHON
2222
ts_language: Any = Field(default=None)
@@ -151,7 +151,7 @@ def _is_local_assignment(self, identifier_node: Node) -> bool:
151151
return False
152152

153153

154-
class TypeScriptFunctionChunker(CodeChunker):
154+
class _TypeScriptFunctionChunker(_CodeChunker):
155155
language: Language = Language.TYPESCRIPT
156156
ts_language: Any = Field(default=None)
157157
parser: Any = Field(default=None)
@@ -232,12 +232,12 @@ def _get_module_variables(self, tree: Tree) -> Dict[str, Node]:
232232
return {}
233233

234234

235-
class JavaScriptFunctionChunker(TypeScriptFunctionChunker):
235+
class _JavaScriptFunctionChunker(_TypeScriptFunctionChunker):
236236
def __init__(self, **data):
237237
super().__init__(language=Language.JAVASCRIPT)
238238

239239

240-
class CFunctionChunker(CodeChunker):
240+
class _CFunctionChunker(_CodeChunker):
241241
language: Language = Language.C
242242
ts_language: Any = Field(default=None)
243243
parser: Any = Field(default=None)
@@ -389,7 +389,7 @@ def collect_identifiers(node, depth=0):
389389
return used_macros
390390

391391

392-
class JavaFunctionChunker(CodeChunker):
392+
class _JavaFunctionChunker(_CodeChunker):
393393

394394
language: Language = Language.JAVA
395395
ts_language: Any = Field(default=None)

test/test_code_chunker.py

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,10 @@
99

1010
from docling_core.transforms.chunker.base_code_chunker import CodeChunk
1111
from docling_core.transforms.chunker.code_chunk_utils.utils import Language
12-
from docling_core.transforms.chunker.language_code_chunkers import (
13-
CFunctionChunker,
14-
JavaFunctionChunker,
15-
JavaScriptFunctionChunker,
16-
PythonFunctionChunker,
17-
TypeScriptFunctionChunker,
12+
from docling_core.transforms.chunker.code_chunking_strategy import (
13+
DefaultCodeChunkingStrategy,
1814
)
15+
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
1916
from docling_core.types.doc import DoclingDocument, DocumentOrigin
2017
from docling_core.types.doc.labels import DocItemLabel
2118
from docling_core.utils.legacy import _create_hash
@@ -96,31 +93,41 @@ def create_documents_from_repository(
9693
"Java",
9794
"/test/data/chunker_repo/repos/acmeair",
9895
"https://github.com/acmeair/acmeair",
99-
lambda: JavaFunctionChunker(max_tokens=5000),
96+
lambda: HierarchicalChunker(
97+
code_chunking_strategy=DefaultCodeChunkingStrategy(max_tokens=5000)
98+
),
10099
),
101100
(
102101
"TypeScript",
103102
"/test/data/chunker_repo/repos/outline",
104103
"https://github.com/outline/outline",
105-
lambda: TypeScriptFunctionChunker(max_tokens=5000),
104+
lambda: HierarchicalChunker(
105+
code_chunking_strategy=DefaultCodeChunkingStrategy(max_tokens=5000)
106+
),
106107
),
107108
(
108109
"JavaScript",
109110
"/test/data/chunker_repo/repos/jquery",
110111
"https://github.com/jquery/jquery",
111-
lambda: JavaScriptFunctionChunker(max_tokens=5000),
112+
lambda: HierarchicalChunker(
113+
code_chunking_strategy=DefaultCodeChunkingStrategy(max_tokens=5000)
114+
),
112115
),
113116
(
114117
"Python",
115118
"/test/data/chunker_repo/repos/docling",
116119
"https://github.com/docling-project/docling",
117-
lambda: PythonFunctionChunker(max_tokens=5000),
120+
lambda: HierarchicalChunker(
121+
code_chunking_strategy=DefaultCodeChunkingStrategy(max_tokens=5000)
122+
),
118123
),
119124
(
120125
"C",
121126
"/test/data/chunker_repo/repos/json-c",
122127
"https://github.com/json-c/json-c",
123-
lambda: CFunctionChunker(max_tokens=5000),
128+
lambda: HierarchicalChunker(
129+
code_chunking_strategy=DefaultCodeChunkingStrategy(max_tokens=5000)
130+
),
124131
),
125132
]
126133

0 commit comments

Comments
 (0)