11from typing import Any , Dict , Iterator , Optional
22
3- from docling_core .transforms .chunker .base_code_chunker import CodeChunker
3+ from docling_core .transforms .chunker .base_code_chunker import _CodeChunker
44from docling_core .transforms .chunker .code_chunk_utils .utils import Language
55from docling_core .transforms .chunker .hierarchical_chunker import (
6- ChunkType ,
76 CodeChunk ,
7+ CodeChunkType ,
88 CodeDocMeta ,
99)
1010from docling_core .transforms .chunker .language_code_chunkers import (
11- CFunctionChunker ,
12- JavaFunctionChunker ,
13- JavaScriptFunctionChunker ,
14- PythonFunctionChunker ,
15- TypeScriptFunctionChunker ,
11+ _CFunctionChunker ,
12+ _JavaFunctionChunker ,
13+ _JavaScriptFunctionChunker ,
14+ _PythonFunctionChunker ,
15+ _TypeScriptFunctionChunker ,
1616)
1717from docling_core .types .doc .base import Size
1818from docling_core .types .doc .document import (
@@ -183,15 +183,15 @@ class CodeChunkingStrategyFactory:
183183 """Factory for creating language-specific code chunking strategies."""
184184
185185 @staticmethod
186- def create_chunker (language : Language , ** kwargs : Any ) -> CodeChunker :
186+ def create_chunker (language : Language , ** kwargs : Any ) -> _CodeChunker :
187187 """Create a language-specific code chunker."""
188188
189189 chunker_map = {
190- Language .PYTHON : PythonFunctionChunker ,
191- Language .TYPESCRIPT : TypeScriptFunctionChunker ,
192- Language .JAVASCRIPT : JavaScriptFunctionChunker ,
193- Language .C : CFunctionChunker ,
194- Language .JAVA : JavaFunctionChunker ,
190+ Language .PYTHON : _PythonFunctionChunker ,
191+ Language .TYPESCRIPT : _TypeScriptFunctionChunker ,
192+ Language .JAVASCRIPT : _JavaScriptFunctionChunker ,
193+ Language .C : _CFunctionChunker ,
194+ Language .JAVA : _JavaFunctionChunker ,
195195 }
196196
197197 chunker_class = chunker_map .get (language )
@@ -208,9 +208,9 @@ def __init__(self, **chunker_kwargs: Any):
208208 """Initialize the strategy with optional chunker parameters."""
209209
210210 self .chunker_kwargs = chunker_kwargs
211- self ._chunker_cache : Dict [Language , CodeChunker ] = {}
211+ self ._chunker_cache : Dict [Language , _CodeChunker ] = {}
212212
213- def _get_chunker (self , language : Language ) -> CodeChunker :
213+ def _get_chunker (self , language : Language ) -> _CodeChunker :
214214 """Get or create a chunker for the given language."""
215215
216216 if language not in self ._chunker_cache :
@@ -238,10 +238,12 @@ def chunk_code_item(
238238 filename = original_doc .origin .filename or "code_chunk"
239239 mimetype = original_doc .origin .mimetype or "text/plain"
240240 binary_hash = _create_hash (code_text )
241+ uri = getattr (original_doc .origin , "uri" , None )
241242 else :
242243 filename = "code_chunk"
243244 mimetype = "text/plain"
244245 binary_hash = _create_hash (code_text )
246+ uri = None
245247
246248 if original_item and hasattr (original_item , "self_ref" ):
247249 self_ref = original_item .self_ref
@@ -255,7 +257,7 @@ def chunk_code_item(
255257 texts = [code_item ],
256258 pages = {0 : PageItem (page_no = 0 , size = Size (width = 612.0 , height = 792.0 ))},
257259 origin = DocumentOrigin (
258- filename = filename , mimetype = mimetype , binary_hash = binary_hash
260+ filename = filename , mimetype = mimetype , binary_hash = binary_hash , uri = uri
259261 ),
260262 )
261263
@@ -279,7 +281,7 @@ def chunk_code_item(
279281 return
280282
281283 meta = CodeDocMeta (
282- chunk_type = ChunkType .CODE_BLOCK ,
284+ chunk_type = CodeChunkType .CODE_BLOCK ,
283285 start_line = 1 ,
284286 end_line = len (code_text .splitlines ()),
285287 )
0 commit comments