From a4a21e90d63da3aa46fd6f0681b065d6bf85305b Mon Sep 17 00:00:00 2001 From: Bridget McGinn Date: Thu, 2 Oct 2025 15:02:41 -0400 Subject: [PATCH 01/12] initial code chunking for docling-core --- docling_core/transforms/chunker/__init__.py | 13 + .../transforms/chunker/base_code_chunker.py | 642 +++++++++ .../chunker/code_chunk_utils/chunk_utils.py | 383 ++++++ .../chunker/code_chunk_utils/types.py | 35 + .../chunker/code_chunk_utils/utils.py | 150 ++ .../chunker/language_code_chunkers.py | 621 +++++++++ pyproject.toml | 9 +- test/data/chunker_repo/C/repo_out_chunks.json | 661 +++++++++ .../chunker_repo/Java/repo_out_chunks.json | 94 ++ .../JavaScript/repo_out_chunks.json | 103 ++ .../chunker_repo/Python/repo_out_chunks.json | 1225 +++++++++++++++++ .../TypeScript/repo_out_chunks.json | 175 +++ .../repos/acmeair/AcmeAirConstants.java | 6 + .../repos/acmeair/CustomerLoader.java | 37 + .../repos/acmeair/FlightLoader.java | 133 ++ .../chunker_repo/repos/docling/base_models.py | 435 ++++++ .../data/chunker_repo/repos/docling/export.py | 146 ++ .../repos/docling/jats_backend.py | 718 ++++++++++ test/data/chunker_repo/repos/jquery/access.js | 63 + test/data/chunker_repo/repos/jquery/data.js | 175 +++ .../chunker_repo/repos/jquery/serialize.js | 129 ++ .../chunker_repo/repos/json-c/json_pointer.c | 415 ++++++ .../data/chunker_repo/repos/json-c/linkhash.c | 718 ++++++++++ .../repos/json-c/strerror_override.c | 110 ++ .../chunker_repo/repos/outline/Comment.ts | 278 ++++ .../chunker_repo/repos/outline/GroupUser.ts | 27 + test/data/chunker_repo/repos/outline/index.ts | 365 +++++ test/test_code_chunker.py | 103 ++ test/test_utils_repo_ds.py | 140 ++ uv.lock | 145 +- 30 files changed, 8248 insertions(+), 6 deletions(-) create mode 100644 docling_core/transforms/chunker/base_code_chunker.py create mode 100644 docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py create mode 100644 docling_core/transforms/chunker/code_chunk_utils/types.py create mode 100644 docling_core/transforms/chunker/code_chunk_utils/utils.py create mode 100644 docling_core/transforms/chunker/language_code_chunkers.py create mode 100644 test/data/chunker_repo/C/repo_out_chunks.json create mode 100644 test/data/chunker_repo/Java/repo_out_chunks.json create mode 100644 test/data/chunker_repo/JavaScript/repo_out_chunks.json create mode 100644 test/data/chunker_repo/Python/repo_out_chunks.json create mode 100644 test/data/chunker_repo/TypeScript/repo_out_chunks.json create mode 100644 test/data/chunker_repo/repos/acmeair/AcmeAirConstants.java create mode 100644 test/data/chunker_repo/repos/acmeair/CustomerLoader.java create mode 100644 test/data/chunker_repo/repos/acmeair/FlightLoader.java create mode 100644 test/data/chunker_repo/repos/docling/base_models.py create mode 100644 test/data/chunker_repo/repos/docling/export.py create mode 100755 test/data/chunker_repo/repos/docling/jats_backend.py create mode 100644 test/data/chunker_repo/repos/jquery/access.js create mode 100644 test/data/chunker_repo/repos/jquery/data.js create mode 100644 test/data/chunker_repo/repos/jquery/serialize.js create mode 100644 test/data/chunker_repo/repos/json-c/json_pointer.c create mode 100644 test/data/chunker_repo/repos/json-c/linkhash.c create mode 100644 test/data/chunker_repo/repos/json-c/strerror_override.c create mode 100644 test/data/chunker_repo/repos/outline/Comment.ts create mode 100644 test/data/chunker_repo/repos/outline/GroupUser.ts create mode 100644 test/data/chunker_repo/repos/outline/index.ts create mode 100644 test/test_code_chunker.py create mode 100644 test/test_utils_repo_ds.py diff --git a/docling_core/transforms/chunker/__init__.py b/docling_core/transforms/chunker/__init__.py index 46de031a..8522e75c 100644 --- a/docling_core/transforms/chunker/__init__.py +++ b/docling_core/transforms/chunker/__init__.py @@ -6,6 +6,19 @@ """Define the chunker types.""" from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta +from docling_core.transforms.chunker.base_code_chunker import CodeChunker +from docling_core.transforms.chunker.code_chunk_utils.chunk_utils import ( + ChunkBuilder, + ChunkMetadataBuilder, + ChunkSizeProcessor, + RangeTracker, +) +from docling_core.transforms.chunker.code_chunk_utils.types import ( + ChunkType, + CodeChunk, + CodeDocMeta, +) +from docling_core.transforms.chunker.code_chunk_utils.utils import Language from docling_core.transforms.chunker.hierarchical_chunker import ( DocChunk, DocMeta, diff --git a/docling_core/transforms/chunker/base_code_chunker.py b/docling_core/transforms/chunker/base_code_chunker.py new file mode 100644 index 00000000..75643d64 --- /dev/null +++ b/docling_core/transforms/chunker/base_code_chunker.py @@ -0,0 +1,642 @@ +from typing import Any, Dict, Iterator, List, Optional, Tuple + +from tree_sitter import Node, Parser, Tree + +from docling_core.transforms.chunker import BaseChunker +from docling_core.transforms.chunker.code_chunk_utils.chunk_utils import ( + ChunkBuilder, + ChunkSizeProcessor, + RangeTracker, +) +from docling_core.transforms.chunker.code_chunk_utils.types import CodeChunk +from docling_core.transforms.chunker.code_chunk_utils.utils import ( + Language, + get_children, + to_str, +) +from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer +from docling_core.types import DoclingDocument as DLDocument +from docling_core.types.doc.labels import DocItemLabel + + +class CodeChunker(BaseChunker): + """Data model for code chunker.""" + + language: Language + ts_language: Any + parser: Any + function_body: str + constructor_name: str + decorator_type: str + class_definition_types: List[str] + docs_types: List[str] + expression_types: List[str] + chunk_prefix: str + chunk_suffix: str + function_definition_types: List[str] + tokenizer: BaseTokenizer + min_chunk_size: int + max_tokens: int + class_body_field: str = "body" + utf8_encoding: str = "utf-8" + name_field: str = "name" + expression_statement: str = "expression_statement" + string_field: str = "string" + identifiers: List[str] = ["identifier", "type_identifier"] + definition_field: str = "definition" + copyright_words: List[str] = [ + "copyright", + "license", + "licensed under", + "all rights reserved", + ] + + def __init__(self, **data): + super().__init__(**data) + if self.ts_language is None: + self.ts_language = self.language.get_tree_sitter_language() + if self.parser is None: + self.parser = Parser(self.ts_language) + + @property + def max_tokens(self) -> int: + """Get maximum number of tokens allowed.""" + return self.tokenizer.get_max_tokens() + + def parse_code(self, code: str) -> Tree: + """Get tree sitter parser""" + return self.parser.parse(bytes(code, self.utf8_encoding)) + + def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[CodeChunk]: + """Chunk the provided code by methods.""" + if not dl_doc.texts: + return + + code_blocks = [t.text for t in dl_doc.texts if t.label == DocItemLabel.CODE] + if not code_blocks: + return + + for code in code_blocks: + tree = self.parse_code(code) + import_nodes = self._get_imports(tree) + module_variables = self._get_module_variables(tree) + range_tracker = RangeTracker() + chunk_builder = ChunkBuilder(dl_doc.origin) if dl_doc.origin else None + size_processor = ChunkSizeProcessor( + self.tokenizer, self.max_tokens, self.min_chunk_size, chunker=self + ) + + self._mark_copyright_comments(tree.root_node, range_tracker) + + all_chunks = [] + + functions = self._get_all_functions(tree.root_node, "") + for node in functions: + for chunk, chunk_used_ranges in self._yield_function_chunks_with_ranges( + node, tree.root_node, import_nodes, chunk_builder, module_variables + ): + range_tracker.extend(chunk_used_ranges) + all_chunks.append((chunk, chunk_used_ranges)) + + if module_variables: + self._track_constructor_variables( + tree.root_node, module_variables, range_tracker + ) + + empty_classes = self._get_classes_no_methods(tree.root_node, "") + for node in empty_classes: + if chunk_builder: + for chunk, chunk_used_ranges in self._yield_class_chunk_with_ranges( + node, import_nodes, chunk_builder + ): + range_tracker.extend(chunk_used_ranges) + all_chunks.append((chunk, chunk_used_ranges)) + + if chunk_builder: + for chunk in chunk_builder.process_orphan_chunks( + range_tracker.get_used_ranges(), dl_doc + ): + all_chunks.append((chunk, [])) + + for chunk, _ in size_processor.process_chunks(all_chunks): + yield chunk + + def _mark_copyright_comments( + self, root_node: Node, range_tracker: RangeTracker + ) -> None: + """Mark copyright comments as used.""" + comment_nodes = get_children(root_node, self.docs_types) + for node in comment_nodes: + comment_text = to_str(node).lower() + if any(keyword in comment_text for keyword in self.copyright_words): + range_tracker.mark_node_used(node) + + def _yield_function_chunks_with_ranges( + self, + node: Node, + root_node: Node, + import_nodes: Dict[str, Node], + chunk_builder: Optional[ChunkBuilder], + module_variables: Optional[Dict[str, Node]] = None, + ) -> Iterator[Tuple[CodeChunk, List[Tuple[int, int]]]]: + + docstring = self._get_docstring(node) + additional_context, additional_context_no_docstring = ( + self._build_additional_context(node, root_node) + ) + imports = self._build_imports( + import_nodes, node, additional_context_no_docstring + ) + function_line_start, _ = node.start_point + function_line_end, _ = node.end_point + signature_line_end, _ = self._get_function_signature_end(node) + function_name = self.language.get_function_name(node) or "unknown_function" + prefix, prefix_range = self._file_prefix(root_node) + + used_ranges = [] + used_ranges.append((node.start_byte, node.end_byte)) + + if imports: + used_imports = self._find_used_imports_in_function( + import_nodes, node, additional_context_no_docstring, module_variables + ) + for import_name in sorted(used_imports): + if import_name in import_nodes: + import_node = import_nodes[import_name] + import_ranges = self._get_import_ranges_with_comments(import_node) + used_ranges.extend(import_ranges) + + if prefix: + used_ranges.extend(prefix_range) + + if additional_context: + current_node = node + while current_node.parent: + if current_node.parent.type in self.class_definition_types: + used_ranges.append( + (current_node.parent.start_byte, current_node.parent.end_byte) + ) + used_ranges.extend( + self._get_class_member_ranges(current_node.parent) + ) + break + current_node = current_node.parent + + module_variable_definitions = "" + if module_variables: + used_variables = self._find_used_variables(node) + for var_name in sorted(used_variables): + if var_name in module_variables: + var_def_node = module_variables[var_name] + var_ranges = self._get_variable_ranges_with_comments(var_def_node) + used_ranges.extend(var_ranges) + var_node = self._get_variable_with_comments(var_def_node, root_node) + var_text = to_str(var_node) + module_variable_definitions += var_text + "\n" + + function_content = self._build_function(node) + function_no_docstring = ( + function_content.replace(docstring, "") if docstring else function_content + ) + + base_content = f"{prefix}{imports}{module_variable_definitions}{additional_context_no_docstring}{function_no_docstring}" + + if chunk_builder: + yield chunk_builder.build_function_chunk( + base_content, + function_name, + docstring, + function_line_start, + function_line_end, + signature_line_end, + ), used_ranges + + def _yield_class_chunk_with_ranges( + self, node: Node, import_nodes: Dict[str, Node], chunk_builder: ChunkBuilder + ) -> Iterator[Tuple[CodeChunk, List[Tuple[int, int]]]]: + docstring = self._get_docstring(node) + function_content = self._build_class_with_comments(node) + imports = self._build_imports(import_nodes, node, function_content) + function_line_start, _ = node.start_point + function_line_end, _ = node.end_point + class_name = self.language.get_function_name(node) or "unknown_class" + + root_node = node + while root_node.parent: + root_node = root_node.parent + prefix, prefix_range = self._file_prefix(root_node) + + used_ranges = [] + class_ranges = self._get_class_ranges_with_comments(node) + used_ranges.extend(class_ranges) + + if imports: + used_imports = self._find_used_imports_in_function( + import_nodes, node, function_content, None + ) + for import_name in sorted(used_imports): + if import_name in import_nodes: + import_node = import_nodes[import_name] + import_ranges = self._get_import_ranges_with_comments(import_node) + used_ranges.extend(import_ranges) + + if prefix: + used_ranges.extend(prefix_range) + + function_no_docstring = ( + function_content.replace(docstring, "") if docstring else function_content + ) + content_no_docstring = f"{prefix}{imports}{function_no_docstring}" + + if chunk_builder: + yield chunk_builder.build_class_chunk( + content_no_docstring, + class_name, + docstring, + function_line_start, + function_line_end, + ), used_ranges + + def _file_prefix(self, root_node: Node) -> Tuple[str, List]: + return "", [] + + def _get_function_body(self, node: Node) -> Optional[Node]: + return next( + (child for child in node.children if child.type == self.function_body), None + ) + + def _get_docstring(self, node: Node) -> str: + if node.prev_named_sibling and node.prev_named_sibling.type in self.docs_types: + text = node.prev_named_sibling.text + return text.decode(self.utf8_encoding) if text else "" + return "" + + def _get_all_functions(self, node: Node, parent_type: str) -> List[Node]: + """Get all functions in the file.""" + if not node or parent_type in self.function_definition_types: + return [] + + nodes = [] + + if node.type in self.function_definition_types: + if self.language.is_collectable_function(node, self.constructor_name): + nodes.append(node) + elif self._is_constructor(node): + if self._is_only_function_in_class(node): + nodes.append(node) + + for child in node.children: + nodes.extend(self._get_all_functions(child, node.type)) + + return nodes + + def _get_classes_no_methods(self, node: Node, parent_type: str) -> List[Node]: + """Get classes and interfaces without methods.""" + + def has_methods(class_node: Node) -> bool: + return any( + child.type in self.function_definition_types + or any( + grandchild.type in self.function_definition_types + for grandchild in child.children + ) + for child in class_node.children + ) + + if not node or parent_type in self.class_definition_types: + return [] + + nodes = [] + if node.type in self.class_definition_types and not has_methods(node): + nodes.append(node) + + for child in node.children: + nodes.extend(self._get_classes_no_methods(child, node.type)) + + return nodes + + def _get_class_member_ranges(self, class_node: Node) -> List[Tuple[int, int]]: + return [] + + def _get_module_variables(self, tree: Tree) -> Dict[str, Node]: + """Get module-level variables/macros. Must be implemented by language-specific chunkers.""" + raise NotImplementedError + + def _find_used_variables(self, function_node: Node) -> set: + """Find variable/macro names used within a function. Default implementation returns empty set.""" + return set() + + def _get_variable_with_comments(self, var_node: Node, root_node: Node) -> Node: + """Get variable node including any preceding comments. Default implementation returns the node as-is.""" + return var_node + + def _get_function_signature_end(self, node: Node) -> Tuple[int, int]: + body_node = self._get_function_body(node) + return body_node.start_point if body_node else node.end_point + + def _build_function(self, function_node: Node) -> str: + if function_node.parent and function_node.parent.type == self.decorator_type: + function_node = function_node.parent + return to_str(function_node) + + def _build_class_with_comments(self, class_node: Node) -> str: + """Build class content including any preceding comments and docstrings.""" + current = class_node.prev_sibling + comment_parts: List[str] = [] + + while current and current.type in self.docs_types: + current_end_line = current.end_point[0] + class_start_line = class_node.start_point[0] + + if current_end_line <= class_start_line: + comment_parts.insert(0, to_str(current)) + current = current.prev_sibling + else: + break + + if comment_parts: + result = "".join(comment_parts) + "\n" + to_str(class_node) + return result + else: + return to_str(class_node) + + def _build_imports( + self, + imports: Dict[str, Node], + function_node: Node, + additional_context: str = "", + ) -> str: + used, set_imports = set(), set() + + def find_used_imports(node): + if ( + node.type in self.identifiers + and node.text.decode(self.utf8_encoding) in imports + ): + used.add(node.text.decode(self.utf8_encoding)) + for child in node.children: + find_used_imports(child) + + find_used_imports(function_node) + + if additional_context: + for import_name in imports.keys(): + if import_name in additional_context: + used.add(import_name) + + for import_name, import_node in imports.items(): + if "*" in import_name: + import_text = self._get_import_with_comments(import_node) + set_imports.add(import_text) + + for u in used: + import_text = self._get_import_with_comments(imports[u]) + set_imports.add(import_text) + + return "\n".join(sorted(set_imports)) + "\n" + + def _find_used_imports_in_function( + self, + imports: Dict[str, Node], + function_node: Node, + additional_context: str = "", + module_variables: Optional[Dict[str, Node]] = None, + ) -> set: + """Find which imports are used in a function and its additional context.""" + used = set() + + def find_used_imports(node): + if ( + node.type in self.identifiers + and node.text.decode(self.utf8_encoding) in imports + ): + used.add(node.text.decode(self.utf8_encoding)) + for child in node.children: + find_used_imports(child) + + find_used_imports(function_node) + + if additional_context: + for import_name in imports.keys(): + if import_name in additional_context: + used.add(import_name) + + if module_variables: + used_variables = self._find_used_variables(function_node) + + for var_name in used_variables: + if var_name in module_variables: + var_def_node = module_variables[var_name] + find_used_imports(var_def_node) + + for import_name in imports.keys(): + if "*" in import_name: + used.add(import_name) + + return used + + def _get_node_with_comments(self, node: Node) -> str: + """Get node text including any preceding comments.""" + + current = node.prev_sibling + comment_parts: List[str] = [] + + while current and current.type in self.docs_types: + current_end_line = current.end_point[0] + node_start_line = node.start_point[0] + + if current_end_line <= node_start_line: + comment_parts.insert(0, to_str(current)) + current = current.prev_sibling + else: + break + + if comment_parts: + result = "".join(comment_parts) + "\n" + to_str(node) + return result + else: + return to_str(node) + + def _get_import_with_comments(self, import_node: Node) -> str: + """Get import text including any preceding comments.""" + return self._get_node_with_comments(import_node) + + def _get_node_ranges_with_comments(self, node: Node) -> List[Tuple[int, int]]: + """Get node ranges including any preceding comments.""" + ranges = [] + + current = node.prev_sibling + + while current and current.type in self.docs_types: + current_end_line = current.end_point[0] + node_start_line = node.start_point[0] + + if current_end_line <= node_start_line: + ranges.append((current.start_byte, current.end_byte)) + current = current.prev_sibling + else: + break + + ranges.append((node.start_byte, node.end_byte)) + + return ranges + + def _get_variable_ranges_with_comments( + self, var_node: Node + ) -> List[Tuple[int, int]]: + """Get variable ranges including any preceding comments.""" + return self._get_node_ranges_with_comments(var_node) + + def _get_import_ranges_with_comments( + self, import_node: Node + ) -> List[Tuple[int, int]]: + """Get import ranges including any preceding comments.""" + return self._get_node_ranges_with_comments(import_node) + + def _get_class_ranges_with_comments( + self, class_node: Node + ) -> List[Tuple[int, int]]: + """Get class ranges including any preceding comments and docstrings.""" + return self._get_node_ranges_with_comments(class_node) + + def _build_additional_context( + self, function_node: Node, root_node: Node + ) -> Tuple[str, str]: + context = "" + context_no_docstring = "" + node = function_node + + while node.parent: + if node.type in self.class_definition_types: + with_doc, without_doc = self._build_class_context(node, root_node) + context = f"{with_doc}\n{context}" + context_no_docstring = f"{without_doc}\n{context_no_docstring}" + node = node.parent + + return context, context_no_docstring + + def _is_docstring(self, node: Node) -> bool: + """Determines if a node is a docstring""" + return bool( + node.type == self.expression_statement + and node.named_children + and node.named_children[0].type == self.string_field + ) + + def _get_imports(self, tree: Tree) -> Dict[str, Node]: + """Get imports from the AST. Must be implemented by language-specific chunkers.""" + raise NotImplementedError + + def _build_class_context( + self, class_node: Node, root_node: Node + ) -> Tuple[str, str]: + class_indent = class_node.start_point.column + start_byte = class_node.start_byte + + if class_node.parent and class_node.parent.type == self.decorator_type: + start_byte = class_node.parent.start_byte + class_indent = class_node.parent.start_point.column + + body_node = class_node.child_by_field_name(self.class_body_field) + + if not body_node: + return ("", "") + + text = root_node.text + if text: + header_text = text[start_byte : body_node.start_byte].decode().rstrip() + else: + header_text = "" + header = f"{' ' * class_indent}{header_text}\n" + docstring = self._get_docstring(class_node) + header_with_docstring = ( + f"{header}{' ' * (class_indent + 4)}{docstring}\n" if docstring else header + ) + + fields = [ + to_str(child) + for child in body_node.children + if child.type in self.expression_types and not self._is_docstring(child) + ] + fields_text = "\n".join(fields) + constructor_node = self._find_constructor(body_node) + if constructor_node: + constructor_doc = self._get_docstring(constructor_node) + constructor_text = self._build_function(constructor_node) + constructor_text_no_doc = ( + constructor_text.replace(constructor_doc, "") + if constructor_doc + else constructor_text + ) + else: + constructor_text = constructor_text_no_doc = "" + + with_doc = f"{header_with_docstring}\n{fields_text}\n{constructor_text}".strip() + without_doc = f"{header}\n{fields_text}\n{constructor_text_no_doc}".strip() + + return with_doc, without_doc + + def _find_constructor(self, body: Node) -> Optional[Node]: + for child in body.children: + definition_field = child.child_by_field_name(self.definition_field) + if self._is_constructor(child) or ( + child.type == self.decorator_type + and definition_field + and self._is_constructor(definition_field) + ): + return child + return None + + def _is_constructor(self, node: Node) -> bool: + if node is None: + return False + + child = node.child_by_field_name(self.name_field) + if child is None: + return False + + name_field = node.child_by_field_name(self.name_field) + if not name_field or not name_field.text: + return False + return ( + node.type in self.function_definition_types + and name_field.text.decode(self.utf8_encoding) == self.constructor_name + ) + + def _is_only_function_in_class(self, constructor_node: Node) -> bool: + """Check if a constructor is the only function in its class.""" + class_node = constructor_node.parent + while class_node and class_node.type not in self.class_definition_types: + class_node = class_node.parent + + if not class_node: + return False + + body_node = class_node.child_by_field_name(self.class_body_field) + if not body_node: + return False + + function_count = 0 + for child in body_node.children: + if ( + child.type in self.function_definition_types + and child != constructor_node + ): + function_count += 1 + + return function_count == 0 + + def _track_constructor_variables( + self, node: Node, module_variables: Dict[str, Node], range_tracker: RangeTracker + ) -> None: + """Track variables used in constructor functions that aren't being chunked separately.""" + if node.type in self.function_definition_types and self._is_constructor(node): + if not self._is_only_function_in_class(node): + used_variables = self._find_used_variables(node) + for var_name in used_variables: + if var_name in module_variables: + var_def_node = module_variables[var_name] + range_tracker.mark_node_used(var_def_node) + + for child in node.children: + self._track_constructor_variables(child, module_variables, range_tracker) diff --git a/docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py b/docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py new file mode 100644 index 00000000..e8a6daf3 --- /dev/null +++ b/docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py @@ -0,0 +1,383 @@ +import hashlib +from typing import Iterator, List, Tuple + +from tree_sitter import Node + +from docling_core.transforms.chunker.code_chunk_utils.types import ( + ChunkType, + CodeChunk, + CodeDocMeta, +) +from docling_core.types.doc.document import DocumentOrigin + + +def new_hash(code: str) -> int: + """Generate SHA256 hash for code.""" + return int(hashlib.sha1(bytes(code, "utf-8")).hexdigest(), 16) + + +class RangeTracker: + """Handles tracking and management of used byte ranges in code.""" + + def __init__(self): + self.used_ranges: List[Tuple[int, int]] = [] + + def mark_used(self, start_byte: int, end_byte: int) -> None: + """Mark a range as used.""" + self.used_ranges.append((start_byte, end_byte)) + + def mark_node_used(self, node: Node) -> None: + """Mark a node's range as used.""" + self.mark_used(node.start_byte, node.end_byte) + + def merge_ranges(self) -> List[Tuple[int, int]]: + """Merge overlapping ranges and return sorted list.""" + if not self.used_ranges: + return [] + + sorted_ranges = sorted(self.used_ranges) + merged: List[Tuple[int, int]] = [] + + for start, end in sorted_ranges: + if not merged or start > merged[-1][1]: + merged.append((start, end)) + else: + merged[-1] = (merged[-1][0], max(merged[-1][1], end)) + + return merged + + def find_gaps(self, total_length: int) -> List[Tuple[int, int]]: + """Find gaps between used ranges.""" + merged = self.merge_ranges() + gaps = [] + last_end = 0 + + for start, end in merged: + if last_end < start: + gaps.append((last_end, start)) + last_end = end + + if last_end < total_length: + gaps.append((last_end, total_length)) + + return gaps + + def get_used_ranges(self) -> List[Tuple[int, int]]: + """Get all used ranges.""" + return self.used_ranges.copy() + + def clear(self) -> None: + """Clear all used ranges.""" + self.used_ranges.clear() + + def extend(self, ranges: List[Tuple[int, int]]) -> None: + """Add multiple ranges at once.""" + self.used_ranges.extend(ranges) + + +class ChunkMetadataBuilder: + """Builds metadata for code chunks.""" + + def __init__(self, origin: DocumentOrigin): + self.origin = origin + + def build_function_metadata( + self, + function_name: str, + docstring: str, + content: str, + start_line: int, + end_line: int, + signature_end_line: int, + ) -> CodeDocMeta: + """Build metadata for function chunks.""" + return CodeDocMeta( + part_name=function_name, + docstring=docstring, + sha256=new_hash(content), + start_line=start_line, + end_line=end_line, + end_line_signature=signature_end_line, + origin=self.origin, + chunk_type=ChunkType.FUNCTION, + ) + + def build_class_metadata( + self, + class_name: str, + docstring: str, + content: str, + start_line: int, + end_line: int, + ) -> CodeDocMeta: + """Build metadata for class chunks.""" + return CodeDocMeta( + part_name=class_name, + docstring=docstring, + sha256=new_hash(content), + start_line=start_line, + end_line=end_line, + end_line_signature=end_line, + origin=self.origin, + chunk_type=ChunkType.CLASS, + ) + + def build_preamble_metadata( + self, content: str, start_line: int, end_line: int + ) -> CodeDocMeta: + """Build metadata for preamble chunks.""" + return CodeDocMeta( + sha256=new_hash(content), + start_line=start_line, + end_line=end_line, + origin=self.origin, + chunk_type=ChunkType.PREAMBLE, + ) + + def calculate_line_numbers( + self, code: str, start_byte: int, end_byte: int + ) -> Tuple[int, int]: + """Calculate line numbers from byte positions.""" + start_line = code[:start_byte].count("\n") + 1 + if end_byte > 0 and end_byte <= len(code): + end_line = code[:end_byte].count("\n") + 1 + if end_byte < len(code) and code[end_byte - 1] == "\n": + end_line -= 1 + else: + end_line = start_line + return start_line, end_line + + +class ChunkBuilder: + """Builds code chunks from nodes and content.""" + + def __init__(self, origin: DocumentOrigin): + self.metadata_builder = ChunkMetadataBuilder(origin) + + def build_function_chunk( + self, + content: str, + function_name: str, + docstring: str, + start_line: int, + end_line: int, + signature_end_line: int, + ) -> CodeChunk: + """Build a function chunk.""" + metadata = self.metadata_builder.build_function_metadata( + function_name, docstring, content, start_line, end_line, signature_end_line + ) + return CodeChunk(text=content, meta=metadata) + + def build_class_chunk( + self, + content: str, + class_name: str, + docstring: str, + start_line: int, + end_line: int, + ) -> CodeChunk: + """Build a class chunk.""" + metadata = self.metadata_builder.build_class_metadata( + class_name, docstring, content, start_line, end_line + ) + return CodeChunk(text=content, meta=metadata) + + def build_preamble_chunk( + self, content: str, start_line: int, end_line: int + ) -> CodeChunk: + """Build a preamble chunk.""" + metadata = self.metadata_builder.build_preamble_metadata( + content, start_line, end_line + ) + return CodeChunk(text=content, meta=metadata) + + def process_orphan_chunks( + self, used_ranges: List[Tuple[int, int]], dl_doc + ) -> Iterator[CodeChunk]: + """Process orphan chunks (preamble) from unused code ranges.""" + from docling_core.types.doc.labels import DocItemLabel + + code = next( + (t.text for t in dl_doc.texts if t.label == DocItemLabel.CODE), None + ) + if not code: + return + + range_tracker = RangeTracker() + range_tracker.extend(used_ranges) + + gaps = range_tracker.find_gaps(len(code)) + orphan_pieces = [] + for start_byte, end_byte in gaps: + orphan_text = code[start_byte:end_byte].strip() + if orphan_text: + orphan_pieces.append((orphan_text, start_byte, end_byte)) + + if orphan_pieces: + merged_content = "\n\n".join(piece[0] for piece in orphan_pieces) + first_start_byte = orphan_pieces[0][1] + last_end_byte = orphan_pieces[-1][2] + + start_line, end_line = self.metadata_builder.calculate_line_numbers( + code, first_start_byte, last_end_byte + ) + yield self.build_preamble_chunk(merged_content, start_line, end_line) + + +class ChunkSizeProcessor: + """Processes chunks to split large ones into smaller pieces.""" + + def __init__( + self, tokenizer, max_tokens: int, min_chunk_size: int = 300, chunker=None + ): + self.tokenizer = tokenizer + self.max_tokens = max_tokens + self.min_chunk_size = min_chunk_size + self.chunker = chunker + + def process_chunks( + self, chunks_and_ranges: List[Tuple[CodeChunk, List[Tuple[int, int]]]] + ) -> Iterator[Tuple[CodeChunk, List[Tuple[int, int]]]]: + """Process chunks and split large ones if needed.""" + for chunk, ranges in chunks_and_ranges: + token_count = self.tokenizer.count_tokens(chunk.text) + + if token_count <= self.max_tokens: + yield chunk, ranges + else: + yield from self._split_large_chunk(chunk, ranges) + + def _split_large_chunk( + self, chunk: CodeChunk, ranges: List[Tuple[int, int]] + ) -> Iterator[Tuple[CodeChunk, List[Tuple[int, int]]]]: + """Split a large chunk into smaller pieces.""" + if chunk.meta.chunk_type in ["function", "method"]: + yield from self._split_function_chunk(chunk, ranges) + else: + yield from self._split_generic_chunk(chunk, ranges) + + def _split_function_chunk( + self, chunk: CodeChunk, ranges: List[Tuple[int, int]] + ) -> Iterator[Tuple[CodeChunk, List[Tuple[int, int]]]]: + """Split a large function chunk using the original sophisticated logic.""" + lines = chunk.text.split("\n") + if not lines: + yield chunk, ranges + return + + signature_line = "" + body_start_idx = 0 + for i, line in enumerate(lines): + if line.strip(): + signature_line = line + body_start_idx = i + 1 + break + + if not signature_line: + yield chunk, ranges + return + + body_lines = lines[body_start_idx:] + if not body_lines: + yield chunk, ranges + return + + if body_lines and body_lines[-1].strip() == "}": + body_lines = body_lines[:-1] + + chunks = [] + current_chunk = [f"{signature_line}{self._get_chunk_prefix()}"] + current_size = 0 + + for line in body_lines: + line_tokens = self.tokenizer.count_tokens(line) + + if current_size + line_tokens > self.max_tokens and len(current_chunk) > 1: + chunks.append("".join(current_chunk) + f"{self._get_chunk_suffix()}") + current_chunk = [f"{signature_line}{self._get_chunk_prefix()}"] + current_size = 0 + + current_chunk.append(line) + current_size += line_tokens + + if current_chunk: + chunks.append("".join(current_chunk) + f"{self._get_chunk_suffix()}") + + if len(chunks) > 1: + last_chunk = chunks.pop() + last_chunk_tokens = self.tokenizer.count_tokens(last_chunk) + if last_chunk_tokens < self.min_chunk_size: + chunks[-1] = ( + chunks[-1].rstrip(self._get_chunk_suffix()) + + "\n" + + last_chunk.lstrip(signature_line + f"{self._get_chunk_prefix()}") + ) + else: + chunks.append(last_chunk) + + for i, chunk_text in enumerate(chunks): + if not chunk_text.strip(): + continue + + new_meta = chunk.meta.model_copy() + new_meta.part_name = ( + f"{chunk.meta.part_name}_part_{i+1}" + if len(chunks) > 1 + else chunk.meta.part_name + ) + + sub_chunk = CodeChunk(text=chunk_text, meta=new_meta) + yield sub_chunk, ranges + + def _get_chunk_prefix(self) -> str: + """Get the chunk prefix for function splitting.""" + if self.chunker and hasattr(self.chunker, "chunk_prefix"): + return self.chunker.chunk_prefix + return " {\n" + + def _get_chunk_suffix(self) -> str: + """Get the chunk suffix for function splitting.""" + if self.chunker and hasattr(self.chunker, "chunk_suffix"): + return self.chunker.chunk_suffix + return "\n}" + + def _split_generic_chunk( + self, chunk: CodeChunk, ranges: List[Tuple[int, int]] + ) -> Iterator[Tuple[CodeChunk, List[Tuple[int, int]]]]: + """Split a generic chunk by lines.""" + lines = chunk.text.split("\n") + current_chunk_lines: List[str] = [] + current_size = 0 + chunk_number = 1 + + for line in lines: + line_tokens = self.tokenizer.count_tokens(line) + + if current_size + line_tokens > self.max_tokens and current_chunk_lines: + chunk_text = "\n".join(current_chunk_lines) + if self.tokenizer.count_tokens(chunk_text) >= self.min_chunk_size: + yield self._create_split_chunk( + chunk, chunk_text, chunk_number + ), ranges + chunk_number += 1 + + current_chunk_lines = [line] + current_size = line_tokens + else: + current_chunk_lines.append(line) + current_size += line_tokens + + if current_chunk_lines: + chunk_text = "\n".join(current_chunk_lines) + if self.tokenizer.count_tokens(chunk_text) >= self.min_chunk_size: + yield self._create_split_chunk(chunk, chunk_text, chunk_number), ranges + + def _create_split_chunk( + self, original_chunk: CodeChunk, text: str, chunk_number: int + ) -> CodeChunk: + """Create a new chunk from split text.""" + new_meta = original_chunk.meta.model_copy() + new_meta.part_name = f"{original_chunk.meta.part_name}_part_{chunk_number}" + + return CodeChunk(text=text, meta=new_meta) diff --git a/docling_core/transforms/chunker/code_chunk_utils/types.py b/docling_core/transforms/chunker/code_chunk_utils/types.py new file mode 100644 index 00000000..a272f226 --- /dev/null +++ b/docling_core/transforms/chunker/code_chunk_utils/types.py @@ -0,0 +1,35 @@ +from enum import Enum +from typing import Optional + +from pydantic import Field + +from docling_core.transforms.chunker.base import BaseChunk, BaseMeta +from docling_core.types.doc.document import DocumentOrigin + + +class CodeDocMeta(BaseMeta): + """Data model for CodeChunker metadata.""" + + part_name: Optional[str] = Field(default=None) + docstring: Optional[str] = Field(default=None) + sha256: Optional[int] = Field(default=None) + start_line: Optional[int] = Field(default=None) + end_line: Optional[int] = Field(default=None) + end_line_signature: Optional[int] = Field(default=None) + origin: Optional[DocumentOrigin] = Field(default=None) + chunk_type: Optional[str] = Field(default=None) + + +class ChunkType(str, Enum): + """Chunk type""" + + FUNCTION = "function" + METHOD = "method" + PREAMBLE = "preamble" + CLASS = "class" + + +class CodeChunk(BaseChunk): + """Data model for code chunks.""" + + meta: CodeDocMeta diff --git a/docling_core/transforms/chunker/code_chunk_utils/utils.py b/docling_core/transforms/chunker/code_chunk_utils/utils.py new file mode 100644 index 00000000..409893f6 --- /dev/null +++ b/docling_core/transforms/chunker/code_chunk_utils/utils.py @@ -0,0 +1,150 @@ +from enum import Enum +from typing import List, Optional + +import tree_sitter_c as ts_c +import tree_sitter_java as ts_java +import tree_sitter_javascript as ts_js +import tree_sitter_python as ts_python +import tree_sitter_typescript as ts_ts +from tree_sitter import Language as Lang +from tree_sitter import Node, Tree + +from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer + + +class Language(str, Enum): + PYTHON = "python" + JAVASCRIPT = "javascript" + TYPESCRIPT = "typescript" + JAVA = "java" + C = "c" + + def file_extensions(self) -> List[str]: + if self == Language.PYTHON: + return [".py"] + elif self == Language.TYPESCRIPT: + return [".ts", ".tsx", ".cts", ".mts", ".d.ts"] + elif self == Language.JAVA: + return [".java"] + elif self == Language.JAVASCRIPT: + return [".js", ".jsx", ".cjs", ".mjs"] + elif self == Language.C: + return [".c"] + else: + return [] + + def get_tree_sitter_language(self): + if self == Language.PYTHON: + return Lang(ts_python.language()) + elif self == Language.TYPESCRIPT: + return Lang(ts_ts.language_typescript()) + elif self == Language.JAVA: + return Lang(ts_java.language()) + elif self == Language.JAVASCRIPT: + return Lang(ts_js.language()) + elif self == Language.C: + return Lang(ts_c.language()) + else: + return None + + def get_import_query(self) -> Optional[str]: + if self == Language.PYTHON: + return """ + (import_statement) @import + (import_from_statement) @import + (future_import_statement) @import + """ + elif self in (Language.TYPESCRIPT, Language.JAVASCRIPT): + return """ + (import_statement) @import_full + + (lexical_declaration + (variable_declarator + name: (identifier) + value: (call_expression + function: (identifier) @require_function + arguments: (arguments + (string (string_fragment)) + ) + (#eq? @require_function "require") + ) + ) + ) @import_full + + (lexical_declaration + (variable_declarator + name: (identifier) + value: (await_expression + (call_expression + function: (import) + arguments: (arguments + (string (string_fragment)) + ) + ) + ) + ) + ) @import_full + """ + else: + return None + + def get_function_name(self, node: Node) -> Optional[str]: + if self == Language.C: + declarator = node.child_by_field_name("declarator") + if declarator: + inner_declarator = declarator.child_by_field_name("declarator") + if inner_declarator and inner_declarator.text: + return inner_declarator.text.decode("utf8") + return None + else: + name_node = node.child_by_field_name("name") + if name_node and name_node.text: + return name_node.text.decode("utf8") + return None + + def is_collectable_function(self, node: Node, constructor_name: str) -> bool: + if self == Language.C: + return True + else: + name = self.get_function_name(node) + if not name: + return False + + return name != constructor_name + + +def _get_default_tokenizer() -> "BaseTokenizer": + from docling_core.transforms.chunker.tokenizer.huggingface import ( + HuggingFaceTokenizer, + ) + + return HuggingFaceTokenizer.from_pretrained( + model_name="sentence-transformers/all-MiniLM-L6-v2" + ) + + +def has_child(node: Node, child_name: str) -> bool: + return bool(node and node.child_by_field_name(child_name)) + + +def get_children(node: Node, child_types: List[str]) -> List[Node]: + if not node.children: + return [] + + return [child for child in node.children if child.type in child_types] + + +def to_str(node: Node) -> str: + if not node or not node.text: + return "" + text = node.text.decode() + indent = node.start_point.column + return f"{' ' * indent}{text}".rstrip() + + +def query_tree(language, tree: Tree, query: str): + """Query a tree-sitter tree with the given query string.""" + if not language: + return [] + q = language.query(query) + return q.captures(tree.root_node) diff --git a/docling_core/transforms/chunker/language_code_chunkers.py b/docling_core/transforms/chunker/language_code_chunkers.py new file mode 100644 index 00000000..c97430e5 --- /dev/null +++ b/docling_core/transforms/chunker/language_code_chunkers.py @@ -0,0 +1,621 @@ +from typing import Any, Dict, List, Tuple + +from pydantic import Field +from tree_sitter import Node, Tree +from typing_extensions import override + +from docling_core.transforms.chunker.base_code_chunker import CodeChunker +from docling_core.transforms.chunker.code_chunk_utils.utils import ( + Language, + _get_default_tokenizer, + get_children, + has_child, + query_tree, + to_str, +) +from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer + + +class PythonFunctionChunker(CodeChunker): + + language: Language = Language.PYTHON + ts_language: Any = Field(default=None) + parser: Any = Field(default=None) + function_definition_types: List[str] = ["function_definition"] + class_definition_types: List[str] = ["class_definition"] + constructor_name: str = "__init__" + decorator_type: str = "decorated_definition" + expression_types: List[str] = ["expression_statement"] + chunk_prefix: str = "\n\t" + chunk_suffix: str = "" + function_body: str = "block" + tokenizer: BaseTokenizer = Field(default_factory=_get_default_tokenizer) + min_chunk_size: int = 300 + max_tokens: int = 50 + docs_types: List[str] = ["body", "comment"] + dotted_name: str = "dotted_name" + aliased_import: str = "aliased_import" + + def __init__(self, **data): + super().__init__(**data) + + @override + def _get_docstring(self, node: Node) -> str: + body_node = node.child_by_field_name(self.function_body) + if not body_node or not body_node.named_children: + return "" + + docstring_node = next( + (child for child in body_node.named_children if self._is_docstring(child)), + None, + ) + + if docstring_node and docstring_node.named_children: + text = docstring_node.named_children[0].text + return text.decode(self.utf8_encoding) if text else "" + return "" + + @override + def _get_imports(self, tree: Tree) -> Dict[str, Node]: + """Get imports for Python.""" + import_query = self.language.get_import_query() + if not import_query: + return {} + import_query_results = query_tree(self.ts_language, tree, import_query) + imports = {} + + if import_query_results: + nodes = [node for node in import_query_results["import"]] + nodes.sort(key=lambda node: node.start_point) + for node in nodes: + import_names = [] + aliases = node.named_children + for child in aliases: + if child.type == self.dotted_name: + import_names.append(child.text.decode(self.utf8_encoding)) + elif child.type == self.aliased_import: + original = child.child(0).text.decode(self.utf8_encoding) + alias = child.child(2).text.decode(self.utf8_encoding) + import_names.append(alias) + import_names.append(original) + for name in import_names: + imports[name] = node + return imports + + def _get_module_variables(self, tree: Tree) -> Dict[str, Node]: + """Get module-level variable assignments for Python.""" + variables = {} + for child in tree.root_node.children: + if child.type in self.expression_types and child.named_children: + expr = child.named_children[0] + if expr.type == "assignment": + if ( + expr.named_children + and expr.named_children[0].type in self.identifiers + ): + text = expr.named_children[0].text + var_name = text.decode(self.utf8_encoding) if text else "" + extended_node = self._get_variable_with_comments( + child, tree.root_node + ) + variables[var_name] = extended_node + return variables + + @override + def _get_variable_with_comments(self, var_node: Node, root_node: Node) -> Node: + """Get variable node including any preceding comments.""" + return var_node + + @override + def _find_used_variables(self, function_node: Node) -> set: + """Find variable names used within a function.""" + used_vars = set() + + def collect_identifiers(node, depth=0): + " " * depth + if node.type in self.identifiers: + var_name = node.text.decode(self.utf8_encoding) + is_local = self._is_local_assignment(node) + if not is_local: + used_vars.add(var_name) + for child in node.children: + collect_identifiers(child, depth + 1) + + body_node = function_node.child_by_field_name("block") + if not body_node: + body_node = function_node.child_by_field_name("body") + if not body_node: + for child in function_node.children: + if child.type in ["block", "suite", "compound_statement"]: + body_node = child + break + + if body_node: + collect_identifiers(body_node) + else: + collect_identifiers(function_node) + + return used_vars + + def _is_local_assignment(self, identifier_node: Node) -> bool: + """Check if an identifier is part of a local assignment (not a reference).""" + current = identifier_node.parent + while current: + if current.type == "assignment": + if ( + current.named_children + and current.named_children[0] == identifier_node + ): + return True + current = current.parent + return False + + +class TypeScriptFunctionChunker(CodeChunker): + language: Language = Language.TYPESCRIPT + ts_language: Any = Field(default=None) + parser: Any = Field(default=None) + function_definition_types: List[str] = [ + "function_declaration", + "arrow_function", + "method_definition", + "function_expression", + "generator_function", + "generator_function_declaration", + "export_statement", + ] + class_definition_types: List[str] = ["class_declaration"] + constructor_name: str = "constructor" + decorator_type: str = "decorator" + function_body: str = "block" + expression_types: List[str] = ["expression_statement"] + tokenizer: BaseTokenizer = Field(default_factory=_get_default_tokenizer) + min_chunk_size: int = 300 + max_tokens: int = 5000 + chunk_prefix: str = " {" + chunk_suffix: str = "\n}" + docs_types: List[str] = ["comment"] + import_clause: str = "import_clause" + named_imports: str = "named_imports" + import_specifier: str = "import_specifier" + namespace_import: str = "namespace_import" + variable_declarator: str = "variable_declarator" + + def __init__(self, **data): + super().__init__(**data) + + @override + def _is_docstring(self, node: Node) -> bool: + return node.type in self.docs_types + + @override + def _get_imports(self, tree: Tree) -> Dict[str, Node]: + import_query = self.language.get_import_query() + if not import_query: + return {} + import_query_results = query_tree(self.ts_language, tree, import_query) + imports = {} + for import_node in import_query_results.get("import_full", []): + identifiers = [] + for child in import_node.children: + if child.type == self.import_clause: + default_name = child.child_by_field_name(self.name_field) + if default_name: + identifiers.append(default_name.text.decode("utf8")) + for sub_child in child.children: + if sub_child.type == self.named_imports: + for spec in sub_child.children: + if spec.type == self.import_specifier: + name_node = spec.child_by_field_name( + self.name_field + ) + if name_node: + identifiers.append( + name_node.text.decode("utf8") + ) + elif sub_child.type in self.identifiers: + identifiers.append(sub_child.text.decode("utf8")) + elif sub_child.type == self.namespace_import: + for ns_child in sub_child.children: + if ns_child.type in self.identifiers: + identifiers.append(ns_child.text.decode("utf8")) + elif child.type == self.variable_declarator: + identifier = child.child_by_field_name(self.name_field) + if identifier: + identifiers.append(identifier.text.decode("utf8")) + for identifier_val in identifiers: + imports[identifier_val] = import_node + return imports + + def _get_module_variables(self, tree: Tree) -> Dict[str, Node]: + """TypeScript/JavaScript don't have module-level variables like Python or C macros.""" + return {} + + +class JavaScriptFunctionChunker(TypeScriptFunctionChunker): + def __init__(self, **data): + super().__init__(language=Language.JAVASCRIPT) + + +class CFunctionChunker(CodeChunker): + language: Language = Language.C + ts_language: Any = Field(default=None) + parser: Any = Field(default=None) + function_definition_types: List[str] = ["function_definition"] + class_definition_types: List[str] = [""] + constructor_name: str = "" + decorator_type: str = "" + function_body: str = "compound_statement" + tokenizer: BaseTokenizer = Field(default_factory=_get_default_tokenizer) + min_chunk_size: int = 300 + max_tokens: int = 5000 + chunk_prefix: str = " {" + chunk_suffix: str = "\n}" + expression_types: List[str] = [] + docs_types: List[str] = ["comment", "block_comment"] + structs: List[str] = ["struct_specifier", "preproc_def", "preproc_function_def"] + declaration: str = "declaration" + declarator: str = "declarator" + function_declaration: List[str] = ["type_definition", "function_declaration"] + type_field: str = "type" + identifiers: List[str] = ["identifier"] + + def __init__(self, **data): + super().__init__(**data) + + @override + def _is_docstring(self, node: Node) -> bool: + return node.type in self.docs_types + + @override + def _get_docstring(self, node: Node) -> str: + docstring = "" + if node.prev_named_sibling and node.prev_named_sibling.type in self.docs_types: + while ( + node.prev_named_sibling + and node.prev_named_sibling.type in self.docs_types + ): + text = node.prev_named_sibling.text + if text: + docstring += text.decode(self.utf8_encoding) + node = node.prev_named_sibling + return docstring + return "" + + @override + def _is_constructor(self, node: Node) -> bool: + return False + + def _get_imports(self, tree: Tree) -> Dict[str, Node]: + structs = {} + + def _clean_name(name_text: str) -> str: + for char in ["[", "("]: + if char in name_text: + name_text = name_text.split(char)[0] + return name_text.strip() + + def _structs(node): + if node.type in self.structs and node.child_by_field_name(self.name_field): + name = node.child_by_field_name(self.name_field) + clean_name = _clean_name(name.text.decode("utf8")) + if clean_name: + structs[clean_name] = node + elif node.type in [self.declaration]: + if has_child( + node.child_by_field_name(self.declarator), self.declarator + ): + name = node.child_by_field_name( + self.declarator + ).child_by_field_name(self.declarator) + else: + name = node.child_by_field_name(self.declarator) + if name: + clean_name = _clean_name(name.text.decode("utf8")) + if clean_name: + structs[clean_name] = node + elif node.type in self.function_declaration: + if has_child( + node.child_by_field_name(self.type_field), self.name_field + ): + name = node.child_by_field_name( + self.type_field + ).child_by_field_name(self.name_field) + else: + name = node.child_by_field_name(self.type_field) + if name: + clean_name = _clean_name(name.text.decode("utf8")) + if clean_name: + structs[clean_name] = node + if node.type not in ["compound_statement", "block"]: + for child in node.children: + _structs(child) + + for child in tree.root_node.children: + _structs(child) + + return {**structs} + + def _get_module_variables(self, tree: Tree) -> Dict[str, Node]: + """Get module-level #define macros for C.""" + macros = {} + for child in tree.root_node.children: + if child.type == "preproc_def": + macro_name = self._extract_macro_name(child) + if macro_name: + extended_node = self._get_macro_with_comments(child, tree.root_node) + macros[macro_name] = extended_node + return macros + + def _extract_macro_name(self, define_node: Node) -> str: + """Extract the macro name from a #define node.""" + for child in define_node.children: + if child.type in self.identifiers: + text = child.text + return text.decode(self.utf8_encoding) if text else "" + return "" + + def _get_macro_with_comments(self, macro_node: Node, root_node: Node) -> Node: + """Get macro node including any preceding comments.""" + return macro_node + + @override + def _find_used_variables(self, function_node: Node) -> set: + """Find macro names used within a function.""" + used_macros = set() + + def collect_identifiers(node, depth=0): + " " * depth + if node.type in self.identifiers: + macro_name = node.text.decode(self.utf8_encoding) + used_macros.add(macro_name) + for child in node.children: + collect_identifiers(child, depth + 1) + + body_node = function_node.child_by_field_name(self.function_body) + if not body_node: + body_node = function_node.child_by_field_name("body") + if not body_node: + for child in function_node.children: + if child.type in ["compound_statement", "block"]: + body_node = child + break + + if body_node: + collect_identifiers(body_node) + else: + collect_identifiers(function_node) + + return used_macros + + +class JavaFunctionChunker(CodeChunker): + + language: Language = Language.JAVA + ts_language: Any = Field(default=None) + parser: Any = Field(default=None) + method_declaration: str = "method_declaration" + function_definition_types: List[str] = [ + method_declaration, + "constructor_declaration", + "static_initializer", + ] + class_definition_types: List[str] = ["class_declaration", "interface_declaration"] + constructor_name: str = "" + decorator_type: str = "annotation" + function_body: str = "block" + expression_types: List[str] = [] + tokenizer: BaseTokenizer = Field(default_factory=_get_default_tokenizer) + min_chunk_size: int = 300 + max_tokens: int = 5000 + chunk_prefix: str = " {" + chunk_suffix: str = "\n}" + docs_types: List[str] = ["block_comment", "comment"] + package_declaration: str = "package_declaration" + import_declaration: str = "import_declaration" + class_declaration: str = "class_declaration" + record_declaration: str = "record_declaration" + enum_declaration: str = "enum_declaration" + interface_declaration: str = "interface_declaration" + field_declaration: str = "field_declaration" + static_initializer: str = "static_initializer" + constructor_declaration: str = "constructor_declaration" + compact_constructor_declaration: str = "compact_constructor_declaration" + enum_constant: str = "enum_constant" + enum_body_declarations: str = "enum_body_declarations" + constant_declaration: str = "constant_declaration" + + enum_inner_types: List[str] = [ + field_declaration, + method_declaration, + function_body, + constructor_declaration, + compact_constructor_declaration, + ] + class_header_inner_types: List[str] = [ + field_declaration, + static_initializer, + function_body, + ] + object_declarations: List[str] = [ + class_declaration, + record_declaration, + enum_declaration, + interface_declaration, + ] + + def __init__(self, **data): + super().__init__(**data) + + @override + def _file_prefix(self, root_node: Node) -> Tuple[str, List[Tuple[int, int]]]: + used_ranges = [] + for child in root_node.children: + if child.type == self.package_declaration: + prefix = to_str(child).strip() + "\n" + package_nodes = get_children(root_node, [self.package_declaration]) + for package_node in package_nodes: + used_ranges.append((package_node.start_byte, package_node.end_byte)) + return prefix, used_ranges + + @override + def _get_imports(self, tree: Tree) -> Dict[str, Node]: + import_nodes = get_children(tree.root_node, [self.import_declaration]) + import_dict = {} + for import_node in import_nodes: + last_child = import_node.children[-2].children[-1] + import_name = to_str(last_child).strip() + if import_name == "*": + import_name = to_str(import_node) + import_dict[import_name] = import_node + return import_dict + + @override + def _build_additional_context( + self, function_node: Node, root_node: Node + ) -> Tuple[str, str]: + context: List[str] = [] + context_no_doc: List[str] = [] + while function_node.parent is not None: + if function_node.type in self.object_declarations: + with_doc, without_doc = self._build_java_object_context( + function_node, root_node + ) + context.insert(0, with_doc) + context_no_doc.insert(0, without_doc) + function_node = function_node.parent + with_doc = "".join(context).rstrip() + without_doc = "".join(context_no_doc).rstrip() + return ( + with_doc + ("" if with_doc else ""), + without_doc + ("" if without_doc else ""), + ) + + def _build_java_object_context( + self, obj_node: Node, root_node: Node + ) -> Tuple[str, str]: + """Build context for Java objects (classes, enums, interfaces).""" + obj_type = obj_node.type + + if obj_type in (self.class_declaration, self.record_declaration): + return self._build_java_class_like_context(obj_node, root_node, "class") + elif obj_type == self.enum_declaration: + return self._build_java_class_like_context(obj_node, root_node, "enum") + elif obj_type == self.interface_declaration: + return self._build_java_class_like_context(obj_node, root_node, "interface") + + return ("", "") + + def _build_java_class_like_context( + self, node: Node, root_node: Node, context_type: str + ) -> Tuple[str, str]: + """Unified context building for Java classes, enums, and interfaces.""" + body = node.child_by_field_name(self.class_body_field) + if not body: + text = to_str(node) + return (text, text) + + header = self._get_function_signature(node, root_node) + doc = self._get_docstring(node) + header_with_doc = ( + f"{header}{' ' * (node.start_point.column + 4)}{doc}" if doc else header + ) + + inner_parts = [] + + if context_type == "enum": + constants = [ + to_str(child) + for child in body.children + if child.type == self.enum_constant + ] + const_block = (",".join(constants) + ";") if constants else "" + inner_parts.append(const_block) + + decl = next( + ( + child + for child in body.children + if child.type == self.enum_body_declarations + ), + None, + ) + if decl: + decl_parts = [ + to_str(child) + for child in decl.children + if child.type in self.enum_inner_types + ] + inner_parts.append("".join(decl_parts)) + + elif context_type == "interface": + constants = [ + to_str(child) + for child in body.children + if child.type == self.constant_declaration + ] + methods = [ + to_str(child) + for child in body.children + if child.type in self.function_definition_types + ] + inner_parts.extend(["".join(constants), "".join(methods)]) + + else: + parts = [ + to_str(child) + for child in body.children + if child.type in self.class_header_inner_types + ] + inner_parts.extend(parts) + + ctor = self._find_constructor(body) + if ctor: + inner_parts.append(self._build_node_with_decorators(ctor)) + + inner = "".join(part for part in inner_parts if part.strip()) + close = (" " * node.start_point.column) + "}" + + with_doc = ( + "\n\n".join(x for x in [header_with_doc, inner] if x).rstrip() + close + ) + without_doc = "\n\n".join(x for x in [header, inner] if x).rstrip() + close + + return with_doc, without_doc + + def _get_function_signature(self, node: Node, root_node: Node) -> str: + indent = node.start_point.column + body_node = node.child_by_field_name(self.class_body_field) + if not body_node: + return to_str(node) + text = root_node.text + if text: + sig = text[node.start_byte : body_node.start_byte].decode().rstrip() + else: + sig = "" + return (" " * indent) + sig + " {" + + def _get_class_member_ranges(self, current_node: Node) -> List[Tuple[int, int]]: + used_ranges = [] + + parent = current_node.parent + if parent: + field_nodes = get_children(parent, [self.field_declaration]) + for field_node in field_nodes: + used_ranges.append((field_node.start_byte, field_node.end_byte)) + + constant_nodes = get_children(parent, [self.constant_declaration]) + for constant_node in constant_nodes: + used_ranges.append((constant_node.start_byte, constant_node.end_byte)) + + return used_ranges + + def _get_module_variables(self, tree: Tree) -> Dict[str, Node]: + """Java doesn't have module-level variables like Python or C macros.""" + return {} + + def _build_node_with_decorators(self, node: Node) -> str: + """Build a node including any decorators/annotations.""" + if node.parent and node.parent.type == self.decorator_type: + return to_str(node.parent) + return to_str(node) diff --git a/pyproject.toml b/pyproject.toml index a0dc0fd0..01b03326 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,8 +48,15 @@ dependencies = [ 'pillow (>=10.0.0,<13.0.0)', 'pyyaml (>=5.1,<7.0.0)', 'typing-extensions (>=4.12.2,<5.0.0)', - 'typer (>=0.12.5,<0.20.0)', + 'typer (>=0.12.5,<0.17.0)', 'latex2mathml (>=3.77.0,<4.0.0)', + "tree-sitter==0.23.2", + "tree-sitter-python==0.23.6", + "tree-sitter-c==0.23.4", + "tree-sitter-java==0.23.5", + "tree-sitter-javascript==0.23.1", + "tree-sitter-typescript==0.23.2", + ] [project.urls] diff --git a/test/data/chunker_repo/C/repo_out_chunks.json b/test/data/chunker_repo/C/repo_out_chunks.json new file mode 100644 index 00000000..f8628120 --- /dev/null +++ b/test/data/chunker_repo/C/repo_out_chunks.json @@ -0,0 +1,661 @@ +{ + "root": [ + { + "text": "\nstatic void string_replace_all_occurrences_with_char(char *s, const char *occur, char repl_char)\n{\n\tsize_t slen = strlen(s);\n\tsize_t skip = strlen(occur) - 1; /* length of the occurrence, minus the char we're replacing */\n\tchar *p = s;\n\twhile ((p = strstr(p, occur)))\n\t{\n\t\t*p = repl_char;\n\t\tp++;\n\t\tslen -= skip;\n\t\tmemmove(p, (p + skip), slen - (p - s) + 1); /* includes null char too */\n\t}\n}", + "meta": { + "part_name": "string_replace_all_occurrences_with_char", + "docstring": "/**\n * JavaScript Object Notation (JSON) Pointer\n * RFC 6901 - https://tools.ietf.org/html/rfc6901\n */", + "sha256": 1117482735928585729815737415012422172962871245598, + "start_line": 31, + "end_line": 43, + "end_line_signature": 32, + "origin": { + "mimetype": "text/plain", + "binary_hash": 3389072908273760774, + "filename": "json_pointer.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" + }, + "chunk_type": "function" + } + }, + { + "text": "/* Avoid ctype.h and locale overhead */\n#define is_plain_digit(c) ((c) >= '0' && (c) <= '9')\nstatic int is_valid_index(const char *path, size_t *idx)\n{\n\tsize_t i, len = strlen(path);\n\t/* this code-path optimizes a bit, for when we reference the 0-9 index range\n\t * in a JSON array and because leading zeros not allowed\n\t */\n\tif (len == 1)\n\t{\n\t\tif (is_plain_digit(path[0]))\n\t\t{\n\t\t\t*idx = (path[0] - '0');\n\t\t\treturn 1;\n\t\t}\n\t\terrno = EINVAL;\n\t\treturn 0;\n\t}\n\t/* leading zeros not allowed per RFC */\n\tif (path[0] == '0')\n\t{\n\t\terrno = EINVAL;\n\t\treturn 0;\n\t}\n\t/* RFC states base-10 decimals */\n\tfor (i = 0; i < len; i++)\n\t{\n\t\tif (!is_plain_digit(path[i]))\n\t\t{\n\t\t\terrno = EINVAL;\n\t\t\treturn 0;\n\t\t}\n\t}\n\n\t// We know it's all digits, so the only error case here is overflow,\n\t// but ULLONG_MAX will be longer than any array length so that's ok.\n\t*idx = strtoull(path, NULL, 10);\n\n\treturn 1;\n}", + "meta": { + "part_name": "is_valid_index", + "docstring": "", + "sha256": 234354095953395323597807168380238510580195482334, + "start_line": 45, + "end_line": 82, + "end_line_signature": 46, + "origin": { + "mimetype": "text/plain", + "binary_hash": 3389072908273760774, + "filename": "json_pointer.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" + }, + "chunk_type": "function" + } + }, + { + "text": "\nstatic int json_pointer_get_single_path(struct json_object *obj, char *path,\n struct json_object **value, size_t *idx)\n{\n\tif (json_object_is_type(obj, json_type_array))\n\t{\n\t\tif (!is_valid_index(path, idx))\n\t\t\treturn -1;\n\t\tif (*idx >= json_object_array_length(obj))\n\t\t{\n\t\t\terrno = ENOENT;\n\t\t\treturn -1;\n\t\t}\n\n\t\tobj = json_object_array_get_idx(obj, *idx);\n\t\tif (obj)\n\t\t{\n\t\t\tif (value)\n\t\t\t\t*value = obj;\n\t\t\treturn 0;\n\t\t}\n\t\t/* Entry not found */\n\t\terrno = ENOENT;\n\t\treturn -1;\n\t}\n\n\t/* RFC states that we first must eval all ~1 then all ~0 */\n\tstring_replace_all_occurrences_with_char(path, \"~1\", '/');\n\tstring_replace_all_occurrences_with_char(path, \"~0\", '~');\n\n\tif (!json_object_object_get_ex(obj, path, value))\n\t{\n\t\terrno = ENOENT;\n\t\treturn -1;\n\t}\n\n\treturn 0;\n}", + "meta": { + "part_name": "json_pointer_get_single_path", + "docstring": "", + "sha256": 85913314315132048628912722197929586436214235955, + "start_line": 84, + "end_line": 120, + "end_line_signature": 86, + "origin": { + "mimetype": "text/plain", + "binary_hash": 3389072908273760774, + "filename": "json_pointer.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" + }, + "chunk_type": "function" + } + }, + { + "text": "\nstatic int json_object_array_put_idx_cb(struct json_object *parent, size_t idx,\n\t\t\t\t\tstruct json_object *value, void *priv)\n{\n\treturn json_object_array_put_idx(parent, idx, value);\n}", + "meta": { + "part_name": "json_object_array_put_idx_cb", + "docstring": "", + "sha256": 515670096298758350505203262066130806756892931374, + "start_line": 122, + "end_line": 126, + "end_line_signature": 124, + "origin": { + "mimetype": "text/plain", + "binary_hash": 3389072908273760774, + "filename": "json_pointer.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" + }, + "chunk_type": "function" + } + }, + { + "text": "\nstatic int json_pointer_set_single_path(struct json_object *parent, const char *path,\n struct json_object *value,\n\t\t\t\t\tjson_pointer_array_set_cb array_set_cb, void *priv)\n{\n\tif (json_object_is_type(parent, json_type_array))\n\t{\n\t\tsize_t idx;\n\t\t/* RFC (Chapter 4) states that '-' may be used to add new elements to an array */\n\t\tif (path[0] == '-' && path[1] == '\\0')\n\t\t\treturn json_object_array_add(parent, value);\n\t\tif (!is_valid_index(path, &idx))\n\t\t\treturn -1;\n\t\treturn array_set_cb(parent, idx, value, priv);\n\t}\n\n\t/* path replacements should have been done in json_pointer_get_single_path(),\n\t * and we should still be good here\n\t */\n\tif (json_object_is_type(parent, json_type_object))\n\t\treturn json_object_object_add(parent, path, value);\n\n\t/* Getting here means that we tried to \"dereference\" a primitive JSON type\n\t * (like string, int, bool).i.e. add a sub-object to it\n\t */\n\terrno = ENOENT;\n\treturn -1;\n}", + "meta": { + "part_name": "json_pointer_set_single_path", + "docstring": "", + "sha256": 744226804185536688172092186408538018610881378934, + "start_line": 128, + "end_line": 154, + "end_line_signature": 131, + "origin": { + "mimetype": "text/plain", + "binary_hash": 3389072908273760774, + "filename": "json_pointer.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" + }, + "chunk_type": "function" + } + }, + { + "text": "\nstatic int json_pointer_result_get_recursive(struct json_object *obj, char *path,\n struct json_pointer_get_result *res)\n{\n\tstruct json_object *parent_obj = obj;\n\tsize_t idx = 0;\n\tchar *endp;\n\tint rc;\n\n\t/* All paths (on each recursion level must have a leading '/' */\n\tif (path[0] != '/')\n\t{\n\t\terrno = EINVAL;\n\t\treturn -1;\n\t}\n\tpath++;\n\n\tendp = strchr(path, '/');\n\tif (endp)\n\t\t*endp = '\\0';\n\n\t/* If we err-ed here, return here */\n\tif ((rc = json_pointer_get_single_path(obj, path, &obj, &idx)))\n\t\treturn rc;\n\n\tif (endp)\n\t{\n\t\t/* Put the slash back, so that the sanity check passes on next recursion level */\n\t\t*endp = '/';\n\t\treturn json_pointer_result_get_recursive(obj, endp, res);\n\t}\n\n\t/* We should be at the end of the recursion here */\n\tif (res) {\n\t\tres->parent = parent_obj;\n\t\tres->obj = obj;\n\t\tif (json_object_is_type(res->parent, json_type_array))\n\t\t\tres->index_in_parent = idx;\n\t\telse\n\t\t\tres->key_in_parent = path;\n\t}\n\n\treturn 0;\n}", + "meta": { + "part_name": "json_pointer_result_get_recursive", + "docstring": "", + "sha256": 518298991245464116417798779750096461462494486587, + "start_line": 156, + "end_line": 198, + "end_line_signature": 158, + "origin": { + "mimetype": "text/plain", + "binary_hash": 3389072908273760774, + "filename": "json_pointer.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" + }, + "chunk_type": "function" + } + }, + { + "text": "\nstatic int json_pointer_object_get_recursive(struct json_object *obj, char *path,\n struct json_object **value)\n{\n\tstruct json_pointer_get_result res;\n\tint rc;\n\n\trc = json_pointer_result_get_recursive(obj, path, &res);\n\tif (rc)\n\t\treturn rc;\n\n\tif (value)\n\t\t*value = res.obj;\n\n\treturn 0;\n}", + "meta": { + "part_name": "json_pointer_object_get_recursive", + "docstring": "", + "sha256": 1217293748232453207346015288718037001737705783321, + "start_line": 200, + "end_line": 214, + "end_line_signature": 202, + "origin": { + "mimetype": "text/plain", + "binary_hash": 3389072908273760774, + "filename": "json_pointer.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" + }, + "chunk_type": "function" + } + }, + { + "text": "\nint json_pointer_get_internal(struct json_object *obj, const char *path,\n struct json_pointer_get_result *res)\n{\n\tchar *path_copy = NULL;\n\tint rc;\n\n\tif (!obj || !path)\n\t{\n\t\terrno = EINVAL;\n\t\treturn -1;\n\t}\n\n\tif (path[0] == '\\0')\n\t{\n\t\tres->parent = NULL;\n\t\tres->obj = obj;\n\t\tres->key_in_parent = NULL;\n\t\tres->index_in_parent = UINT32_MAX;\n\t\treturn 0;\n\t}\n\n\t/* pass a working copy to the recursive call */\n\tif (!(path_copy = strdup(path)))\n\t{\n\t\terrno = ENOMEM;\n\t\treturn -1;\n\t}\n\trc = json_pointer_result_get_recursive(obj, path_copy, res);\n\t/* re-map the path string to the const-path string */\n\tif (rc == 0 && json_object_is_type(res->parent, json_type_object) && res->key_in_parent)\n\t\tres->key_in_parent = path + (res->key_in_parent - path_copy);\n\tfree(path_copy);\n\n\treturn rc;\n}", + "meta": { + "part_name": "json_pointer_get_internal", + "docstring": "", + "sha256": 196996869167588750666460162571361715333822997162, + "start_line": 216, + "end_line": 250, + "end_line_signature": 218, + "origin": { + "mimetype": "text/plain", + "binary_hash": 3389072908273760774, + "filename": "json_pointer.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" + }, + "chunk_type": "function" + } + }, + { + "text": "\nint json_pointer_get(struct json_object *obj, const char *path, struct json_object **res)\n{\n\tstruct json_pointer_get_result jpres;\n\tint rc;\n\n\trc = json_pointer_get_internal(obj, path, &jpres);\n\tif (rc)\n\t\treturn rc;\n\n\tif (res)\n\t\t*res = jpres.obj;\n\n\treturn 0;\n}", + "meta": { + "part_name": "json_pointer_get", + "docstring": "", + "sha256": 463273473259540096316239720380761023977777440343, + "start_line": 252, + "end_line": 265, + "end_line_signature": 253, + "origin": { + "mimetype": "text/plain", + "binary_hash": 3389072908273760774, + "filename": "json_pointer.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" + }, + "chunk_type": "function" + } + }, + { + "text": "\nint json_pointer_getf(struct json_object *obj, struct json_object **res, const char *path_fmt, ...)\n{\n\tchar *path_copy = NULL;\n\tint rc = 0;\n\tva_list args;\n\n\tif (!obj || !path_fmt)\n\t{\n\t\terrno = EINVAL;\n\t\treturn -1;\n\t}\n\n\tva_start(args, path_fmt);\n\trc = vasprintf(&path_copy, path_fmt, args);\n\tva_end(args);\n\n\tif (rc < 0)\n\t\treturn rc;\n\n\tif (path_copy[0] == '\\0')\n\t{\n\t\tif (res)\n\t\t\t*res = obj;\n\t\tgoto out;\n\t}\n\n\trc = json_pointer_object_get_recursive(obj, path_copy, res);\nout:\n\tfree(path_copy);\n\n\treturn rc;\n}", + "meta": { + "part_name": "json_pointer_getf", + "docstring": "", + "sha256": 924347282411192461265505339007126264782988122151, + "start_line": 267, + "end_line": 298, + "end_line_signature": 268, + "origin": { + "mimetype": "text/plain", + "binary_hash": 3389072908273760774, + "filename": "json_pointer.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" + }, + "chunk_type": "function" + } + }, + { + "text": "\nint json_pointer_set_with_array_cb(struct json_object **obj, const char *path,\n\t\t\t\t struct json_object *value,\n\t\t\t\t json_pointer_array_set_cb array_set_cb, void *priv)\n{\n\tconst char *endp;\n\tchar *path_copy = NULL;\n\tstruct json_object *set = NULL;\n\tint rc;\n\n\tif (!obj || !path)\n\t{\n\t\terrno = EINVAL;\n\t\treturn -1;\n\t}\n\n\tif (path[0] == '\\0')\n\t{\n\t\tjson_object_put(*obj);\n\t\t*obj = value;\n\t\treturn 0;\n\t}\n\n\tif (path[0] != '/')\n\t{\n\t\terrno = EINVAL;\n\t\treturn -1;\n\t}\n\n\t/* If there's only 1 level to set, stop here */\n\tif ((endp = strrchr(path, '/')) == path)\n\t{\n\t\tpath++;\n\t\treturn json_pointer_set_single_path(*obj, path, value, array_set_cb, priv);\n\t}\n\n\t/* pass a working copy to the recursive call */\n\tif (!(path_copy = strdup(path)))\n\t{\n\t\terrno = ENOMEM;\n\t\treturn -1;\n\t}\n\tpath_copy[endp - path] = '\\0';\n\trc = json_pointer_object_get_recursive(*obj, path_copy, &set);\n\tfree(path_copy);\n\n\tif (rc)\n\t\treturn rc;\n\n\tendp++;\n\treturn json_pointer_set_single_path(set, endp, value, array_set_cb, priv);\n}", + "meta": { + "part_name": "json_pointer_set_with_array_cb", + "docstring": "", + "sha256": 262882567182967450570864701845081576454846403316, + "start_line": 300, + "end_line": 350, + "end_line_signature": 303, + "origin": { + "mimetype": "text/plain", + "binary_hash": 3389072908273760774, + "filename": "json_pointer.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" + }, + "chunk_type": "function" + } + }, + { + "text": "\nint json_pointer_set(struct json_object **obj, const char *path, struct json_object *value)\n{\n\treturn json_pointer_set_with_array_cb(obj, path, value, json_object_array_put_idx_cb, NULL);\n}", + "meta": { + "part_name": "json_pointer_set", + "docstring": "", + "sha256": 23353041941238655187424843169018755147284430949, + "start_line": 352, + "end_line": 355, + "end_line_signature": 353, + "origin": { + "mimetype": "text/plain", + "binary_hash": 3389072908273760774, + "filename": "json_pointer.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" + }, + "chunk_type": "function" + } + }, + { + "text": "\nint json_pointer_setf(struct json_object **obj, struct json_object *value, const char *path_fmt,\n ...)\n{\n\tchar *endp;\n\tchar *path_copy = NULL;\n\tstruct json_object *set = NULL;\n\tva_list args;\n\tint rc = 0;\n\n\tif (!obj || !path_fmt)\n\t{\n\t\terrno = EINVAL;\n\t\treturn -1;\n\t}\n\n\t/* pass a working copy to the recursive call */\n\tva_start(args, path_fmt);\n\trc = vasprintf(&path_copy, path_fmt, args);\n\tva_end(args);\n\n\tif (rc < 0)\n\t\treturn rc;\n\n\tif (path_copy[0] == '\\0')\n\t{\n\t\tjson_object_put(*obj);\n\t\t*obj = value;\n\t\tgoto out;\n\t}\n\n\tif (path_copy[0] != '/')\n\t{\n\t\terrno = EINVAL;\n\t\trc = -1;\n\t\tgoto out;\n\t}\n\n\t/* If there's only 1 level to set, stop here */\n\tif ((endp = strrchr(path_copy, '/')) == path_copy)\n\t{\n\t\tset = *obj;\n\t\tgoto set_single_path;\n\t}\n\n\t*endp = '\\0';\n\trc = json_pointer_object_get_recursive(*obj, path_copy, &set);\n\n\tif (rc)\n\t\tgoto out;\n\nset_single_path:\n\tendp++;\n\trc = json_pointer_set_single_path(set, endp, value,\n\t\t\t\t\t json_object_array_put_idx_cb, NULL);\nout:\n\tfree(path_copy);\n\treturn rc;\n}", + "meta": { + "part_name": "json_pointer_setf", + "docstring": "", + "sha256": 278409402010463874805413705333555052224985193220, + "start_line": 357, + "end_line": 414, + "end_line_signature": 359, + "origin": { + "mimetype": "text/plain", + "binary_hash": 3389072908273760774, + "filename": "json_pointer.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" + }, + "chunk_type": "function" + } + }, + { + "text": "#include \"config.h\"\n\n#include \"strerror_override.h\"\n\n#include \n#include \n#include \n#include \n\n#include \"json_object_private.h\"\n#include \"json_pointer.h\"\n#include \"json_pointer_private.h\"\n#include \"strdup_compat.h\"\n#include \"vasprintf_compat.h\"\n\n/**\n * JavaScript Object Notation (JSON) Pointer\n * RFC 6901 - https://tools.ietf.org/html/rfc6901\n */", + "meta": { + "sha256": 1217234116973748366829093199878078246801755936207, + "start_line": 7, + "end_line": 31, + "origin": { + "mimetype": "text/plain", + "binary_hash": 3389072908273760774, + "filename": "json_pointer.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" + }, + "chunk_type": "preamble" + } + }, + { + "text": "/* hash functions */\nstatic unsigned long lh_char_hash(const void *k);\nstatic lh_hash_fn *char_hash_fn = lh_char_hash;\nstatic unsigned long lh_perllike_str_hash(const void *k);\nint json_global_set_string_hash(const int h)\n{\n\tswitch (h)\n\t{\n\tcase JSON_C_STR_HASH_DFLT: char_hash_fn = lh_char_hash; break;\n\tcase JSON_C_STR_HASH_PERLLIKE: char_hash_fn = lh_perllike_str_hash; break;\n\tdefault: return -1;\n\t}\n\treturn 0;\n}", + "meta": { + "part_name": "json_global_set_string_hash", + "docstring": "", + "sha256": 998221257334549775068212743280296040491437343457, + "start_line": 45, + "end_line": 54, + "end_line_signature": 46, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12737018040358436176, + "filename": "linkhash.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" + }, + "chunk_type": "function" + } + }, + { + "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nstatic unsigned long lh_ptr_hash(const void *k)\n{\n\t/* CAW: refactored to be 64bit nice */\n\treturn (unsigned long)((((ptrdiff_t)k * LH_PRIME) >> 4) & ULONG_MAX);\n}", + "meta": { + "part_name": "lh_ptr_hash", + "docstring": "", + "sha256": 1293894620828796812611104590645246395957873389975, + "start_line": 56, + "end_line": 60, + "end_line_signature": 57, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12737018040358436176, + "filename": "linkhash.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" + }, + "chunk_type": "function" + } + }, + { + "text": "int lh_ptr_equal(const void *k1, const void *k2);\nstatic lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_ptr_equal(const void *k1, const void *k2)\n{\n\treturn (k1 == k2);\n}", + "meta": { + "part_name": "lh_ptr_equal", + "docstring": "", + "sha256": 561855950266729137986207467028016863357001140681, + "start_line": 62, + "end_line": 65, + "end_line_signature": 63, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12737018040358436176, + "filename": "linkhash.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" + }, + "chunk_type": "function" + } + }, + { + "text": "#define HASH_LITTLE_ENDIAN 0\n/*\n-------------------------------------------------------------------------------\nmix -- mix 3 32-bit values reversibly.\n\nThis is reversible, so any information in (a,b,c) before mix() is\nstill in (a,b,c) after mix().\n\nIf four pairs of (a,b,c) inputs are run through mix(), or through\nmix() in reverse, there are at least 32 bits of the output that\nare sometimes the same for one pair and different for another pair.\nThis was tested for:\n* pairs that differed by one bit, by two bits, in any combination\n of top bits of (a,b,c), or in any combination of bottom bits of\n (a,b,c).\n* \"differ\" is defined as +, -, ^, or ~^. For + and -, I transformed\n the output delta to a Gray code (a^(a>>1)) so a string of 1's (as\n is commonly produced by subtraction) look like a single 1-bit\n difference.\n* the base values were pseudorandom, all zero but one bit set, or\n all zero plus a counter that starts at zero.\n\nSome k values for my \"a-=c; a^=rot(c,k); c+=b;\" arrangement that\nsatisfy this are\n 4 6 8 16 19 4\n 9 15 3 18 27 15\n 14 9 3 7 17 3\nWell, \"9 15 3 18 27 15\" didn't quite get 32 bits diffing\nfor \"differ\" defined as + with a one-bit base and a two-bit delta. I\nused https://burtleburtle.net/bob/hash/avalanche.html to choose\nthe operations, constants, and arrangements of the variables.\n\nThis does not achieve avalanche. There are input bits of (a,b,c)\nthat fail to affect some output bits of (a,b,c), especially of a. The\nmost thoroughly mixed value is c, but it doesn't really even achieve\navalanche in c.\n\nThis allows some parallelism. Read-after-writes are good at doubling\nthe number of bits affected, so the goal of mixing pulls in the opposite\ndirection as the goal of parallelism. I did what I could. Rotates\nseem to cost as much as shifts on every machine I could lay my hands\non, and rotates are much kinder to the top and bottom bits, so I used\nrotates.\n-------------------------------------------------------------------------------\n*//* clang-format off */\n#define mix(a,b,c) \\\n{ \\\n\ta -= c; a ^= rot(c, 4); c += b; \\\n\tb -= a; b ^= rot(a, 6); a += c; \\\n\tc -= b; c ^= rot(b, 8); b += a; \\\n\ta -= c; a ^= rot(c,16); c += b; \\\n\tb -= a; b ^= rot(a,19); a += c; \\\n\tc -= b; c ^= rot(b, 4); b += a; \\\n}\n/* clang-format on *//*\n-------------------------------------------------------------------------------\nfinal -- final mixing of 3 32-bit values (a,b,c) into c\n\nPairs of (a,b,c) values differing in only a few bits will usually\nproduce values of c that look totally different. This was tested for\n* pairs that differed by one bit, by two bits, in any combination\n of top bits of (a,b,c), or in any combination of bottom bits of\n (a,b,c).\n* \"differ\" is defined as +, -, ^, or ~^. For + and -, I transformed\n the output delta to a Gray code (a^(a>>1)) so a string of 1's (as\n is commonly produced by subtraction) look like a single 1-bit\n difference.\n* the base values were pseudorandom, all zero but one bit set, or\n all zero plus a counter that starts at zero.\n\nThese constants passed:\n 14 11 25 16 4 14 24\n 12 14 25 16 4 14 24\nand these came close:\n 4 8 15 26 3 22 24\n 10 8 15 26 3 22 24\n 11 8 15 26 3 22 24\n-------------------------------------------------------------------------------\n*//* clang-format off */\n#define final(a,b,c) \\\n{ \\\n\tc ^= b; c -= rot(b,14); \\\n\ta ^= c; a -= rot(c,11); \\\n\tb ^= a; b -= rot(a,25); \\\n\tc ^= b; c -= rot(b,16); \\\n\ta ^= c; a -= rot(c,4); \\\n\tb ^= a; b -= rot(a,14); \\\n\tc ^= b; c -= rot(b,24); \\\n}\nstatic lh_hash_fn *char_hash_fn = lh_char_hash;\nstatic uint32_t hashlittle(const void *key, size_t length, uint32_t initval)\n{\n\tuint32_t a,b,c; /* internal state */\n\tunion\n\t{\n\t\tconst void *ptr;\n\t\tsize_t i;\n\t} u; /* needed for Mac Powerbook G4 */\n\n\t/* Set up the internal state */\n\ta = b = c = 0xdeadbeef + ((uint32_t)length) + initval;\n\n\tu.ptr = key;\n\tif (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {\n\t\tconst uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */\n\n\t\t/*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */\n\t\twhile (length > 12)\n\t\t{\n\t\t\ta += k[0];\n\t\t\tb += k[1];\n\t\t\tc += k[2];\n\t\t\tmix(a,b,c);\n\t\t\tlength -= 12;\n\t\t\tk += 3;\n\t\t}\n\n\t\t/*----------------------------- handle the last (probably partial) block */\n\t\t/*\n\t\t * \"k[2]&0xffffff\" actually reads beyond the end of the string, but\n\t\t * then masks off the part it's not allowed to read. Because the\n\t\t * string is aligned, the masked-off tail is in the same word as the\n\t\t * rest of the string. Every machine with memory protection I've seen\n\t\t * does it on word boundaries, so is OK with this. But VALGRIND will\n\t\t * still catch it and complain. The masking trick does make the hash\n\t\t * noticeably faster for short strings (like English words).\n\t\t * AddressSanitizer is similarly picky about overrunning\n\t\t * the buffer. (https://clang.llvm.org/docs/AddressSanitizer.html)\n\t\t */\n#ifdef VALGRIND\n#define PRECISE_MEMORY_ACCESS 1\n#elif defined(__SANITIZE_ADDRESS__) /* GCC's ASAN */\n#define PRECISE_MEMORY_ACCESS 1\n#elif defined(__has_feature)\n#if __has_feature(address_sanitizer) /* Clang's ASAN */\n#define PRECISE_MEMORY_ACCESS 1\n#endif\n#endif\n#ifndef PRECISE_MEMORY_ACCESS\n\n\t\tswitch(length)\n\t\t{\n\t\tcase 12: c+=k[2]; b+=k[1]; a+=k[0]; break;\n\t\tcase 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break;\n\t\tcase 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break;\n\t\tcase 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break;\n\t\tcase 8 : b+=k[1]; a+=k[0]; break;\n\t\tcase 7 : b+=k[1]&0xffffff; a+=k[0]; break;\n\t\tcase 6 : b+=k[1]&0xffff; a+=k[0]; break;\n\t\tcase 5 : b+=k[1]&0xff; a+=k[0]; break;\n\t\tcase 4 : a+=k[0]; break;\n\t\tcase 3 : a+=k[0]&0xffffff; break;\n\t\tcase 2 : a+=k[0]&0xffff; break;\n\t\tcase 1 : a+=k[0]&0xff; break;\n\t\tcase 0 : return c; /* zero length strings require no mixing */\n\t\t}\n\n#else /* make valgrind happy */\n\n\t\tconst uint8_t *k8 = (const uint8_t *)k;\n\t\tswitch(length)\n\t\t{\n\t\tcase 12: c+=k[2]; b+=k[1]; a+=k[0]; break;\n\t\tcase 11: c+=((uint32_t)k8[10])<<16; /* fall through */\n\t\tcase 10: c+=((uint32_t)k8[9])<<8; /* fall through */\n\t\tcase 9 : c+=k8[8]; /* fall through */\n\t\tcase 8 : b+=k[1]; a+=k[0]; break;\n\t\tcase 7 : b+=((uint32_t)k8[6])<<16; /* fall through */\n\t\tcase 6 : b+=((uint32_t)k8[5])<<8; /* fall through */\n\t\tcase 5 : b+=k8[4]; /* fall through */\n\t\tcase 4 : a+=k[0]; break;\n\t\tcase 3 : a+=((uint32_t)k8[2])<<16; /* fall through */\n\t\tcase 2 : a+=((uint32_t)k8[1])<<8; /* fall through */\n\t\tcase 1 : a+=k8[0]; break;\n\t\tcase 0 : return c;\n\t\t}\n\n#endif /* !valgrind */\n\n\t}\n\telse if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0))\n\t{\n\t\tconst uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */\n\t\tconst uint8_t *k8;\n\n\t\t/*--------------- all but last block: aligned reads and different mixing */\n\t\twhile (length > 12)\n\t\t{\n\t\t\ta += k[0] + (((uint32_t)k[1])<<16);\n\t\t\tb += k[2] + (((uint32_t)k[3])<<16);\n\t\t\tc += k[4] + (((uint32_t)k[5])<<16);\n\t\t\tmix(a,b,c);\n\t\t\tlength -= 12;\n\t\t\tk += 6;\n\t\t}\n\n\t\t/*----------------------------- handle the last (probably partial) block */\n\t\tk8 = (const uint8_t *)k;\n\t\tswitch(length)\n\t\t{\n\t\tcase 12: c+=k[4]+(((uint32_t)k[5])<<16);\n\t\t\t b+=k[2]+(((uint32_t)k[3])<<16);\n\t\t\t a+=k[0]+(((uint32_t)k[1])<<16);\n\t\t\t break;\n\t\tcase 11: c+=((uint32_t)k8[10])<<16; /* fall through */\n\t\tcase 10: c+=k[4];\n\t\t\t b+=k[2]+(((uint32_t)k[3])<<16);\n\t\t\t a+=k[0]+(((uint32_t)k[1])<<16);\n\t\t\t break;\n\t\tcase 9 : c+=k8[8]; /* fall through */\n\t\tcase 8 : b+=k[2]+(((uint32_t)k[3])<<16);\n\t\t\t a+=k[0]+(((uint32_t)k[1])<<16);\n\t\t\t break;\n\t\tcase 7 : b+=((uint32_t)k8[6])<<16; /* fall through */\n\t\tcase 6 : b+=k[2];\n\t\t\t a+=k[0]+(((uint32_t)k[1])<<16);\n\t\t\t break;\n\t\tcase 5 : b+=k8[4]; /* fall through */\n\t\tcase 4 : a+=k[0]+(((uint32_t)k[1])<<16);\n\t\t\t break;\n\t\tcase 3 : a+=((uint32_t)k8[2])<<16; /* fall through */\n\t\tcase 2 : a+=k[0];\n\t\t\t break;\n\t\tcase 1 : a+=k8[0];\n\t\t\t break;\n\t\tcase 0 : return c; /* zero length requires no mixing */\n\t\t}\n\n\t}\n\telse\n\t{\n\t\t/* need to read the key one byte at a time */\n\t\tconst uint8_t *k = (const uint8_t *)key;\n\n\t\t/*--------------- all but the last block: affect some 32 bits of (a,b,c) */\n\t\twhile (length > 12)\n\t\t{\n\t\t\ta += k[0];\n\t\t\ta += ((uint32_t)k[1])<<8;\n\t\t\ta += ((uint32_t)k[2])<<16;\n\t\t\ta += ((uint32_t)k[3])<<24;\n\t\t\tb += k[4];\n\t\t\tb += ((uint32_t)k[5])<<8;\n\t\t\tb += ((uint32_t)k[6])<<16;\n\t\t\tb += ((uint32_t)k[7])<<24;\n\t\t\tc += k[8];\n\t\t\tc += ((uint32_t)k[9])<<8;\n\t\t\tc += ((uint32_t)k[10])<<16;\n\t\t\tc += ((uint32_t)k[11])<<24;\n\t\t\tmix(a,b,c);\n\t\t\tlength -= 12;\n\t\t\tk += 12;\n\t\t}\n\n\t\t/*-------------------------------- last block: affect all 32 bits of (c) */\n\t\tswitch(length) /* all the case statements fall through */\n\t\t{\n\t\tcase 12: c+=((uint32_t)k[11])<<24; /* FALLTHRU */\n\t\tcase 11: c+=((uint32_t)k[10])<<16; /* FALLTHRU */\n\t\tcase 10: c+=((uint32_t)k[9])<<8; /* FALLTHRU */\n\t\tcase 9 : c+=k[8]; /* FALLTHRU */\n\t\tcase 8 : b+=((uint32_t)k[7])<<24; /* FALLTHRU */\n\t\tcase 7 : b+=((uint32_t)k[6])<<16; /* FALLTHRU */\n\t\tcase 6 : b+=((uint32_t)k[5])<<8; /* FALLTHRU */\n\t\tcase 5 : b+=k[4]; /* FALLTHRU */\n\t\tcase 4 : a+=((uint32_t)k[3])<<24; /* FALLTHRU */\n\t\tcase 3 : a+=((uint32_t)k[2])<<16; /* FALLTHRU */\n\t\tcase 2 : a+=((uint32_t)k[1])<<8; /* FALLTHRU */\n\t\tcase 1 : a+=k[0];\n\t\t\t break;\n\t\tcase 0 : return c;\n\t\t}\n\t}\n\n\tfinal(a,b,c);\n\treturn c;\n}", + "meta": { + "part_name": "hashlittle", + "docstring": "/* clang-format off *//*\n-------------------------------------------------------------------------------\nhashlittle() -- hash a variable-length key into a 32-bit value\n k : the key (the unaligned variable-length array of bytes)\n length : the length of the key, counting by bytes\n initval : can be any 4-byte value\nReturns a 32-bit value. Every bit of the key affects every bit of\nthe return value. Two keys differing by one or two bits will have\ntotally different hash values.\n\nThe best hash table sizes are powers of 2. There is no need to do\nmod a prime (mod is sooo slow!). If you need less than 32 bits,\nuse a bitmask. For example, if you need only 10 bits, do\n h = (h & hashmask(10));\nIn which case, the hash table should have hashsize(10) elements.\n\nIf you are hashing n strings (uint8_t **)k, do it like this:\n for (i=0, h=0; i 0);\n\tt = (struct lh_table *)calloc(1, sizeof(struct lh_table));\n\tif (!t)\n\t\treturn NULL;\n\n\tt->count = 0;\n\tt->size = size;\n\tt->table = (struct lh_entry *)calloc(size, sizeof(struct lh_entry));\n\tif (!t->table)\n\t{\n\t\tfree(t);\n\t\treturn NULL;\n\t}\n\tt->free_fn = free_fn;\n\tt->hash_fn = hash_fn;\n\tt->equal_fn = equal_fn;\n\tfor (i = 0; i < size; i++)\n\t\tt->table[i].k = LH_EMPTY;\n\treturn t;\n}", + "meta": { + "part_name": "lh_table_new(int size, lh_entry_free_fn *free_fn, lh_hash_fn *hash_fn,\n lh_equal_fn *equal_fn)", + "docstring": "", + "sha256": 1353978832949898607839900715285314979449043361401, + "start_line": 498, + "end_line": 524, + "end_line_signature": 500, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12737018040358436176, + "filename": "linkhash.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" + }, + "chunk_type": "function" + } + }, + { + "text": "/* comparison functions */\nint lh_char_equal(const void *k1, const void *k2);\nstatic lh_hash_fn *char_hash_fn = lh_char_hash;\nstruct lh_table *lh_kchar_table_new(int size, lh_entry_free_fn *free_fn)\n{\n\treturn lh_table_new(size, free_fn, char_hash_fn, lh_char_equal);\n}", + "meta": { + "part_name": "lh_kchar_table_new(int size, lh_entry_free_fn *free_fn)", + "docstring": "", + "sha256": 1143372083340073080177483187079554398021595080815, + "start_line": 526, + "end_line": 529, + "end_line_signature": 527, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12737018040358436176, + "filename": "linkhash.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" + }, + "chunk_type": "function" + } + }, + { + "text": "int lh_ptr_equal(const void *k1, const void *k2);\nstatic lh_hash_fn *char_hash_fn = lh_char_hash;\nstruct lh_table *lh_kptr_table_new(int size, lh_entry_free_fn *free_fn)\n{\n\treturn lh_table_new(size, free_fn, lh_ptr_hash, lh_ptr_equal);\n}", + "meta": { + "part_name": "lh_kptr_table_new(int size, lh_entry_free_fn *free_fn)", + "docstring": "", + "sha256": 5889006246199618862907451444252129237803738713, + "start_line": 531, + "end_line": 534, + "end_line_signature": 532, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12737018040358436176, + "filename": "linkhash.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" + }, + "chunk_type": "function" + } + }, + { + "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_table_resize(struct lh_table *t, int new_size)\n{\n\tstruct lh_table *new_t;\n\tstruct lh_entry *ent;\n\n\tnew_t = lh_table_new(new_size, NULL, t->hash_fn, t->equal_fn);\n\tif (new_t == NULL)\n\t\treturn -1;\n\n\tfor (ent = t->head; ent != NULL; ent = ent->next)\n\t{\n\t\tunsigned long h = lh_get_hash(new_t, ent->k);\n\t\tunsigned int opts = 0;\n\t\tif (ent->k_is_constant)\n\t\t\topts = JSON_C_OBJECT_ADD_CONSTANT_KEY;\n\t\tif (lh_table_insert_w_hash(new_t, ent->k, ent->v, h, opts) != 0)\n\t\t{\n\t\t\tlh_table_free(new_t);\n\t\t\treturn -1;\n\t\t}\n\t}\n\tfree(t->table);\n\tt->table = new_t->table;\n\tt->size = new_size;\n\tt->head = new_t->head;\n\tt->tail = new_t->tail;\n\tfree(new_t);\n\n\treturn 0;\n}", + "meta": { + "part_name": "lh_table_resize", + "docstring": "", + "sha256": 513675985135766203593761374923281677204896804938, + "start_line": 536, + "end_line": 565, + "end_line_signature": 537, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12737018040358436176, + "filename": "linkhash.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" + }, + "chunk_type": "function" + } + }, + { + "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nvoid lh_table_free(struct lh_table *t)\n{\n\tstruct lh_entry *c;\n\tif (t->free_fn)\n\t{\n\t\tfor (c = t->head; c != NULL; c = c->next)\n\t\t\tt->free_fn(c);\n\t}\n\tfree(t->table);\n\tfree(t);\n}", + "meta": { + "part_name": "lh_table_free", + "docstring": "", + "sha256": 1461287464285525024180362726694195219608876587842, + "start_line": 567, + "end_line": 577, + "end_line_signature": 568, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12737018040358436176, + "filename": "linkhash.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" + }, + "chunk_type": "function" + } + }, + { + "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_table_insert_w_hash(struct lh_table *t, const void *k, const void *v, const unsigned long h,\n const unsigned opts)\n{\n\tunsigned long n;\n\n\tif (t->count >= t->size * LH_LOAD_FACTOR)\n\t{\n\t\t/* Avoid signed integer overflow with large tables. */\n\t\tint new_size = (t->size > INT_MAX / 2) ? INT_MAX : (t->size * 2);\n\t\tif (t->size == INT_MAX || lh_table_resize(t, new_size) != 0)\n\t\t\treturn -1;\n\t}\n\n\tn = h % t->size;\n\n\twhile (1)\n\t{\n\t\tif (t->table[n].k == LH_EMPTY || t->table[n].k == LH_FREED)\n\t\t\tbreak;\n\t\tif ((int)++n == t->size)\n\t\t\tn = 0;\n\t}\n\n\tt->table[n].k = k;\n\tt->table[n].k_is_constant = (opts & JSON_C_OBJECT_ADD_CONSTANT_KEY);\n\tt->table[n].v = v;\n\tt->count++;\n\n\tif (t->head == NULL)\n\t{\n\t\tt->head = t->tail = &t->table[n];\n\t\tt->table[n].next = t->table[n].prev = NULL;\n\t}\n\telse\n\t{\n\t\tt->tail->next = &t->table[n];\n\t\tt->table[n].prev = t->tail;\n\t\tt->table[n].next = NULL;\n\t\tt->tail = &t->table[n];\n\t}\n\n\treturn 0;\n}", + "meta": { + "part_name": "lh_table_insert_w_hash", + "docstring": "", + "sha256": 1457272684322346273275672024449638738347377045483, + "start_line": 579, + "end_line": 621, + "end_line_signature": 581, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12737018040358436176, + "filename": "linkhash.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" + }, + "chunk_type": "function" + } + }, + { + "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_table_insert(struct lh_table *t, const void *k, const void *v)\n{\n\treturn lh_table_insert_w_hash(t, k, v, lh_get_hash(t, k), 0);\n}", + "meta": { + "part_name": "lh_table_insert", + "docstring": "", + "sha256": 914976466412251973023999660063657611060463868578, + "start_line": 622, + "end_line": 625, + "end_line_signature": 623, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12737018040358436176, + "filename": "linkhash.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" + }, + "chunk_type": "function" + } + }, + { + "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nstruct lh_entry *lh_table_lookup_entry_w_hash(struct lh_table *t, const void *k,\n const unsigned long h)\n{\n\tunsigned long n = h % t->size;\n\tint count = 0;\n\n\twhile (count < t->size)\n\t{\n\t\tif (t->table[n].k == LH_EMPTY)\n\t\t\treturn NULL;\n\t\tif (t->table[n].k != LH_FREED && t->equal_fn(t->table[n].k, k))\n\t\t\treturn &t->table[n];\n\t\tif ((int)++n == t->size)\n\t\t\tn = 0;\n\t\tcount++;\n\t}\n\treturn NULL;\n}", + "meta": { + "part_name": "lh_table_lookup_entry_w_hash(struct lh_table *t, const void *k,\n const unsigned long h)", + "docstring": "", + "sha256": 1235017991348899387013216690716133251889003151179, + "start_line": 627, + "end_line": 644, + "end_line_signature": 629, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12737018040358436176, + "filename": "linkhash.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" + }, + "chunk_type": "function" + } + }, + { + "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nstruct lh_entry *lh_table_lookup_entry(struct lh_table *t, const void *k)\n{\n\treturn lh_table_lookup_entry_w_hash(t, k, lh_get_hash(t, k));\n}", + "meta": { + "part_name": "lh_table_lookup_entry(struct lh_table *t, const void *k)", + "docstring": "", + "sha256": 1030652463340488651179217356281066519111633079656, + "start_line": 646, + "end_line": 649, + "end_line_signature": 647, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12737018040358436176, + "filename": "linkhash.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" + }, + "chunk_type": "function" + } + }, + { + "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\njson_bool lh_table_lookup_ex(struct lh_table *t, const void *k, void **v)\n{\n\tstruct lh_entry *e = lh_table_lookup_entry(t, k);\n\tif (e != NULL)\n\t{\n\t\tif (v != NULL)\n\t\t\t*v = lh_entry_v(e);\n\t\treturn 1; /* key found */\n\t}\n\tif (v != NULL)\n\t\t*v = NULL;\n\treturn 0; /* key not found */\n}", + "meta": { + "part_name": "lh_table_lookup_ex", + "docstring": "", + "sha256": 634848787249761171541292303028190495411132430955, + "start_line": 651, + "end_line": 663, + "end_line_signature": 652, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12737018040358436176, + "filename": "linkhash.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" + }, + "chunk_type": "function" + } + }, + { + "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_table_delete_entry(struct lh_table *t, struct lh_entry *e)\n{\n\t/* CAW: fixed to be 64bit nice, still need the crazy negative case... */\n\tptrdiff_t n = (ptrdiff_t)(e - t->table);\n\n\t/* CAW: this is bad, really bad, maybe stack goes other direction on this machine... */\n\tif (n < 0)\n\t{\n\t\treturn -2;\n\t}\n\n\tif (t->table[n].k == LH_EMPTY || t->table[n].k == LH_FREED)\n\t\treturn -1;\n\tt->count--;\n\tif (t->free_fn)\n\t\tt->free_fn(e);\n\tt->table[n].v = NULL;\n\tt->table[n].k = LH_FREED;\n\tif (t->tail == &t->table[n] && t->head == &t->table[n])\n\t{\n\t\tt->head = t->tail = NULL;\n\t}\n\telse if (t->head == &t->table[n])\n\t{\n\t\tt->head->next->prev = NULL;\n\t\tt->head = t->head->next;\n\t}\n\telse if (t->tail == &t->table[n])\n\t{\n\t\tt->tail->prev->next = NULL;\n\t\tt->tail = t->tail->prev;\n\t}\n\telse\n\t{\n\t\tt->table[n].prev->next = t->table[n].next;\n\t\tt->table[n].next->prev = t->table[n].prev;\n\t}\n\tt->table[n].next = t->table[n].prev = NULL;\n\treturn 0;\n}", + "meta": { + "part_name": "lh_table_delete_entry", + "docstring": "", + "sha256": 951281510321322595326628350604221188298138063502, + "start_line": 665, + "end_line": 704, + "end_line_signature": 666, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12737018040358436176, + "filename": "linkhash.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" + }, + "chunk_type": "function" + } + }, + { + "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_table_delete(struct lh_table *t, const void *k)\n{\n\tstruct lh_entry *e = lh_table_lookup_entry(t, k);\n\tif (!e)\n\t\treturn -1;\n\treturn lh_table_delete_entry(t, e);\n}", + "meta": { + "part_name": "lh_table_delete", + "docstring": "", + "sha256": 44220499716621050420359222788414516841562449761, + "start_line": 706, + "end_line": 712, + "end_line_signature": 707, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12737018040358436176, + "filename": "linkhash.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" + }, + "chunk_type": "function" + } + }, + { + "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_table_length(struct lh_table *t)\n{\n\treturn t->count;\n}", + "meta": { + "part_name": "lh_table_length", + "docstring": "", + "sha256": 719864955613574534766865380227650980669430095114, + "start_line": 714, + "end_line": 717, + "end_line_signature": 715, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12737018040358436176, + "filename": "linkhash.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" + }, + "chunk_type": "function" + } + }, + { + "text": "#include \"config.h\"\n\n#include \n#include \n#include \n#include \n#include \n#include \n#include \n\n#ifdef HAVE_ENDIAN_H\n#include /* attempt to define endianness */\n#endif\n\n#if defined(_MSC_VER) || defined(__MINGW32__)\n#ifndef WIN32_LEAN_AND_MEAN\n#define WIN32_LEAN_AND_MEAN\n#endif\n#include /* Get InterlockedCompareExchange */\n#endif\n\n#include \"linkhash.h\"\n#include \"random_seed.h\"\n\n/*\n * hashlittle from lookup3.c, by Bob Jenkins, May 2006, Public Domain.\n * https://burtleburtle.net/bob/c/lookup3.c\n * minor modifications to make functions static so no symbols are exported\n * minor modifications to compile with -Werror\n */\n\n/*\n-------------------------------------------------------------------------------\nlookup3.c, by Bob Jenkins, May 2006, Public Domain.\n\nThese are functions for producing 32-bit hashes for hash table lookup.\nhashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()\nare externally useful functions. Routines to test the hash are included\nif SELF_TEST is defined. You can use this free for any purpose. It's in\nthe public domain. It has no warranty.\n\nYou probably want to use hashlittle(). hashlittle() and hashbig()\nhash byte arrays. hashlittle() is faster than hashbig() on\nlittle-endian machines. Intel and AMD are little-endian machines.\nOn second thought, you probably want hashlittle2(), which is identical to\nhashlittle() except it returns two 32-bit hashes for the price of one.\nYou could implement hashbig2() if you wanted but I haven't bothered here.\n\nIf you want to find a hash of, say, exactly 7 integers, do\n a = i1; b = i2; c = i3;\n mix(a,b,c);\n a += i4; b += i5; c += i6;\n mix(a,b,c);\n a += i7;\n final(a,b,c);\nthen use c as the hash value. If you have a variable length array of\n4-byte integers to hash, use hashword(). If you have a byte array (like\na character string), use hashlittle(). If you have several byte arrays, or\na mix of things, see the comments above hashlittle().\n\nWhy is this so big? I read 12 bytes at a time into 3 4-byte integers,\nthen mix those integers. This is fast (you can do a lot more thorough\nmixing with 12*3 instructions on 3 integers than you can with 3 instructions\non 1 byte), but shoehorning those bytes into integers efficiently is messy.\n-------------------------------------------------------------------------------\n*/\n\n/*\n * My best guess at if you are big-endian or little-endian. This may\n * need adjustment.\n */\n#if (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) || \\\n (defined(i386) || defined(__i386__) || defined(__i486__) || defined(__i586__) || \\\n defined(__i686__) || defined(vax) || defined(MIPSEL))\n#define HASH_LITTLE_ENDIAN 1\n#define HASH_BIG_ENDIAN 0\n#elif (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) || \\\n (defined(sparc) || defined(POWERPC) || defined(mc68000) || defined(sel))\n#define HASH_LITTLE_ENDIAN 0\n#define HASH_BIG_ENDIAN 1\n#else\n\n#define HASH_BIG_ENDIAN 0\n#endif\n\n#define hashsize(n) ((uint32_t)1 << (n))\n#define hashmask(n) (hashsize(n) - 1)\n#define rot(x, k) (((x) << (k)) | ((x) >> (32 - (k))))\n\n/* clang-format on */\n\n/*\n-------------------------------------------------------------------------------\nhashlittle() -- hash a variable-length key into a 32-bit value\n k : the key (the unaligned variable-length array of bytes)\n length : the length of the key, counting by bytes\n initval : can be any 4-byte value\nReturns a 32-bit value. Every bit of the key affects every bit of\nthe return value. Two keys differing by one or two bits will have\ntotally different hash values.\n\nThe best hash table sizes are powers of 2. There is no need to do\nmod a prime (mod is sooo slow!). If you need less than 32 bits,\nuse a bitmask. For example, if you need only 10 bits, do\n h = (h & hashmask(10));\nIn which case, the hash table should have hashsize(10) elements.\n\nIf you are hashing n strings (uint8_t **)k, do it like this:\n for (i=0, h=0; i= 10; errno_in /= 10, ii++)\n\t{\n\t\tdigbuf[ii] = \"0123456789\"[(errno_in % 10)];\n\t}\n\tdigbuf[ii] = \"0123456789\"[(errno_in % 10)];\n\n\t// Reverse the digits\n\tfor (start_idx = sizeof(PREFIX) - 1; ii >= 0; ii--, start_idx++)\n\t{\n\t\terrno_buf[start_idx] = digbuf[ii];\n\t}\n\terrno_buf[start_idx] = '\\0';\n\treturn errno_buf;\n}", + "meta": { + "part_name": "_json_c_strerror(int errno_in)", + "docstring": "", + "sha256": 70696874837601163637337209327435673270874963588, + "start_line": 66, + "end_line": 109, + "end_line_signature": 67, + "origin": { + "mimetype": "text/plain", + "binary_hash": 14386364040007058020, + "filename": "strerror_override.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/strerror_override.c" + }, + "chunk_type": "function" + } + }, + { + "text": "#define STRERROR_OVERRIDE_IMPL 1\n#include \"strerror_override.h\"", + "meta": { + "sha256": 1284269608364859541196511107996937073581407786389, + "start_line": 1, + "end_line": 3, + "origin": { + "mimetype": "text/plain", + "binary_hash": 14386364040007058020, + "filename": "strerror_override.c", + "uri": "https://github.com/json-c/json-c/blob/abc123def456/strerror_override.c" + }, + "chunk_type": "preamble" + } + } + ] +} diff --git a/test/data/chunker_repo/Java/repo_out_chunks.json b/test/data/chunker_repo/Java/repo_out_chunks.json new file mode 100644 index 00000000..693630e4 --- /dev/null +++ b/test/data/chunker_repo/Java/repo_out_chunks.json @@ -0,0 +1,94 @@ +{ + "root": [ + { + "text": "package com.acmeair;\n\npublic interface AcmeAirConstants {\n\n\t\n}", + "meta": { + "part_name": "AcmeAirConstants", + "docstring": "", + "sha256": 802233348002430704981298093765030369309512619867, + "start_line": 2, + "end_line": 5, + "end_line_signature": 5, + "origin": { + "mimetype": "text/plain", + "binary_hash": 4721786809665574388, + "filename": "AcmeAirConstants.java", + "uri": "https://github.com/acmeair/acmeair/blob/abc123def456/AcmeAirConstants.java" + }, + "chunk_type": "class" + } + }, + { + "text": "package com.acmeair.loader;\nimport com.acmeair.entities.Customer.PhoneType;\nimport com.acmeair.entities.Customer;\nimport com.acmeair.entities.CustomerAddress;\nimport com.acmeair.service.CustomerService;\nimport com.acmeair.service.ServiceLocator;\npublic class CustomerLoader {\n\n private CustomerService customerService = ServiceLocator.instance().getService(CustomerService.class);} public void loadCustomers(long numCustomers) {\n\t\tCustomerAddress address = customerService.createAddress(\"123 Main St.\", null, \"Anytown\", \"NC\", \"USA\", \"27617\");\n\t\tfor (long ii = 0; ii < numCustomers; ii++) {\n\t\t\tcustomerService.createCustomer(\"uid\"+ii+\"@email.com\", \"password\", Customer.MemberShipStatus.GOLD, 1000000, 1000, \"919-123-4567\", PhoneType.BUSINESS, address);\n\t\t}\n\t}", + "meta": { + "part_name": "loadCustomers", + "docstring": "", + "sha256": 216694729768327235646074424271588504079937437501, + "start_line": 29, + "end_line": 34, + "end_line_signature": 29, + "origin": { + "mimetype": "text/plain", + "binary_hash": 16652446628586613798, + "filename": "CustomerLoader.java", + "uri": "https://github.com/acmeair/acmeair/blob/abc123def456/CustomerLoader.java" + }, + "chunk_type": "function" + } + }, + { + "text": "package com.acmeair.loader;\nimport com.acmeair.entities.AirportCodeMapping;\nimport com.acmeair.service.FlightService;\nimport com.acmeair.service.ServiceLocator;\nimport java.io.InputStream;\nimport java.io.InputStreamReader;\nimport java.io.LineNumberReader;\nimport java.math.*;\nimport java.util.*;\npublic class FlightLoader {\n\n private static final int MAX_FLIGHTS_PER_SEGMENT = 30; private FlightService flightService = ServiceLocator.instance().getService(FlightService.class);} public void loadFlights() throws Exception {\n\t\tInputStream csvInputStream = FlightLoader.class.getResourceAsStream(\"/mileage.csv\");\n\t\t\n\t\tLineNumberReader lnr = new LineNumberReader(new InputStreamReader(csvInputStream));\n\t\tString line1 = lnr.readLine();\n\t\tStringTokenizer st = new StringTokenizer(line1, \",\");\n\t\tArrayList airports = new ArrayList();\n\t\t\n\t\t// read the first line which are airport names\n\t\twhile (st.hasMoreTokens()) {\n\t\t\tAirportCodeMapping acm = flightService.createAirportCodeMapping(null, st.nextToken());\n\t\t//\tacm.setAirportName(st.nextToken());\n\t\t\tairports.add(acm);\n\t\t}\n\t\t// read the second line which contains matching airport codes for the first line\n\t\tString line2 = lnr.readLine();\n\t\tst = new StringTokenizer(line2, \",\");\n\t\tint ii = 0;\n\t\twhile (st.hasMoreTokens()) {\n\t\t\tString airportCode = st.nextToken();\n\t\t\tairports.get(ii).setAirportCode(airportCode);\n\t\t\tii++;\n\t\t}\n\t\t// read the other lines which are of format:\n\t\t// airport name, aiport code, distance from this airport to whatever airport is in the column from lines one and two\n\t\tString line;\n\t\tint flightNumber = 0;\n\t\twhile (true) {\n\t\t\tline = lnr.readLine();\n\t\t\tif (line == null || line.trim().equals(\"\")) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tst = new StringTokenizer(line, \",\");\n\t\t\tString airportName = st.nextToken();\n\t\t\tString airportCode = st.nextToken();\n\t\t\tif (!alreadyInCollection(airportCode, airports)) {\n\t\t\t\tAirportCodeMapping acm = flightService.createAirportCodeMapping(airportCode, airportName);\n\t\t\t\tairports.add(acm);\n\t\t\t}\n\t\t\tint indexIntoTopLine = 0;\n\t\t\twhile (st.hasMoreTokens()) {\n\t\t\t\tString milesString = st.nextToken();\n\t\t\t\tif (milesString.equals(\"NA\")) {\n\t\t\t\t\tindexIntoTopLine++;\n\t\t\t\t\tcontinue;\n\t\t\t\t}\n\t\t\t\tint miles = Integer.parseInt(milesString);\n\t\t\t\tString toAirport = airports.get(indexIntoTopLine).getAirportCode();\n\t\t\t\tString flightId = \"AA\" + flightNumber;\t\t\t\n\t\t\t\tflightService.storeFlightSegment(flightId, airportCode, toAirport, miles);\n\t\t\t\tDate now = new Date();\n\t\t\t\tfor (int daysFromNow = 0; daysFromNow < MAX_FLIGHTS_PER_SEGMENT; daysFromNow++) {\n\t\t\t\t\tCalendar c = Calendar.getInstance();\n\t\t\t\t\tc.setTime(now);\n\t\t\t\t\tc.set(Calendar.HOUR_OF_DAY, 0);\n\t\t\t\t c.set(Calendar.MINUTE, 0);\n\t\t\t\t c.set(Calendar.SECOND, 0);\n\t\t\t\t c.set(Calendar.MILLISECOND, 0);\n\t\t\t\t\tc.add(Calendar.DATE, daysFromNow);\n\t\t\t\t\tDate departureTime = c.getTime();\n\t\t\t\t\tDate arrivalTime = getArrivalTime(departureTime, miles);\n\t\t\t\t\tflightService.createNewFlight(flightId, departureTime, arrivalTime, new BigDecimal(500), new BigDecimal(200), 10, 200, \"B747\");\n\t\t\t\t\t\n\t\t\t\t}\n\t\t\t\tflightNumber++;\n\t\t\t\tindexIntoTopLine++;\n\t\t\t}\n\t\t}\n\t\t\n\t\tfor (int jj = 0; jj < airports.size(); jj++) {\n\t\t\tflightService.storeAirportMapping(airports.get(jj));\n\t\t}\n\t\tlnr.close();\n\t}", + "meta": { + "part_name": "loadFlights", + "docstring": "", + "sha256": 1402513010551547847601046795871134853016900337764, + "start_line": 37, + "end_line": 110, + "end_line_signature": 37, + "origin": { + "mimetype": "text/plain", + "binary_hash": 13929779701984022643, + "filename": "FlightLoader.java", + "uri": "https://github.com/acmeair/acmeair/blob/abc123def456/FlightLoader.java" + }, + "chunk_type": "function" + } + }, + { + "text": "package com.acmeair.loader;\nimport com.acmeair.service.FlightService;\nimport com.acmeair.service.ServiceLocator;\nimport java.math.*;\nimport java.util.*;\npublic class FlightLoader {\n\n private static final int MAX_FLIGHTS_PER_SEGMENT = 30; private FlightService flightService = ServiceLocator.instance().getService(FlightService.class);} private static Date getArrivalTime(Date departureTime, int mileage) {\n\t\tdouble averageSpeed = 600.0; // 600 miles/hours\n\t\tdouble hours = (double) mileage / averageSpeed; // miles / miles/hour = hours\n\t\tdouble partsOfHour = hours % 1.0;\n\t\tint minutes = (int)(60.0 * partsOfHour);\n\t\tCalendar c = Calendar.getInstance();\n\t\tc.setTime(departureTime);\n\t\tc.add(Calendar.HOUR, (int)hours);\n\t\tc.add(Calendar.MINUTE, minutes);\n\t\treturn c.getTime();\n\t}", + "meta": { + "part_name": "getArrivalTime", + "docstring": "", + "sha256": 498682806925350255052209347840878724203772545481, + "start_line": 112, + "end_line": 122, + "end_line_signature": 112, + "origin": { + "mimetype": "text/plain", + "binary_hash": 13929779701984022643, + "filename": "FlightLoader.java", + "uri": "https://github.com/acmeair/acmeair/blob/abc123def456/FlightLoader.java" + }, + "chunk_type": "function" + } + }, + { + "text": "package com.acmeair.loader;\nimport com.acmeair.entities.AirportCodeMapping;\nimport com.acmeair.service.FlightService;\nimport com.acmeair.service.ServiceLocator;\nimport java.math.*;\nimport java.util.*;\npublic class FlightLoader {\n\n private static final int MAX_FLIGHTS_PER_SEGMENT = 30; private FlightService flightService = ServiceLocator.instance().getService(FlightService.class);} static private boolean alreadyInCollection(String airportCode, ArrayList airports) {\n\t\tfor (int ii = 0; ii < airports.size(); ii++) {\n\t\t\tif (airports.get(ii).getAirportCode().equals(airportCode)) {\n\t\t\t\treturn true;\n\t\t\t}\n\t\t}\n\t\treturn false;\n\t}", + "meta": { + "part_name": "alreadyInCollection", + "docstring": "", + "sha256": 846281520348793854081127747386039365962060314516, + "start_line": 124, + "end_line": 131, + "end_line_signature": 124, + "origin": { + "mimetype": "text/plain", + "binary_hash": 13929779701984022643, + "filename": "FlightLoader.java", + "uri": "https://github.com/acmeair/acmeair/blob/abc123def456/FlightLoader.java" + }, + "chunk_type": "function" + } + } + ] +} diff --git a/test/data/chunker_repo/JavaScript/repo_out_chunks.json b/test/data/chunker_repo/JavaScript/repo_out_chunks.json new file mode 100644 index 00000000..20024a15 --- /dev/null +++ b/test/data/chunker_repo/JavaScript/repo_out_chunks.json @@ -0,0 +1,103 @@ +{ + "root": [ + { + "text": "import { jQuery } from \"../core.js\";\nimport { toType } from \"../core/toType.js\";\n\n// Multifunctional method to get and set values of a collection\n// The value/s can optionally be executed if it's a function\nexport function access( elems, fn, key, value, chainable, emptyGet, raw ) {\n\tvar i = 0,\n\t\tlen = elems.length,\n\t\tbulk = key == null;\n\n\t// Sets many values\n\tif ( toType( key ) === \"object\" ) {\n\t\tchainable = true;\n\t\tfor ( i in key ) {\n\t\t\taccess( elems, fn, i, key[ i ], true, emptyGet, raw );\n\t\t}\n\n\t// Sets one value\n\t} else if ( value !== undefined ) {\n\t\tchainable = true;\n\n\t\tif ( typeof value !== \"function\" ) {\n\t\t\traw = true;\n\t\t}\n\n\t\tif ( bulk ) {\n\n\t\t\t// Bulk operations run against the entire set\n\t\t\tif ( raw ) {\n\t\t\t\tfn.call( elems, value );\n\t\t\t\tfn = null;\n\n\t\t\t// ...except when executing function values\n\t\t\t} else {\n\t\t\t\tbulk = fn;\n\t\t\t\tfn = function( elem, _key, value ) {\n\t\t\t\t\treturn bulk.call( jQuery( elem ), value );\n\t\t\t\t};\n\t\t\t}\n\t\t}\n\n\t\tif ( fn ) {\n\t\t\tfor ( ; i < len; i++ ) {\n\t\t\t\tfn(\n\t\t\t\t\telems[ i ], key, raw ?\n\t\t\t\t\t\tvalue :\n\t\t\t\t\t\tvalue.call( elems[ i ], i, fn( elems[ i ], key ) )\n\t\t\t\t);\n\t\t\t}\n\t\t}\n\t}\n\n\tif ( chainable ) {\n\t\treturn elems;\n\t}\n\n\t// Gets\n\tif ( bulk ) {\n\t\treturn fn.call( elems );\n\t}\n\n\treturn len ? fn( elems[ 0 ], key ) : emptyGet;\n}", + "meta": { + "sha256": 19281888941792979874208112177048718444947121672, + "start_line": 1, + "end_line": 64, + "origin": { + "mimetype": "text/plain", + "binary_hash": 6135287906716252438, + "filename": "access.js", + "uri": "https://github.com/jquery/jquery/blob/abc123def456/access.js" + }, + "chunk_type": "preamble" + } + }, + { + "text": "\nfunction getData( data ) {\n\tif ( data === \"true\" ) {\n\t\treturn true;\n\t}\n\n\tif ( data === \"false\" ) {\n\t\treturn false;\n\t}\n\n\tif ( data === \"null\" ) {\n\t\treturn null;\n\t}\n\n\t// Only convert to a number if it doesn't change the string\n\tif ( data === +data + \"\" ) {\n\t\treturn +data;\n\t}\n\n\tif ( rbrace.test( data ) ) {\n\t\treturn JSON.parse( data );\n\t}\n\n\treturn data;\n}", + "meta": { + "part_name": "getData", + "docstring": "", + "sha256": 726798474764155913807876762001398681472967415464, + "start_line": 19, + "end_line": 42, + "end_line_signature": 42, + "origin": { + "mimetype": "text/plain", + "binary_hash": 15216584529958576692, + "filename": "data.js", + "uri": "https://github.com/jquery/jquery/blob/abc123def456/data.js" + }, + "chunk_type": "function" + } + }, + { + "text": "import { dataUser } from \"./data/var/dataUser.js\";\nfunction dataAttr( elem, key, data ) {\n\tvar name;\n\n\t// If nothing was found internally, try to fetch any\n\t// data from the HTML5 data-* attribute\n\tif ( data === undefined && elem.nodeType === 1 ) {\n\t\tname = \"data-\" + key.replace( rmultiDash, \"-$&\" ).toLowerCase();\n\t\tdata = elem.getAttribute( name );\n\n\t\tif ( typeof data === \"string\" ) {\n\t\t\ttry {\n\t\t\t\tdata = getData( data );\n\t\t\t} catch ( e ) {}\n\n\t\t\t// Make sure we set the data so it isn't changed later\n\t\t\tdataUser.set( elem, key, data );\n\t\t} else {\n\t\t\tdata = undefined;\n\t\t}\n\t}\n\treturn data;\n}", + "meta": { + "part_name": "dataAttr", + "docstring": "", + "sha256": 1201089615638546656833156594995163567755610461195, + "start_line": 44, + "end_line": 65, + "end_line_signature": 65, + "origin": { + "mimetype": "text/plain", + "binary_hash": 15216584529958576692, + "filename": "data.js", + "uri": "https://github.com/jquery/jquery/blob/abc123def456/data.js" + }, + "chunk_type": "function" + } + }, + { + "text": "import { jQuery } from \"./core.js\";\nimport { access } from \"./core/access.js\";\nimport { camelCase } from \"./core/camelCase.js\";\nimport { dataPriv } from \"./data/var/dataPriv.js\";\n\n//\tImplementation Summary\n//\n//\t1. Enforce API surface and semantic compatibility with 1.9.x branch\n//\t2. Improve the module's maintainability by reducing the storage\n//\t\tpaths to a single mechanism.\n//\t3. Use the same single mechanism to support \"private\" and \"user\" data.\n//\t4. _Never_ expose \"private\" data to user code (TODO: Drop _data, _removeData)\n//\t5. Avoid exposing implementation details on user objects (eg. expando properties)\n//\t6. Provide a clear path for implementation upgrade to WeakMap in 2014\n\nvar rbrace = /^(?:\\{[\\w\\W]*\\}|\\[[\\w\\W]*\\])$/,\n\trmultiDash = /[A-Z]/g;\n\njQuery.extend( {\n\thasData: function( elem ) {\n\t\treturn dataUser.hasData( elem ) || dataPriv.hasData( elem );\n\t},\n\n\tdata: function( elem, name, data ) {\n\t\treturn dataUser.access( elem, name, data );\n\t},\n\n\tremoveData: function( elem, name ) {\n\t\tdataUser.remove( elem, name );\n\t},\n\n\t// TODO: Now that all calls to _data and _removeData have been replaced\n\t// with direct calls to dataPriv methods, these can be deprecated.\n\t_data: function( elem, name, data ) {\n\t\treturn dataPriv.access( elem, name, data );\n\t},\n\n\t_removeData: function( elem, name ) {\n\t\tdataPriv.remove( elem, name );\n\t}\n} );\n\njQuery.fn.extend( {\n\tdata: function( key, value ) {\n\t\tvar i, name, data,\n\t\t\telem = this[ 0 ],\n\t\t\tattrs = elem && elem.attributes;\n\n\t\t// Gets all values\n\t\tif ( key === undefined ) {\n\t\t\tif ( this.length ) {\n\t\t\t\tdata = dataUser.get( elem );\n\n\t\t\t\tif ( elem.nodeType === 1 && !dataPriv.get( elem, \"hasDataAttrs\" ) ) {\n\t\t\t\t\ti = attrs.length;\n\t\t\t\t\twhile ( i-- ) {\n\n\t\t\t\t\t\t// Support: IE 11+\n\t\t\t\t\t\t// The attrs elements can be null (trac-14894)\n\t\t\t\t\t\tif ( attrs[ i ] ) {\n\t\t\t\t\t\t\tname = attrs[ i ].name;\n\t\t\t\t\t\t\tif ( name.indexOf( \"data-\" ) === 0 ) {\n\t\t\t\t\t\t\t\tname = camelCase( name.slice( 5 ) );\n\t\t\t\t\t\t\t\tdataAttr( elem, name, data[ name ] );\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t\tdataPriv.set( elem, \"hasDataAttrs\", true );\n\t\t\t\t}\n\t\t\t}\n\n\t\t\treturn data;\n\t\t}\n\n\t\t// Sets multiple values\n\t\tif ( typeof key === \"object\" ) {\n\t\t\treturn this.each( function() {\n\t\t\t\tdataUser.set( this, key );\n\t\t\t} );\n\t\t}\n\n\t\treturn access( this, function( value ) {\n\t\t\tvar data;\n\n\t\t\t// The calling jQuery object (element matches) is not empty\n\t\t\t// (and therefore has an element appears at this[ 0 ]) and the\n\t\t\t// `value` parameter was not undefined. An empty jQuery object\n\t\t\t// will result in `undefined` for elem = this[ 0 ] which will\n\t\t\t// throw an exception if an attempt to read a data cache is made.\n\t\t\tif ( elem && value === undefined ) {\n\n\t\t\t\t// Attempt to get data from the cache\n\t\t\t\t// The key will always be camelCased in Data\n\t\t\t\tdata = dataUser.get( elem, key );\n\t\t\t\tif ( data !== undefined ) {\n\t\t\t\t\treturn data;\n\t\t\t\t}\n\n\t\t\t\t// Attempt to \"discover\" the data in\n\t\t\t\t// HTML5 custom data-* attrs\n\t\t\t\tdata = dataAttr( elem, key );\n\t\t\t\tif ( data !== undefined ) {\n\t\t\t\t\treturn data;\n\t\t\t\t}\n\n\t\t\t\t// We tried really hard, but the data doesn't exist.\n\t\t\t\treturn;\n\t\t\t}\n\n\t\t\t// Set the data...\n\t\t\tthis.each( function() {\n\n\t\t\t\t// We always store the camelCased key\n\t\t\t\tdataUser.set( this, key, value );\n\t\t\t} );\n\t\t}, null, value, arguments.length > 1, null, true );\n\t},\n\n\tremoveData: function( key ) {\n\t\treturn this.each( function() {\n\t\t\tdataUser.remove( this, key );\n\t\t} );\n\t}\n} );\n\nexport { jQuery, jQuery as $ };", + "meta": { + "sha256": 541558141920421205501086138012356951496054039953, + "start_line": 1, + "end_line": 176, + "origin": { + "mimetype": "text/plain", + "binary_hash": 15216584529958576692, + "filename": "data.js", + "uri": "https://github.com/jquery/jquery/blob/abc123def456/data.js" + }, + "chunk_type": "preamble" + } + }, + { + "text": "import { jQuery } from \"./core.js\";\nimport { toType } from \"./core/toType.js\";\nfunction buildParams( prefix, obj, traditional, add ) {\n\tvar name;\n\n\tif ( Array.isArray( obj ) ) {\n\n\t\t// Serialize array item.\n\t\tjQuery.each( obj, function( i, v ) {\n\t\t\tif ( traditional || rbracket.test( prefix ) ) {\n\n\t\t\t\t// Treat each array item as a scalar.\n\t\t\t\tadd( prefix, v );\n\n\t\t\t} else {\n\n\t\t\t\t// Item is non-scalar (array or object), encode its numeric index.\n\t\t\t\tbuildParams(\n\t\t\t\t\tprefix + \"[\" + ( typeof v === \"object\" && v != null ? i : \"\" ) + \"]\",\n\t\t\t\t\tv,\n\t\t\t\t\ttraditional,\n\t\t\t\t\tadd\n\t\t\t\t);\n\t\t\t}\n\t\t} );\n\n\t} else if ( !traditional && toType( obj ) === \"object\" ) {\n\n\t\t// Serialize object item.\n\t\tfor ( name in obj ) {\n\t\t\tbuildParams( prefix + \"[\" + name + \"]\", obj[ name ], traditional, add );\n\t\t}\n\n\t} else {\n\n\t\t// Serialize scalar item.\n\t\tadd( prefix, obj );\n\t}\n}", + "meta": { + "part_name": "buildParams", + "docstring": "", + "sha256": 1111988990908705094703814693986724759434620677873, + "start_line": 14, + "end_line": 50, + "end_line_signature": 50, + "origin": { + "mimetype": "text/plain", + "binary_hash": 7904055776319460817, + "filename": "serialize.js", + "uri": "https://github.com/jquery/jquery/blob/abc123def456/serialize.js" + }, + "chunk_type": "function" + } + }, + { + "text": "import { rcheckableType } from \"./var/rcheckableType.js\";\n\nimport \"./core/init.js\";\nimport \"./traversing.js\"; // filter\nimport \"./attributes/prop.js\";\n\nvar\n\trbracket = /\\[\\]$/,\n\trCRLF = /\\r?\\n/g,\n\trsubmitterTypes = /^(?:submit|button|image|reset|file)$/i,\n\trsubmittable = /^(?:input|select|textarea|keygen)/i;\n\n// Serialize an array of form elements or a set of\n// key/values into a query string\njQuery.param = function( a, traditional ) {\n\tvar prefix,\n\t\ts = [],\n\t\tadd = function( key, valueOrFunction ) {\n\n\t\t\t// If value is a function, invoke it and use its return value\n\t\t\tvar value = typeof valueOrFunction === \"function\" ?\n\t\t\t\tvalueOrFunction() :\n\t\t\t\tvalueOrFunction;\n\n\t\t\ts[ s.length ] = encodeURIComponent( key ) + \"=\" +\n\t\t\t\tencodeURIComponent( value == null ? \"\" : value );\n\t\t};\n\n\tif ( a == null ) {\n\t\treturn \"\";\n\t}\n\n\t// If an array was passed in, assume that it is an array of form elements.\n\tif ( Array.isArray( a ) || ( a.jquery && !jQuery.isPlainObject( a ) ) ) {\n\n\t\t// Serialize the form elements\n\t\tjQuery.each( a, function() {\n\t\t\tadd( this.name, this.value );\n\t\t} );\n\n\t} else {\n\n\t\t// If traditional, encode the \"old\" way (the way 1.3.2 or older\n\t\t// did it), otherwise encode params recursively.\n\t\tfor ( prefix in a ) {\n\t\t\tbuildParams( prefix, a[ prefix ], traditional, add );\n\t\t}\n\t}\n\n\t// Return the resulting serialization\n\treturn s.join( \"&\" );\n};\n\njQuery.fn.extend( {\n\tserialize: function() {\n\t\treturn jQuery.param( this.serializeArray() );\n\t},\n\tserializeArray: function() {\n\t\treturn this.map( function() {\n\n\t\t\t// Can add propHook for \"elements\" to filter or add form elements\n\t\t\tvar elements = jQuery.prop( this, \"elements\" );\n\t\t\treturn elements ? jQuery.makeArray( elements ) : this;\n\t\t} ).filter( function() {\n\t\t\tvar type = this.type;\n\n\t\t\t// Use .is( \":disabled\" ) so that fieldset[disabled] works\n\t\t\treturn this.name && !jQuery( this ).is( \":disabled\" ) &&\n\t\t\t\trsubmittable.test( this.nodeName ) && !rsubmitterTypes.test( type ) &&\n\t\t\t\t( this.checked || !rcheckableType.test( type ) );\n\t\t} ).map( function( _i, elem ) {\n\t\t\tvar val = jQuery( this ).val();\n\n\t\t\tif ( val == null ) {\n\t\t\t\treturn null;\n\t\t\t}\n\n\t\t\tif ( Array.isArray( val ) ) {\n\t\t\t\treturn jQuery.map( val, function( val ) {\n\t\t\t\t\treturn { name: elem.name, value: val.replace( rCRLF, \"\\r\\n\" ) };\n\t\t\t\t} );\n\t\t\t}\n\n\t\t\treturn { name: elem.name, value: val.replace( rCRLF, \"\\r\\n\" ) };\n\t\t} ).get();\n\t}\n} );\n\nexport { jQuery, jQuery as $ };", + "meta": { + "sha256": 1122560629718129408618766425038972734060154979936, + "start_line": 2, + "end_line": 130, + "origin": { + "mimetype": "text/plain", + "binary_hash": 7904055776319460817, + "filename": "serialize.js", + "uri": "https://github.com/jquery/jquery/blob/abc123def456/serialize.js" + }, + "chunk_type": "preamble" + } + } + ] +} diff --git a/test/data/chunker_repo/Python/repo_out_chunks.json b/test/data/chunker_repo/Python/repo_out_chunks.json new file mode 100644 index 00000000..386b99ad --- /dev/null +++ b/test/data/chunker_repo/Python/repo_out_chunks.json @@ -0,0 +1,1225 @@ +{ + "root": [ + { + "text": "from docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float\nfrom docling_core.types.doc.page import SegmentedPdfPage, TextCell\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass Cluster(BaseModel):\n\n id: int\n label: DocItemLabel\n bbox: BoundingBox\n confidence: float = 1.0\n cells: List[TextCell] = []\n children: List[\"Cluster\"] = []\n @field_serializer(\"confidence\")\n def _serialize(self, value: float, info: FieldSerializationInfo) -> float:\n return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)", + "meta": { + "part_name": "_serialize", + "docstring": "", + "sha256": 1370311415977656221876886741900648971627414401247, + "start_line": 150, + "end_line": 151, + "end_line_signature": 151, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass FigureElement(BasePageElement):\n\n annotations: List[PictureDataType] = []\n provenance: Optional[str] = None\n predicted_class: Optional[str] = None\n confidence: Optional[float] = None\n @field_serializer(\"confidence\")\n def _serialize(\n self, value: Optional[float], info: FieldSerializationInfo\n ) -> Optional[float]:\n return (\n round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)\n if value is not None\n else None\n )", + "meta": { + "part_name": "_serialize", + "docstring": "", + "sha256": 548765170194758372904020338821756398576566540703, + "start_line": 206, + "end_line": 213, + "end_line_signature": 209, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "function" + } + }, + { + "text": " from docling.backend.pdf_backend import PdfPageBackend\n# DO NOT REMOVE; explicitly exposed from this location\nfrom PIL.Image import Image\nfrom docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.page import SegmentedPdfPage, TextCell\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass Page(BaseModel):\n\n model_config = ConfigDict(arbitrary_types_allowed=True)\n page_no: int\n size: Optional[Size] = None\n parsed_page: Optional[SegmentedPdfPage] = None\n predictions: PagePredictions = PagePredictions()\n assembled: Optional[AssembledUnit] = None\n _backend: Optional[\"PdfPageBackend\"] = (\n None # Internal PDF backend. By default it is cleared during assembling.\n )\n _default_image_scale: float = 1.0\n _image_cache: Dict[float, Image] = (\n {}\n )\n @property\n def cells(self) -> List[TextCell]:\n \"\"\"Return text cells as a read-only view of parsed_page.textline_cells.\"\"\"\n if self.parsed_page is not None:\n return self.parsed_page.textline_cells\n else:\n return []", + "meta": { + "part_name": "cells", + "docstring": "", + "sha256": 808130656114478424554213379229194132787588082937, + "start_line": 269, + "end_line": 274, + "end_line_signature": 270, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "function" + } + }, + { + "text": " from docling.backend.pdf_backend import PdfPageBackend\n# DO NOT REMOVE; explicitly exposed from this location\nfrom PIL.Image import Image\nfrom docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.page import SegmentedPdfPage, TextCell\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass Page(BaseModel):\n\n model_config = ConfigDict(arbitrary_types_allowed=True)\n page_no: int\n size: Optional[Size] = None\n parsed_page: Optional[SegmentedPdfPage] = None\n predictions: PagePredictions = PagePredictions()\n assembled: Optional[AssembledUnit] = None\n _backend: Optional[\"PdfPageBackend\"] = (\n None # Internal PDF backend. By default it is cleared during assembling.\n )\n _default_image_scale: float = 1.0\n _image_cache: Dict[float, Image] = (\n {}\n )\n def get_image(\n self,\n scale: float = 1.0,\n max_size: Optional[int] = None,\n cropbox: Optional[BoundingBox] = None,\n ) -> Optional[Image]:\n if self._backend is None:\n return self._image_cache.get(scale, None)\n\n if max_size:\n assert self.size is not None\n scale = min(scale, max_size / max(self.size.as_tuple()))\n\n if scale not in self._image_cache:\n if cropbox is None:\n self._image_cache[scale] = self._backend.get_page_image(scale=scale)\n else:\n return self._backend.get_page_image(scale=scale, cropbox=cropbox)\n\n if cropbox is None:\n return self._image_cache[scale]\n else:\n page_im = self._image_cache[scale]\n assert self.size is not None\n return page_im.crop(\n cropbox.to_top_left_origin(page_height=self.size.height)\n .scaled(scale=scale)\n .as_tuple()\n )", + "meta": { + "part_name": "get_image", + "docstring": "", + "sha256": 730486712684958979505969494702830191562221418826, + "start_line": 276, + "end_line": 304, + "end_line_signature": 282, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "function" + } + }, + { + "text": " from docling.backend.pdf_backend import PdfPageBackend\n# DO NOT REMOVE; explicitly exposed from this location\nfrom PIL.Image import Image\nfrom docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.page import SegmentedPdfPage, TextCell\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass Page(BaseModel):\n\n model_config = ConfigDict(arbitrary_types_allowed=True)\n page_no: int\n size: Optional[Size] = None\n parsed_page: Optional[SegmentedPdfPage] = None\n predictions: PagePredictions = PagePredictions()\n assembled: Optional[AssembledUnit] = None\n _backend: Optional[\"PdfPageBackend\"] = (\n None # Internal PDF backend. By default it is cleared during assembling.\n )\n _default_image_scale: float = 1.0\n _image_cache: Dict[float, Image] = (\n {}\n )\n @property\n def image(self) -> Optional[Image]:\n return self.get_image(scale=self._default_image_scale)", + "meta": { + "part_name": "image", + "docstring": "", + "sha256": 411118430431318207465607893315291238177524289712, + "start_line": 307, + "end_line": 308, + "end_line_signature": 308, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nimport numpy as np\nclass PageConfidenceScores(BaseModel):\n\n parse_score: ScoreValue = np.nan\n layout_score: ScoreValue = np.nan\n table_score: ScoreValue = np.nan\n ocr_score: ScoreValue = np.nan\n def _score_to_grade(self, score: ScoreValue) -> QualityGrade:\n if score < 0.5:\n return QualityGrade.POOR\n elif score < 0.8:\n return QualityGrade.FAIR\n elif score < 0.9:\n return QualityGrade.GOOD\n elif score >= 0.9:\n return QualityGrade.EXCELLENT\n\n return QualityGrade.UNSPECIFIED", + "meta": { + "part_name": "_score_to_grade", + "docstring": "", + "sha256": 1226961229592084659714241042350075927273793479169, + "start_line": 361, + "end_line": 371, + "end_line_signature": 362, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nimport numpy as np\nclass PageConfidenceScores(BaseModel):\n\n parse_score: ScoreValue = np.nan\n layout_score: ScoreValue = np.nan\n table_score: ScoreValue = np.nan\n ocr_score: ScoreValue = np.nan\n @computed_field # type: ignore\n @property\n def mean_grade(self) -> QualityGrade:\n return self._score_to_grade(self.mean_score)", + "meta": { + "part_name": "mean_grade", + "docstring": "", + "sha256": 970148436571335637993437576490782463715252886019, + "start_line": 375, + "end_line": 376, + "end_line_signature": 376, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nimport numpy as np\nclass PageConfidenceScores(BaseModel):\n\n parse_score: ScoreValue = np.nan\n layout_score: ScoreValue = np.nan\n table_score: ScoreValue = np.nan\n ocr_score: ScoreValue = np.nan\n @computed_field # type: ignore\n @property\n def low_grade(self) -> QualityGrade:\n return self._score_to_grade(self.low_score)", + "meta": { + "part_name": "low_grade", + "docstring": "", + "sha256": 1414417851083571439151429300774211251904833950620, + "start_line": 380, + "end_line": 381, + "end_line_signature": 381, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nimport numpy as np\nScoreValue = float\nclass PageConfidenceScores(BaseModel):\n\n parse_score: ScoreValue = np.nan\n layout_score: ScoreValue = np.nan\n table_score: ScoreValue = np.nan\n ocr_score: ScoreValue = np.nan\n @computed_field # type: ignore\n @property\n def mean_score(self) -> ScoreValue:\n return ScoreValue(\n np.nanmean(\n [\n self.ocr_score,\n self.table_score,\n self.layout_score,\n self.parse_score,\n ]\n )\n )", + "meta": { + "part_name": "mean_score", + "docstring": "", + "sha256": 1258375186580609407958319910033845627450381020082, + "start_line": 385, + "end_line": 395, + "end_line_signature": 386, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nimport numpy as np\nScoreValue = float\nclass PageConfidenceScores(BaseModel):\n\n parse_score: ScoreValue = np.nan\n layout_score: ScoreValue = np.nan\n table_score: ScoreValue = np.nan\n ocr_score: ScoreValue = np.nan\n @computed_field # type: ignore\n @property\n def low_score(self) -> ScoreValue:\n return ScoreValue(\n np.nanquantile(\n [\n self.ocr_score,\n self.table_score,\n self.layout_score,\n self.parse_score,\n ],\n q=0.05,\n )\n )", + "meta": { + "part_name": "low_score", + "docstring": "", + "sha256": 530920199340573617576130514840888087666895770482, + "start_line": 399, + "end_line": 410, + "end_line_signature": 400, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from collections import defaultdict\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nimport numpy as np\nScoreValue = float\nclass ConfidenceReport(PageConfidenceScores):\n\n pages: Dict[int, PageConfidenceScores] = Field(\n default_factory=lambda: defaultdict(PageConfidenceScores)\n )\n @computed_field # type: ignore\n @property\n def mean_score(self) -> ScoreValue:\n return ScoreValue(\n np.nanmean(\n [c.mean_score for c in self.pages.values()],\n )\n )", + "meta": { + "part_name": "mean_score", + "docstring": "", + "sha256": 132450849266989335217771535733536814236612441736, + "start_line": 420, + "end_line": 425, + "end_line_signature": 421, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from collections import defaultdict\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nimport numpy as np\nScoreValue = float\nclass ConfidenceReport(PageConfidenceScores):\n\n pages: Dict[int, PageConfidenceScores] = Field(\n default_factory=lambda: defaultdict(PageConfidenceScores)\n )\n @computed_field # type: ignore\n @property\n def low_score(self) -> ScoreValue:\n return ScoreValue(\n np.nanmean(\n [c.low_score for c in self.pages.values()],\n )\n )", + "meta": { + "part_name": "low_score", + "docstring": "", + "sha256": 970153367204825604327172702664272609484373919390, + "start_line": 429, + "end_line": 434, + "end_line_signature": 430, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from enum import Enum\nclass ConversionStatus(str, Enum):\n PENDING = \"pending\"\n STARTED = \"started\"\n FAILURE = \"failure\"\n SUCCESS = \"success\"\n PARTIAL_SUCCESS = \"partial_success\"\n SKIPPED = \"skipped\"", + "meta": { + "part_name": "ConversionStatus", + "docstring": "", + "sha256": 620766103743608450410859564155193221612617787030, + "start_line": 32, + "end_line": 38, + "end_line_signature": 38, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from enum import Enum\nimport numpy as np\nclass InputFormat(str, Enum):\n \"\"\"A document format supported by document backend parsers.\"\"\"\n\n DOCX = \"docx\"\n PPTX = \"pptx\"\n HTML = \"html\"\n IMAGE = \"image\"\n PDF = \"pdf\"\n ASCIIDOC = \"asciidoc\"\n MD = \"md\"\n CSV = \"csv\"\n XLSX = \"xlsx\"\n XML_USPTO = \"xml_uspto\"\n XML_JATS = \"xml_jats\"\n JSON_DOCLING = \"json_docling\"\n AUDIO = \"audio\"", + "meta": { + "part_name": "InputFormat", + "docstring": "", + "sha256": 892216703579506331579469699340486405594949995133, + "start_line": 41, + "end_line": 56, + "end_line_signature": 56, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from enum import Enum\nclass OutputFormat(str, Enum):\n MARKDOWN = \"md\"\n JSON = \"json\"\n HTML = \"html\"\n HTML_SPLIT_PAGE = \"html_split_page\"\n TEXT = \"text\"\n DOCTAGS = \"doctags\"", + "meta": { + "part_name": "OutputFormat", + "docstring": "", + "sha256": 1347846176447908013052242174477254615853046589425, + "start_line": 59, + "end_line": 65, + "end_line_signature": 65, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from enum import Enum\nimport numpy as np\nclass DocInputType(str, Enum):\n PATH = \"path\"\n STREAM = \"stream\"", + "meta": { + "part_name": "DocInputType", + "docstring": "", + "sha256": 1223714591888346503494526053642460574025115616826, + "start_line": 123, + "end_line": 125, + "end_line_signature": 125, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from enum import Enum\nimport numpy as np\nclass DoclingComponentType(str, Enum):\n DOCUMENT_BACKEND = \"document_backend\"\n MODEL = \"model\"\n DOC_ASSEMBLER = \"doc_assembler\"\n USER_INPUT = \"user_input\"", + "meta": { + "part_name": "DoclingComponentType", + "docstring": "", + "sha256": 1044618506138011322142671349003262389011658625241, + "start_line": 128, + "end_line": 132, + "end_line_signature": 132, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nclass ErrorItem(BaseModel):\n component_type: DoclingComponentType\n module_name: str\n error_message: str", + "meta": { + "part_name": "ErrorItem", + "docstring": "", + "sha256": 1086117531920474775755100836319966340621867981803, + "start_line": 135, + "end_line": 138, + "end_line_signature": 138, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float\nfrom docling_core.types.doc.page import SegmentedPdfPage, TextCell\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass Cluster(BaseModel):\n id: int\n label: DocItemLabel\n bbox: BoundingBox\n confidence: float = 1.0\n cells: List[TextCell] = []\n children: List[\"Cluster\"] = [] # Add child cluster support\n\n @field_serializer(\"confidence\")\n def _serialize(self, value: float, info: FieldSerializationInfo) -> float:\n return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)", + "meta": { + "part_name": "Cluster", + "docstring": "", + "sha256": 1265293438447400808420129430722899990479108904146, + "start_line": 141, + "end_line": 151, + "end_line_signature": 151, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass BasePageElement(BaseModel):\n label: DocItemLabel\n id: int\n page_no: int\n cluster: Cluster\n text: Optional[str] = None", + "meta": { + "part_name": "BasePageElement", + "docstring": "", + "sha256": 27686403131898710443755657765582374638708518770, + "start_line": 154, + "end_line": 159, + "end_line_signature": 159, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass LayoutPrediction(BaseModel):\n clusters: List[Cluster] = []", + "meta": { + "part_name": "LayoutPrediction", + "docstring": "", + "sha256": 987561170389338380550072794621774259794654494830, + "start_line": 162, + "end_line": 163, + "end_line_signature": 163, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nclass VlmPredictionToken(BaseModel):\n text: str = \"\"\n token: int = -1\n logprob: float = -1", + "meta": { + "part_name": "VlmPredictionToken", + "docstring": "", + "sha256": 65603519381706971863039126377096035311378651150, + "start_line": 166, + "end_line": 169, + "end_line_signature": 169, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nclass VlmPrediction(BaseModel):\n text: str = \"\"\n generated_tokens: list[VlmPredictionToken] = []\n generation_time: float = -1", + "meta": { + "part_name": "VlmPrediction", + "docstring": "", + "sha256": 1001170426609364857440748112674472207227386338107, + "start_line": 172, + "end_line": 175, + "end_line_signature": 175, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass ContainerElement(\n BasePageElement\n): # Used for Form and Key-Value-Regions, only for typing.\n pass", + "meta": { + "part_name": "ContainerElement", + "docstring": "", + "sha256": 595661713802144347628508983545186540799570992038, + "start_line": 178, + "end_line": 181, + "end_line_signature": 181, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass Table(BasePageElement):\n otsl_seq: List[str]\n num_rows: int = 0\n num_cols: int = 0\n table_cells: List[TableCell]", + "meta": { + "part_name": "Table", + "docstring": "", + "sha256": 899700098549855605262894705349504270338610789745, + "start_line": 184, + "end_line": 188, + "end_line_signature": 188, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass TableStructurePrediction(BaseModel):\n table_map: Dict[int, Table] = {}", + "meta": { + "part_name": "TableStructurePrediction", + "docstring": "", + "sha256": 10390869097689903408310238897062292721124859701, + "start_line": 191, + "end_line": 192, + "end_line_signature": 192, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "\nclass TextElement(BasePageElement):\n text: str", + "meta": { + "part_name": "TextElement", + "docstring": "", + "sha256": 910599684044725278502405469110289841532247426179, + "start_line": 195, + "end_line": 196, + "end_line_signature": 196, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass FigureElement(BasePageElement):\n annotations: List[PictureDataType] = []\n provenance: Optional[str] = None\n predicted_class: Optional[str] = None\n confidence: Optional[float] = None\n\n @field_serializer(\"confidence\")\n def _serialize(\n self, value: Optional[float], info: FieldSerializationInfo\n ) -> Optional[float]:\n return (\n round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)\n if value is not None\n else None\n )", + "meta": { + "part_name": "FigureElement", + "docstring": "", + "sha256": 49150437793556841766787782882109128052965382281, + "start_line": 199, + "end_line": 213, + "end_line_signature": 213, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass FigureClassificationPrediction(BaseModel):\n figure_count: int = 0\n figure_map: Dict[int, FigureElement] = {}", + "meta": { + "part_name": "FigureClassificationPrediction", + "docstring": "", + "sha256": 393470850502283580013047324181028663675467213132, + "start_line": 216, + "end_line": 218, + "end_line_signature": 218, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass EquationPrediction(BaseModel):\n equation_count: int = 0\n equation_map: Dict[int, TextElement] = {}", + "meta": { + "part_name": "EquationPrediction", + "docstring": "", + "sha256": 283019580808330812046385053785548970241882616440, + "start_line": 221, + "end_line": 223, + "end_line_signature": 223, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass PagePredictions(BaseModel):\n layout: Optional[LayoutPrediction] = None\n tablestructure: Optional[TableStructurePrediction] = None\n figures_classification: Optional[FigureClassificationPrediction] = None\n equations_prediction: Optional[EquationPrediction] = None\n vlm_response: Optional[VlmPrediction] = None", + "meta": { + "part_name": "PagePredictions", + "docstring": "", + "sha256": 1048734808182693909263635311132078575228024326882, + "start_line": 226, + "end_line": 231, + "end_line_signature": 231, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass AssembledUnit(BaseModel):\n elements: List[PageElement] = []\n body: List[PageElement] = []\n headers: List[PageElement] = []", + "meta": { + "part_name": "AssembledUnit", + "docstring": "", + "sha256": 1094387419928066226155485436524594727573943352114, + "start_line": 237, + "end_line": 240, + "end_line_signature": 240, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "# DO NOT REMOVE; explicitly exposed from this location\nfrom PIL.Image import Image\nfrom docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass ItemAndImageEnrichmentElement(BaseModel):\n model_config = ConfigDict(arbitrary_types_allowed=True)\n\n item: NodeItem\n image: Image", + "meta": { + "part_name": "ItemAndImageEnrichmentElement", + "docstring": "", + "sha256": 30748452496409606175686443467765939596665570803, + "start_line": 243, + "end_line": 247, + "end_line_signature": 247, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\n## OpenAI API Request / Response Models ##\nclass OpenAiChatMessage(BaseModel):\n role: str\n content: str", + "meta": { + "part_name": "OpenAiChatMessage", + "docstring": "", + "sha256": 515012574841107792563852565513091992302046287434, + "start_line": 314, + "end_line": 316, + "end_line_signature": 316, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass OpenAiResponseChoice(BaseModel):\n index: int\n message: OpenAiChatMessage\n finish_reason: Optional[str]", + "meta": { + "part_name": "OpenAiResponseChoice", + "docstring": "", + "sha256": 337899610582669912657797333467719843029840509833, + "start_line": 319, + "end_line": 322, + "end_line_signature": 322, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nclass OpenAiResponseUsage(BaseModel):\n prompt_tokens: int\n completion_tokens: int\n total_tokens: int", + "meta": { + "part_name": "OpenAiResponseUsage", + "docstring": "", + "sha256": 130433137267720616513765827820400685577191918977, + "start_line": 325, + "end_line": 328, + "end_line_signature": 328, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass OpenAiApiResponse(BaseModel):\n model_config = ConfigDict(\n protected_namespaces=(),\n )\n\n id: str\n model: Optional[str] = None # returned by openai\n choices: List[OpenAiResponseChoice]\n created: int\n usage: OpenAiResponseUsage", + "meta": { + "part_name": "OpenAiApiResponse", + "docstring": "", + "sha256": 891404258682255341223304052944928949828473242768, + "start_line": 331, + "end_line": 340, + "end_line_signature": 340, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from enum import Enum\nclass QualityGrade(str, Enum):\n POOR = \"poor\"\n FAIR = \"fair\"\n GOOD = \"good\"\n EXCELLENT = \"excellent\"\n UNSPECIFIED = \"unspecified\"", + "meta": { + "part_name": "QualityGrade", + "docstring": "", + "sha256": 193399221256625706292721394797930754087225969626, + "start_line": 347, + "end_line": 352, + "end_line_signature": 352, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from collections import defaultdict\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nimport numpy as np\nclass ConfidenceReport(PageConfidenceScores):\n pages: Dict[int, PageConfidenceScores] = Field(\n default_factory=lambda: defaultdict(PageConfidenceScores)\n )\n\n @computed_field # type: ignore\n @property\n def mean_score(self) -> ScoreValue:\n return ScoreValue(\n np.nanmean(\n [c.mean_score for c in self.pages.values()],\n )\n )\n\n @computed_field # type: ignore\n @property\n def low_score(self) -> ScoreValue:\n return ScoreValue(\n np.nanmean(\n [c.low_score for c in self.pages.values()],\n )\n )", + "meta": { + "part_name": "ConfidenceReport", + "docstring": "", + "sha256": 1446615132461763223157668983795322811472146148315, + "start_line": 413, + "end_line": 434, + "end_line_signature": 434, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "class" + } + }, + { + "text": "if TYPE_CHECKING:\n\nFormatToExtensions: Dict[InputFormat, List[str]] = {\n InputFormat.DOCX: [\"docx\", \"dotx\", \"docm\", \"dotm\"],\n InputFormat.PPTX: [\"pptx\", \"potx\", \"ppsx\", \"pptm\", \"potm\", \"ppsm\"],\n InputFormat.PDF: [\"pdf\"],\n InputFormat.MD: [\"md\"],\n InputFormat.HTML: [\"html\", \"htm\", \"xhtml\"],\n InputFormat.XML_JATS: [\"xml\", \"nxml\"],\n InputFormat.IMAGE: [\"jpg\", \"jpeg\", \"png\", \"tif\", \"tiff\", \"bmp\", \"webp\"],\n InputFormat.ASCIIDOC: [\"adoc\", \"asciidoc\", \"asc\"],\n InputFormat.CSV: [\"csv\"],\n InputFormat.XLSX: [\"xlsx\", \"xlsm\"],\n InputFormat.XML_USPTO: [\"xml\", \"txt\"],\n InputFormat.JSON_DOCLING: [\"json\"],\n InputFormat.AUDIO: [\"wav\", \"mp3\"],\n}\n\nFormatToMimeType: Dict[InputFormat, List[str]] = {\n InputFormat.DOCX: [\n \"application/vnd.openxmlformats-officedocument.wordprocessingml.document\",\n \"application/vnd.openxmlformats-officedocument.wordprocessingml.template\",\n ],\n InputFormat.PPTX: [\n \"application/vnd.openxmlformats-officedocument.presentationml.template\",\n \"application/vnd.openxmlformats-officedocument.presentationml.slideshow\",\n \"application/vnd.openxmlformats-officedocument.presentationml.presentation\",\n ],\n InputFormat.HTML: [\"text/html\", \"application/xhtml+xml\"],\n InputFormat.XML_JATS: [\"application/xml\"],\n InputFormat.IMAGE: [\n \"image/png\",\n \"image/jpeg\",\n \"image/tiff\",\n \"image/gif\",\n \"image/bmp\",\n \"image/webp\",\n ],\n InputFormat.PDF: [\"application/pdf\"],\n InputFormat.ASCIIDOC: [\"text/asciidoc\"],\n InputFormat.MD: [\"text/markdown\", \"text/x-markdown\"],\n InputFormat.CSV: [\"text/csv\"],\n InputFormat.XLSX: [\n \"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet\"\n ],\n InputFormat.XML_USPTO: [\"application/xml\", \"text/plain\"],\n InputFormat.JSON_DOCLING: [\"application/json\"],\n InputFormat.AUDIO: [\"audio/x-wav\", \"audio/mpeg\", \"audio/wav\", \"audio/mp3\"],\n}\n\nMimeTypeToFormat: dict[str, list[InputFormat]] = {\n mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]\n for value in FormatToMimeType.values()\n for mime in value\n}\n\nPageElement = Union[TextElement, Table, FigureElement, ContainerElement]", + "meta": { + "sha256": 937534938268631177739242095765995242760409532040, + "start_line": 27, + "end_line": 237, + "origin": { + "mimetype": "text/plain", + "binary_hash": 17127733993255342652, + "filename": "base_models.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" + }, + "chunk_type": "preamble" + } + }, + { + "text": "from collections.abc import Iterable\nfrom docling.datamodel.document import ConversionResult, Page\nfrom docling_core.types.doc import BoundingBox, CoordOrigin\nfrom docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table\nfrom typing import Any, Dict, List, Tuple, Union\n_log = logging.getLogger(__name__)\ndef generate_multimodal_pages(\n doc_result: ConversionResult,\n) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:\n label_to_doclaynet = {\n \"title\": \"title\",\n \"table-of-contents\": \"document_index\",\n \"subtitle-level-1\": \"section_header\",\n \"checkbox-selected\": \"checkbox_selected\",\n \"checkbox-unselected\": \"checkbox_unselected\",\n \"caption\": \"caption\",\n \"page-header\": \"page_header\",\n \"page-footer\": \"page_footer\",\n \"footnote\": \"footnote\",\n \"table\": \"table\",\n \"formula\": \"formula\",\n \"list-item\": \"list_item\",\n \"code\": \"code\",\n \"figure\": \"picture\",\n \"picture\": \"picture\",\n \"reference\": \"text\",\n \"paragraph\": \"text\",\n \"text\": \"text\",\n }\n\n content_text = \"\"\n page_no = 0\n start_ix = 0\n end_ix = 0\n doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []\n\n doc = doc_result.legacy_document\n\n def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):\n segments = []\n\n for ix, item in doc_items:\n item_type = item.obj_type\n label = label_to_doclaynet.get(item_type, None)\n\n if label is None or item.prov is None or page.size is None:\n continue\n\n bbox = BoundingBox.from_tuple(\n tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT\n )\n new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(\n page_size=page.size\n )\n\n new_segment = {\n \"index_in_doc\": ix,\n \"label\": label,\n \"text\": item.text if item.text is not None else \"\",\n \"bbox\": new_bbox.as_tuple(),\n \"data\": [],\n }\n\n if isinstance(item, Table):\n table_html = item.export_to_html()\n new_segment[\"data\"].append(\n {\n \"html_seq\": table_html,\n \"otsl_seq\": \"\",\n }\n )\n\n segments.append(new_segment)\n\n return segments\n\n def _process_page_cells(page: Page):\n cells: List[dict] = []\n if page.size is None:\n return cells\n for cell in page.cells:\n new_bbox = (\n cell.rect.to_bounding_box()\n .to_top_left_origin(page_height=page.size.height)\n .normalized(page_size=page.size)\n )\n is_ocr = cell.from_ocr\n ocr_confidence = cell.confidence\n cells.append(\n {\n \"text\": cell.text,\n \"bbox\": new_bbox.as_tuple(),\n \"ocr\": is_ocr,\n \"ocr_confidence\": ocr_confidence,\n }\n )\n return cells\n\n def _process_page():\n page_ix = page_no - 1\n page = doc_result.pages[page_ix]\n\n page_cells = _process_page_cells(page=page)\n page_segments = _process_page_segments(doc_items=doc_items, page=page)\n content_md = doc.export_to_markdown(\n main_text_start=start_ix, main_text_stop=end_ix\n )\n # No page-tagging since we only do 1 page at the time\n content_dt = doc.export_to_document_tokens(\n main_text_start=start_ix, main_text_stop=end_ix, add_page_index=False\n )\n\n return content_text, content_md, content_dt, page_cells, page_segments, page\n\n if doc.main_text is None:\n return\n for ix, orig_item in enumerate(doc.main_text):\n item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item\n if item is None or item.prov is None or len(item.prov) == 0:\n _log.debug(f\"Skipping item {orig_item}\")\n continue\n\n item_page = item.prov[0].page\n\n # Page is complete\n if page_no > 0 and item_page > page_no:\n yield _process_page()\n\n start_ix = ix\n doc_items = []\n content_text = \"\"\n\n page_no = item_page\n end_ix = ix\n doc_items.append((ix, item))\n if item.text is not None and item.text != \"\":\n content_text += item.text + \" \"\n\n if len(doc_items) > 0:\n yield _process_page()", + "meta": { + "part_name": "generate_multimodal_pages", + "docstring": "", + "sha256": 1004790262158132739538587728436826033595593751761, + "start_line": 12, + "end_line": 145, + "end_line_signature": 15, + "origin": { + "mimetype": "text/plain", + "binary_hash": 11028592083014135829, + "filename": "export.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/export.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n @override\n def is_valid(self) -> bool:\n return self.valid", + "meta": { + "part_name": "is_valid", + "docstring": "", + "sha256": 1389299177428647533914300122685171886284474960790, + "start_line": 120, + "end_line": 121, + "end_line_signature": 121, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n @classmethod\n @override\n def supports_pagination(cls) -> bool:\n return False", + "meta": { + "part_name": "supports_pagination", + "docstring": "", + "sha256": 189133244729867257087740036829509886476419053207, + "start_line": 125, + "end_line": 126, + "end_line_signature": 126, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n @override\n def unload(self):\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.close()\n self.path_or_stream = None", + "meta": { + "part_name": "unload", + "docstring": "", + "sha256": 19733788426265514145027761479429042000417200591, + "start_line": 129, + "end_line": 132, + "end_line_signature": 130, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.base_models import InputFormat\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n @classmethod\n @override\n def supported_formats(cls) -> set[InputFormat]:\n return {InputFormat.XML_JATS}", + "meta": { + "part_name": "supported_formats", + "docstring": "", + "sha256": 95992898799884786951251283661078353705554191150, + "start_line": 136, + "end_line": 137, + "end_line_signature": 137, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nimport traceback\n_log = logging.getLogger(__name__)\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n @override\n def convert(self) -> DoclingDocument:\n try:\n # Create empty document\n origin = DocumentOrigin(\n filename=self.file.name or \"file\",\n mimetype=\"application/xml\",\n binary_hash=self.document_hash,\n )\n doc = DoclingDocument(name=self.file.stem or \"file\", origin=origin)\n self.hlevel = 0\n\n # Get metadata XML components\n xml_components: XMLComponents = self._parse_metadata()\n\n # Add metadata to the document\n self._add_metadata(doc, xml_components)\n\n # walk over the XML body\n body = self.tree.xpath(\"//body\")\n if self.root and len(body) > 0:\n self._walk_linear(doc, self.root, body[0])\n\n # walk over the XML back matter\n back = self.tree.xpath(\"//back\")\n if self.root and len(back) > 0:\n self._walk_linear(doc, self.root, back[0])\n except Exception:\n _log.error(traceback.format_exc())\n\n return doc", + "meta": { + "part_name": "convert", + "docstring": "", + "sha256": 401268109311259531613842418991708895011320755673, + "start_line": 140, + "end_line": 169, + "end_line_signature": 141, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n @staticmethod\n def _get_text(node: etree._Element, sep: Optional[str] = None) -> str:\n skip_tags = [\"term\", \"disp-formula\", \"inline-formula\"]\n text: str = (\n node.text.replace(\"\\n\", \" \")\n if (node.tag not in skip_tags and node.text)\n else \"\"\n )\n for child in list(node):\n if child.tag not in skip_tags:\n # TODO: apply styling according to child.tag when supported by docling-core\n text += JatsDocumentBackend._get_text(child, sep)\n if sep:\n text = text.rstrip(sep) + sep\n text += child.tail.replace(\"\\n\", \" \") if child.tail else \"\"\n\n return text", + "meta": { + "part_name": "_get_text", + "docstring": "", + "sha256": 766714162982515447138884963963637052165120920700, + "start_line": 172, + "end_line": 187, + "end_line_signature": 173, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _find_metadata(self) -> Optional[etree._Element]:\n meta_names: list[str] = [\"article-meta\", \"book-part-meta\"]\n meta: Optional[etree._Element] = None\n for name in meta_names:\n node = self.tree.xpath(f\".//{name}\")\n if len(node) > 0:\n meta = node[0]\n break\n\n return meta", + "meta": { + "part_name": "_find_metadata", + "docstring": "", + "sha256": 864269816803865464490166888555034068213906212092, + "start_line": 189, + "end_line": 198, + "end_line_signature": 190, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _parse_abstract(self) -> list[Abstract]:\n # TODO: address cases with multiple sections\n abs_list: list[Abstract] = []\n\n for abs_node in self.tree.xpath(\".//abstract\"):\n abstract: Abstract = dict(label=\"\", content=\"\")\n texts = []\n for abs_par in abs_node.xpath(\"p\"):\n texts.append(JatsDocumentBackend._get_text(abs_par).strip())\n abstract[\"content\"] = \" \".join(texts)\n\n label_node = abs_node.xpath(\"title|label\")\n if len(label_node) > 0:\n abstract[\"label\"] = label_node[0].text.strip()\n\n abs_list.append(abstract)\n\n return abs_list", + "meta": { + "part_name": "_parse_abstract", + "docstring": "", + "sha256": 270615732461062541428727022585950585711717687697, + "start_line": 200, + "end_line": 217, + "end_line_signature": 202, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _parse_authors(self) -> list[Author]:\n # Get mapping between affiliation ids and names\n authors: list[Author] = []\n meta: Optional[etree._Element] = self._find_metadata()\n if meta is None:\n return authors\n\n affiliation_names = []\n for affiliation_node in meta.xpath(\".//aff[@id]\"):\n aff = \", \".join([t for t in affiliation_node.itertext() if t.strip()])\n aff = aff.replace(\"\\n\", \" \")\n label = affiliation_node.xpath(\"label\")\n if label:\n # TODO: once superscript is supported, add label with formatting\n aff = aff.removeprefix(f\"{label[0].text}, \")\n affiliation_names.append(aff)\n affiliation_ids_names = dict(\n zip(meta.xpath(\".//aff[@id]/@id\"), affiliation_names)\n )\n\n # Get author names and affiliation names\n for author_node in meta.xpath(\n './/contrib-group/contrib[@contrib-type=\"author\"]'\n ):\n author: Author = {\n \"name\": \"\",\n \"affiliation_names\": [],\n }\n\n # Affiliation names\n affiliation_ids = [\n a.attrib[\"rid\"] for a in author_node.xpath('xref[@ref-type=\"aff\"]')\n ]\n for id in affiliation_ids:\n if id in affiliation_ids_names:\n author[\"affiliation_names\"].append(affiliation_ids_names[id])\n\n # Name\n author[\"name\"] = (\n author_node.xpath(\"name/given-names\")[0].text\n + \" \"\n + author_node.xpath(\"name/surname\")[0].text\n )\n\n authors.append(author)\n\n return authors", + "meta": { + "part_name": "_parse_authors", + "docstring": "", + "sha256": 285578325238635728078594781795591481429703102959, + "start_line": 219, + "end_line": 265, + "end_line_signature": 221, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _parse_title(self) -> str:\n meta_names: list[str] = [\n \"article-meta\",\n \"collection-meta\",\n \"book-meta\",\n \"book-part-meta\",\n ]\n title_names: list[str] = [\"article-title\", \"subtitle\", \"title\", \"label\"]\n titles: list[str] = [\n \" \".join(\n elem.text.replace(\"\\n\", \" \").strip()\n for elem in list(title_node)\n if elem.tag in title_names\n ).strip()\n for title_node in self.tree.xpath(\n \"|\".join([f\".//{item}/title-group\" for item in meta_names])\n )\n ]\n\n text = \" - \".join(titles)\n\n return text", + "meta": { + "part_name": "_parse_title", + "docstring": "", + "sha256": 211107707856227464327571216206163890165368281377, + "start_line": 267, + "end_line": 288, + "end_line_signature": 268, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _parse_metadata(self) -> XMLComponents:\n \"\"\"Parsing JATS document metadata.\"\"\"\n xml_components: XMLComponents = {\n \"title\": self._parse_title(),\n \"authors\": self._parse_authors(),\n \"abstract\": self._parse_abstract(),\n }\n return xml_components", + "meta": { + "part_name": "_parse_metadata", + "docstring": "", + "sha256": 6721807935708893971952263073653597579122214521, + "start_line": 290, + "end_line": 297, + "end_line_signature": 291, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nDEFAULT_HEADER_ABSTRACT: Final = \"Abstract\"\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_abstract(\n self, doc: DoclingDocument, xml_components: XMLComponents\n ) -> None:\n for abstract in xml_components[\"abstract\"]:\n text: str = abstract[\"content\"]\n title: str = abstract[\"label\"] or DEFAULT_HEADER_ABSTRACT\n if not text:\n continue\n parent = doc.add_heading(\n parent=self.root, text=title, level=self.hlevel + 1\n )\n doc.add_text(\n parent=parent,\n text=text,\n label=DocItemLabel.TEXT,\n )\n\n return", + "meta": { + "part_name": "_add_abstract", + "docstring": "", + "sha256": 532581450273390935568634548645819886174208360025, + "start_line": 299, + "end_line": 316, + "end_line_signature": 302, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:\n # TODO: once docling supports text formatting, add affiliation reference to\n # author names through superscripts\n authors: list = [item[\"name\"] for item in xml_components[\"authors\"]]\n authors_str = \", \".join(authors)\n affiliations: list = [\n item\n for author in xml_components[\"authors\"]\n for item in author[\"affiliation_names\"]\n ]\n affiliations_str = \"; \".join(list(dict.fromkeys(affiliations)))\n if authors_str:\n doc.add_text(\n parent=self.root,\n text=authors_str,\n label=DocItemLabel.PARAGRAPH,\n )\n if affiliations_str:\n doc.add_text(\n parent=self.root,\n text=affiliations_str,\n label=DocItemLabel.PARAGRAPH,\n )\n\n return", + "meta": { + "part_name": "_add_authors", + "docstring": "", + "sha256": 1338802139689470366285923911940829378081971981022, + "start_line": 318, + "end_line": 342, + "end_line_signature": 321, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_citation(self, doc: DoclingDocument, parent: NodeItem, text: str) -> None:\n if isinstance(parent, GroupItem) and parent.label == GroupLabel.LIST:\n doc.add_list_item(text=text, enumerated=False, parent=parent)\n else:\n doc.add_text(text=text, label=DocItemLabel.TEXT, parent=parent)\n\n return", + "meta": { + "part_name": "_add_citation", + "docstring": "", + "sha256": 886159930357800932034845911444780765912823069638, + "start_line": 344, + "end_line": 350, + "end_line_signature": 345, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nDEFAULT_TEXT_ETAL: Final = \"et al.\"\n_log = logging.getLogger(__name__)\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901\n citation: Citation = {\n \"author_names\": \"\",\n \"title\": \"\",\n \"source\": \"\",\n \"year\": \"\",\n \"volume\": \"\",\n \"page\": \"\",\n \"pub_id\": \"\",\n \"publisher_name\": \"\",\n \"publisher_loc\": \"\",\n }\n\n _log.debug(\"Citation parsing started\")\n\n # Author names\n names = []\n for name_node in node.xpath(\".//name\"):\n name_str = (\n name_node.xpath(\"surname\")[0].text.replace(\"\\n\", \" \").strip()\n + \" \"\n + name_node.xpath(\"given-names\")[0].text.replace(\"\\n\", \" \").strip()\n )\n names.append(name_str)\n etal_node = node.xpath(\".//etal\")\n if len(etal_node) > 0:\n etal_text = etal_node[0].text or DEFAULT_TEXT_ETAL\n names.append(etal_text)\n citation[\"author_names\"] = \", \".join(names)\n\n titles: list[str] = [\n \"article-title\",\n \"chapter-title\",\n \"data-title\",\n \"issue-title\",\n \"part-title\",\n \"trans-title\",\n ]\n title_node: Optional[etree._Element] = None\n for name in titles:\n name_node = node.xpath(name)\n if len(name_node) > 0:\n title_node = name_node[0]\n break\n citation[\"title\"] = (\n JatsDocumentBackend._get_text(title_node)\n if title_node is not None\n else node.text.replace(\"\\n\", \" \").strip()\n )\n\n # Journal, year, publisher name, publisher location, volume, elocation\n fields: list[str] = [\n \"source\",\n \"year\",\n \"publisher-name\",\n \"publisher-loc\",\n \"volume\",\n ]\n for item in fields:\n item_node = node.xpath(item)\n if len(item_node) > 0:\n citation[item.replace(\"-\", \"_\")] = ( # type: ignore[literal-required]\n item_node[0].text.replace(\"\\n\", \" \").strip()\n )\n\n # Publication identifier\n if len(node.xpath(\"pub-id\")) > 0:\n pub_id: list[str] = []\n for id_node in node.xpath(\"pub-id\"):\n id_type = id_node.get(\"assigning-authority\") or id_node.get(\n \"pub-id-type\"\n )\n id_text = id_node.text\n if id_type and id_text:\n pub_id.append(\n id_type.replace(\"\\n\", \" \").strip().upper()\n + \": \"\n + id_text.replace(\"\\n\", \" \").strip()\n )\n if pub_id:\n citation[\"pub_id\"] = \", \".join(pub_id)\n\n # Pages\n if len(node.xpath(\"elocation-id\")) > 0:\n citation[\"page\"] = (\n node.xpath(\"elocation-id\")[0].text.replace(\"\\n\", \" \").strip()\n )\n elif len(node.xpath(\"fpage\")) > 0:\n citation[\"page\"] = node.xpath(\"fpage\")[0].text.replace(\"\\n\", \" \").strip()\n if len(node.xpath(\"lpage\")) > 0:\n citation[\"page\"] += (\n \"\u2013\"\n + node.xpath(\"lpage\")[0]\n .text.replace(\"\\n\", \" \")\n .strip() # noqa: RUF001\n )\n\n # Flatten the citation to string\n\n text = \"\"\n if citation[\"author_names\"]:\n text += citation[\"author_names\"].rstrip(\".\") + \". \"\n if citation[\"title\"]:\n text += citation[\"title\"] + \". \"\n if citation[\"source\"]:\n text += citation[\"source\"] + \". \"\n if citation[\"publisher_name\"]:\n if citation[\"publisher_loc\"]:\n text += f\"{citation['publisher_loc']}: \"\n text += citation[\"publisher_name\"] + \". \"\n if citation[\"volume\"]:\n text = text.rstrip(\". \")\n text += f\" {citation['volume']}. \"\n if citation[\"page\"]:\n text = text.rstrip(\". \")\n if citation[\"volume\"]:\n text += \":\"\n text += citation[\"page\"] + \". \"\n if citation[\"year\"]:\n text = text.rstrip(\". \")\n text += f\" ({citation['year']}).\"\n if citation[\"pub_id\"]:\n text = text.rstrip(\".\") + \". \"\n text += citation[\"pub_id\"]\n\n _log.debug(\"Citation flattened\")\n\n return text", + "meta": { + "part_name": "_parse_element_citation", + "docstring": "", + "sha256": 270746555936847738057126604179455710215169149513, + "start_line": 352, + "end_line": 479, + "end_line_signature": 353, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_equation(\n self, doc: DoclingDocument, parent: NodeItem, node: etree._Element\n ) -> None:\n math_text = node.text\n math_parts = math_text.split(\"$$\")\n if len(math_parts) == 3:\n math_formula = math_parts[1]\n doc.add_text(label=DocItemLabel.FORMULA, text=math_formula, parent=parent)\n\n return", + "meta": { + "part_name": "_add_equation", + "docstring": "", + "sha256": 1414534615925307980331912092067693530565141108001, + "start_line": 481, + "end_line": 490, + "end_line_signature": 484, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_figure_captions(\n self, doc: DoclingDocument, parent: NodeItem, node: etree._Element\n ) -> None:\n label_node = node.xpath(\"label\")\n label: Optional[str] = (\n JatsDocumentBackend._get_text(label_node[0]).strip() if label_node else \"\"\n )\n\n caption_node = node.xpath(\"caption\")\n caption: Optional[str]\n if len(caption_node) > 0:\n caption = \"\"\n for caption_par in list(caption_node[0]):\n if caption_par.xpath(\".//supplementary-material\"):\n continue\n caption += JatsDocumentBackend._get_text(caption_par).strip() + \" \"\n caption = caption.strip()\n else:\n caption = None\n\n # TODO: format label vs caption once styling is supported\n fig_text: str = f\"{label}{' ' if label and caption else ''}{caption}\"\n fig_caption: Optional[TextItem] = (\n doc.add_text(label=DocItemLabel.CAPTION, text=fig_text)\n if fig_text\n else None\n )\n\n doc.add_picture(parent=parent, caption=fig_caption)\n\n return", + "meta": { + "part_name": "_add_figure_captions", + "docstring": "", + "sha256": 835512377408024386136672483548752387568193882073, + "start_line": 492, + "end_line": 522, + "end_line_signature": 495, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_metadata(\n self, doc: DoclingDocument, xml_components: XMLComponents\n ) -> None:\n self._add_title(doc, xml_components)\n self._add_authors(doc, xml_components)\n self._add_abstract(doc, xml_components)\n\n return", + "meta": { + "part_name": "_add_metadata", + "docstring": "", + "sha256": 153982142573938397014666471275555982493113402407, + "start_line": 531, + "end_line": 538, + "end_line_signature": 534, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from bs4 import BeautifulSoup, Tag\nfrom docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.backend.html_backend import HTMLDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_table(\n self, doc: DoclingDocument, parent: NodeItem, table_xml_component: Table\n ) -> None:\n soup = BeautifulSoup(table_xml_component[\"content\"], \"html.parser\")\n table_tag = soup.find(\"table\")\n if not isinstance(table_tag, Tag):\n return\n\n data = HTMLDocumentBackend.parse_table_data(table_tag)\n\n # TODO: format label vs caption once styling is supported\n label = table_xml_component[\"label\"]\n caption = table_xml_component[\"caption\"]\n table_text: str = f\"{label}{' ' if label and caption else ''}{caption}\"\n table_caption: Optional[TextItem] = (\n doc.add_text(label=DocItemLabel.CAPTION, text=table_text)\n if table_text\n else None\n )\n\n if data is not None:\n doc.add_table(data=data, parent=parent, caption=table_caption)\n\n return", + "meta": { + "part_name": "_add_table", + "docstring": "", + "sha256": 1342802968890476187190364473245592459773739883169, + "start_line": 540, + "end_line": 563, + "end_line_signature": 543, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\n_log = logging.getLogger(__name__)\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_tables(\n self, doc: DoclingDocument, parent: NodeItem, node: etree._Element\n ) -> None:\n table: Table = {\"label\": \"\", \"caption\": \"\", \"content\": \"\"}\n\n # Content\n if len(node.xpath(\"table\")) > 0:\n table_content_node = node.xpath(\"table\")[0]\n elif len(node.xpath(\"alternatives/table\")) > 0:\n table_content_node = node.xpath(\"alternatives/table\")[0]\n else:\n table_content_node = None\n if table_content_node is not None:\n table[\"content\"] = etree.tostring(table_content_node).decode(\"utf-8\")\n\n # Caption\n caption_node = node.xpath(\"caption\")\n caption: Optional[str]\n if caption_node:\n caption = \"\"\n for caption_par in list(caption_node[0]):\n if caption_par.xpath(\".//supplementary-material\"):\n continue\n caption += JatsDocumentBackend._get_text(caption_par).strip() + \" \"\n caption = caption.strip()\n else:\n caption = None\n if caption is not None:\n table[\"caption\"] = caption\n\n # Label\n if len(node.xpath(\"label\")) > 0:\n table[\"label\"] = node.xpath(\"label\")[0].text\n\n try:\n self._add_table(doc, parent, table)\n except Exception:\n _log.warning(f\"Skipping unsupported table in {self.file!s}\")\n\n return", + "meta": { + "part_name": "_add_tables", + "docstring": "", + "sha256": 1033621645055163687215987453641177660800797256694, + "start_line": 565, + "end_line": 604, + "end_line_signature": 568, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:\n self.root = doc.add_text(\n parent=None,\n text=xml_components[\"title\"],\n label=DocItemLabel.TITLE,\n )\n return", + "meta": { + "part_name": "_add_title", + "docstring": "", + "sha256": 974332749105219020038962392126855200985294853221, + "start_line": 606, + "end_line": 612, + "end_line_signature": 607, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nDEFAULT_HEADER_ACKNOWLEDGMENTS: Final = \"Acknowledgments\"\nDEFAULT_HEADER_REFERENCES: Final = \"References\"\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _walk_linear( # noqa: C901\n self, doc: DoclingDocument, parent: NodeItem, node: etree._Element\n ) -> str:\n skip_tags = [\"term\"]\n flush_tags = [\"ack\", \"sec\", \"list\", \"boxed-text\", \"disp-formula\", \"fig\"]\n new_parent: NodeItem = parent\n node_text: str = (\n node.text.replace(\"\\n\", \" \")\n if (node.tag not in skip_tags and node.text)\n else \"\"\n )\n\n for child in list(node):\n stop_walk: bool = False\n\n # flush text into TextItem for some tags in paragraph nodes\n if node.tag == \"p\" and node_text.strip() and child.tag in flush_tags:\n doc.add_text(\n label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent\n )\n node_text = \"\"\n\n # add elements and decide whether to stop walking\n if child.tag in (\"sec\", \"ack\"):\n header = child.xpath(\"title|label\")\n text: Optional[str] = None\n if len(header) > 0:\n text = JatsDocumentBackend._get_text(header[0])\n elif child.tag == \"ack\":\n text = DEFAULT_HEADER_ACKNOWLEDGMENTS\n if text:\n self.hlevel += 1\n new_parent = doc.add_heading(\n text=text, parent=parent, level=self.hlevel\n )\n elif child.tag == \"list\":\n new_parent = doc.add_group(\n label=GroupLabel.LIST, name=\"list\", parent=parent\n )\n elif child.tag == \"list-item\":\n # TODO: address any type of content (another list, formula,...)\n # TODO: address list type and item label\n text = JatsDocumentBackend._get_text(child).strip()\n new_parent = doc.add_list_item(text=text, parent=parent)\n stop_walk = True\n elif child.tag == \"fig\":\n self._add_figure_captions(doc, parent, child)\n stop_walk = True\n elif child.tag == \"table-wrap\":\n self._add_tables(doc, parent, child)\n stop_walk = True\n elif child.tag == \"suplementary-material\":\n stop_walk = True\n elif child.tag == \"fn-group\":\n # header = child.xpath(\".//title\") or child.xpath(\".//label\")\n # if header:\n # text = JatsDocumentBackend._get_text(header[0])\n # fn_parent = doc.add_heading(text=text, parent=new_parent)\n # self._add_footnote_group(doc, fn_parent, child)\n stop_walk = True\n elif child.tag == \"ref-list\" and node.tag != \"ref-list\":\n header = child.xpath(\"title|label\")\n text = (\n JatsDocumentBackend._get_text(header[0])\n if len(header) > 0\n else DEFAULT_HEADER_REFERENCES\n )\n new_parent = doc.add_heading(text=text, parent=parent)\n new_parent = doc.add_group(\n parent=new_parent, label=GroupLabel.LIST, name=\"list\"\n )\n elif child.tag == \"element-citation\":\n text = self._parse_element_citation(child)\n self._add_citation(doc, parent, text)\n stop_walk = True\n elif child.tag == \"mixed-citation\":\n text = JatsDocumentBackend._get_text(child).strip()\n self._add_citation(doc, parent, text)\n stop_walk = True\n elif child.tag == \"tex-math\":\n self._add_equation(doc, parent, child)\n stop_walk = True\n elif child.tag == \"inline-formula\":\n # TODO: address inline formulas when supported by docling-core\n stop_walk = True\n\n # step into child\n if not stop_walk:\n new_text = self._walk_linear(doc, new_parent, child)\n if not (node.getparent().tag == \"p\" and node.tag in flush_tags):\n node_text += new_text\n if child.tag in (\"sec\", \"ack\") and text:\n self.hlevel -= 1\n\n # pick up the tail text\n node_text += child.tail.replace(\"\\n\", \" \") if child.tail else \"\"\n\n # create paragraph\n if node.tag == \"p\" and node_text.strip():\n doc.add_text(label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent)\n return \"\"\n else:\n # backpropagate the text\n return node_text", + "meta": { + "part_name": "_walk_linear", + "docstring": "", + "sha256": 1396734027815047329843267767563675798505988021776, + "start_line": 614, + "end_line": 717, + "end_line_signature": 617, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "function" + } + }, + { + "text": "from typing_extensions import TypedDict, override\nclass Abstract(TypedDict):\n label: str\n content: str", + "meta": { + "part_name": "Abstract", + "docstring": "", + "sha256": 794614712107844233737490895996588109287861339811, + "start_line": 33, + "end_line": 35, + "end_line_signature": 35, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from io import BytesIO\nfrom typing_extensions import TypedDict, override\nclass Author(TypedDict):\n name: str\n affiliation_names: list[str]", + "meta": { + "part_name": "Author", + "docstring": "", + "sha256": 618469300419808735784045889717450654715997143657, + "start_line": 38, + "end_line": 40, + "end_line_signature": 40, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from io import BytesIO\nfrom typing_extensions import TypedDict, override\nclass Citation(TypedDict):\n author_names: str\n title: str\n source: str\n year: str\n volume: str\n page: str\n pub_id: str\n publisher_name: str\n publisher_loc: str", + "meta": { + "part_name": "Citation", + "docstring": "", + "sha256": 47216956481538603575192296942081985433567090375, + "start_line": 43, + "end_line": 52, + "end_line_signature": 52, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from io import BytesIO\nfrom typing_extensions import TypedDict, override\nclass Table(TypedDict):\n label: str\n caption: str\n content: str", + "meta": { + "part_name": "Table", + "docstring": "", + "sha256": 652205560496743097978957542262426472701689171417, + "start_line": 55, + "end_line": 58, + "end_line_signature": 58, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "class" + } + }, + { + "text": "from typing_extensions import TypedDict, override\nclass XMLComponents(TypedDict):\n title: str\n authors: list[Author]\n abstract: list[Abstract]", + "meta": { + "part_name": "XMLComponents", + "docstring": "", + "sha256": 1130452765636835800645360676517087324676223612005, + "start_line": 61, + "end_line": 64, + "end_line_signature": 64, + "origin": { + "mimetype": "text/plain", + "binary_hash": 12767849390864590006, + "filename": "jats_backend.py", + "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" + }, + "chunk_type": "class" + } + } + ] +} diff --git a/test/data/chunker_repo/TypeScript/repo_out_chunks.json b/test/data/chunker_repo/TypeScript/repo_out_chunks.json new file mode 100644 index 00000000..a192ed53 --- /dev/null +++ b/test/data/chunker_repo/TypeScript/repo_out_chunks.json @@ -0,0 +1,175 @@ +{ + "root": [ + { + "text": "import Model from \"./base/Model\";\nclass Comment extends Model\n public get isResolved(): boolean {\n return !!this.resolvedAt || !!this.parentComment?.isResolved;\n }", + "meta": { + "part_name": "isResolved", + "docstring": "", + "sha256": 1268395403700592019784717617222283727541873921424, + "start_line": 100, + "end_line": 102, + "end_line_signature": 102, + "origin": { + "mimetype": "text/plain", + "binary_hash": 13313267827846711454, + "filename": "Comment.ts", + "uri": "https://github.com/outline/outline/blob/abc123def456/Comment.ts" + }, + "chunk_type": "function" + } + }, + { + "text": "import Model from \"./base/Model\";\nclass Comment extends Model\n public get isReply() {\n return !!this.parentCommentId;\n }", + "meta": { + "part_name": "isReply", + "docstring": "", + "sha256": 1365364938419899639010891183359481253853232355963, + "start_line": 108, + "end_line": 110, + "end_line_signature": 110, + "origin": { + "mimetype": "text/plain", + "binary_hash": 13313267827846711454, + "filename": "Comment.ts", + "uri": "https://github.com/outline/outline/blob/abc123def456/Comment.ts" + }, + "chunk_type": "function" + } + }, + { + "text": "import Model from \"./base/Model\";\nclass Comment extends Model\n public resolve() {\n return this.store.rootStore.comments.resolve(this.id);\n }", + "meta": { + "part_name": "resolve", + "docstring": "/**\n * Resolve the comment\n */", + "sha256": 991119951853749619459124936919291768908369832281, + "start_line": 115, + "end_line": 117, + "end_line_signature": 117, + "origin": { + "mimetype": "text/plain", + "binary_hash": 13313267827846711454, + "filename": "Comment.ts", + "uri": "https://github.com/outline/outline/blob/abc123def456/Comment.ts" + }, + "chunk_type": "function" + } + }, + { + "text": "import Model from \"./base/Model\";\nclass Comment extends Model\n public unresolve() {\n return this.store.rootStore.comments.unresolve(this.id);\n }", + "meta": { + "part_name": "unresolve", + "docstring": "/**\n * Unresolve the comment\n */", + "sha256": 737181169666352833175846995267642651601564437701, + "start_line": 122, + "end_line": 124, + "end_line_signature": 124, + "origin": { + "mimetype": "text/plain", + "binary_hash": 13313267827846711454, + "filename": "Comment.ts", + "uri": "https://github.com/outline/outline/blob/abc123def456/Comment.ts" + }, + "chunk_type": "function" + } + }, + { + "text": "import invariant from \"invariant\";\nimport uniq from \"lodash/uniq\";\nimport { action, computed, observable } from \"mobx\";\nimport { Pagination } from \"@shared/constants\";\nimport type { ProsemirrorData, ReactionSummary } from \"@shared/types\";\nimport User from \"~/models/User\";\nimport { client } from \"~/utils/ApiClient\";\nimport Document from \"./Document\";\n\nimport Field from \"./decorators/Field\";\nimport Relation from \"./decorators/Relation\";\n\nexport default Comment;", + "meta": { + "sha256": 127587344566918131981664969548384712413573188523, + "start_line": 1, + "end_line": 279, + "origin": { + "mimetype": "text/plain", + "binary_hash": 13313267827846711454, + "filename": "Comment.ts", + "uri": "https://github.com/outline/outline/blob/abc123def456/Comment.ts" + }, + "chunk_type": "preamble" + } + }, + { + "text": "import Group from \"./Group\";\nimport Model from \"./base/Model\";\nimport Relation from \"./decorators/Relation\";\nimport User from \"./User\";\n\nclass GroupUser extends Model {\n static modelName = \"GroupUser\";\n\n /** The ID of the user. */\n userId: string;\n\n /** The user that belongs to the group. */\n @Relation(() => User, { onDelete: \"cascade\" })\n user: User;\n\n /** The ID of the group. */\n groupId: string;\n\n /** The group that the user belongs to. */\n @Relation(() => Group, { onDelete: \"cascade\" })\n group: Group;\n}", + "meta": { + "part_name": "GroupUser", + "docstring": "/**\n * Represents a user's membership to a group.\n */", + "sha256": 819039209099366519772307112685515925657900275191, + "start_line": 8, + "end_line": 24, + "end_line_signature": 24, + "origin": { + "mimetype": "text/plain", + "binary_hash": 1986469258069411733, + "filename": "GroupUser.ts", + "uri": "https://github.com/outline/outline/blob/abc123def456/GroupUser.ts" + }, + "chunk_type": "class" + } + }, + { + "text": "export default GroupUser;", + "meta": { + "sha256": 1202573002644555545724623497903246581585285637847, + "start_line": 25, + "end_line": 28, + "origin": { + "mimetype": "text/plain", + "binary_hash": 1986469258069411733, + "filename": "GroupUser.ts", + "uri": "https://github.com/outline/outline/blob/abc123def456/GroupUser.ts" + }, + "chunk_type": "preamble" + } + }, + { + "text": "import {\n Action,\n ActionContext,\n ActionV2,\n ActionV2Group,\n ActionV2Separator as TActionV2Separator,\n ActionV2Variant,\n ActionV2WithChildren,\n CommandBarAction,\n ExternalLinkActionV2,\n InternalLinkActionV2,\n MenuExternalLink,\n MenuInternalLink,\n MenuItem,\n MenuItemButton,\n MenuItemWithChildren,\n} from \"~/types\";\nfunction resolve(value: any, context: ActionContext): T {\n return typeof value === \"function\" ? value(context) : value;\n}", + "meta": { + "part_name": "resolve", + "docstring": "", + "sha256": 1289291728661617648625599715448098966958049316632, + "start_line": 24, + "end_line": 26, + "end_line_signature": 26, + "origin": { + "mimetype": "text/plain", + "binary_hash": 16803020185603763773, + "filename": "index.ts", + "uri": "https://github.com/outline/outline/blob/abc123def456/index.ts" + }, + "chunk_type": "function" + } + }, + { + "text": "import {\n Action,\n ActionContext,\n ActionV2,\n ActionV2Group,\n ActionV2Separator as TActionV2Separator,\n ActionV2Variant,\n ActionV2WithChildren,\n CommandBarAction,\n ExternalLinkActionV2,\n InternalLinkActionV2,\n MenuExternalLink,\n MenuInternalLink,\n MenuItem,\n MenuItemButton,\n MenuItemWithChildren,\n} from \"~/types\";\nfunction hasVisibleItems(items: MenuItem[]) {\n const applicableTypes = [\"button\", \"link\", \"route\", \"group\", \"submenu\"];\n return items.some(\n (item) => applicableTypes.includes(item.type) && item.visible\n );\n}", + "meta": { + "part_name": "hasVisibleItems", + "docstring": "", + "sha256": 1279869349240065760172944255546797254943234495037, + "start_line": 359, + "end_line": 364, + "end_line_signature": 364, + "origin": { + "mimetype": "text/plain", + "binary_hash": 16803020185603763773, + "filename": "index.ts", + "uri": "https://github.com/outline/outline/blob/abc123def456/index.ts" + }, + "chunk_type": "function" + } + }, + { + "text": "import flattenDeep from \"lodash/flattenDeep\";\nimport { toast } from \"sonner\";\nimport { Optional } from \"utility-types\";\nimport { v4 as uuidv4 } from \"uuid\";\n\nimport Analytics from \"~/utils/Analytics\";\nimport history from \"~/utils/history\";\n\nexport function createAction(definition: Optional): Action {\n return {\n ...definition,\n perform: definition.perform\n ? (context) => {\n // We must use the specific analytics name here as the action name is\n // translated and potentially contains user strings.\n if (definition.analyticsName) {\n Analytics.track(\"perform_action\", definition.analyticsName, {\n context: context.isButton\n ? \"button\"\n : context.isCommandBar\n ? \"commandbar\"\n : \"contextmenu\",\n });\n }\n return definition.perform?.(context);\n }\n : undefined,\n id: definition.id ?? uuidv4(),\n };\n}\n\nexport function actionToMenuItem(\n action: Action,\n context: ActionContext\n): MenuItemButton | MenuExternalLink | MenuInternalLink | MenuItemWithChildren {\n const resolvedIcon = resolve>(action.icon, context);\n const resolvedChildren = resolve(action.children, context);\n const visible = action.visible ? action.visible(context) : true;\n const title = resolve(action.name, context);\n const icon =\n resolvedIcon && action.iconInContextMenu !== false\n ? resolvedIcon\n : undefined;\n\n if (resolvedChildren) {\n const items = resolvedChildren\n .map((a) => actionToMenuItem(a, context))\n .filter(Boolean)\n .filter((a) => a.visible);\n\n return {\n type: \"submenu\",\n title,\n icon,\n items,\n visible: visible && items.length > 0,\n };\n }\n\n if (action.to) {\n return typeof action.to === \"string\"\n ? {\n type: \"route\",\n title,\n icon,\n visible,\n to: action.to,\n selected: action.selected?.(context),\n }\n : {\n type: \"link\",\n title,\n icon,\n visible,\n href: action.to,\n selected: action.selected?.(context),\n };\n }\n\n return {\n type: \"button\",\n title,\n icon,\n visible,\n dangerous: action.dangerous,\n onClick: () => performAction(action, context),\n selected: action.selected?.(context),\n };\n}\n\nexport function actionToKBar(\n action: Action,\n context: ActionContext\n): CommandBarAction[] {\n if (typeof action.visible === \"function\" && !action.visible(context)) {\n return [];\n }\n\n const resolvedIcon = resolve(action.icon, context);\n const resolvedChildren = resolve(action.children, context);\n const resolvedSection = resolve(action.section, context);\n const resolvedName = resolve(action.name, context);\n const resolvedPlaceholder = resolve(action.placeholder, context);\n const children = resolvedChildren\n ? flattenDeep(resolvedChildren.map((a) => actionToKBar(a, context))).filter(\n (a) => !!a\n )\n : [];\n\n const sectionPriority =\n typeof action.section !== \"string\" && \"priority\" in action.section\n ? ((action.section.priority as number) ?? 0)\n : 0;\n\n return [\n {\n id: action.id,\n name: resolvedName,\n analyticsName: action.analyticsName,\n section: resolvedSection,\n placeholder: resolvedPlaceholder,\n keywords: action.keywords ?? \"\",\n shortcut: action.shortcut || [],\n icon: resolvedIcon,\n priority: (1 + (action.priority ?? 0)) * (1 + (sectionPriority ?? 0)),\n perform:\n action.perform || action.to\n ? () => performAction(action, context)\n : undefined,\n },\n ].concat(\n // @ts-expect-error ts-migrate(2769) FIXME: No overload matches this call.\n children.map((child) => ({ ...child, parent: child.parent ?? action.id }))\n );\n}\n\nexport async function performAction(action: Action, context: ActionContext) {\n const result = action.perform\n ? action.perform(context)\n : action.to\n ? typeof action.to === \"string\"\n ? history.push(action.to)\n : window.open(action.to.url, action.to.target)\n : undefined;\n\n if (result instanceof Promise) {\n return result.catch((err: Error) => {\n toast.error(err.message);\n });\n }\n\n return result;\n}\n\n/** Actions V2 */\n\nexport const ActionV2Separator: TActionV2Separator = {\n type: \"action_separator\",\n};\n\nexport function createActionV2(\n definition: Optional, \"id\">\n): ActionV2 {\n return {\n ...definition,\n type: \"action\",\n variant: \"action\",\n perform: definition.perform\n ? (context) => {\n // We must use the specific analytics name here as the action name is\n // translated and potentially contains user strings.\n if (definition.analyticsName) {\n Analytics.track(\"perform_action\", definition.analyticsName, {\n context: context.isButton\n ? \"button\"\n : context.isCommandBar\n ? \"commandbar\"\n : \"contextmenu\",\n });\n }\n return definition.perform(context);\n }\n : () => {},\n id: definition.id ?? uuidv4(),\n };\n}\n\nexport function createInternalLinkActionV2(\n definition: Optional, \"id\">\n): InternalLinkActionV2 {\n return {\n ...definition,\n type: \"action\",\n variant: \"internal_link\",\n id: definition.id ?? uuidv4(),\n };\n}\n\nexport function createExternalLinkActionV2(\n definition: Optional, \"id\">\n): ExternalLinkActionV2 {\n return {\n ...definition,\n type: \"action\",\n variant: \"external_link\",\n id: definition.id ?? uuidv4(),\n };\n}\n\nexport function createActionV2WithChildren(\n definition: Optional, \"id\">\n): ActionV2WithChildren {\n return {\n ...definition,\n type: \"action\",\n variant: \"action_with_children\",\n id: definition.id ?? uuidv4(),\n };\n}\n\nexport function createActionV2Group(\n definition: Omit\n): ActionV2Group {\n return {\n ...definition,\n type: \"action_group\",\n };\n}\n\nexport function createRootMenuAction(\n actions: (ActionV2Variant | ActionV2Group | TActionV2Separator)[]\n): ActionV2WithChildren {\n return {\n id: uuidv4(),\n type: \"action\",\n variant: \"action_with_children\",\n name: \"root_action\",\n section: \"Root\",\n children: actions,\n };\n}\n\nexport function actionV2ToMenuItem(\n action: ActionV2Variant | ActionV2Group | TActionV2Separator,\n context: ActionContext\n): MenuItem {\n switch (action.type) {\n case \"action\": {\n const title = resolve(action.name, context);\n const visible = resolve(action.visible, context);\n const icon =\n !!action.icon && action.iconInContextMenu !== false\n ? action.icon\n : undefined;\n\n switch (action.variant) {\n case \"action\":\n return {\n type: \"button\",\n title,\n icon,\n visible,\n dangerous: action.dangerous,\n onClick: () => performActionV2(action, context),\n };\n\n case \"internal_link\":\n return {\n type: \"route\",\n title,\n icon,\n visible,\n to: action.to,\n };\n\n case \"external_link\":\n return {\n type: \"link\",\n title,\n icon,\n visible,\n href: action.target\n ? { url: action.url, target: action.target }\n : action.url,\n };\n\n case \"action_with_children\": {\n const children = resolve<\n (ActionV2Variant | ActionV2Group | TActionV2Separator)[]\n >(action.children, context);\n const subMenuItems = children.map((a) =>\n actionV2ToMenuItem(a, context)\n );\n return {\n type: \"submenu\",\n title,\n icon,\n items: subMenuItems,\n visible: visible && hasVisibleItems(subMenuItems),\n };\n }\n\n default:\n throw Error(\"invalid action variant\");\n }\n }\n\n case \"action_group\": {\n const groupItems = action.actions.map((a) =>\n actionV2ToMenuItem(a, context)\n );\n return {\n type: \"group\",\n title: resolve(action.name, context),\n visible: hasVisibleItems(groupItems),\n items: groupItems,\n };\n }\n\n case \"action_separator\":\n return { type: \"separator\" };\n }\n}\n\nexport async function performActionV2(\n action: ActionV2,\n context: ActionContext\n) {\n const result = action.perform(context);\n\n if (result instanceof Promise) {\n return result.catch((err: Error) => {\n toast.error(err.message);\n });\n }\n\n return result;\n}", + "meta": { + "sha256": 1201606939402032044701817936630056161927504135324, + "start_line": 1, + "end_line": 359, + "origin": { + "mimetype": "text/plain", + "binary_hash": 16803020185603763773, + "filename": "index.ts", + "uri": "https://github.com/outline/outline/blob/abc123def456/index.ts" + }, + "chunk_type": "preamble" + } + } + ] +} diff --git a/test/data/chunker_repo/repos/acmeair/AcmeAirConstants.java b/test/data/chunker_repo/repos/acmeair/AcmeAirConstants.java new file mode 100644 index 00000000..dde7e77d --- /dev/null +++ b/test/data/chunker_repo/repos/acmeair/AcmeAirConstants.java @@ -0,0 +1,6 @@ +package com.acmeair; + +public interface AcmeAirConstants { + + +} diff --git a/test/data/chunker_repo/repos/acmeair/CustomerLoader.java b/test/data/chunker_repo/repos/acmeair/CustomerLoader.java new file mode 100644 index 00000000..890347a7 --- /dev/null +++ b/test/data/chunker_repo/repos/acmeair/CustomerLoader.java @@ -0,0 +1,37 @@ +/******************************************************************************* +* Copyright (c) 2013 IBM Corp. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ +package com.acmeair.loader; + +import com.acmeair.entities.Customer; +import com.acmeair.entities.CustomerAddress; +import com.acmeair.entities.Customer.PhoneType; +import com.acmeair.service.CustomerService; +import com.acmeair.service.ServiceLocator; + + +public class CustomerLoader { + + private CustomerService customerService = ServiceLocator.instance().getService(CustomerService.class); + + + public void loadCustomers(long numCustomers) { + CustomerAddress address = customerService.createAddress("123 Main St.", null, "Anytown", "NC", "USA", "27617"); + for (long ii = 0; ii < numCustomers; ii++) { + customerService.createCustomer("uid"+ii+"@email.com", "password", Customer.MemberShipStatus.GOLD, 1000000, 1000, "919-123-4567", PhoneType.BUSINESS, address); + } + } + +} \ No newline at end of file diff --git a/test/data/chunker_repo/repos/acmeair/FlightLoader.java b/test/data/chunker_repo/repos/acmeair/FlightLoader.java new file mode 100644 index 00000000..9b2a1adc --- /dev/null +++ b/test/data/chunker_repo/repos/acmeair/FlightLoader.java @@ -0,0 +1,133 @@ +/******************************************************************************* +* Copyright (c) 2013 IBM Corp. +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ +package com.acmeair.loader; + +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.LineNumberReader; +import java.util.*; +import java.math.*; + +import com.acmeair.entities.AirportCodeMapping; +import com.acmeair.service.FlightService; +import com.acmeair.service.ServiceLocator; + + + + +public class FlightLoader { + + private static final int MAX_FLIGHTS_PER_SEGMENT = 30; + + + private FlightService flightService = ServiceLocator.instance().getService(FlightService.class); + + public void loadFlights() throws Exception { + InputStream csvInputStream = FlightLoader.class.getResourceAsStream("/mileage.csv"); + + LineNumberReader lnr = new LineNumberReader(new InputStreamReader(csvInputStream)); + String line1 = lnr.readLine(); + StringTokenizer st = new StringTokenizer(line1, ","); + ArrayList airports = new ArrayList(); + + // read the first line which are airport names + while (st.hasMoreTokens()) { + AirportCodeMapping acm = flightService.createAirportCodeMapping(null, st.nextToken()); + // acm.setAirportName(st.nextToken()); + airports.add(acm); + } + // read the second line which contains matching airport codes for the first line + String line2 = lnr.readLine(); + st = new StringTokenizer(line2, ","); + int ii = 0; + while (st.hasMoreTokens()) { + String airportCode = st.nextToken(); + airports.get(ii).setAirportCode(airportCode); + ii++; + } + // read the other lines which are of format: + // airport name, aiport code, distance from this airport to whatever airport is in the column from lines one and two + String line; + int flightNumber = 0; + while (true) { + line = lnr.readLine(); + if (line == null || line.trim().equals("")) { + break; + } + st = new StringTokenizer(line, ","); + String airportName = st.nextToken(); + String airportCode = st.nextToken(); + if (!alreadyInCollection(airportCode, airports)) { + AirportCodeMapping acm = flightService.createAirportCodeMapping(airportCode, airportName); + airports.add(acm); + } + int indexIntoTopLine = 0; + while (st.hasMoreTokens()) { + String milesString = st.nextToken(); + if (milesString.equals("NA")) { + indexIntoTopLine++; + continue; + } + int miles = Integer.parseInt(milesString); + String toAirport = airports.get(indexIntoTopLine).getAirportCode(); + String flightId = "AA" + flightNumber; + flightService.storeFlightSegment(flightId, airportCode, toAirport, miles); + Date now = new Date(); + for (int daysFromNow = 0; daysFromNow < MAX_FLIGHTS_PER_SEGMENT; daysFromNow++) { + Calendar c = Calendar.getInstance(); + c.setTime(now); + c.set(Calendar.HOUR_OF_DAY, 0); + c.set(Calendar.MINUTE, 0); + c.set(Calendar.SECOND, 0); + c.set(Calendar.MILLISECOND, 0); + c.add(Calendar.DATE, daysFromNow); + Date departureTime = c.getTime(); + Date arrivalTime = getArrivalTime(departureTime, miles); + flightService.createNewFlight(flightId, departureTime, arrivalTime, new BigDecimal(500), new BigDecimal(200), 10, 200, "B747"); + + } + flightNumber++; + indexIntoTopLine++; + } + } + + for (int jj = 0; jj < airports.size(); jj++) { + flightService.storeAirportMapping(airports.get(jj)); + } + lnr.close(); + } + + private static Date getArrivalTime(Date departureTime, int mileage) { + double averageSpeed = 600.0; // 600 miles/hours + double hours = (double) mileage / averageSpeed; // miles / miles/hour = hours + double partsOfHour = hours % 1.0; + int minutes = (int)(60.0 * partsOfHour); + Calendar c = Calendar.getInstance(); + c.setTime(departureTime); + c.add(Calendar.HOUR, (int)hours); + c.add(Calendar.MINUTE, minutes); + return c.getTime(); + } + + static private boolean alreadyInCollection(String airportCode, ArrayList airports) { + for (int ii = 0; ii < airports.size(); ii++) { + if (airports.get(ii).getAirportCode().equals(airportCode)) { + return true; + } + } + return false; + } +} diff --git a/test/data/chunker_repo/repos/docling/base_models.py b/test/data/chunker_repo/repos/docling/base_models.py new file mode 100644 index 00000000..0c8608d7 --- /dev/null +++ b/test/data/chunker_repo/repos/docling/base_models.py @@ -0,0 +1,435 @@ +from collections import defaultdict +from enum import Enum +from typing import TYPE_CHECKING, Dict, List, Optional, Union + +import numpy as np + +# DO NOT REMOVE; explicitly exposed from this location +from PIL.Image import Image +from pydantic import ( + BaseModel, + ConfigDict, + Field, + FieldSerializationInfo, + computed_field, + field_serializer, +) + +from docling_core.types.doc import ( + BoundingBox, + DocItemLabel, + NodeItem, + PictureDataType, + Size, + TableCell, +) +from docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float +from docling_core.types.doc.page import SegmentedPdfPage, TextCell + +if TYPE_CHECKING: + from docling.backend.pdf_backend import PdfPageBackend + + +class ConversionStatus(str, Enum): + PENDING = "pending" + STARTED = "started" + FAILURE = "failure" + SUCCESS = "success" + PARTIAL_SUCCESS = "partial_success" + SKIPPED = "skipped" + + +class InputFormat(str, Enum): + """A document format supported by document backend parsers.""" + + DOCX = "docx" + PPTX = "pptx" + HTML = "html" + IMAGE = "image" + PDF = "pdf" + ASCIIDOC = "asciidoc" + MD = "md" + CSV = "csv" + XLSX = "xlsx" + XML_USPTO = "xml_uspto" + XML_JATS = "xml_jats" + JSON_DOCLING = "json_docling" + AUDIO = "audio" + + +class OutputFormat(str, Enum): + MARKDOWN = "md" + JSON = "json" + HTML = "html" + HTML_SPLIT_PAGE = "html_split_page" + TEXT = "text" + DOCTAGS = "doctags" + + +FormatToExtensions: Dict[InputFormat, List[str]] = { + InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"], + InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"], + InputFormat.PDF: ["pdf"], + InputFormat.MD: ["md"], + InputFormat.HTML: ["html", "htm", "xhtml"], + InputFormat.XML_JATS: ["xml", "nxml"], + InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp", "webp"], + InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"], + InputFormat.CSV: ["csv"], + InputFormat.XLSX: ["xlsx", "xlsm"], + InputFormat.XML_USPTO: ["xml", "txt"], + InputFormat.JSON_DOCLING: ["json"], + InputFormat.AUDIO: ["wav", "mp3"], +} + +FormatToMimeType: Dict[InputFormat, List[str]] = { + InputFormat.DOCX: [ + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.openxmlformats-officedocument.wordprocessingml.template", + ], + InputFormat.PPTX: [ + "application/vnd.openxmlformats-officedocument.presentationml.template", + "application/vnd.openxmlformats-officedocument.presentationml.slideshow", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ], + InputFormat.HTML: ["text/html", "application/xhtml+xml"], + InputFormat.XML_JATS: ["application/xml"], + InputFormat.IMAGE: [ + "image/png", + "image/jpeg", + "image/tiff", + "image/gif", + "image/bmp", + "image/webp", + ], + InputFormat.PDF: ["application/pdf"], + InputFormat.ASCIIDOC: ["text/asciidoc"], + InputFormat.MD: ["text/markdown", "text/x-markdown"], + InputFormat.CSV: ["text/csv"], + InputFormat.XLSX: [ + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ], + InputFormat.XML_USPTO: ["application/xml", "text/plain"], + InputFormat.JSON_DOCLING: ["application/json"], + InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"], +} + +MimeTypeToFormat: dict[str, list[InputFormat]] = { + mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]] + for value in FormatToMimeType.values() + for mime in value +} + + +class DocInputType(str, Enum): + PATH = "path" + STREAM = "stream" + + +class DoclingComponentType(str, Enum): + DOCUMENT_BACKEND = "document_backend" + MODEL = "model" + DOC_ASSEMBLER = "doc_assembler" + USER_INPUT = "user_input" + + +class ErrorItem(BaseModel): + component_type: DoclingComponentType + module_name: str + error_message: str + + +class Cluster(BaseModel): + id: int + label: DocItemLabel + bbox: BoundingBox + confidence: float = 1.0 + cells: List[TextCell] = [] + children: List["Cluster"] = [] # Add child cluster support + + @field_serializer("confidence") + def _serialize(self, value: float, info: FieldSerializationInfo) -> float: + return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC) + + +class BasePageElement(BaseModel): + label: DocItemLabel + id: int + page_no: int + cluster: Cluster + text: Optional[str] = None + + +class LayoutPrediction(BaseModel): + clusters: List[Cluster] = [] + + +class VlmPredictionToken(BaseModel): + text: str = "" + token: int = -1 + logprob: float = -1 + + +class VlmPrediction(BaseModel): + text: str = "" + generated_tokens: list[VlmPredictionToken] = [] + generation_time: float = -1 + + +class ContainerElement( + BasePageElement +): # Used for Form and Key-Value-Regions, only for typing. + pass + + +class Table(BasePageElement): + otsl_seq: List[str] + num_rows: int = 0 + num_cols: int = 0 + table_cells: List[TableCell] + + +class TableStructurePrediction(BaseModel): + table_map: Dict[int, Table] = {} + + +class TextElement(BasePageElement): + text: str + + +class FigureElement(BasePageElement): + annotations: List[PictureDataType] = [] + provenance: Optional[str] = None + predicted_class: Optional[str] = None + confidence: Optional[float] = None + + @field_serializer("confidence") + def _serialize( + self, value: Optional[float], info: FieldSerializationInfo + ) -> Optional[float]: + return ( + round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC) + if value is not None + else None + ) + + +class FigureClassificationPrediction(BaseModel): + figure_count: int = 0 + figure_map: Dict[int, FigureElement] = {} + + +class EquationPrediction(BaseModel): + equation_count: int = 0 + equation_map: Dict[int, TextElement] = {} + + +class PagePredictions(BaseModel): + layout: Optional[LayoutPrediction] = None + tablestructure: Optional[TableStructurePrediction] = None + figures_classification: Optional[FigureClassificationPrediction] = None + equations_prediction: Optional[EquationPrediction] = None + vlm_response: Optional[VlmPrediction] = None + + +PageElement = Union[TextElement, Table, FigureElement, ContainerElement] + + +class AssembledUnit(BaseModel): + elements: List[PageElement] = [] + body: List[PageElement] = [] + headers: List[PageElement] = [] + + +class ItemAndImageEnrichmentElement(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + + item: NodeItem + image: Image + + +class Page(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + + page_no: int + # page_hash: Optional[str] = None + size: Optional[Size] = None + parsed_page: Optional[SegmentedPdfPage] = None + predictions: PagePredictions = PagePredictions() + assembled: Optional[AssembledUnit] = None + + _backend: Optional["PdfPageBackend"] = ( + None # Internal PDF backend. By default it is cleared during assembling. + ) + _default_image_scale: float = 1.0 # Default image scale for external usage. + _image_cache: Dict[float, Image] = ( + {} + ) # Cache of images in different scales. By default it is cleared during assembling. + + @property + def cells(self) -> List[TextCell]: + """Return text cells as a read-only view of parsed_page.textline_cells.""" + if self.parsed_page is not None: + return self.parsed_page.textline_cells + else: + return [] + + def get_image( + self, + scale: float = 1.0, + max_size: Optional[int] = None, + cropbox: Optional[BoundingBox] = None, + ) -> Optional[Image]: + if self._backend is None: + return self._image_cache.get(scale, None) + + if max_size: + assert self.size is not None + scale = min(scale, max_size / max(self.size.as_tuple())) + + if scale not in self._image_cache: + if cropbox is None: + self._image_cache[scale] = self._backend.get_page_image(scale=scale) + else: + return self._backend.get_page_image(scale=scale, cropbox=cropbox) + + if cropbox is None: + return self._image_cache[scale] + else: + page_im = self._image_cache[scale] + assert self.size is not None + return page_im.crop( + cropbox.to_top_left_origin(page_height=self.size.height) + .scaled(scale=scale) + .as_tuple() + ) + + @property + def image(self) -> Optional[Image]: + return self.get_image(scale=self._default_image_scale) + + +## OpenAI API Request / Response Models ## + + +class OpenAiChatMessage(BaseModel): + role: str + content: str + + +class OpenAiResponseChoice(BaseModel): + index: int + message: OpenAiChatMessage + finish_reason: Optional[str] + + +class OpenAiResponseUsage(BaseModel): + prompt_tokens: int + completion_tokens: int + total_tokens: int + + +class OpenAiApiResponse(BaseModel): + model_config = ConfigDict( + protected_namespaces=(), + ) + + id: str + model: Optional[str] = None # returned by openai + choices: List[OpenAiResponseChoice] + created: int + usage: OpenAiResponseUsage + + +# Create a type alias for score values +ScoreValue = float + + +class QualityGrade(str, Enum): + POOR = "poor" + FAIR = "fair" + GOOD = "good" + EXCELLENT = "excellent" + UNSPECIFIED = "unspecified" + + +class PageConfidenceScores(BaseModel): + parse_score: ScoreValue = np.nan + layout_score: ScoreValue = np.nan + table_score: ScoreValue = np.nan + ocr_score: ScoreValue = np.nan + + def _score_to_grade(self, score: ScoreValue) -> QualityGrade: + if score < 0.5: + return QualityGrade.POOR + elif score < 0.8: + return QualityGrade.FAIR + elif score < 0.9: + return QualityGrade.GOOD + elif score >= 0.9: + return QualityGrade.EXCELLENT + + return QualityGrade.UNSPECIFIED + + @computed_field # type: ignore + @property + def mean_grade(self) -> QualityGrade: + return self._score_to_grade(self.mean_score) + + @computed_field # type: ignore + @property + def low_grade(self) -> QualityGrade: + return self._score_to_grade(self.low_score) + + @computed_field # type: ignore + @property + def mean_score(self) -> ScoreValue: + return ScoreValue( + np.nanmean( + [ + self.ocr_score, + self.table_score, + self.layout_score, + self.parse_score, + ] + ) + ) + + @computed_field # type: ignore + @property + def low_score(self) -> ScoreValue: + return ScoreValue( + np.nanquantile( + [ + self.ocr_score, + self.table_score, + self.layout_score, + self.parse_score, + ], + q=0.05, + ) + ) + + +class ConfidenceReport(PageConfidenceScores): + pages: Dict[int, PageConfidenceScores] = Field( + default_factory=lambda: defaultdict(PageConfidenceScores) + ) + + @computed_field # type: ignore + @property + def mean_score(self) -> ScoreValue: + return ScoreValue( + np.nanmean( + [c.mean_score for c in self.pages.values()], + ) + ) + + @computed_field # type: ignore + @property + def low_score(self) -> ScoreValue: + return ScoreValue( + np.nanmean( + [c.low_score for c in self.pages.values()], + ) + ) diff --git a/test/data/chunker_repo/repos/docling/export.py b/test/data/chunker_repo/repos/docling/export.py new file mode 100644 index 00000000..7d5badd7 --- /dev/null +++ b/test/data/chunker_repo/repos/docling/export.py @@ -0,0 +1,146 @@ +import logging +from collections.abc import Iterable +from typing import Any, Dict, List, Tuple, Union + +from docling.datamodel.document import ConversionResult, Page + +from docling_core.types.doc import BoundingBox, CoordOrigin +from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table + +_log = logging.getLogger(__name__) + + +def generate_multimodal_pages( + doc_result: ConversionResult, +) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]: + label_to_doclaynet = { + "title": "title", + "table-of-contents": "document_index", + "subtitle-level-1": "section_header", + "checkbox-selected": "checkbox_selected", + "checkbox-unselected": "checkbox_unselected", + "caption": "caption", + "page-header": "page_header", + "page-footer": "page_footer", + "footnote": "footnote", + "table": "table", + "formula": "formula", + "list-item": "list_item", + "code": "code", + "figure": "picture", + "picture": "picture", + "reference": "text", + "paragraph": "text", + "text": "text", + } + + content_text = "" + page_no = 0 + start_ix = 0 + end_ix = 0 + doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = [] + + doc = doc_result.legacy_document + + def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page): + segments = [] + + for ix, item in doc_items: + item_type = item.obj_type + label = label_to_doclaynet.get(item_type, None) + + if label is None or item.prov is None or page.size is None: + continue + + bbox = BoundingBox.from_tuple( + tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT + ) + new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized( + page_size=page.size + ) + + new_segment = { + "index_in_doc": ix, + "label": label, + "text": item.text if item.text is not None else "", + "bbox": new_bbox.as_tuple(), + "data": [], + } + + if isinstance(item, Table): + table_html = item.export_to_html() + new_segment["data"].append( + { + "html_seq": table_html, + "otsl_seq": "", + } + ) + + segments.append(new_segment) + + return segments + + def _process_page_cells(page: Page): + cells: List[dict] = [] + if page.size is None: + return cells + for cell in page.cells: + new_bbox = ( + cell.rect.to_bounding_box() + .to_top_left_origin(page_height=page.size.height) + .normalized(page_size=page.size) + ) + is_ocr = cell.from_ocr + ocr_confidence = cell.confidence + cells.append( + { + "text": cell.text, + "bbox": new_bbox.as_tuple(), + "ocr": is_ocr, + "ocr_confidence": ocr_confidence, + } + ) + return cells + + def _process_page(): + page_ix = page_no - 1 + page = doc_result.pages[page_ix] + + page_cells = _process_page_cells(page=page) + page_segments = _process_page_segments(doc_items=doc_items, page=page) + content_md = doc.export_to_markdown( + main_text_start=start_ix, main_text_stop=end_ix + ) + # No page-tagging since we only do 1 page at the time + content_dt = doc.export_to_document_tokens( + main_text_start=start_ix, main_text_stop=end_ix, add_page_index=False + ) + + return content_text, content_md, content_dt, page_cells, page_segments, page + + if doc.main_text is None: + return + for ix, orig_item in enumerate(doc.main_text): + item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item + if item is None or item.prov is None or len(item.prov) == 0: + _log.debug(f"Skipping item {orig_item}") + continue + + item_page = item.prov[0].page + + # Page is complete + if page_no > 0 and item_page > page_no: + yield _process_page() + + start_ix = ix + doc_items = [] + content_text = "" + + page_no = item_page + end_ix = ix + doc_items.append((ix, item)) + if item.text is not None and item.text != "": + content_text += item.text + " " + + if len(doc_items) > 0: + yield _process_page() diff --git a/test/data/chunker_repo/repos/docling/jats_backend.py b/test/data/chunker_repo/repos/docling/jats_backend.py new file mode 100755 index 00000000..def95865 --- /dev/null +++ b/test/data/chunker_repo/repos/docling/jats_backend.py @@ -0,0 +1,718 @@ +import logging +import traceback +from io import BytesIO +from pathlib import Path +from typing import Final, Optional, Union + +from bs4 import BeautifulSoup, Tag +from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.backend.html_backend import HTMLDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument +from lxml import etree +from typing_extensions import TypedDict, override + +from docling_core.types.doc import ( + DocItemLabel, + DoclingDocument, + DocumentOrigin, + GroupItem, + GroupLabel, + NodeItem, + TextItem, +) + +_log = logging.getLogger(__name__) + +JATS_DTD_URL: Final = ["JATS-journalpublishing", "JATS-archive"] +DEFAULT_HEADER_ACKNOWLEDGMENTS: Final = "Acknowledgments" +DEFAULT_HEADER_ABSTRACT: Final = "Abstract" +DEFAULT_HEADER_REFERENCES: Final = "References" +DEFAULT_TEXT_ETAL: Final = "et al." + + +class Abstract(TypedDict): + label: str + content: str + + +class Author(TypedDict): + name: str + affiliation_names: list[str] + + +class Citation(TypedDict): + author_names: str + title: str + source: str + year: str + volume: str + page: str + pub_id: str + publisher_name: str + publisher_loc: str + + +class Table(TypedDict): + label: str + caption: str + content: str + + +class XMLComponents(TypedDict): + title: str + authors: list[Author] + abstract: list[Abstract] + + +class JatsDocumentBackend(DeclarativeDocumentBackend): + """Backend to parse articles in XML format tagged according to JATS definition. + + The Journal Article Tag Suite (JATS) is an definition standard for the + representation of journal articles in XML format. Several publishers and journal + archives provide content in JATS format, including PubMed Central® (PMC), bioRxiv, + medRxiv, or Springer Nature. + + Refer to https://jats.nlm.nih.gov for more details on JATS. + + The code from this document backend has been developed by modifying parts of the + PubMed Parser library (version 0.5.0, released on 12.08.2024): + Achakulvisut et al., (2020). + Pubmed Parser: A Python Parser for PubMed Open-Access XML Subset and MEDLINE XML + Dataset XML Dataset. + Journal of Open Source Software, 5(46), 1979, + https://doi.org/10.21105/joss.01979 + """ + + @override + def __init__( + self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] + ) -> None: + super().__init__(in_doc, path_or_stream) + self.path_or_stream = path_or_stream + + # Initialize the root of the document hierarchy + self.root: Optional[NodeItem] = None + self.hlevel: int = 0 + self.valid: bool = False + try: + if isinstance(self.path_or_stream, BytesIO): + self.path_or_stream.seek(0) + self.tree: etree._ElementTree = etree.parse(self.path_or_stream) + + doc_info: etree.DocInfo = self.tree.docinfo + if doc_info.system_url and any( + kwd in doc_info.system_url for kwd in JATS_DTD_URL + ): + self.valid = True + return + for ent in doc_info.internalDTD.iterentities(): + if ent.system_url and any( + kwd in ent.system_url for kwd in JATS_DTD_URL + ): + self.valid = True + return + except Exception as exc: + raise RuntimeError( + f"Could not initialize JATS backend for file with hash {self.document_hash}." + ) from exc + + @override + def is_valid(self) -> bool: + return self.valid + + @classmethod + @override + def supports_pagination(cls) -> bool: + return False + + @override + def unload(self): + if isinstance(self.path_or_stream, BytesIO): + self.path_or_stream.close() + self.path_or_stream = None + + @classmethod + @override + def supported_formats(cls) -> set[InputFormat]: + return {InputFormat.XML_JATS} + + @override + def convert(self) -> DoclingDocument: + try: + # Create empty document + origin = DocumentOrigin( + filename=self.file.name or "file", + mimetype="application/xml", + binary_hash=self.document_hash, + ) + doc = DoclingDocument(name=self.file.stem or "file", origin=origin) + self.hlevel = 0 + + # Get metadata XML components + xml_components: XMLComponents = self._parse_metadata() + + # Add metadata to the document + self._add_metadata(doc, xml_components) + + # walk over the XML body + body = self.tree.xpath("//body") + if self.root and len(body) > 0: + self._walk_linear(doc, self.root, body[0]) + + # walk over the XML back matter + back = self.tree.xpath("//back") + if self.root and len(back) > 0: + self._walk_linear(doc, self.root, back[0]) + except Exception: + _log.error(traceback.format_exc()) + + return doc + + @staticmethod + def _get_text(node: etree._Element, sep: Optional[str] = None) -> str: + skip_tags = ["term", "disp-formula", "inline-formula"] + text: str = ( + node.text.replace("\n", " ") + if (node.tag not in skip_tags and node.text) + else "" + ) + for child in list(node): + if child.tag not in skip_tags: + # TODO: apply styling according to child.tag when supported by docling-core + text += JatsDocumentBackend._get_text(child, sep) + if sep: + text = text.rstrip(sep) + sep + text += child.tail.replace("\n", " ") if child.tail else "" + + return text + + def _find_metadata(self) -> Optional[etree._Element]: + meta_names: list[str] = ["article-meta", "book-part-meta"] + meta: Optional[etree._Element] = None + for name in meta_names: + node = self.tree.xpath(f".//{name}") + if len(node) > 0: + meta = node[0] + break + + return meta + + def _parse_abstract(self) -> list[Abstract]: + # TODO: address cases with multiple sections + abs_list: list[Abstract] = [] + + for abs_node in self.tree.xpath(".//abstract"): + abstract: Abstract = dict(label="", content="") + texts = [] + for abs_par in abs_node.xpath("p"): + texts.append(JatsDocumentBackend._get_text(abs_par).strip()) + abstract["content"] = " ".join(texts) + + label_node = abs_node.xpath("title|label") + if len(label_node) > 0: + abstract["label"] = label_node[0].text.strip() + + abs_list.append(abstract) + + return abs_list + + def _parse_authors(self) -> list[Author]: + # Get mapping between affiliation ids and names + authors: list[Author] = [] + meta: Optional[etree._Element] = self._find_metadata() + if meta is None: + return authors + + affiliation_names = [] + for affiliation_node in meta.xpath(".//aff[@id]"): + aff = ", ".join([t for t in affiliation_node.itertext() if t.strip()]) + aff = aff.replace("\n", " ") + label = affiliation_node.xpath("label") + if label: + # TODO: once superscript is supported, add label with formatting + aff = aff.removeprefix(f"{label[0].text}, ") + affiliation_names.append(aff) + affiliation_ids_names = dict( + zip(meta.xpath(".//aff[@id]/@id"), affiliation_names) + ) + + # Get author names and affiliation names + for author_node in meta.xpath( + './/contrib-group/contrib[@contrib-type="author"]' + ): + author: Author = { + "name": "", + "affiliation_names": [], + } + + # Affiliation names + affiliation_ids = [ + a.attrib["rid"] for a in author_node.xpath('xref[@ref-type="aff"]') + ] + for id in affiliation_ids: + if id in affiliation_ids_names: + author["affiliation_names"].append(affiliation_ids_names[id]) + + # Name + author["name"] = ( + author_node.xpath("name/given-names")[0].text + + " " + + author_node.xpath("name/surname")[0].text + ) + + authors.append(author) + + return authors + + def _parse_title(self) -> str: + meta_names: list[str] = [ + "article-meta", + "collection-meta", + "book-meta", + "book-part-meta", + ] + title_names: list[str] = ["article-title", "subtitle", "title", "label"] + titles: list[str] = [ + " ".join( + elem.text.replace("\n", " ").strip() + for elem in list(title_node) + if elem.tag in title_names + ).strip() + for title_node in self.tree.xpath( + "|".join([f".//{item}/title-group" for item in meta_names]) + ) + ] + + text = " - ".join(titles) + + return text + + def _parse_metadata(self) -> XMLComponents: + """Parsing JATS document metadata.""" + xml_components: XMLComponents = { + "title": self._parse_title(), + "authors": self._parse_authors(), + "abstract": self._parse_abstract(), + } + return xml_components + + def _add_abstract( + self, doc: DoclingDocument, xml_components: XMLComponents + ) -> None: + for abstract in xml_components["abstract"]: + text: str = abstract["content"] + title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT + if not text: + continue + parent = doc.add_heading( + parent=self.root, text=title, level=self.hlevel + 1 + ) + doc.add_text( + parent=parent, + text=text, + label=DocItemLabel.TEXT, + ) + + return + + def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None: + # TODO: once docling supports text formatting, add affiliation reference to + # author names through superscripts + authors: list = [item["name"] for item in xml_components["authors"]] + authors_str = ", ".join(authors) + affiliations: list = [ + item + for author in xml_components["authors"] + for item in author["affiliation_names"] + ] + affiliations_str = "; ".join(list(dict.fromkeys(affiliations))) + if authors_str: + doc.add_text( + parent=self.root, + text=authors_str, + label=DocItemLabel.PARAGRAPH, + ) + if affiliations_str: + doc.add_text( + parent=self.root, + text=affiliations_str, + label=DocItemLabel.PARAGRAPH, + ) + + return + + def _add_citation(self, doc: DoclingDocument, parent: NodeItem, text: str) -> None: + if isinstance(parent, GroupItem) and parent.label == GroupLabel.LIST: + doc.add_list_item(text=text, enumerated=False, parent=parent) + else: + doc.add_text(text=text, label=DocItemLabel.TEXT, parent=parent) + + return + + def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901 + citation: Citation = { + "author_names": "", + "title": "", + "source": "", + "year": "", + "volume": "", + "page": "", + "pub_id": "", + "publisher_name": "", + "publisher_loc": "", + } + + _log.debug("Citation parsing started") + + # Author names + names = [] + for name_node in node.xpath(".//name"): + name_str = ( + name_node.xpath("surname")[0].text.replace("\n", " ").strip() + + " " + + name_node.xpath("given-names")[0].text.replace("\n", " ").strip() + ) + names.append(name_str) + etal_node = node.xpath(".//etal") + if len(etal_node) > 0: + etal_text = etal_node[0].text or DEFAULT_TEXT_ETAL + names.append(etal_text) + citation["author_names"] = ", ".join(names) + + titles: list[str] = [ + "article-title", + "chapter-title", + "data-title", + "issue-title", + "part-title", + "trans-title", + ] + title_node: Optional[etree._Element] = None + for name in titles: + name_node = node.xpath(name) + if len(name_node) > 0: + title_node = name_node[0] + break + citation["title"] = ( + JatsDocumentBackend._get_text(title_node) + if title_node is not None + else node.text.replace("\n", " ").strip() + ) + + # Journal, year, publisher name, publisher location, volume, elocation + fields: list[str] = [ + "source", + "year", + "publisher-name", + "publisher-loc", + "volume", + ] + for item in fields: + item_node = node.xpath(item) + if len(item_node) > 0: + citation[item.replace("-", "_")] = ( # type: ignore[literal-required] + item_node[0].text.replace("\n", " ").strip() + ) + + # Publication identifier + if len(node.xpath("pub-id")) > 0: + pub_id: list[str] = [] + for id_node in node.xpath("pub-id"): + id_type = id_node.get("assigning-authority") or id_node.get( + "pub-id-type" + ) + id_text = id_node.text + if id_type and id_text: + pub_id.append( + id_type.replace("\n", " ").strip().upper() + + ": " + + id_text.replace("\n", " ").strip() + ) + if pub_id: + citation["pub_id"] = ", ".join(pub_id) + + # Pages + if len(node.xpath("elocation-id")) > 0: + citation["page"] = ( + node.xpath("elocation-id")[0].text.replace("\n", " ").strip() + ) + elif len(node.xpath("fpage")) > 0: + citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip() + if len(node.xpath("lpage")) > 0: + citation["page"] += ( + "–" + + node.xpath("lpage")[0] + .text.replace("\n", " ") + .strip() # noqa: RUF001 + ) + + # Flatten the citation to string + + text = "" + if citation["author_names"]: + text += citation["author_names"].rstrip(".") + ". " + if citation["title"]: + text += citation["title"] + ". " + if citation["source"]: + text += citation["source"] + ". " + if citation["publisher_name"]: + if citation["publisher_loc"]: + text += f"{citation['publisher_loc']}: " + text += citation["publisher_name"] + ". " + if citation["volume"]: + text = text.rstrip(". ") + text += f" {citation['volume']}. " + if citation["page"]: + text = text.rstrip(". ") + if citation["volume"]: + text += ":" + text += citation["page"] + ". " + if citation["year"]: + text = text.rstrip(". ") + text += f" ({citation['year']})." + if citation["pub_id"]: + text = text.rstrip(".") + ". " + text += citation["pub_id"] + + _log.debug("Citation flattened") + + return text + + def _add_equation( + self, doc: DoclingDocument, parent: NodeItem, node: etree._Element + ) -> None: + math_text = node.text + math_parts = math_text.split("$$") + if len(math_parts) == 3: + math_formula = math_parts[1] + doc.add_text(label=DocItemLabel.FORMULA, text=math_formula, parent=parent) + + return + + def _add_figure_captions( + self, doc: DoclingDocument, parent: NodeItem, node: etree._Element + ) -> None: + label_node = node.xpath("label") + label: Optional[str] = ( + JatsDocumentBackend._get_text(label_node[0]).strip() if label_node else "" + ) + + caption_node = node.xpath("caption") + caption: Optional[str] + if len(caption_node) > 0: + caption = "" + for caption_par in list(caption_node[0]): + if caption_par.xpath(".//supplementary-material"): + continue + caption += JatsDocumentBackend._get_text(caption_par).strip() + " " + caption = caption.strip() + else: + caption = None + + # TODO: format label vs caption once styling is supported + fig_text: str = f"{label}{' ' if label and caption else ''}{caption}" + fig_caption: Optional[TextItem] = ( + doc.add_text(label=DocItemLabel.CAPTION, text=fig_text) + if fig_text + else None + ) + + doc.add_picture(parent=parent, caption=fig_caption) + + return + + # TODO: add footnotes when DocItemLabel.FOOTNOTE and styling are supported + # def _add_footnote_group(self, doc: DoclingDocument, parent: NodeItem, node: etree._Element) -> None: + # new_parent = doc.add_group(label=GroupLabel.LIST, name="footnotes", parent=parent) + # for child in node.iterchildren(tag="fn"): + # text = JatsDocumentBackend._get_text(child) + # doc.add_list_item(text=text, parent=new_parent) + + def _add_metadata( + self, doc: DoclingDocument, xml_components: XMLComponents + ) -> None: + self._add_title(doc, xml_components) + self._add_authors(doc, xml_components) + self._add_abstract(doc, xml_components) + + return + + def _add_table( + self, doc: DoclingDocument, parent: NodeItem, table_xml_component: Table + ) -> None: + soup = BeautifulSoup(table_xml_component["content"], "html.parser") + table_tag = soup.find("table") + if not isinstance(table_tag, Tag): + return + + data = HTMLDocumentBackend.parse_table_data(table_tag) + + # TODO: format label vs caption once styling is supported + label = table_xml_component["label"] + caption = table_xml_component["caption"] + table_text: str = f"{label}{' ' if label and caption else ''}{caption}" + table_caption: Optional[TextItem] = ( + doc.add_text(label=DocItemLabel.CAPTION, text=table_text) + if table_text + else None + ) + + if data is not None: + doc.add_table(data=data, parent=parent, caption=table_caption) + + return + + def _add_tables( + self, doc: DoclingDocument, parent: NodeItem, node: etree._Element + ) -> None: + table: Table = {"label": "", "caption": "", "content": ""} + + # Content + if len(node.xpath("table")) > 0: + table_content_node = node.xpath("table")[0] + elif len(node.xpath("alternatives/table")) > 0: + table_content_node = node.xpath("alternatives/table")[0] + else: + table_content_node = None + if table_content_node is not None: + table["content"] = etree.tostring(table_content_node).decode("utf-8") + + # Caption + caption_node = node.xpath("caption") + caption: Optional[str] + if caption_node: + caption = "" + for caption_par in list(caption_node[0]): + if caption_par.xpath(".//supplementary-material"): + continue + caption += JatsDocumentBackend._get_text(caption_par).strip() + " " + caption = caption.strip() + else: + caption = None + if caption is not None: + table["caption"] = caption + + # Label + if len(node.xpath("label")) > 0: + table["label"] = node.xpath("label")[0].text + + try: + self._add_table(doc, parent, table) + except Exception: + _log.warning(f"Skipping unsupported table in {self.file!s}") + + return + + def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None: + self.root = doc.add_text( + parent=None, + text=xml_components["title"], + label=DocItemLabel.TITLE, + ) + return + + def _walk_linear( # noqa: C901 + self, doc: DoclingDocument, parent: NodeItem, node: etree._Element + ) -> str: + skip_tags = ["term"] + flush_tags = ["ack", "sec", "list", "boxed-text", "disp-formula", "fig"] + new_parent: NodeItem = parent + node_text: str = ( + node.text.replace("\n", " ") + if (node.tag not in skip_tags and node.text) + else "" + ) + + for child in list(node): + stop_walk: bool = False + + # flush text into TextItem for some tags in paragraph nodes + if node.tag == "p" and node_text.strip() and child.tag in flush_tags: + doc.add_text( + label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent + ) + node_text = "" + + # add elements and decide whether to stop walking + if child.tag in ("sec", "ack"): + header = child.xpath("title|label") + text: Optional[str] = None + if len(header) > 0: + text = JatsDocumentBackend._get_text(header[0]) + elif child.tag == "ack": + text = DEFAULT_HEADER_ACKNOWLEDGMENTS + if text: + self.hlevel += 1 + new_parent = doc.add_heading( + text=text, parent=parent, level=self.hlevel + ) + elif child.tag == "list": + new_parent = doc.add_group( + label=GroupLabel.LIST, name="list", parent=parent + ) + elif child.tag == "list-item": + # TODO: address any type of content (another list, formula,...) + # TODO: address list type and item label + text = JatsDocumentBackend._get_text(child).strip() + new_parent = doc.add_list_item(text=text, parent=parent) + stop_walk = True + elif child.tag == "fig": + self._add_figure_captions(doc, parent, child) + stop_walk = True + elif child.tag == "table-wrap": + self._add_tables(doc, parent, child) + stop_walk = True + elif child.tag == "suplementary-material": + stop_walk = True + elif child.tag == "fn-group": + # header = child.xpath(".//title") or child.xpath(".//label") + # if header: + # text = JatsDocumentBackend._get_text(header[0]) + # fn_parent = doc.add_heading(text=text, parent=new_parent) + # self._add_footnote_group(doc, fn_parent, child) + stop_walk = True + elif child.tag == "ref-list" and node.tag != "ref-list": + header = child.xpath("title|label") + text = ( + JatsDocumentBackend._get_text(header[0]) + if len(header) > 0 + else DEFAULT_HEADER_REFERENCES + ) + new_parent = doc.add_heading(text=text, parent=parent) + new_parent = doc.add_group( + parent=new_parent, label=GroupLabel.LIST, name="list" + ) + elif child.tag == "element-citation": + text = self._parse_element_citation(child) + self._add_citation(doc, parent, text) + stop_walk = True + elif child.tag == "mixed-citation": + text = JatsDocumentBackend._get_text(child).strip() + self._add_citation(doc, parent, text) + stop_walk = True + elif child.tag == "tex-math": + self._add_equation(doc, parent, child) + stop_walk = True + elif child.tag == "inline-formula": + # TODO: address inline formulas when supported by docling-core + stop_walk = True + + # step into child + if not stop_walk: + new_text = self._walk_linear(doc, new_parent, child) + if not (node.getparent().tag == "p" and node.tag in flush_tags): + node_text += new_text + if child.tag in ("sec", "ack") and text: + self.hlevel -= 1 + + # pick up the tail text + node_text += child.tail.replace("\n", " ") if child.tail else "" + + # create paragraph + if node.tag == "p" and node_text.strip(): + doc.add_text(label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent) + return "" + else: + # backpropagate the text + return node_text diff --git a/test/data/chunker_repo/repos/jquery/access.js b/test/data/chunker_repo/repos/jquery/access.js new file mode 100644 index 00000000..86bac126 --- /dev/null +++ b/test/data/chunker_repo/repos/jquery/access.js @@ -0,0 +1,63 @@ +import { jQuery } from "../core.js"; +import { toType } from "../core/toType.js"; + +// Multifunctional method to get and set values of a collection +// The value/s can optionally be executed if it's a function +export function access( elems, fn, key, value, chainable, emptyGet, raw ) { + var i = 0, + len = elems.length, + bulk = key == null; + + // Sets many values + if ( toType( key ) === "object" ) { + chainable = true; + for ( i in key ) { + access( elems, fn, i, key[ i ], true, emptyGet, raw ); + } + + // Sets one value + } else if ( value !== undefined ) { + chainable = true; + + if ( typeof value !== "function" ) { + raw = true; + } + + if ( bulk ) { + + // Bulk operations run against the entire set + if ( raw ) { + fn.call( elems, value ); + fn = null; + + // ...except when executing function values + } else { + bulk = fn; + fn = function( elem, _key, value ) { + return bulk.call( jQuery( elem ), value ); + }; + } + } + + if ( fn ) { + for ( ; i < len; i++ ) { + fn( + elems[ i ], key, raw ? + value : + value.call( elems[ i ], i, fn( elems[ i ], key ) ) + ); + } + } + } + + if ( chainable ) { + return elems; + } + + // Gets + if ( bulk ) { + return fn.call( elems ); + } + + return len ? fn( elems[ 0 ], key ) : emptyGet; +} diff --git a/test/data/chunker_repo/repos/jquery/data.js b/test/data/chunker_repo/repos/jquery/data.js new file mode 100644 index 00000000..ce2813e9 --- /dev/null +++ b/test/data/chunker_repo/repos/jquery/data.js @@ -0,0 +1,175 @@ +import { jQuery } from "./core.js"; +import { access } from "./core/access.js"; +import { camelCase } from "./core/camelCase.js"; +import { dataPriv } from "./data/var/dataPriv.js"; +import { dataUser } from "./data/var/dataUser.js"; + +// Implementation Summary +// +// 1. Enforce API surface and semantic compatibility with 1.9.x branch +// 2. Improve the module's maintainability by reducing the storage +// paths to a single mechanism. +// 3. Use the same single mechanism to support "private" and "user" data. +// 4. _Never_ expose "private" data to user code (TODO: Drop _data, _removeData) +// 5. Avoid exposing implementation details on user objects (eg. expando properties) +// 6. Provide a clear path for implementation upgrade to WeakMap in 2014 + +var rbrace = /^(?:\{[\w\W]*\}|\[[\w\W]*\])$/, + rmultiDash = /[A-Z]/g; + +function getData( data ) { + if ( data === "true" ) { + return true; + } + + if ( data === "false" ) { + return false; + } + + if ( data === "null" ) { + return null; + } + + // Only convert to a number if it doesn't change the string + if ( data === +data + "" ) { + return +data; + } + + if ( rbrace.test( data ) ) { + return JSON.parse( data ); + } + + return data; +} + +function dataAttr( elem, key, data ) { + var name; + + // If nothing was found internally, try to fetch any + // data from the HTML5 data-* attribute + if ( data === undefined && elem.nodeType === 1 ) { + name = "data-" + key.replace( rmultiDash, "-$&" ).toLowerCase(); + data = elem.getAttribute( name ); + + if ( typeof data === "string" ) { + try { + data = getData( data ); + } catch ( e ) {} + + // Make sure we set the data so it isn't changed later + dataUser.set( elem, key, data ); + } else { + data = undefined; + } + } + return data; +} + +jQuery.extend( { + hasData: function( elem ) { + return dataUser.hasData( elem ) || dataPriv.hasData( elem ); + }, + + data: function( elem, name, data ) { + return dataUser.access( elem, name, data ); + }, + + removeData: function( elem, name ) { + dataUser.remove( elem, name ); + }, + + // TODO: Now that all calls to _data and _removeData have been replaced + // with direct calls to dataPriv methods, these can be deprecated. + _data: function( elem, name, data ) { + return dataPriv.access( elem, name, data ); + }, + + _removeData: function( elem, name ) { + dataPriv.remove( elem, name ); + } +} ); + +jQuery.fn.extend( { + data: function( key, value ) { + var i, name, data, + elem = this[ 0 ], + attrs = elem && elem.attributes; + + // Gets all values + if ( key === undefined ) { + if ( this.length ) { + data = dataUser.get( elem ); + + if ( elem.nodeType === 1 && !dataPriv.get( elem, "hasDataAttrs" ) ) { + i = attrs.length; + while ( i-- ) { + + // Support: IE 11+ + // The attrs elements can be null (trac-14894) + if ( attrs[ i ] ) { + name = attrs[ i ].name; + if ( name.indexOf( "data-" ) === 0 ) { + name = camelCase( name.slice( 5 ) ); + dataAttr( elem, name, data[ name ] ); + } + } + } + dataPriv.set( elem, "hasDataAttrs", true ); + } + } + + return data; + } + + // Sets multiple values + if ( typeof key === "object" ) { + return this.each( function() { + dataUser.set( this, key ); + } ); + } + + return access( this, function( value ) { + var data; + + // The calling jQuery object (element matches) is not empty + // (and therefore has an element appears at this[ 0 ]) and the + // `value` parameter was not undefined. An empty jQuery object + // will result in `undefined` for elem = this[ 0 ] which will + // throw an exception if an attempt to read a data cache is made. + if ( elem && value === undefined ) { + + // Attempt to get data from the cache + // The key will always be camelCased in Data + data = dataUser.get( elem, key ); + if ( data !== undefined ) { + return data; + } + + // Attempt to "discover" the data in + // HTML5 custom data-* attrs + data = dataAttr( elem, key ); + if ( data !== undefined ) { + return data; + } + + // We tried really hard, but the data doesn't exist. + return; + } + + // Set the data... + this.each( function() { + + // We always store the camelCased key + dataUser.set( this, key, value ); + } ); + }, null, value, arguments.length > 1, null, true ); + }, + + removeData: function( key ) { + return this.each( function() { + dataUser.remove( this, key ); + } ); + } +} ); + +export { jQuery, jQuery as $ }; diff --git a/test/data/chunker_repo/repos/jquery/serialize.js b/test/data/chunker_repo/repos/jquery/serialize.js new file mode 100644 index 00000000..704fe09b --- /dev/null +++ b/test/data/chunker_repo/repos/jquery/serialize.js @@ -0,0 +1,129 @@ +import { jQuery } from "./core.js"; +import { toType } from "./core/toType.js"; +import { rcheckableType } from "./var/rcheckableType.js"; + +import "./core/init.js"; +import "./traversing.js"; // filter +import "./attributes/prop.js"; + +var + rbracket = /\[\]$/, + rCRLF = /\r?\n/g, + rsubmitterTypes = /^(?:submit|button|image|reset|file)$/i, + rsubmittable = /^(?:input|select|textarea|keygen)/i; + +function buildParams( prefix, obj, traditional, add ) { + var name; + + if ( Array.isArray( obj ) ) { + + // Serialize array item. + jQuery.each( obj, function( i, v ) { + if ( traditional || rbracket.test( prefix ) ) { + + // Treat each array item as a scalar. + add( prefix, v ); + + } else { + + // Item is non-scalar (array or object), encode its numeric index. + buildParams( + prefix + "[" + ( typeof v === "object" && v != null ? i : "" ) + "]", + v, + traditional, + add + ); + } + } ); + + } else if ( !traditional && toType( obj ) === "object" ) { + + // Serialize object item. + for ( name in obj ) { + buildParams( prefix + "[" + name + "]", obj[ name ], traditional, add ); + } + + } else { + + // Serialize scalar item. + add( prefix, obj ); + } +} + +// Serialize an array of form elements or a set of +// key/values into a query string +jQuery.param = function( a, traditional ) { + var prefix, + s = [], + add = function( key, valueOrFunction ) { + + // If value is a function, invoke it and use its return value + var value = typeof valueOrFunction === "function" ? + valueOrFunction() : + valueOrFunction; + + s[ s.length ] = encodeURIComponent( key ) + "=" + + encodeURIComponent( value == null ? "" : value ); + }; + + if ( a == null ) { + return ""; + } + + // If an array was passed in, assume that it is an array of form elements. + if ( Array.isArray( a ) || ( a.jquery && !jQuery.isPlainObject( a ) ) ) { + + // Serialize the form elements + jQuery.each( a, function() { + add( this.name, this.value ); + } ); + + } else { + + // If traditional, encode the "old" way (the way 1.3.2 or older + // did it), otherwise encode params recursively. + for ( prefix in a ) { + buildParams( prefix, a[ prefix ], traditional, add ); + } + } + + // Return the resulting serialization + return s.join( "&" ); +}; + +jQuery.fn.extend( { + serialize: function() { + return jQuery.param( this.serializeArray() ); + }, + serializeArray: function() { + return this.map( function() { + + // Can add propHook for "elements" to filter or add form elements + var elements = jQuery.prop( this, "elements" ); + return elements ? jQuery.makeArray( elements ) : this; + } ).filter( function() { + var type = this.type; + + // Use .is( ":disabled" ) so that fieldset[disabled] works + return this.name && !jQuery( this ).is( ":disabled" ) && + rsubmittable.test( this.nodeName ) && !rsubmitterTypes.test( type ) && + ( this.checked || !rcheckableType.test( type ) ); + } ).map( function( _i, elem ) { + var val = jQuery( this ).val(); + + if ( val == null ) { + return null; + } + + if ( Array.isArray( val ) ) { + return jQuery.map( val, function( val ) { + return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; + } ); + } + + return { name: elem.name, value: val.replace( rCRLF, "\r\n" ) }; + } ).get(); + } +} ); + +export { jQuery, jQuery as $ }; diff --git a/test/data/chunker_repo/repos/json-c/json_pointer.c b/test/data/chunker_repo/repos/json-c/json_pointer.c new file mode 100644 index 00000000..6e5609d7 --- /dev/null +++ b/test/data/chunker_repo/repos/json-c/json_pointer.c @@ -0,0 +1,415 @@ +/* + * Copyright (c) 2016 Alexandru Ardelean. + * + * This is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See COPYING for details. + * + */ + +#include "config.h" + +#include "strerror_override.h" + +#include +#include +#include +#include + +#include "json_object_private.h" +#include "json_pointer.h" +#include "json_pointer_private.h" +#include "strdup_compat.h" +#include "vasprintf_compat.h" + +/* Avoid ctype.h and locale overhead */ +#define is_plain_digit(c) ((c) >= '0' && (c) <= '9') + +/** + * JavaScript Object Notation (JSON) Pointer + * RFC 6901 - https://tools.ietf.org/html/rfc6901 + */ + +static void string_replace_all_occurrences_with_char(char *s, const char *occur, char repl_char) +{ + size_t slen = strlen(s); + size_t skip = strlen(occur) - 1; /* length of the occurrence, minus the char we're replacing */ + char *p = s; + while ((p = strstr(p, occur))) + { + *p = repl_char; + p++; + slen -= skip; + memmove(p, (p + skip), slen - (p - s) + 1); /* includes null char too */ + } +} + +static int is_valid_index(const char *path, size_t *idx) +{ + size_t i, len = strlen(path); + /* this code-path optimizes a bit, for when we reference the 0-9 index range + * in a JSON array and because leading zeros not allowed + */ + if (len == 1) + { + if (is_plain_digit(path[0])) + { + *idx = (path[0] - '0'); + return 1; + } + errno = EINVAL; + return 0; + } + /* leading zeros not allowed per RFC */ + if (path[0] == '0') + { + errno = EINVAL; + return 0; + } + /* RFC states base-10 decimals */ + for (i = 0; i < len; i++) + { + if (!is_plain_digit(path[i])) + { + errno = EINVAL; + return 0; + } + } + + // We know it's all digits, so the only error case here is overflow, + // but ULLONG_MAX will be longer than any array length so that's ok. + *idx = strtoull(path, NULL, 10); + + return 1; +} + +static int json_pointer_get_single_path(struct json_object *obj, char *path, + struct json_object **value, size_t *idx) +{ + if (json_object_is_type(obj, json_type_array)) + { + if (!is_valid_index(path, idx)) + return -1; + if (*idx >= json_object_array_length(obj)) + { + errno = ENOENT; + return -1; + } + + obj = json_object_array_get_idx(obj, *idx); + if (obj) + { + if (value) + *value = obj; + return 0; + } + /* Entry not found */ + errno = ENOENT; + return -1; + } + + /* RFC states that we first must eval all ~1 then all ~0 */ + string_replace_all_occurrences_with_char(path, "~1", '/'); + string_replace_all_occurrences_with_char(path, "~0", '~'); + + if (!json_object_object_get_ex(obj, path, value)) + { + errno = ENOENT; + return -1; + } + + return 0; +} + +static int json_object_array_put_idx_cb(struct json_object *parent, size_t idx, + struct json_object *value, void *priv) +{ + return json_object_array_put_idx(parent, idx, value); +} + +static int json_pointer_set_single_path(struct json_object *parent, const char *path, + struct json_object *value, + json_pointer_array_set_cb array_set_cb, void *priv) +{ + if (json_object_is_type(parent, json_type_array)) + { + size_t idx; + /* RFC (Chapter 4) states that '-' may be used to add new elements to an array */ + if (path[0] == '-' && path[1] == '\0') + return json_object_array_add(parent, value); + if (!is_valid_index(path, &idx)) + return -1; + return array_set_cb(parent, idx, value, priv); + } + + /* path replacements should have been done in json_pointer_get_single_path(), + * and we should still be good here + */ + if (json_object_is_type(parent, json_type_object)) + return json_object_object_add(parent, path, value); + + /* Getting here means that we tried to "dereference" a primitive JSON type + * (like string, int, bool).i.e. add a sub-object to it + */ + errno = ENOENT; + return -1; +} + +static int json_pointer_result_get_recursive(struct json_object *obj, char *path, + struct json_pointer_get_result *res) +{ + struct json_object *parent_obj = obj; + size_t idx = 0; + char *endp; + int rc; + + /* All paths (on each recursion level must have a leading '/' */ + if (path[0] != '/') + { + errno = EINVAL; + return -1; + } + path++; + + endp = strchr(path, '/'); + if (endp) + *endp = '\0'; + + /* If we err-ed here, return here */ + if ((rc = json_pointer_get_single_path(obj, path, &obj, &idx))) + return rc; + + if (endp) + { + /* Put the slash back, so that the sanity check passes on next recursion level */ + *endp = '/'; + return json_pointer_result_get_recursive(obj, endp, res); + } + + /* We should be at the end of the recursion here */ + if (res) { + res->parent = parent_obj; + res->obj = obj; + if (json_object_is_type(res->parent, json_type_array)) + res->index_in_parent = idx; + else + res->key_in_parent = path; + } + + return 0; +} + +static int json_pointer_object_get_recursive(struct json_object *obj, char *path, + struct json_object **value) +{ + struct json_pointer_get_result res; + int rc; + + rc = json_pointer_result_get_recursive(obj, path, &res); + if (rc) + return rc; + + if (value) + *value = res.obj; + + return 0; +} + +int json_pointer_get_internal(struct json_object *obj, const char *path, + struct json_pointer_get_result *res) +{ + char *path_copy = NULL; + int rc; + + if (!obj || !path) + { + errno = EINVAL; + return -1; + } + + if (path[0] == '\0') + { + res->parent = NULL; + res->obj = obj; + res->key_in_parent = NULL; + res->index_in_parent = UINT32_MAX; + return 0; + } + + /* pass a working copy to the recursive call */ + if (!(path_copy = strdup(path))) + { + errno = ENOMEM; + return -1; + } + rc = json_pointer_result_get_recursive(obj, path_copy, res); + /* re-map the path string to the const-path string */ + if (rc == 0 && json_object_is_type(res->parent, json_type_object) && res->key_in_parent) + res->key_in_parent = path + (res->key_in_parent - path_copy); + free(path_copy); + + return rc; +} + +int json_pointer_get(struct json_object *obj, const char *path, struct json_object **res) +{ + struct json_pointer_get_result jpres; + int rc; + + rc = json_pointer_get_internal(obj, path, &jpres); + if (rc) + return rc; + + if (res) + *res = jpres.obj; + + return 0; +} + +int json_pointer_getf(struct json_object *obj, struct json_object **res, const char *path_fmt, ...) +{ + char *path_copy = NULL; + int rc = 0; + va_list args; + + if (!obj || !path_fmt) + { + errno = EINVAL; + return -1; + } + + va_start(args, path_fmt); + rc = vasprintf(&path_copy, path_fmt, args); + va_end(args); + + if (rc < 0) + return rc; + + if (path_copy[0] == '\0') + { + if (res) + *res = obj; + goto out; + } + + rc = json_pointer_object_get_recursive(obj, path_copy, res); +out: + free(path_copy); + + return rc; +} + +int json_pointer_set_with_array_cb(struct json_object **obj, const char *path, + struct json_object *value, + json_pointer_array_set_cb array_set_cb, void *priv) +{ + const char *endp; + char *path_copy = NULL; + struct json_object *set = NULL; + int rc; + + if (!obj || !path) + { + errno = EINVAL; + return -1; + } + + if (path[0] == '\0') + { + json_object_put(*obj); + *obj = value; + return 0; + } + + if (path[0] != '/') + { + errno = EINVAL; + return -1; + } + + /* If there's only 1 level to set, stop here */ + if ((endp = strrchr(path, '/')) == path) + { + path++; + return json_pointer_set_single_path(*obj, path, value, array_set_cb, priv); + } + + /* pass a working copy to the recursive call */ + if (!(path_copy = strdup(path))) + { + errno = ENOMEM; + return -1; + } + path_copy[endp - path] = '\0'; + rc = json_pointer_object_get_recursive(*obj, path_copy, &set); + free(path_copy); + + if (rc) + return rc; + + endp++; + return json_pointer_set_single_path(set, endp, value, array_set_cb, priv); +} + +int json_pointer_set(struct json_object **obj, const char *path, struct json_object *value) +{ + return json_pointer_set_with_array_cb(obj, path, value, json_object_array_put_idx_cb, NULL); +} + +int json_pointer_setf(struct json_object **obj, struct json_object *value, const char *path_fmt, + ...) +{ + char *endp; + char *path_copy = NULL; + struct json_object *set = NULL; + va_list args; + int rc = 0; + + if (!obj || !path_fmt) + { + errno = EINVAL; + return -1; + } + + /* pass a working copy to the recursive call */ + va_start(args, path_fmt); + rc = vasprintf(&path_copy, path_fmt, args); + va_end(args); + + if (rc < 0) + return rc; + + if (path_copy[0] == '\0') + { + json_object_put(*obj); + *obj = value; + goto out; + } + + if (path_copy[0] != '/') + { + errno = EINVAL; + rc = -1; + goto out; + } + + /* If there's only 1 level to set, stop here */ + if ((endp = strrchr(path_copy, '/')) == path_copy) + { + set = *obj; + goto set_single_path; + } + + *endp = '\0'; + rc = json_pointer_object_get_recursive(*obj, path_copy, &set); + + if (rc) + goto out; + +set_single_path: + endp++; + rc = json_pointer_set_single_path(set, endp, value, + json_object_array_put_idx_cb, NULL); +out: + free(path_copy); + return rc; +} diff --git a/test/data/chunker_repo/repos/json-c/linkhash.c b/test/data/chunker_repo/repos/json-c/linkhash.c new file mode 100644 index 00000000..58e13130 --- /dev/null +++ b/test/data/chunker_repo/repos/json-c/linkhash.c @@ -0,0 +1,718 @@ +/* + * $Id: linkhash.c,v 1.4 2006/01/26 02:16:28 mclark Exp $ + * + * Copyright (c) 2004, 2005 Metaparadigm Pte. Ltd. + * Michael Clark + * Copyright (c) 2009 Hewlett-Packard Development Company, L.P. + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the MIT license. See COPYING for details. + * + */ + +#include "config.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_ENDIAN_H +#include /* attempt to define endianness */ +#endif + +#if defined(_MSC_VER) || defined(__MINGW32__) +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#include /* Get InterlockedCompareExchange */ +#endif + +#include "linkhash.h" +#include "random_seed.h" + +/* hash functions */ +static unsigned long lh_char_hash(const void *k); +static unsigned long lh_perllike_str_hash(const void *k); +static lh_hash_fn *char_hash_fn = lh_char_hash; + +/* comparison functions */ +int lh_char_equal(const void *k1, const void *k2); +int lh_ptr_equal(const void *k1, const void *k2); + +int json_global_set_string_hash(const int h) +{ + switch (h) + { + case JSON_C_STR_HASH_DFLT: char_hash_fn = lh_char_hash; break; + case JSON_C_STR_HASH_PERLLIKE: char_hash_fn = lh_perllike_str_hash; break; + default: return -1; + } + return 0; +} + +static unsigned long lh_ptr_hash(const void *k) +{ + /* CAW: refactored to be 64bit nice */ + return (unsigned long)((((ptrdiff_t)k * LH_PRIME) >> 4) & ULONG_MAX); +} + +int lh_ptr_equal(const void *k1, const void *k2) +{ + return (k1 == k2); +} + +/* + * hashlittle from lookup3.c, by Bob Jenkins, May 2006, Public Domain. + * https://burtleburtle.net/bob/c/lookup3.c + * minor modifications to make functions static so no symbols are exported + * minor modifications to compile with -Werror + */ + +/* +------------------------------------------------------------------------------- +lookup3.c, by Bob Jenkins, May 2006, Public Domain. + +These are functions for producing 32-bit hashes for hash table lookup. +hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() +are externally useful functions. Routines to test the hash are included +if SELF_TEST is defined. You can use this free for any purpose. It's in +the public domain. It has no warranty. + +You probably want to use hashlittle(). hashlittle() and hashbig() +hash byte arrays. hashlittle() is faster than hashbig() on +little-endian machines. Intel and AMD are little-endian machines. +On second thought, you probably want hashlittle2(), which is identical to +hashlittle() except it returns two 32-bit hashes for the price of one. +You could implement hashbig2() if you wanted but I haven't bothered here. + +If you want to find a hash of, say, exactly 7 integers, do + a = i1; b = i2; c = i3; + mix(a,b,c); + a += i4; b += i5; c += i6; + mix(a,b,c); + a += i7; + final(a,b,c); +then use c as the hash value. If you have a variable length array of +4-byte integers to hash, use hashword(). If you have a byte array (like +a character string), use hashlittle(). If you have several byte arrays, or +a mix of things, see the comments above hashlittle(). + +Why is this so big? I read 12 bytes at a time into 3 4-byte integers, +then mix those integers. This is fast (you can do a lot more thorough +mixing with 12*3 instructions on 3 integers than you can with 3 instructions +on 1 byte), but shoehorning those bytes into integers efficiently is messy. +------------------------------------------------------------------------------- +*/ + +/* + * My best guess at if you are big-endian or little-endian. This may + * need adjustment. + */ +#if (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) || \ + (defined(i386) || defined(__i386__) || defined(__i486__) || defined(__i586__) || \ + defined(__i686__) || defined(vax) || defined(MIPSEL)) +#define HASH_LITTLE_ENDIAN 1 +#define HASH_BIG_ENDIAN 0 +#elif (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) || \ + (defined(sparc) || defined(POWERPC) || defined(mc68000) || defined(sel)) +#define HASH_LITTLE_ENDIAN 0 +#define HASH_BIG_ENDIAN 1 +#else +#define HASH_LITTLE_ENDIAN 0 +#define HASH_BIG_ENDIAN 0 +#endif + +#define hashsize(n) ((uint32_t)1 << (n)) +#define hashmask(n) (hashsize(n) - 1) +#define rot(x, k) (((x) << (k)) | ((x) >> (32 - (k)))) + +/* +------------------------------------------------------------------------------- +mix -- mix 3 32-bit values reversibly. + +This is reversible, so any information in (a,b,c) before mix() is +still in (a,b,c) after mix(). + +If four pairs of (a,b,c) inputs are run through mix(), or through +mix() in reverse, there are at least 32 bits of the output that +are sometimes the same for one pair and different for another pair. +This was tested for: +* pairs that differed by one bit, by two bits, in any combination + of top bits of (a,b,c), or in any combination of bottom bits of + (a,b,c). +* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + is commonly produced by subtraction) look like a single 1-bit + difference. +* the base values were pseudorandom, all zero but one bit set, or + all zero plus a counter that starts at zero. + +Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that +satisfy this are + 4 6 8 16 19 4 + 9 15 3 18 27 15 + 14 9 3 7 17 3 +Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing +for "differ" defined as + with a one-bit base and a two-bit delta. I +used https://burtleburtle.net/bob/hash/avalanche.html to choose +the operations, constants, and arrangements of the variables. + +This does not achieve avalanche. There are input bits of (a,b,c) +that fail to affect some output bits of (a,b,c), especially of a. The +most thoroughly mixed value is c, but it doesn't really even achieve +avalanche in c. + +This allows some parallelism. Read-after-writes are good at doubling +the number of bits affected, so the goal of mixing pulls in the opposite +direction as the goal of parallelism. I did what I could. Rotates +seem to cost as much as shifts on every machine I could lay my hands +on, and rotates are much kinder to the top and bottom bits, so I used +rotates. +------------------------------------------------------------------------------- +*/ +/* clang-format off */ +#define mix(a,b,c) \ +{ \ + a -= c; a ^= rot(c, 4); c += b; \ + b -= a; b ^= rot(a, 6); a += c; \ + c -= b; c ^= rot(b, 8); b += a; \ + a -= c; a ^= rot(c,16); c += b; \ + b -= a; b ^= rot(a,19); a += c; \ + c -= b; c ^= rot(b, 4); b += a; \ +} +/* clang-format on */ + +/* +------------------------------------------------------------------------------- +final -- final mixing of 3 32-bit values (a,b,c) into c + +Pairs of (a,b,c) values differing in only a few bits will usually +produce values of c that look totally different. This was tested for +* pairs that differed by one bit, by two bits, in any combination + of top bits of (a,b,c), or in any combination of bottom bits of + (a,b,c). +* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed + the output delta to a Gray code (a^(a>>1)) so a string of 1's (as + is commonly produced by subtraction) look like a single 1-bit + difference. +* the base values were pseudorandom, all zero but one bit set, or + all zero plus a counter that starts at zero. + +These constants passed: + 14 11 25 16 4 14 24 + 12 14 25 16 4 14 24 +and these came close: + 4 8 15 26 3 22 24 + 10 8 15 26 3 22 24 + 11 8 15 26 3 22 24 +------------------------------------------------------------------------------- +*/ +/* clang-format off */ +#define final(a,b,c) \ +{ \ + c ^= b; c -= rot(b,14); \ + a ^= c; a -= rot(c,11); \ + b ^= a; b -= rot(a,25); \ + c ^= b; c -= rot(b,16); \ + a ^= c; a -= rot(c,4); \ + b ^= a; b -= rot(a,14); \ + c ^= b; c -= rot(b,24); \ +} +/* clang-format on */ + +/* +------------------------------------------------------------------------------- +hashlittle() -- hash a variable-length key into a 32-bit value + k : the key (the unaligned variable-length array of bytes) + length : the length of the key, counting by bytes + initval : can be any 4-byte value +Returns a 32-bit value. Every bit of the key affects every bit of +the return value. Two keys differing by one or two bits will have +totally different hash values. + +The best hash table sizes are powers of 2. There is no need to do +mod a prime (mod is sooo slow!). If you need less than 32 bits, +use a bitmask. For example, if you need only 10 bits, do + h = (h & hashmask(10)); +In which case, the hash table should have hashsize(10) elements. + +If you are hashing n strings (uint8_t **)k, do it like this: + for (i=0, h=0; i 12) + { + a += k[0]; + b += k[1]; + c += k[2]; + mix(a,b,c); + length -= 12; + k += 3; + } + + /*----------------------------- handle the last (probably partial) block */ + /* + * "k[2]&0xffffff" actually reads beyond the end of the string, but + * then masks off the part it's not allowed to read. Because the + * string is aligned, the masked-off tail is in the same word as the + * rest of the string. Every machine with memory protection I've seen + * does it on word boundaries, so is OK with this. But VALGRIND will + * still catch it and complain. The masking trick does make the hash + * noticeably faster for short strings (like English words). + * AddressSanitizer is similarly picky about overrunning + * the buffer. (https://clang.llvm.org/docs/AddressSanitizer.html) + */ +#ifdef VALGRIND +#define PRECISE_MEMORY_ACCESS 1 +#elif defined(__SANITIZE_ADDRESS__) /* GCC's ASAN */ +#define PRECISE_MEMORY_ACCESS 1 +#elif defined(__has_feature) +#if __has_feature(address_sanitizer) /* Clang's ASAN */ +#define PRECISE_MEMORY_ACCESS 1 +#endif +#endif +#ifndef PRECISE_MEMORY_ACCESS + + switch(length) + { + case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; + case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break; + case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break; + case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break; + case 8 : b+=k[1]; a+=k[0]; break; + case 7 : b+=k[1]&0xffffff; a+=k[0]; break; + case 6 : b+=k[1]&0xffff; a+=k[0]; break; + case 5 : b+=k[1]&0xff; a+=k[0]; break; + case 4 : a+=k[0]; break; + case 3 : a+=k[0]&0xffffff; break; + case 2 : a+=k[0]&0xffff; break; + case 1 : a+=k[0]&0xff; break; + case 0 : return c; /* zero length strings require no mixing */ + } + +#else /* make valgrind happy */ + + const uint8_t *k8 = (const uint8_t *)k; + switch(length) + { + case 12: c+=k[2]; b+=k[1]; a+=k[0]; break; + case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ + case 10: c+=((uint32_t)k8[9])<<8; /* fall through */ + case 9 : c+=k8[8]; /* fall through */ + case 8 : b+=k[1]; a+=k[0]; break; + case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ + case 6 : b+=((uint32_t)k8[5])<<8; /* fall through */ + case 5 : b+=k8[4]; /* fall through */ + case 4 : a+=k[0]; break; + case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ + case 2 : a+=((uint32_t)k8[1])<<8; /* fall through */ + case 1 : a+=k8[0]; break; + case 0 : return c; + } + +#endif /* !valgrind */ + + } + else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) + { + const uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */ + const uint8_t *k8; + + /*--------------- all but last block: aligned reads and different mixing */ + while (length > 12) + { + a += k[0] + (((uint32_t)k[1])<<16); + b += k[2] + (((uint32_t)k[3])<<16); + c += k[4] + (((uint32_t)k[5])<<16); + mix(a,b,c); + length -= 12; + k += 6; + } + + /*----------------------------- handle the last (probably partial) block */ + k8 = (const uint8_t *)k; + switch(length) + { + case 12: c+=k[4]+(((uint32_t)k[5])<<16); + b+=k[2]+(((uint32_t)k[3])<<16); + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 11: c+=((uint32_t)k8[10])<<16; /* fall through */ + case 10: c+=k[4]; + b+=k[2]+(((uint32_t)k[3])<<16); + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 9 : c+=k8[8]; /* fall through */ + case 8 : b+=k[2]+(((uint32_t)k[3])<<16); + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */ + case 6 : b+=k[2]; + a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 5 : b+=k8[4]; /* fall through */ + case 4 : a+=k[0]+(((uint32_t)k[1])<<16); + break; + case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */ + case 2 : a+=k[0]; + break; + case 1 : a+=k8[0]; + break; + case 0 : return c; /* zero length requires no mixing */ + } + + } + else + { + /* need to read the key one byte at a time */ + const uint8_t *k = (const uint8_t *)key; + + /*--------------- all but the last block: affect some 32 bits of (a,b,c) */ + while (length > 12) + { + a += k[0]; + a += ((uint32_t)k[1])<<8; + a += ((uint32_t)k[2])<<16; + a += ((uint32_t)k[3])<<24; + b += k[4]; + b += ((uint32_t)k[5])<<8; + b += ((uint32_t)k[6])<<16; + b += ((uint32_t)k[7])<<24; + c += k[8]; + c += ((uint32_t)k[9])<<8; + c += ((uint32_t)k[10])<<16; + c += ((uint32_t)k[11])<<24; + mix(a,b,c); + length -= 12; + k += 12; + } + + /*-------------------------------- last block: affect all 32 bits of (c) */ + switch(length) /* all the case statements fall through */ + { + case 12: c+=((uint32_t)k[11])<<24; /* FALLTHRU */ + case 11: c+=((uint32_t)k[10])<<16; /* FALLTHRU */ + case 10: c+=((uint32_t)k[9])<<8; /* FALLTHRU */ + case 9 : c+=k[8]; /* FALLTHRU */ + case 8 : b+=((uint32_t)k[7])<<24; /* FALLTHRU */ + case 7 : b+=((uint32_t)k[6])<<16; /* FALLTHRU */ + case 6 : b+=((uint32_t)k[5])<<8; /* FALLTHRU */ + case 5 : b+=k[4]; /* FALLTHRU */ + case 4 : a+=((uint32_t)k[3])<<24; /* FALLTHRU */ + case 3 : a+=((uint32_t)k[2])<<16; /* FALLTHRU */ + case 2 : a+=((uint32_t)k[1])<<8; /* FALLTHRU */ + case 1 : a+=k[0]; + break; + case 0 : return c; + } + } + + final(a,b,c); + return c; +} +/* clang-format on */ + +/* a simple hash function similar to what perl does for strings. + * for good results, the string should not be excessively large. + */ +static unsigned long lh_perllike_str_hash(const void *k) +{ + const char *rkey = (const char *)k; + unsigned hashval = 1; + + while (*rkey) + hashval = hashval * 33 + *rkey++; + + return hashval; +} + +static unsigned long lh_char_hash(const void *k) +{ +#if defined _MSC_VER || defined __MINGW32__ +#define RANDOM_SEED_TYPE LONG +#else +#define RANDOM_SEED_TYPE int +#endif + static volatile RANDOM_SEED_TYPE random_seed = -1; + + if (random_seed == -1) + { + RANDOM_SEED_TYPE seed; + /* we can't use -1 as it is the uninitialized sentinel */ + while ((seed = json_c_get_random_seed()) == -1) {} +#if SIZEOF_INT == 8 && defined __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 +#define USE_SYNC_COMPARE_AND_SWAP 1 +#endif +#if SIZEOF_INT == 4 && defined __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 +#define USE_SYNC_COMPARE_AND_SWAP 1 +#endif +#if SIZEOF_INT == 2 && defined __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 +#define USE_SYNC_COMPARE_AND_SWAP 1 +#endif +#if defined USE_SYNC_COMPARE_AND_SWAP + (void)__sync_val_compare_and_swap(&random_seed, -1, seed); +#elif defined _MSC_VER || defined __MINGW32__ + InterlockedCompareExchange(&random_seed, seed, -1); +#else + //#warning "racy random seed initialization if used by multiple threads" + random_seed = seed; /* potentially racy */ +#endif + } + + return hashlittle((const char *)k, strlen((const char *)k), (uint32_t)random_seed); +} + +int lh_char_equal(const void *k1, const void *k2) +{ + return (strcmp((const char *)k1, (const char *)k2) == 0); +} + +struct lh_table *lh_table_new(int size, lh_entry_free_fn *free_fn, lh_hash_fn *hash_fn, + lh_equal_fn *equal_fn) +{ + int i; + struct lh_table *t; + + /* Allocate space for elements to avoid divisions by zero. */ + assert(size > 0); + t = (struct lh_table *)calloc(1, sizeof(struct lh_table)); + if (!t) + return NULL; + + t->count = 0; + t->size = size; + t->table = (struct lh_entry *)calloc(size, sizeof(struct lh_entry)); + if (!t->table) + { + free(t); + return NULL; + } + t->free_fn = free_fn; + t->hash_fn = hash_fn; + t->equal_fn = equal_fn; + for (i = 0; i < size; i++) + t->table[i].k = LH_EMPTY; + return t; +} + +struct lh_table *lh_kchar_table_new(int size, lh_entry_free_fn *free_fn) +{ + return lh_table_new(size, free_fn, char_hash_fn, lh_char_equal); +} + +struct lh_table *lh_kptr_table_new(int size, lh_entry_free_fn *free_fn) +{ + return lh_table_new(size, free_fn, lh_ptr_hash, lh_ptr_equal); +} + +int lh_table_resize(struct lh_table *t, int new_size) +{ + struct lh_table *new_t; + struct lh_entry *ent; + + new_t = lh_table_new(new_size, NULL, t->hash_fn, t->equal_fn); + if (new_t == NULL) + return -1; + + for (ent = t->head; ent != NULL; ent = ent->next) + { + unsigned long h = lh_get_hash(new_t, ent->k); + unsigned int opts = 0; + if (ent->k_is_constant) + opts = JSON_C_OBJECT_ADD_CONSTANT_KEY; + if (lh_table_insert_w_hash(new_t, ent->k, ent->v, h, opts) != 0) + { + lh_table_free(new_t); + return -1; + } + } + free(t->table); + t->table = new_t->table; + t->size = new_size; + t->head = new_t->head; + t->tail = new_t->tail; + free(new_t); + + return 0; +} + +void lh_table_free(struct lh_table *t) +{ + struct lh_entry *c; + if (t->free_fn) + { + for (c = t->head; c != NULL; c = c->next) + t->free_fn(c); + } + free(t->table); + free(t); +} + +int lh_table_insert_w_hash(struct lh_table *t, const void *k, const void *v, const unsigned long h, + const unsigned opts) +{ + unsigned long n; + + if (t->count >= t->size * LH_LOAD_FACTOR) + { + /* Avoid signed integer overflow with large tables. */ + int new_size = (t->size > INT_MAX / 2) ? INT_MAX : (t->size * 2); + if (t->size == INT_MAX || lh_table_resize(t, new_size) != 0) + return -1; + } + + n = h % t->size; + + while (1) + { + if (t->table[n].k == LH_EMPTY || t->table[n].k == LH_FREED) + break; + if ((int)++n == t->size) + n = 0; + } + + t->table[n].k = k; + t->table[n].k_is_constant = (opts & JSON_C_OBJECT_ADD_CONSTANT_KEY); + t->table[n].v = v; + t->count++; + + if (t->head == NULL) + { + t->head = t->tail = &t->table[n]; + t->table[n].next = t->table[n].prev = NULL; + } + else + { + t->tail->next = &t->table[n]; + t->table[n].prev = t->tail; + t->table[n].next = NULL; + t->tail = &t->table[n]; + } + + return 0; +} +int lh_table_insert(struct lh_table *t, const void *k, const void *v) +{ + return lh_table_insert_w_hash(t, k, v, lh_get_hash(t, k), 0); +} + +struct lh_entry *lh_table_lookup_entry_w_hash(struct lh_table *t, const void *k, + const unsigned long h) +{ + unsigned long n = h % t->size; + int count = 0; + + while (count < t->size) + { + if (t->table[n].k == LH_EMPTY) + return NULL; + if (t->table[n].k != LH_FREED && t->equal_fn(t->table[n].k, k)) + return &t->table[n]; + if ((int)++n == t->size) + n = 0; + count++; + } + return NULL; +} + +struct lh_entry *lh_table_lookup_entry(struct lh_table *t, const void *k) +{ + return lh_table_lookup_entry_w_hash(t, k, lh_get_hash(t, k)); +} + +json_bool lh_table_lookup_ex(struct lh_table *t, const void *k, void **v) +{ + struct lh_entry *e = lh_table_lookup_entry(t, k); + if (e != NULL) + { + if (v != NULL) + *v = lh_entry_v(e); + return 1; /* key found */ + } + if (v != NULL) + *v = NULL; + return 0; /* key not found */ +} + +int lh_table_delete_entry(struct lh_table *t, struct lh_entry *e) +{ + /* CAW: fixed to be 64bit nice, still need the crazy negative case... */ + ptrdiff_t n = (ptrdiff_t)(e - t->table); + + /* CAW: this is bad, really bad, maybe stack goes other direction on this machine... */ + if (n < 0) + { + return -2; + } + + if (t->table[n].k == LH_EMPTY || t->table[n].k == LH_FREED) + return -1; + t->count--; + if (t->free_fn) + t->free_fn(e); + t->table[n].v = NULL; + t->table[n].k = LH_FREED; + if (t->tail == &t->table[n] && t->head == &t->table[n]) + { + t->head = t->tail = NULL; + } + else if (t->head == &t->table[n]) + { + t->head->next->prev = NULL; + t->head = t->head->next; + } + else if (t->tail == &t->table[n]) + { + t->tail->prev->next = NULL; + t->tail = t->tail->prev; + } + else + { + t->table[n].prev->next = t->table[n].next; + t->table[n].next->prev = t->table[n].prev; + } + t->table[n].next = t->table[n].prev = NULL; + return 0; +} + +int lh_table_delete(struct lh_table *t, const void *k) +{ + struct lh_entry *e = lh_table_lookup_entry(t, k); + if (!e) + return -1; + return lh_table_delete_entry(t, e); +} + +int lh_table_length(struct lh_table *t) +{ + return t->count; +} diff --git a/test/data/chunker_repo/repos/json-c/strerror_override.c b/test/data/chunker_repo/repos/json-c/strerror_override.c new file mode 100644 index 00000000..a3dd377a --- /dev/null +++ b/test/data/chunker_repo/repos/json-c/strerror_override.c @@ -0,0 +1,110 @@ +#define STRERROR_OVERRIDE_IMPL 1 +#include "strerror_override.h" + +/* + * Override strerror() to get consistent output across platforms. + */ + +static struct +{ + int errno_value; + const char *errno_str; +} errno_list[] = { +/* clang-format off */ +#define STRINGIFY(x) #x +#define ENTRY(x) {x, &STRINGIFY(undef_ ## x)[6]} + ENTRY(EPERM), + ENTRY(ENOENT), + ENTRY(ESRCH), + ENTRY(EINTR), + ENTRY(EIO), + ENTRY(ENXIO), + ENTRY(E2BIG), +#ifdef ENOEXEC + ENTRY(ENOEXEC), +#endif + ENTRY(EBADF), + ENTRY(ECHILD), + ENTRY(EDEADLK), + ENTRY(ENOMEM), + ENTRY(EACCES), + ENTRY(EFAULT), +#ifdef ENOTBLK + ENTRY(ENOTBLK), +#endif + ENTRY(EBUSY), + ENTRY(EEXIST), + ENTRY(EXDEV), + ENTRY(ENODEV), + ENTRY(ENOTDIR), + ENTRY(EISDIR), + ENTRY(EINVAL), + ENTRY(ENFILE), + ENTRY(EMFILE), + ENTRY(ENOTTY), +#ifdef ETXTBSY + ENTRY(ETXTBSY), +#endif + ENTRY(EFBIG), + ENTRY(ENOSPC), + ENTRY(ESPIPE), + ENTRY(EROFS), + ENTRY(EMLINK), + ENTRY(EPIPE), + ENTRY(EDOM), + ENTRY(ERANGE), + ENTRY(EAGAIN), + { 0, (char *)0 } +}; +/* clang-format on */ + +// Enabled during tests +static int _json_c_strerror_enable = 0; +extern char *getenv(const char *name); // Avoid including stdlib.h + +#define PREFIX "ERRNO=" +static char errno_buf[128] = PREFIX; +char *_json_c_strerror(int errno_in) +{ + int start_idx; + char digbuf[20]; + int ii, jj; + + if (!_json_c_strerror_enable) + _json_c_strerror_enable = (getenv("_JSON_C_STRERROR_ENABLE") == NULL) ? -1 : 1; + if (_json_c_strerror_enable == -1) + return strerror(errno_in); + + // Avoid standard functions, so we don't need to include any + // headers, or guess at signatures. + + for (ii = 0; errno_list[ii].errno_str != (char *)0; ii++) + { + const char *errno_str = errno_list[ii].errno_str; + if (errno_list[ii].errno_value != errno_in) + continue; + + for (start_idx = sizeof(PREFIX) - 1, jj = 0; errno_str[jj] != '\0'; + jj++, start_idx++) + { + errno_buf[start_idx] = errno_str[jj]; + } + errno_buf[start_idx] = '\0'; + return errno_buf; + } + + // It's not one of the known errno values, return the numeric value. + for (ii = 0; errno_in >= 10; errno_in /= 10, ii++) + { + digbuf[ii] = "0123456789"[(errno_in % 10)]; + } + digbuf[ii] = "0123456789"[(errno_in % 10)]; + + // Reverse the digits + for (start_idx = sizeof(PREFIX) - 1; ii >= 0; ii--, start_idx++) + { + errno_buf[start_idx] = digbuf[ii]; + } + errno_buf[start_idx] = '\0'; + return errno_buf; +} diff --git a/test/data/chunker_repo/repos/outline/Comment.ts b/test/data/chunker_repo/repos/outline/Comment.ts new file mode 100644 index 00000000..b16783bf --- /dev/null +++ b/test/data/chunker_repo/repos/outline/Comment.ts @@ -0,0 +1,278 @@ +import invariant from "invariant"; +import uniq from "lodash/uniq"; +import { action, computed, observable } from "mobx"; +import { Pagination } from "@shared/constants"; +import type { ProsemirrorData, ReactionSummary } from "@shared/types"; +import User from "~/models/User"; +import { client } from "~/utils/ApiClient"; +import Document from "./Document"; +import Model from "./base/Model"; +import Field from "./decorators/Field"; +import Relation from "./decorators/Relation"; + +class Comment extends Model { + static modelName = "Comment"; + + /** + * The Prosemirror data representing the comment content + */ + @Field + @observable.shallow + data: ProsemirrorData; + + /** + * If this comment is a reply then the parent comment will be set, otherwise + * it is a top thread. + */ + @Field + @observable + parentCommentId: string | null; + + /** + * The comment that this comment is a reply to. + */ + @Relation(() => Comment, { onDelete: "cascade" }) + parentComment?: Comment; + + /** + * The document ID to which this comment belongs. + */ + @Field + @observable + documentId: string; + + /** + * The document that this comment belongs to. + */ + @Relation(() => Document, { onDelete: "cascade" }) + document: Document; + + /** + * The user who created this comment. + */ + @Relation(() => User) + createdBy: User; + + /** + * The ID of the user who created this comment. + */ + createdById: string; + + /** + * The date and time that this comment was resolved, if it has been resolved. + */ + @observable + resolvedAt: string; + + /** + * The user who resolved this comment, if it has been resolved. + */ + @Relation(() => User) + resolvedBy: User | null; + + /** + * The ID of the user who resolved this comment, if it has been resolved. + */ + resolvedById: string | null; + + /** + * Active reactions for this comment. + * + * Note: This contains just the emoji with the associated user-ids. + */ + @observable + reactions: ReactionSummary[]; + + /** + * Denotes whether the user data for the active reactions are loaded. + */ + @observable + reactedUsersLoaded: boolean = false; + + /** + * Denotes whether there is an in-flight request for loading reacted users. + */ + private reactedUsersLoading = false; + + /** + * Whether the comment is resolved + */ + @computed + public get isResolved(): boolean { + return !!this.resolvedAt || !!this.parentComment?.isResolved; + } + + /** + * Whether the comment is a reply to another comment. + */ + @computed + public get isReply() { + return !!this.parentCommentId; + } + + /** + * Resolve the comment + */ + public resolve() { + return this.store.rootStore.comments.resolve(this.id); + } + + /** + * Unresolve the comment + */ + public unresolve() { + return this.store.rootStore.comments.unresolve(this.id); + } + + /** + * Add an emoji as a reaction to this comment. + * + * Optimistically updates the `reactions` cache and invokes the backend API. + * + * @param {Object} reaction - The reaction data. + * @param {string} reaction.emoji - The emoji to add as a reaction. + * @param {string} reaction.user - The user who added this reaction. + */ + @action + public addReaction = async ({ + emoji, + user, + }: { + emoji: string; + user: User; + }) => { + this.updateReaction({ type: "add", emoji, user }); + try { + await client.post("/comments.add_reaction", { + id: this.id, + emoji, + }); + } catch { + this.updateReaction({ type: "remove", emoji, user }); + } + }; + + /** + * Remove an emoji as a reaction from this comment. + * + * Optimistically updates the `reactions` cache and invokes the backend API. + * + * @param {Object} reaction - The reaction data. + * @param {string} reaction.emoji - The emoji to remove as a reaction. + * @param {string} reaction.user - The user who removed this reaction. + */ + @action + public removeReaction = async ({ + emoji, + user, + }: { + emoji: string; + user: User; + }) => { + this.updateReaction({ type: "remove", emoji, user }); + try { + await client.post("/comments.remove_reaction", { + id: this.id, + emoji, + }); + } catch { + this.updateReaction({ type: "add", emoji, user }); + } + }; + + /** + * Update the `reactions` cache. + * + * @param {Object} reaction - The reaction data. + * @param {string} reaction.type - The type of the action. + * @param {string} reaction.emoji - The emoji to update as a reaction. + * @param {string} reaction.user - The user who performed this action. + */ + @action + public updateReaction = ({ + type, + emoji, + user, + }: { + type: "add" | "remove"; + emoji: string; + user: User; + }) => { + const reaction = this.reactions.find((r) => r.emoji === emoji); + + // Step 1: Update the reactions cache. + + if (type === "add") { + if (!reaction) { + this.reactions.push({ emoji, userIds: [user.id] }); + } else { + reaction.userIds = uniq([...reaction.userIds, user.id]); + } + } else { + if (reaction) { + reaction.userIds = reaction.userIds.filter((id) => id !== user.id); + } + + if (reaction?.userIds.length === 0) { + this.reactions = this.reactions.filter( + (r) => r.emoji !== reaction.emoji + ); + } + } + + // Step 2: Add the user to the store. + this.store.rootStore.users.add(user); + }; + + /** + * Load the users for the active reactions. + * + * + * @param {Object} options - Options for loading the data. + * @param {string} options.limit - Per request limit for pagination. + */ + @action + loadReactedUsersData = async ( + { limit }: { limit: number } = { limit: Pagination.defaultLimit } + ) => { + if (this.reactedUsersLoading || this.reactedUsersLoaded) { + return; + } + + this.reactedUsersLoading = true; + + try { + const fetchPage = async (offset: number = 0) => { + const res = await client.post("/reactions.list", { + commentId: this.id, + offset, + limit, + }); + + invariant(res?.data, "Data not available"); + // @ts-expect-error reaction from server response + res.data.map((reaction) => + this.store.rootStore.users.add(reaction.user) + ); + + return res.pagination; + }; + + const { total } = await fetchPage(); + + const pages = Math.ceil(total / limit); + const fetchPages = []; + for (let page = 1; page < pages; page++) { + fetchPages.push(fetchPage(page * limit)); + } + + await Promise.all(fetchPages); + + this.reactedUsersLoaded = true; + } finally { + this.reactedUsersLoading = false; + } + }; +} + +export default Comment; diff --git a/test/data/chunker_repo/repos/outline/GroupUser.ts b/test/data/chunker_repo/repos/outline/GroupUser.ts new file mode 100644 index 00000000..de61ce7b --- /dev/null +++ b/test/data/chunker_repo/repos/outline/GroupUser.ts @@ -0,0 +1,27 @@ +import Group from "./Group"; +import User from "./User"; +import Model from "./base/Model"; +import Relation from "./decorators/Relation"; + +/** + * Represents a user's membership to a group. + */ +class GroupUser extends Model { + static modelName = "GroupUser"; + + /** The ID of the user. */ + userId: string; + + /** The user that belongs to the group. */ + @Relation(() => User, { onDelete: "cascade" }) + user: User; + + /** The ID of the group. */ + groupId: string; + + /** The group that the user belongs to. */ + @Relation(() => Group, { onDelete: "cascade" }) + group: Group; +} + +export default GroupUser; diff --git a/test/data/chunker_repo/repos/outline/index.ts b/test/data/chunker_repo/repos/outline/index.ts new file mode 100644 index 00000000..81687d3e --- /dev/null +++ b/test/data/chunker_repo/repos/outline/index.ts @@ -0,0 +1,365 @@ +import flattenDeep from "lodash/flattenDeep"; +import { toast } from "sonner"; +import { Optional } from "utility-types"; +import { v4 as uuidv4 } from "uuid"; +import { + Action, + ActionContext, + ActionV2, + ActionV2Group, + ActionV2Separator as TActionV2Separator, + ActionV2Variant, + ActionV2WithChildren, + CommandBarAction, + ExternalLinkActionV2, + InternalLinkActionV2, + MenuExternalLink, + MenuInternalLink, + MenuItem, + MenuItemButton, + MenuItemWithChildren, +} from "~/types"; +import Analytics from "~/utils/Analytics"; +import history from "~/utils/history"; + +function resolve(value: any, context: ActionContext): T { + return typeof value === "function" ? value(context) : value; +} + +export function createAction(definition: Optional): Action { + return { + ...definition, + perform: definition.perform + ? (context) => { + // We must use the specific analytics name here as the action name is + // translated and potentially contains user strings. + if (definition.analyticsName) { + Analytics.track("perform_action", definition.analyticsName, { + context: context.isButton + ? "button" + : context.isCommandBar + ? "commandbar" + : "contextmenu", + }); + } + return definition.perform?.(context); + } + : undefined, + id: definition.id ?? uuidv4(), + }; +} + +export function actionToMenuItem( + action: Action, + context: ActionContext +): MenuItemButton | MenuExternalLink | MenuInternalLink | MenuItemWithChildren { + const resolvedIcon = resolve>(action.icon, context); + const resolvedChildren = resolve(action.children, context); + const visible = action.visible ? action.visible(context) : true; + const title = resolve(action.name, context); + const icon = + resolvedIcon && action.iconInContextMenu !== false + ? resolvedIcon + : undefined; + + if (resolvedChildren) { + const items = resolvedChildren + .map((a) => actionToMenuItem(a, context)) + .filter(Boolean) + .filter((a) => a.visible); + + return { + type: "submenu", + title, + icon, + items, + visible: visible && items.length > 0, + }; + } + + if (action.to) { + return typeof action.to === "string" + ? { + type: "route", + title, + icon, + visible, + to: action.to, + selected: action.selected?.(context), + } + : { + type: "link", + title, + icon, + visible, + href: action.to, + selected: action.selected?.(context), + }; + } + + return { + type: "button", + title, + icon, + visible, + dangerous: action.dangerous, + onClick: () => performAction(action, context), + selected: action.selected?.(context), + }; +} + +export function actionToKBar( + action: Action, + context: ActionContext +): CommandBarAction[] { + if (typeof action.visible === "function" && !action.visible(context)) { + return []; + } + + const resolvedIcon = resolve(action.icon, context); + const resolvedChildren = resolve(action.children, context); + const resolvedSection = resolve(action.section, context); + const resolvedName = resolve(action.name, context); + const resolvedPlaceholder = resolve(action.placeholder, context); + const children = resolvedChildren + ? flattenDeep(resolvedChildren.map((a) => actionToKBar(a, context))).filter( + (a) => !!a + ) + : []; + + const sectionPriority = + typeof action.section !== "string" && "priority" in action.section + ? ((action.section.priority as number) ?? 0) + : 0; + + return [ + { + id: action.id, + name: resolvedName, + analyticsName: action.analyticsName, + section: resolvedSection, + placeholder: resolvedPlaceholder, + keywords: action.keywords ?? "", + shortcut: action.shortcut || [], + icon: resolvedIcon, + priority: (1 + (action.priority ?? 0)) * (1 + (sectionPriority ?? 0)), + perform: + action.perform || action.to + ? () => performAction(action, context) + : undefined, + }, + ].concat( + // @ts-expect-error ts-migrate(2769) FIXME: No overload matches this call. + children.map((child) => ({ ...child, parent: child.parent ?? action.id })) + ); +} + +export async function performAction(action: Action, context: ActionContext) { + const result = action.perform + ? action.perform(context) + : action.to + ? typeof action.to === "string" + ? history.push(action.to) + : window.open(action.to.url, action.to.target) + : undefined; + + if (result instanceof Promise) { + return result.catch((err: Error) => { + toast.error(err.message); + }); + } + + return result; +} + +/** Actions V2 */ + +export const ActionV2Separator: TActionV2Separator = { + type: "action_separator", +}; + +export function createActionV2( + definition: Optional, "id"> +): ActionV2 { + return { + ...definition, + type: "action", + variant: "action", + perform: definition.perform + ? (context) => { + // We must use the specific analytics name here as the action name is + // translated and potentially contains user strings. + if (definition.analyticsName) { + Analytics.track("perform_action", definition.analyticsName, { + context: context.isButton + ? "button" + : context.isCommandBar + ? "commandbar" + : "contextmenu", + }); + } + return definition.perform(context); + } + : () => {}, + id: definition.id ?? uuidv4(), + }; +} + +export function createInternalLinkActionV2( + definition: Optional, "id"> +): InternalLinkActionV2 { + return { + ...definition, + type: "action", + variant: "internal_link", + id: definition.id ?? uuidv4(), + }; +} + +export function createExternalLinkActionV2( + definition: Optional, "id"> +): ExternalLinkActionV2 { + return { + ...definition, + type: "action", + variant: "external_link", + id: definition.id ?? uuidv4(), + }; +} + +export function createActionV2WithChildren( + definition: Optional, "id"> +): ActionV2WithChildren { + return { + ...definition, + type: "action", + variant: "action_with_children", + id: definition.id ?? uuidv4(), + }; +} + +export function createActionV2Group( + definition: Omit +): ActionV2Group { + return { + ...definition, + type: "action_group", + }; +} + +export function createRootMenuAction( + actions: (ActionV2Variant | ActionV2Group | TActionV2Separator)[] +): ActionV2WithChildren { + return { + id: uuidv4(), + type: "action", + variant: "action_with_children", + name: "root_action", + section: "Root", + children: actions, + }; +} + +export function actionV2ToMenuItem( + action: ActionV2Variant | ActionV2Group | TActionV2Separator, + context: ActionContext +): MenuItem { + switch (action.type) { + case "action": { + const title = resolve(action.name, context); + const visible = resolve(action.visible, context); + const icon = + !!action.icon && action.iconInContextMenu !== false + ? action.icon + : undefined; + + switch (action.variant) { + case "action": + return { + type: "button", + title, + icon, + visible, + dangerous: action.dangerous, + onClick: () => performActionV2(action, context), + }; + + case "internal_link": + return { + type: "route", + title, + icon, + visible, + to: action.to, + }; + + case "external_link": + return { + type: "link", + title, + icon, + visible, + href: action.target + ? { url: action.url, target: action.target } + : action.url, + }; + + case "action_with_children": { + const children = resolve< + (ActionV2Variant | ActionV2Group | TActionV2Separator)[] + >(action.children, context); + const subMenuItems = children.map((a) => + actionV2ToMenuItem(a, context) + ); + return { + type: "submenu", + title, + icon, + items: subMenuItems, + visible: visible && hasVisibleItems(subMenuItems), + }; + } + + default: + throw Error("invalid action variant"); + } + } + + case "action_group": { + const groupItems = action.actions.map((a) => + actionV2ToMenuItem(a, context) + ); + return { + type: "group", + title: resolve(action.name, context), + visible: hasVisibleItems(groupItems), + items: groupItems, + }; + } + + case "action_separator": + return { type: "separator" }; + } +} + +export async function performActionV2( + action: ActionV2, + context: ActionContext +) { + const result = action.perform(context); + + if (result instanceof Promise) { + return result.catch((err: Error) => { + toast.error(err.message); + }); + } + + return result; +} + +function hasVisibleItems(items: MenuItem[]) { + const applicableTypes = ["button", "link", "route", "group", "submenu"]; + return items.some( + (item) => applicableTypes.includes(item.type) && item.visible + ); +} diff --git a/test/test_code_chunker.py b/test/test_code_chunker.py new file mode 100644 index 00000000..91abd8d2 --- /dev/null +++ b/test/test_code_chunker.py @@ -0,0 +1,103 @@ +import json +import os +import pathlib + +import pytest + +from docling_core.transforms.chunker.base_code_chunker import CodeChunk +from docling_core.transforms.chunker.language_code_chunkers import ( + CFunctionChunker, + JavaFunctionChunker, + JavaScriptFunctionChunker, + PythonFunctionChunker, + TypeScriptFunctionChunker, +) +from docling_core.types.doc.labels import DocItemLabel + +from .test_data_gen_flag import GEN_TEST_DATA +from .test_utils_repo_ds import create_ds, language_to_extension + +HERE = pathlib.Path(__file__).parent +DATA = HERE / "data" / "chunker_repo" +DATA.mkdir(parents=True, exist_ok=True) + +REPO_SPECS = [ + ( + "Java", + "/test/data/chunker_repo/repos/acmeair", + "https://github.com/acmeair/acmeair", + lambda: JavaFunctionChunker(max_tokens=5000), + ), + ( + "TypeScript", + "/test/data/chunker_repo/repos/outline", + "https://github.com/outline/outline", + lambda: TypeScriptFunctionChunker(max_tokens=5000), + ), + ( + "JavaScript", + "/test/data/chunker_repo/repos/jquery", + "https://github.com/jquery/jquery", + lambda: JavaScriptFunctionChunker(max_tokens=5000), + ), + ( + "Python", + "/test/data/chunker_repo/repos/docling", + "https://github.com/docling-project/docling", + lambda: PythonFunctionChunker(max_tokens=5000), + ), + ( + "C", + "/test/data/chunker_repo/repos/json-c", + "https://github.com/json-c/json-c", + lambda: CFunctionChunker(max_tokens=5000), + ), +] + + +def _dump_or_assert(act_data: dict, out_path: pathlib.Path): + out_path.parent.mkdir(parents=True, exist_ok=True) + if GEN_TEST_DATA: + with out_path.open("w", encoding="utf-8") as f: + json.dump(act_data, fp=f, indent=4) + f.write("\n") + else: + with out_path.open(encoding="utf-8") as f: + exp_data = json.load(fp=f) + assert exp_data == act_data + + +@pytest.mark.parametrize("name,local_path,repo_url,chunker_factory", REPO_SPECS) +def test_function_chunkers_repo(name, local_path, repo_url, chunker_factory): + + local_path_full = os.getcwd() + local_path + + if not os.path.isdir(local_path_full): + pytest.skip(f"Missing repo at {local_path_full}; skipping {name} test.") + + docs = create_ds(local_path_full, repo_url, commit_id="abc123def456") + docs = [ + doc + for doc in docs + if any(text.label == DocItemLabel.CODE and text.text for text in doc.texts) + ] + docs = [doc for doc in docs if doc.name.endswith(language_to_extension[name])] + if not docs: + pytest.skip(f"No documents found in {local_path_full} for {name}.") + + sample = docs[:3] + + chunker = chunker_factory() + all_chunks = [] + for doc in sample: + chunks_iter = chunker.chunk(dl_doc=doc) + + chunks = [CodeChunk.model_validate(n) for n in chunks_iter] + all_chunks.extend(chunks) + assert chunks, f"Expected chunks for {doc.name}" + for c in chunks: + assert c.text and isinstance(c.text, str) + + act_data = {"root": [c.export_json_dict() for c in all_chunks]} + out_path = DATA / name / "repo_out_chunks.json" + _dump_or_assert(act_data, out_path) diff --git a/test/test_utils_repo_ds.py b/test/test_utils_repo_ds.py new file mode 100644 index 00000000..88141f01 --- /dev/null +++ b/test/test_utils_repo_ds.py @@ -0,0 +1,140 @@ +import fnmatch +import glob +import os +from typing import List + +import git + +from docling_core.types.doc import DoclingDocument, DocumentOrigin +from docling_core.types.doc.labels import CodeLanguageLabel +from docling_core.utils.legacy import _create_hash + +language_to_extension = { + "Python": ".py", + "Java": ".java", + "C": ".c", + "TypeScript": ".ts", + "JavaScript": ".js", +} + +language_to_enum = { + "Python": CodeLanguageLabel.PYTHON, + "Java": CodeLanguageLabel.JAVA, + "C": CodeLanguageLabel.C, + "TypeScript": CodeLanguageLabel.TYPESCRIPT, + "JavaScript": CodeLanguageLabel.JAVASCRIPT, +} + + +def get_latest_commit_id(file_dir: str) -> str: + """ + Returns the hexadecimal SHA-1 ID of the latest commit in the given Git repository directory. + + Parameters: + file_dir (str): The path to the Git repository directory. + + Returns: + str: The hexadecimal SHA-1 ID of the latest commit, or an empty string if an error occurs. + """ + try: + repo = git.Repo(file_dir, search_parent_directories=True) + return repo.head.commit.hexsha + except Exception: + return "" + + +def load_ignore_patterns(ignore_file_path: str) -> list: + """ + Load ignore patterns from a file. + + This function reads a file containing ignore patterns (one per line) and returns a list of patterns, + excluding empty lines and lines starting with '#'. If the file does not exist, it returns an empty list. + + Args: + ignore_file_path (str): The path to the ignore file. + + Returns: + list: A list of ignore patterns. + """ + if not os.path.exists(ignore_file_path): + return [] + with open(ignore_file_path, "r", encoding="utf-8") as file: + return [ + line.strip() for line in file if line.strip() and not line.startswith("#") + ] + + +def is_ignored(file_path: str, ignore_patterns: List[str]) -> bool: + """ + Check if a file path matches any of the given ignore patterns. + + This function takes a file path and a list of ignore patterns, and returns True if the file path matches any of the patterns, + indicating that the file should be ignored. Otherwise, it returns False. + + Args: + file_path (str): The path of the file to check. + ignore_patterns (list of str): A list of patterns to check against the file path. + + Returns: + bool: True if the file path matches any ignore pattern, False otherwise. + """ + for pattern in ignore_patterns: + if fnmatch.fnmatch(file_path, pattern): + return True + return False + + +def create_ds( + file_dir: str, repo_url: str, commit_id: str = None +) -> List[DoclingDocument]: + """ + Build DoclingDocument objects from a local checkout, one per code file. + Deterministic ordering and hashes for use in tests. + + Args: + file_dir: Directory containing the repository + repo_url: URL of the repository + commit_id: Specific commit ID to use (defaults to "main" for deterministic testing) + """ + documents: List[DoclingDocument] = [] + if commit_id is None: + commit_id = get_latest_commit_id(file_dir) + ignore_file = os.path.join(file_dir, ".ragignore") + ignore_patterns = load_ignore_patterns(ignore_file) + + for language, extension in language_to_extension.items(): + files = [ + f + for f in sorted(glob.glob(f"{file_dir}/**/*{extension}", recursive=True)) + if not is_ignored(f, ignore_patterns) + ] + files.sort() + for file_path in files: + try: + with open(file_path, "r", encoding="utf-8") as f: + file_content = f.read() + + file_relative = os.path.relpath(file_path, start=file_dir).replace( + "\\", "/" + ) + + origin = DocumentOrigin( + filename=file_relative, + uri=( + f"{repo_url}/blob/{commit_id}/{file_relative}" + if commit_id + else f"{repo_url}/{file_relative}" + ), + mimetype="text/plain", + binary_hash=_create_hash(file_content), + ) + + doc = DoclingDocument(name=file_relative, origin=origin) + doc.add_code( + text=file_content, code_language=language_to_enum[language] + ) + documents.append(doc) + except Exception: + continue + + return documents diff --git a/uv.lock b/uv.lock index 160450ab..921be35a 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.9, <4.0" resolution-markers = [ "python_full_version >= '3.12'", @@ -737,6 +737,12 @@ dependencies = [ { name = "pydantic" }, { name = "pyyaml" }, { name = "tabulate" }, + { name = "tree-sitter" }, + { name = "tree-sitter-c" }, + { name = "tree-sitter-java" }, + { name = "tree-sitter-javascript" }, + { name = "tree-sitter-python" }, + { name = "tree-sitter-typescript" }, { name = "typer" }, { name = "typing-extensions" }, ] @@ -788,7 +794,13 @@ requires-dist = [ { name = "tabulate", specifier = ">=0.9.0,<0.10.0" }, { name = "tiktoken", marker = "extra == 'chunking-openai'", specifier = ">=0.9.0,<0.13.0" }, { name = "transformers", marker = "extra == 'chunking'", specifier = ">=4.34.0,<5.0.0" }, - { name = "typer", specifier = ">=0.12.5,<0.20.0" }, + { name = "tree-sitter", specifier = "==0.23.2" }, + { name = "tree-sitter-c", specifier = "==0.23.4" }, + { name = "tree-sitter-java", specifier = "==0.23.5" }, + { name = "tree-sitter-javascript", specifier = "==0.23.1" }, + { name = "tree-sitter-python", specifier = "==0.23.6" }, + { name = "tree-sitter-typescript", specifier = "==0.23.2" }, + { name = "typer", specifier = ">=0.12.5,<0.17.0" }, { name = "typing-extensions", specifier = ">=4.12.2,<5.0.0" }, ] provides-extras = ["chunking", "chunking-openai"] @@ -3438,6 +3450,129 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/71/d3/c16c3b3cf7655a67db1144da94b021c200ac1303f82428f2beef6c2e72bb/transformers-4.57.1-py3-none-any.whl", hash = "sha256:b10d05da8fa67dc41644dbbf9bc45a44cb86ae33da6f9295f5fbf5b7890bd267", size = 11990925, upload-time = "2025-10-14T15:39:23.085Z" }, ] +[[package]] +name = "tree-sitter" +version = "0.23.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0f/50/fd5fafa42b884f741b28d9e6fd366c3f34e15d2ed3aa9633b34e388379e2/tree-sitter-0.23.2.tar.gz", hash = "sha256:66bae8dd47f1fed7bdef816115146d3a41c39b5c482d7bad36d9ba1def088450", size = 166800, upload-time = "2024-10-24T15:31:02.238Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/91/04/2068a7b725265ecfcbf63ecdae038f1d4124ebccd55b8a7ce145b70e2b6a/tree_sitter-0.23.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3a937f5d8727bc1c74c4bf2a9d1c25ace049e8628273016ad0d45914ae904e10", size = 139289, upload-time = "2024-10-24T15:29:59.27Z" }, + { url = "https://files.pythonhosted.org/packages/a8/07/a5b943121f674fe1ac77694a698e71ce95353830c1f3f4ce45da7ef3e406/tree_sitter-0.23.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2c7eae7fe2af215645a38660d2d57d257a4c461fe3ec827cca99a79478284e80", size = 132379, upload-time = "2024-10-24T15:30:01.437Z" }, + { url = "https://files.pythonhosted.org/packages/d4/96/fcc72c33d464a2d722db1e95b74a53ced771a47b3cfde60aced29764a783/tree_sitter-0.23.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3a71d607595270b6870eaf778a1032d146b2aa79bfcfa60f57a82a7b7584a4c7", size = 552884, upload-time = "2024-10-24T15:30:02.672Z" }, + { url = "https://files.pythonhosted.org/packages/d0/af/b0e787a52767155b4643a55d6de03c1e4ae77abb61e1dc1629ad983e0a40/tree_sitter-0.23.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fe9b9ea7a0aa23b52fd97354da95d1b2580065bc12a4ac868f9164a127211d6", size = 566561, upload-time = "2024-10-24T15:30:04.073Z" }, + { url = "https://files.pythonhosted.org/packages/65/fd/05e966b5317b1c6679c071c5b0203f28af9d26c9363700cb9682e1bcf343/tree_sitter-0.23.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d74d00a8021719eae14d10d1b1e28649e15d8b958c01c2b2c3dad7a2ebc4dbae", size = 558273, upload-time = "2024-10-24T15:30:06.177Z" }, + { url = "https://files.pythonhosted.org/packages/60/bc/19145efdf3f47711aa3f1bf06f0b50593f97f1108550d38694841fd97b7c/tree_sitter-0.23.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:6de18d8d8a7f67ab71f472d1fcb01cc506e080cbb5e13d52929e4b6fdce6bbee", size = 569176, upload-time = "2024-10-24T15:30:07.902Z" }, + { url = "https://files.pythonhosted.org/packages/32/08/3553d8e488ae9284a0762effafb7d2639a306e184963b7f99853923084d6/tree_sitter-0.23.2-cp310-cp310-win_amd64.whl", hash = "sha256:12b60dca70d2282af942b650a6d781be487485454668c7c956338a367b98cdee", size = 117902, upload-time = "2024-10-24T15:30:09.675Z" }, + { url = "https://files.pythonhosted.org/packages/1d/39/836fa485e985c33e8aa1cc3abbf7a84be1c2c382e69547a765631fdd7ce3/tree_sitter-0.23.2-cp310-cp310-win_arm64.whl", hash = "sha256:3346a4dd0447a42aabb863443b0fd8c92b909baf40ed2344fae4b94b625d5955", size = 102644, upload-time = "2024-10-24T15:30:11.484Z" }, + { url = "https://files.pythonhosted.org/packages/55/8d/2d4fb04408772be0919441d66f700673ce7cb76b9ab6682e226d740fb88d/tree_sitter-0.23.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:91fda41d4f8824335cc43c64e2c37d8089c8c563bd3900a512d2852d075af719", size = 139142, upload-time = "2024-10-24T15:30:12.627Z" }, + { url = "https://files.pythonhosted.org/packages/32/52/b8a44bfff7b0203256e5dbc8d3a372ee8896128b8ed7d3a89e1ef17b2065/tree_sitter-0.23.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:92b2b489d5ce54b41f94c6f23fbaf592bd6e84dc2877048fd1cb060480fa53f7", size = 132198, upload-time = "2024-10-24T15:30:13.893Z" }, + { url = "https://files.pythonhosted.org/packages/5d/54/746f2ee5acf6191a4a0be7f5843329f0d713bfe5196f5fc6fe2ea69cb44c/tree_sitter-0.23.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:64859bd4aa1567d0d6016a811b2b49c59d4a4427d096e3d8c84b2521455f62b7", size = 554303, upload-time = "2024-10-24T15:30:15.334Z" }, + { url = "https://files.pythonhosted.org/packages/2f/5a/3169d9933be813776a9b4b3f2e671d3d50fa27e589dee5578f6ecef7ff6d/tree_sitter-0.23.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:614590611636044e071d3a0b748046d52676dbda3bc9fa431216231e11dd98f7", size = 567626, upload-time = "2024-10-24T15:30:17.12Z" }, + { url = "https://files.pythonhosted.org/packages/32/0d/23f363b3b0bc3fa0e7a4a294bf119957ac1ab02737d57815e1e8b7b3e196/tree_sitter-0.23.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:08466953c78ae57be61057188fb88c89791b0a562856010228e0ccf60e2ac453", size = 559803, upload-time = "2024-10-24T15:30:18.921Z" }, + { url = "https://files.pythonhosted.org/packages/6f/b3/1ffba0f17a7ff2c9114d91a1ecc15e0748f217817797564d31fbb61d7458/tree_sitter-0.23.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:8a33f03a562de91f7fd05eefcedd8994a06cd44c62f7aabace811ad82bc11cbd", size = 570987, upload-time = "2024-10-24T15:30:21.116Z" }, + { url = "https://files.pythonhosted.org/packages/59/4b/085bcb8a11ea18003aacc4dbc91c301d1536c5e2deedb95393e8ef26f1f7/tree_sitter-0.23.2-cp311-cp311-win_amd64.whl", hash = "sha256:03b70296b569ef64f7b92b42ca5da9bf86d81bee2afd480bea35092687f51dae", size = 117771, upload-time = "2024-10-24T15:30:22.38Z" }, + { url = "https://files.pythonhosted.org/packages/4b/e5/90adc4081f49ccb6bea89a800dc9b0dcc5b6953b0da423e8eff28f63fddf/tree_sitter-0.23.2-cp311-cp311-win_arm64.whl", hash = "sha256:7cb4bb953ea7c0b50eeafc4454783e030357179d2a93c3dd5ebed2da5588ddd0", size = 102555, upload-time = "2024-10-24T15:30:23.534Z" }, + { url = "https://files.pythonhosted.org/packages/07/a7/57e0fe87b49a78c670a7b4483f70e44c000c65c29b138001096b22e7dd87/tree_sitter-0.23.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a014498b6a9e6003fae8c6eb72f5927d62da9dcb72b28b3ce8cd15c6ff6a6572", size = 139259, upload-time = "2024-10-24T15:30:24.941Z" }, + { url = "https://files.pythonhosted.org/packages/b4/b9/bc8513d818ffb54993a017a36c8739300bc5739a13677acf90b54995e7db/tree_sitter-0.23.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:04f8699b131d4bcbe3805c37e4ef3d159ee9a82a0e700587625623999ba0ea53", size = 131951, upload-time = "2024-10-24T15:30:26.176Z" }, + { url = "https://files.pythonhosted.org/packages/d7/6a/eab01bb6b1ce3c9acf16d72922ffc29a904af485eb3e60baf3a3e04edd30/tree_sitter-0.23.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4471577df285059c71686ecb208bc50fb472099b38dcc8e849b0e86652891e87", size = 557952, upload-time = "2024-10-24T15:30:27.389Z" }, + { url = "https://files.pythonhosted.org/packages/bd/95/f2f73332623cf63200d57800f85273170bc5f99d28ea3f234afd5b0048df/tree_sitter-0.23.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f342c925290dd4e20ecd5787ef7ae8749981597ab364783a1eb73173efe65226", size = 571199, upload-time = "2024-10-24T15:30:28.879Z" }, + { url = "https://files.pythonhosted.org/packages/04/ac/bd6e6cfdd0421156e86f5c93848629af1c7323083077e1a95b27d32d5811/tree_sitter-0.23.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a4e9e53d07dd076bede72e4f7d3a0173d7b9ad6576572dd86da008a740a9bb22", size = 562129, upload-time = "2024-10-24T15:30:30.199Z" }, + { url = "https://files.pythonhosted.org/packages/7b/bd/8a9edcbcf8a76b0bf58e3b927ed291e3598e063d56667367762833cc8709/tree_sitter-0.23.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:8caebe65bc358759dac2500d8f8feed3aed939c4ade9a684a1783fe07bc7d5db", size = 574307, upload-time = "2024-10-24T15:30:32.085Z" }, + { url = "https://files.pythonhosted.org/packages/0c/c2/3fb2c6c0ae2f59a7411dc6d3e7945e3cb6f34c8552688708acc8b2b13f83/tree_sitter-0.23.2-cp312-cp312-win_amd64.whl", hash = "sha256:fc5a72eb50d43485000dbbb309acb350467b7467e66dc747c6bb82ce63041582", size = 117858, upload-time = "2024-10-24T15:30:33.353Z" }, + { url = "https://files.pythonhosted.org/packages/e2/18/4ca2c0f4a0c802ebcb3a92264cc436f1d54b394fa24dfa76bf57cdeaca9e/tree_sitter-0.23.2-cp312-cp312-win_arm64.whl", hash = "sha256:a0320eb6c7993359c5f7b371d22719ccd273f440d41cf1bd65dac5e9587f2046", size = 102496, upload-time = "2024-10-24T15:30:34.782Z" }, + { url = "https://files.pythonhosted.org/packages/ba/c6/4ead9ce3113a7c27f37a2bdef163c09757efbaa85adbdfe7b3fbf0317c57/tree_sitter-0.23.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:eff630dddee7ba05accb439b17e559e15ce13f057297007c246237ceb6306332", size = 139266, upload-time = "2024-10-24T15:30:35.946Z" }, + { url = "https://files.pythonhosted.org/packages/76/c9/b4197c5b0c1d6ba648202a547846ac910a53163b69a459504b2aa6cdb76e/tree_sitter-0.23.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4780ba8f3894f2dea869fad2995c2aceab3fd5ab9e6a27c45475d2acd7f7e84e", size = 131959, upload-time = "2024-10-24T15:30:37.646Z" }, + { url = "https://files.pythonhosted.org/packages/99/94/0f7c5580d2adff3b57d36f1998725b0caf6cf1af50ceafc00c6cdbc2fef6/tree_sitter-0.23.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0b609460b8e3e256361fb12e94fae5b728cb835b16f0f9d590b5aadbf9d109b", size = 557582, upload-time = "2024-10-24T15:30:39.019Z" }, + { url = "https://files.pythonhosted.org/packages/97/8a/f73ff06959d43fd47fc283cbcc4d8efa6550b2cc431d852b184504992447/tree_sitter-0.23.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78d070d8eaeaeb36cf535f55e5578fddbfc3bf53c1980f58bf1a99d57466b3b5", size = 570891, upload-time = "2024-10-24T15:30:40.432Z" }, + { url = "https://files.pythonhosted.org/packages/b8/86/bbda5ad09b88051ff7bf3275622a2f79bc4f728b4c283ff8b93b8fcdf36d/tree_sitter-0.23.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:878580b2ad5054c410ba3418edca4d34c81cc26706114d8f5b5541688bc2d785", size = 562343, upload-time = "2024-10-24T15:30:43.045Z" }, + { url = "https://files.pythonhosted.org/packages/ca/55/b404fa49cb5c2926ad6fe1cac033dd486ef69f1afeb7828452d21e1e05c1/tree_sitter-0.23.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:29224bdc2a3b9af535b7725e249d3ee291b2e90708e82832e73acc175e40dc48", size = 574407, upload-time = "2024-10-24T15:30:45.018Z" }, + { url = "https://files.pythonhosted.org/packages/c2/c8/eea2104443ab973091107ef3e730683bd8e6cb51dd025cef853d3fff9dae/tree_sitter-0.23.2-cp313-cp313-win_amd64.whl", hash = "sha256:c58d89348162fbc3aea1fe6511a66ee189fc0e4e4bbe937026f29e4ecef17763", size = 117854, upload-time = "2024-10-24T15:30:47.817Z" }, + { url = "https://files.pythonhosted.org/packages/89/4d/1728d9ce32a1d851081911b7e47830f5e740431f2bb920f54bb8c26175bc/tree_sitter-0.23.2-cp313-cp313-win_arm64.whl", hash = "sha256:0ff2037be5edab7801de3f6a721b9cf010853f612e2008ee454e0e0badb225a6", size = 102492, upload-time = "2024-10-24T15:30:48.892Z" }, + { url = "https://files.pythonhosted.org/packages/cb/ab/b39173a47d498cc6276e303c865f4a222134ceae890bd3c1b29427489805/tree_sitter-0.23.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a5db8e585205faef8bf219da77d8993e2ef04d08eda2e3c8ad7e4df8297ee344", size = 139550, upload-time = "2024-10-24T15:30:50.516Z" }, + { url = "https://files.pythonhosted.org/packages/4c/34/fa8f5b862dd7a6014fd5578810178e8f7601830cabb6d65d2aba050c2df1/tree_sitter-0.23.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9dbd110a30cf28be5da734ae4cd0e9031768228dbf6a79f2973962aa51de4ec7", size = 132686, upload-time = "2024-10-24T15:30:51.779Z" }, + { url = "https://files.pythonhosted.org/packages/98/b9/ccdddf35705fc23395caa71557f767e0753d38afe4b5bb99efddbf62bb22/tree_sitter-0.23.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:569514b9a996a0fd458b3a891c46ca125298be0c03cf82f2b6f0c13d5d8f25dc", size = 554958, upload-time = "2024-10-24T15:30:53.327Z" }, + { url = "https://files.pythonhosted.org/packages/8d/ba/20ae9079bdfc5cfac28b39d945a6c354c8e1385e73aec8142db6c53b635c/tree_sitter-0.23.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a357ed98a74e47787b812df99a74a2c35c0fe11e55c2095cc01d1cad144ef552", size = 568162, upload-time = "2024-10-24T15:30:54.667Z" }, + { url = "https://files.pythonhosted.org/packages/40/00/b16bf6cf88c47c1b6c8e1cce1eb9e90badb5db9e5252ae0970d858d02592/tree_sitter-0.23.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:c2dfb8e8f760f4cc67888d03ef9e2dbd3353245f67f5efba375c2a14d944ac0e", size = 560278, upload-time = "2024-10-24T15:30:56.49Z" }, + { url = "https://files.pythonhosted.org/packages/7a/8f/27ab9b96cc0261af78b080ec8a9846a38e216360ec38774ea27eba35bd3c/tree_sitter-0.23.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:3ead958df87a21d706903987e665e9e0e5df7b2c5021ff69ea349826840adc6a", size = 571255, upload-time = "2024-10-24T15:30:58.254Z" }, + { url = "https://files.pythonhosted.org/packages/44/e0/95a3d66a7e5bb229574484ab10c6dc99d1c7a32972b890d194076e30dc4f/tree_sitter-0.23.2-cp39-cp39-win_amd64.whl", hash = "sha256:611cae16be332213c0e6ece72c0bfca202e30ff320a8b309b1526c6cb79ee4ba", size = 118232, upload-time = "2024-10-24T15:30:59.965Z" }, + { url = "https://files.pythonhosted.org/packages/10/b5/9eaf794fc71490573ab14a366affca415bc1ddbf86a14d78e54583db4254/tree_sitter-0.23.2-cp39-cp39-win_arm64.whl", hash = "sha256:b848e0fdd522fbb8888cdb4f4d93f8fad97ae10d70c122fb922e51363c7febcd", size = 102787, upload-time = "2024-10-24T15:31:01.084Z" }, +] + +[[package]] +name = "tree-sitter-c" +version = "0.23.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/27/27/254ebffa4066b3073dddee00c1915893794f5cbf938335c1cc926cd32385/tree_sitter_c-0.23.4.tar.gz", hash = "sha256:9215c7888dd019038f162ea5646178f6e129cd2b49fc506d14becf5e426121d7", size = 223089, upload-time = "2024-12-15T22:24:42.833Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/a9/41e5177fd9309bf142d6772f6885e6a93baa0ad40f17c7a4144ba1275c9c/tree_sitter_c-0.23.4-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2c92c0571b36b6da06f8882f34151dc11e67a493e9101cc0026a16da27709c05", size = 80812, upload-time = "2024-12-15T22:24:26.318Z" }, + { url = "https://files.pythonhosted.org/packages/90/99/cf0a3a8a661fffc7f6843cafbbc1887c47e1a79f751cf9c88002008c8eae/tree_sitter_c-0.23.4-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:98c285a23bf4fb6fb34140d6ea0f0d25d0a93e0d93692f9dffe3db6d1fe08534", size = 85813, upload-time = "2024-12-15T22:24:28.438Z" }, + { url = "https://files.pythonhosted.org/packages/01/c1/d346a08e05223bff3cea08a8f96d685d19bc2c022fde719bfd3e9f6aaaac/tree_sitter_c-0.23.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e42a3519825ca59c91b2b7aec08dd3c89e02690c7b315d54a1e1743f9be3f15", size = 110085, upload-time = "2024-12-15T22:24:30.823Z" }, + { url = "https://files.pythonhosted.org/packages/a8/88/b7d395038b109d42a4682b9f3d72f8e02de8f7c7caf9ad2b289991f1ac19/tree_sitter_c-0.23.4-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c15c7588c3d95872328019073a8d5eaf7c2691b4d4ef0393a0168399b2ad2356", size = 98075, upload-time = "2024-12-15T22:24:32.946Z" }, + { url = "https://files.pythonhosted.org/packages/e8/12/754a8166d3860cdd614bf7d117c94a740ce1ab1ab2ba766321249909e7b1/tree_sitter_c-0.23.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:013403e74765d74e523f380f9df8f3d99e9fe94132a3fc0c8b29cba538a7b2bf", size = 94071, upload-time = "2024-12-15T22:24:34.974Z" }, + { url = "https://files.pythonhosted.org/packages/14/da/2f97b96f081d6ac9b37c87c9d8e5c0ff5948802562ae28b1a58afd8dec1d/tree_sitter_c-0.23.4-cp39-abi3-win_amd64.whl", hash = "sha256:a4d7bdeaca8f1da72352a945853f56aa5d34e7bc22569ec5bda5d7c1a04e5b0f", size = 84483, upload-time = "2024-12-15T22:24:37.052Z" }, + { url = "https://files.pythonhosted.org/packages/d9/33/0d3b72634e2f34e64b07aaf100207cf3d01e32d814e72e144af0a0e785ad/tree_sitter_c-0.23.4-cp39-abi3-win_arm64.whl", hash = "sha256:edd36e12cc79b8b5bbc81fc336ff7d2577d0fe16afd18163c9aff7ae3ff69e15", size = 82482, upload-time = "2024-12-15T22:24:40.758Z" }, +] + +[[package]] +name = "tree-sitter-java" +version = "0.23.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fa/dc/eb9c8f96304e5d8ae1663126d89967a622a80937ad2909903569ccb7ec8f/tree_sitter_java-0.23.5.tar.gz", hash = "sha256:f5cd57b8f1270a7f0438878750d02ccc79421d45cca65ff284f1527e9ef02e38", size = 138121, upload-time = "2024-12-21T18:24:26.936Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/67/21/b3399780b440e1567a11d384d0ebb1aea9b642d0d98becf30fa55c0e3a3b/tree_sitter_java-0.23.5-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:355ce0308672d6f7013ec913dee4a0613666f4cda9044a7824240d17f38209df", size = 58926, upload-time = "2024-12-21T18:24:12.53Z" }, + { url = "https://files.pythonhosted.org/packages/57/ef/6406b444e2a93bc72a04e802f4107e9ecf04b8de4a5528830726d210599c/tree_sitter_java-0.23.5-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:24acd59c4720dedad80d548fe4237e43ef2b7a4e94c8549b0ca6e4c4d7bf6e69", size = 62288, upload-time = "2024-12-21T18:24:14.634Z" }, + { url = "https://files.pythonhosted.org/packages/4e/6c/74b1c150d4f69c291ab0b78d5dd1b59712559bbe7e7daf6d8466d483463f/tree_sitter_java-0.23.5-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9401e7271f0b333df39fc8a8336a0caf1b891d9a2b89ddee99fae66b794fc5b7", size = 85533, upload-time = "2024-12-21T18:24:16.695Z" }, + { url = "https://files.pythonhosted.org/packages/29/09/e0d08f5c212062fd046db35c1015a2621c2631bc8b4aae5740d7adb276ad/tree_sitter_java-0.23.5-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:370b204b9500b847f6d0c5ad584045831cee69e9a3e4d878535d39e4a7e4c4f1", size = 84033, upload-time = "2024-12-21T18:24:18.758Z" }, + { url = "https://files.pythonhosted.org/packages/43/56/7d06b23ddd09bde816a131aa504ee11a1bbe87c6b62ab9b2ed23849a3382/tree_sitter_java-0.23.5-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:aae84449e330363b55b14a2af0585e4e0dae75eb64ea509b7e5b0e1de536846a", size = 82564, upload-time = "2024-12-21T18:24:20.493Z" }, + { url = "https://files.pythonhosted.org/packages/da/d6/0528c7e1e88a18221dbd8ccee3825bf274b1fa300f745fd74eb343878043/tree_sitter_java-0.23.5-cp39-abi3-win_amd64.whl", hash = "sha256:1ee45e790f8d31d416bc84a09dac2e2c6bc343e89b8a2e1d550513498eedfde7", size = 60650, upload-time = "2024-12-21T18:24:22.902Z" }, + { url = "https://files.pythonhosted.org/packages/72/57/5bab54d23179350356515526fff3cc0f3ac23bfbc1a1d518a15978d4880e/tree_sitter_java-0.23.5-cp39-abi3-win_arm64.whl", hash = "sha256:402efe136104c5603b429dc26c7e75ae14faaca54cfd319ecc41c8f2534750f4", size = 59059, upload-time = "2024-12-21T18:24:24.934Z" }, +] + +[[package]] +name = "tree-sitter-javascript" +version = "0.23.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cd/dc/1c55c33cc6bbe754359b330534cf9f261c1b9b2c26ddf23aef3c5fa67759/tree_sitter_javascript-0.23.1.tar.gz", hash = "sha256:b2059ce8b150162cda05a457ca3920450adbf915119c04b8c67b5241cd7fcfed", size = 110058, upload-time = "2024-11-10T05:40:42.357Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/20/d3/c67d7d49967344b51208ad19f105233be1afdf07d3dcb35b471900265227/tree_sitter_javascript-0.23.1-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:6ca583dad4bd79d3053c310b9f7208cd597fd85f9947e4ab2294658bb5c11e35", size = 59333, upload-time = "2024-11-10T05:40:31.988Z" }, + { url = "https://files.pythonhosted.org/packages/a5/db/ea0ee1547679d1750e80a0c4bc60b3520b166eeaf048764cfdd1ba3fd5e5/tree_sitter_javascript-0.23.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:94100e491a6a247aa4d14caf61230c171b6376c863039b6d9cd71255c2d815ec", size = 61071, upload-time = "2024-11-10T05:40:33.458Z" }, + { url = "https://files.pythonhosted.org/packages/67/6e/07c4857e08be37bfb55bfb269863df8ec908b2f6a3f1893cd852b893ecab/tree_sitter_javascript-0.23.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a6bc1055b061c5055ec58f39ee9b2e9efb8e6e0ae970838af74da0afb811f0a", size = 96999, upload-time = "2024-11-10T05:40:34.869Z" }, + { url = "https://files.pythonhosted.org/packages/5f/f5/4de730afe8b9422845bc2064020a8a8f49ebd1695c04261c38d1b3e3edec/tree_sitter_javascript-0.23.1-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:056dc04fb6b24293f8c5fec43c14e7e16ba2075b3009c643abf8c85edc4c7c3c", size = 94020, upload-time = "2024-11-10T05:40:35.735Z" }, + { url = "https://files.pythonhosted.org/packages/77/0a/f980520da86c4eff8392867840a945578ef43372c9d4a37922baa6b121fe/tree_sitter_javascript-0.23.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a11ca1c0f736da42967586b568dff8a465ee148a986c15ebdc9382806e0ce871", size = 92927, upload-time = "2024-11-10T05:40:37.92Z" }, + { url = "https://files.pythonhosted.org/packages/ff/5c/36a98d512aa1d1082409d6b7eda5d26b820bd4477a54100ad9f62212bc55/tree_sitter_javascript-0.23.1-cp39-abi3-win_amd64.whl", hash = "sha256:041fa22b34250ea6eb313d33104d5303f79504cb259d374d691e38bbdc49145b", size = 58824, upload-time = "2024-11-10T05:40:39.903Z" }, + { url = "https://files.pythonhosted.org/packages/dc/79/ceb21988e6de615355a63eebcf806cd2a0fe875bec27b429d58b63e7fb5f/tree_sitter_javascript-0.23.1-cp39-abi3-win_arm64.whl", hash = "sha256:eb28130cd2fb30d702d614cbf61ef44d1c7f6869e7d864a9cc17111e370be8f7", size = 57027, upload-time = "2024-11-10T05:40:40.841Z" }, +] + +[[package]] +name = "tree-sitter-python" +version = "0.23.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1c/30/6766433b31be476fda6569a3a374c2220e45ffee0bff75460038a57bf23b/tree_sitter_python-0.23.6.tar.gz", hash = "sha256:354bfa0a2f9217431764a631516f85173e9711af2c13dbd796a8815acfe505d9", size = 155868, upload-time = "2024-12-22T23:09:55.918Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/67/577a02acae5f776007c924ca86ef14c19c12e71de0aa9d2a036f3c248e7b/tree_sitter_python-0.23.6-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:28fbec8f74eeb2b30292d97715e60fac9ccf8a8091ce19b9d93e9b580ed280fb", size = 74361, upload-time = "2024-12-22T23:09:42.37Z" }, + { url = "https://files.pythonhosted.org/packages/d2/a6/194b3625a7245c532ad418130d63077ce6cd241152524152f533e4d6edb0/tree_sitter_python-0.23.6-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:680b710051b144fedf61c95197db0094f2245e82551bf7f0c501356333571f7a", size = 76436, upload-time = "2024-12-22T23:09:43.566Z" }, + { url = "https://files.pythonhosted.org/packages/d0/62/1da112689d6d282920e62c40e67ab39ea56463b0e7167bfc5e81818a770e/tree_sitter_python-0.23.6-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a9dcef55507b6567207e8ee0a6b053d0688019b47ff7f26edc1764b7f4dc0a4", size = 112060, upload-time = "2024-12-22T23:09:44.721Z" }, + { url = "https://files.pythonhosted.org/packages/5d/62/c9358584c96e38318d69b6704653684fd8467601f7b74e88aa44f4e6903f/tree_sitter_python-0.23.6-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:29dacdc0cd2f64e55e61d96c6906533ebb2791972bec988450c46cce60092f5d", size = 112338, upload-time = "2024-12-22T23:09:48.323Z" }, + { url = "https://files.pythonhosted.org/packages/1a/58/c5e61add45e34fb8ecbf057c500bae9d96ed7c9ca36edb7985da8ae45526/tree_sitter_python-0.23.6-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:7e048733c36f564b379831689006801feb267d8194f9e793fbb395ef1723335d", size = 109382, upload-time = "2024-12-22T23:09:49.49Z" }, + { url = "https://files.pythonhosted.org/packages/e9/f3/9b30893cae9b3811fe652dc6f90aaadfda12ae0b2757f5722fc7266f423c/tree_sitter_python-0.23.6-cp39-abi3-win_amd64.whl", hash = "sha256:a24027248399fb41594b696f929f9956828ae7cc85596d9f775e6c239cd0c2be", size = 75904, upload-time = "2024-12-22T23:09:51.597Z" }, + { url = "https://files.pythonhosted.org/packages/87/cb/ce35a65f83a47b510d8a2f1eddf3bdbb0d57aabc87351c8788caf3309f76/tree_sitter_python-0.23.6-cp39-abi3-win_arm64.whl", hash = "sha256:71334371bd73d5fe080aed39fbff49ed8efb9506edebe16795b0c7567ed6a272", size = 73649, upload-time = "2024-12-22T23:09:53.71Z" }, +] + +[[package]] +name = "tree-sitter-typescript" +version = "0.23.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1e/fc/bb52958f7e399250aee093751e9373a6311cadbe76b6e0d109b853757f35/tree_sitter_typescript-0.23.2.tar.gz", hash = "sha256:7b167b5827c882261cb7a50dfa0fb567975f9b315e87ed87ad0a0a3aedb3834d", size = 773053, upload-time = "2024-11-11T02:36:11.396Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/28/95/4c00680866280e008e81dd621fd4d3f54aa3dad1b76b857a19da1b2cc426/tree_sitter_typescript-0.23.2-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:3cd752d70d8e5371fdac6a9a4df9d8924b63b6998d268586f7d374c9fba2a478", size = 286677, upload-time = "2024-11-11T02:35:58.839Z" }, + { url = "https://files.pythonhosted.org/packages/8f/2f/1f36fda564518d84593f2740d5905ac127d590baf5c5753cef2a88a89c15/tree_sitter_typescript-0.23.2-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:c7cc1b0ff5d91bac863b0e38b1578d5505e718156c9db577c8baea2557f66de8", size = 302008, upload-time = "2024-11-11T02:36:00.733Z" }, + { url = "https://files.pythonhosted.org/packages/96/2d/975c2dad292aa9994f982eb0b69cc6fda0223e4b6c4ea714550477d8ec3a/tree_sitter_typescript-0.23.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4b1eed5b0b3a8134e86126b00b743d667ec27c63fc9de1b7bb23168803879e31", size = 351987, upload-time = "2024-11-11T02:36:02.669Z" }, + { url = "https://files.pythonhosted.org/packages/49/d1/a71c36da6e2b8a4ed5e2970819b86ef13ba77ac40d9e333cb17df6a2c5db/tree_sitter_typescript-0.23.2-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e96d36b85bcacdeb8ff5c2618d75593ef12ebaf1b4eace3477e2bdb2abb1752c", size = 344960, upload-time = "2024-11-11T02:36:04.443Z" }, + { url = "https://files.pythonhosted.org/packages/7f/cb/f57b149d7beed1a85b8266d0c60ebe4c46e79c9ba56bc17b898e17daf88e/tree_sitter_typescript-0.23.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:8d4f0f9bcb61ad7b7509d49a1565ff2cc363863644a234e1e0fe10960e55aea0", size = 340245, upload-time = "2024-11-11T02:36:06.473Z" }, + { url = "https://files.pythonhosted.org/packages/8b/ab/dd84f0e2337296a5f09749f7b5483215d75c8fa9e33738522e5ed81f7254/tree_sitter_typescript-0.23.2-cp39-abi3-win_amd64.whl", hash = "sha256:3f730b66396bc3e11811e4465c41ee45d9e9edd6de355a58bbbc49fa770da8f9", size = 278015, upload-time = "2024-11-11T02:36:07.631Z" }, + { url = "https://files.pythonhosted.org/packages/9f/e4/81f9a935789233cf412a0ed5fe04c883841d2c8fb0b7e075958a35c65032/tree_sitter_typescript-0.23.2-cp39-abi3-win_arm64.whl", hash = "sha256:05db58f70b95ef0ea126db5560f3775692f609589ed6f8dd0af84b7f19f1cbb7", size = 274052, upload-time = "2024-11-11T02:36:09.514Z" }, +] + [[package]] name = "twine" version = "3.8.0" @@ -3461,7 +3596,7 @@ wheels = [ [[package]] name = "typer" -version = "0.19.2" +version = "0.16.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, @@ -3470,9 +3605,9 @@ dependencies = [ { name = "shellingham" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/21/ca/950278884e2ca20547ff3eb109478c6baf6b8cf219318e6bc4f666fad8e8/typer-0.19.2.tar.gz", hash = "sha256:9ad824308ded0ad06cc716434705f691d4ee0bfd0fb081839d2e426860e7fdca", size = 104755, upload-time = "2025-09-23T09:47:48.256Z" } +sdist = { url = "https://files.pythonhosted.org/packages/43/78/d90f616bf5f88f8710ad067c1f8705bf7618059836ca084e5bb2a0855d75/typer-0.16.1.tar.gz", hash = "sha256:d358c65a464a7a90f338e3bb7ff0c74ac081449e53884b12ba658cbd72990614", size = 102836, upload-time = "2025-08-18T19:18:22.898Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/00/22/35617eee79080a5d071d0f14ad698d325ee6b3bf824fc0467c03b30e7fa8/typer-0.19.2-py3-none-any.whl", hash = "sha256:755e7e19670ffad8283db353267cb81ef252f595aa6834a0d1ca9312d9326cb9", size = 46748, upload-time = "2025-09-23T09:47:46.777Z" }, + { url = "https://files.pythonhosted.org/packages/2d/76/06dbe78f39b2203d2a47d5facc5df5102d0561e2807396471b5f7c5a30a1/typer-0.16.1-py3-none-any.whl", hash = "sha256:90ee01cb02d9b8395ae21ee3368421faf21fa138cb2a541ed369c08cec5237c9", size = 46397, upload-time = "2025-08-18T19:18:21.663Z" }, ] [[package]] From 38ed69a85418e741ef5c8e5486d96959edd9140d Mon Sep 17 00:00:00 2001 From: Bridget McGinn Date: Fri, 3 Oct 2025 09:28:33 -0400 Subject: [PATCH 02/12] DCO Remediation Commit for Bridget McGinn I, Bridget McGinn , hereby add my Signed-off-by to this commit: 334811a4edf0c9a8511a0e3cd31acda2df335d62 Signed-off-by: Bridget McGinn From 0266c6387066cd8e4d45ea02fcb417ade5eedd18 Mon Sep 17 00:00:00 2001 From: Bridget McGinn Date: Fri, 10 Oct 2025 12:35:45 -0400 Subject: [PATCH 03/12] include language detections, add code chunking into hierarchical chunker --- docling_core/transforms/chunker/__init__.py | 14 +- .../transforms/chunker/base_code_chunker.py | 2 +- .../chunker/code_chunk_utils/chunk_utils.py | 2 +- .../chunker/code_chunk_utils/types.py | 35 - .../chunker/code_chunk_utils/utils.py | 12 + .../chunker/code_chunking_strategy.py | 287 ++++++ .../chunker/hierarchical_chunker.py | 78 +- .../chunker/language_code_chunkers.py | 1 + docling_core/types/doc/document.py | 22 +- docling_core/types/doc/labels.py | 13 + test/data/chunker_repo/C/repo_out_chunks.json | 500 +++++---- .../chunker_repo/Java/repo_out_chunks.json | 70 +- .../JavaScript/repo_out_chunks.json | 66 +- .../chunker_repo/Python/repo_out_chunks.json | 946 ++++++++++-------- .../TypeScript/repo_out_chunks.json | 122 ++- test/data/repo_chunking/sample.c | 22 + test/data/repo_chunking/sample.go | 23 + test/data/repo_chunking/sample.java | 25 + test/data/repo_chunking/sample.js | 23 + test/data/repo_chunking/sample.md | 23 + test/data/repo_chunking/sample.py | 25 + test/data/repo_chunking/sample.ts | 20 + test/test_code_chunker.py | 77 +- test/test_code_chunking_strategy.py | 296 ++++++ test/test_utils_repo_ds.py | 140 --- 25 files changed, 1930 insertions(+), 914 deletions(-) delete mode 100644 docling_core/transforms/chunker/code_chunk_utils/types.py create mode 100644 docling_core/transforms/chunker/code_chunking_strategy.py create mode 100644 test/data/repo_chunking/sample.c create mode 100644 test/data/repo_chunking/sample.go create mode 100644 test/data/repo_chunking/sample.java create mode 100644 test/data/repo_chunking/sample.js create mode 100644 test/data/repo_chunking/sample.md create mode 100644 test/data/repo_chunking/sample.py create mode 100644 test/data/repo_chunking/sample.ts create mode 100644 test/test_code_chunking_strategy.py delete mode 100644 test/test_utils_repo_ds.py diff --git a/docling_core/transforms/chunker/__init__.py b/docling_core/transforms/chunker/__init__.py index 8522e75c..a218fb8a 100644 --- a/docling_core/transforms/chunker/__init__.py +++ b/docling_core/transforms/chunker/__init__.py @@ -13,15 +13,21 @@ ChunkSizeProcessor, RangeTracker, ) -from docling_core.transforms.chunker.code_chunk_utils.types import ( +from docling_core.transforms.chunker.code_chunk_utils.utils import Language +from docling_core.transforms.chunker.code_chunking_strategy import ( + CodeChunkingStrategyFactory, + DefaultCodeChunkingStrategy, + LanguageDetector, + NoOpCodeChunkingStrategy, +) +from docling_core.transforms.chunker.hierarchical_chunker import ( ChunkType, CodeChunk, + CodeChunkingStrategy, CodeDocMeta, -) -from docling_core.transforms.chunker.code_chunk_utils.utils import Language -from docling_core.transforms.chunker.hierarchical_chunker import ( DocChunk, DocMeta, HierarchicalChunker, ) +from docling_core.transforms.chunker.hybrid_chunker import HybridChunker from docling_core.transforms.chunker.page_chunker import PageChunker diff --git a/docling_core/transforms/chunker/base_code_chunker.py b/docling_core/transforms/chunker/base_code_chunker.py index 75643d64..7c9a8cc2 100644 --- a/docling_core/transforms/chunker/base_code_chunker.py +++ b/docling_core/transforms/chunker/base_code_chunker.py @@ -8,12 +8,12 @@ ChunkSizeProcessor, RangeTracker, ) -from docling_core.transforms.chunker.code_chunk_utils.types import CodeChunk from docling_core.transforms.chunker.code_chunk_utils.utils import ( Language, get_children, to_str, ) +from docling_core.transforms.chunker.hierarchical_chunker import CodeChunk from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer from docling_core.types import DoclingDocument as DLDocument from docling_core.types.doc.labels import DocItemLabel diff --git a/docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py b/docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py index e8a6daf3..4756c82e 100644 --- a/docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py +++ b/docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py @@ -3,7 +3,7 @@ from tree_sitter import Node -from docling_core.transforms.chunker.code_chunk_utils.types import ( +from docling_core.transforms.chunker.hierarchical_chunker import ( ChunkType, CodeChunk, CodeDocMeta, diff --git a/docling_core/transforms/chunker/code_chunk_utils/types.py b/docling_core/transforms/chunker/code_chunk_utils/types.py deleted file mode 100644 index a272f226..00000000 --- a/docling_core/transforms/chunker/code_chunk_utils/types.py +++ /dev/null @@ -1,35 +0,0 @@ -from enum import Enum -from typing import Optional - -from pydantic import Field - -from docling_core.transforms.chunker.base import BaseChunk, BaseMeta -from docling_core.types.doc.document import DocumentOrigin - - -class CodeDocMeta(BaseMeta): - """Data model for CodeChunker metadata.""" - - part_name: Optional[str] = Field(default=None) - docstring: Optional[str] = Field(default=None) - sha256: Optional[int] = Field(default=None) - start_line: Optional[int] = Field(default=None) - end_line: Optional[int] = Field(default=None) - end_line_signature: Optional[int] = Field(default=None) - origin: Optional[DocumentOrigin] = Field(default=None) - chunk_type: Optional[str] = Field(default=None) - - -class ChunkType(str, Enum): - """Chunk type""" - - FUNCTION = "function" - METHOD = "method" - PREAMBLE = "preamble" - CLASS = "class" - - -class CodeChunk(BaseChunk): - """Data model for code chunks.""" - - meta: CodeDocMeta diff --git a/docling_core/transforms/chunker/code_chunk_utils/utils.py b/docling_core/transforms/chunker/code_chunk_utils/utils.py index 409893f6..58301e19 100644 --- a/docling_core/transforms/chunker/code_chunk_utils/utils.py +++ b/docling_core/transforms/chunker/code_chunk_utils/utils.py @@ -10,6 +10,7 @@ from tree_sitter import Node, Tree from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer +from docling_core.types.doc.labels import CodeLanguageLabel class Language(str, Enum): @@ -47,6 +48,17 @@ def get_tree_sitter_language(self): else: return None + def to_code_language_label(self): + + mapping = { + Language.PYTHON: CodeLanguageLabel.PYTHON, + Language.JAVA: CodeLanguageLabel.JAVA, + Language.C: CodeLanguageLabel.C, + Language.TYPESCRIPT: CodeLanguageLabel.TYPESCRIPT, + Language.JAVASCRIPT: CodeLanguageLabel.JAVASCRIPT, + } + return mapping.get(self, CodeLanguageLabel.UNKNOWN) + def get_import_query(self) -> Optional[str]: if self == Language.PYTHON: return """ diff --git a/docling_core/transforms/chunker/code_chunking_strategy.py b/docling_core/transforms/chunker/code_chunking_strategy.py new file mode 100644 index 00000000..f78dedfa --- /dev/null +++ b/docling_core/transforms/chunker/code_chunking_strategy.py @@ -0,0 +1,287 @@ +from typing import Any, Dict, Iterator, Optional + +from docling_core.transforms.chunker.base_code_chunker import CodeChunker +from docling_core.transforms.chunker.code_chunk_utils.utils import Language +from docling_core.transforms.chunker.hierarchical_chunker import ( + ChunkType, + CodeChunk, + CodeDocMeta, +) +from docling_core.transforms.chunker.language_code_chunkers import ( + CFunctionChunker, + JavaFunctionChunker, + JavaScriptFunctionChunker, + PythonFunctionChunker, + TypeScriptFunctionChunker, +) +from docling_core.types.doc.base import Size +from docling_core.types.doc.document import ( + CodeItem, + DoclingDocument, + DocumentOrigin, + PageItem, +) +from docling_core.utils.legacy import _create_hash + + +class LanguageDetector: + """Utility class for detecting programming languages from code content and file extensions.""" + + @staticmethod + def detect_from_extension(filename: Optional[str]) -> Optional[Language]: + """Detect language from file extension.""" + + if not filename: + return None + + filename_lower = filename.lower() + + for language in Language: + for ext in language.file_extensions(): + if filename_lower.endswith(ext): + return language + return None + + @staticmethod + def detect_from_content(code_text: str) -> Optional[Language]: + """Detect language from code content using heuristics.""" + + if not code_text: + return None + + code_lower = code_text.lower().strip() + + if any( + pattern in code_lower + for pattern in [ + "def ", + "import ", + "from ", + 'if __name__ == "__main__"', + "print(", + "lambda ", + "yield ", + "async def", + ] + ) and not any( + pattern in code_lower + for pattern in ["public class", "private ", "protected ", "package "] + ): + return Language.PYTHON + + if any( + pattern in code_lower + for pattern in [ + "package main", + "func main()", + 'import "fmt"', + 'import "os"', + "chan ", + "interface{}", + "go func", + "defer ", + ":= ", + ] + ) and not any( + pattern in code_lower + for pattern in [ + "public class", + "import java.", + "System.out.println", + "extends ", + "implements ", + ] + ): + return None + + if any( + pattern in code_lower + for pattern in [ + "public class", + "package ", + "import java.", + "public static void main", + "extends ", + "implements ", + "String[]", + "System.out.println", + ] + ) and not any( + pattern in code_lower + for pattern in ["package main", "func main()", "chan ", "interface{}"] + ): + return Language.JAVA + + if any( + pattern in code_lower + for pattern in [ + ": string", + ": number", + ": boolean", + "interface ", + "type ", + "enum ", + "public ", + "private ", + "protected ", + ] + ): + return Language.TYPESCRIPT + + if any( + pattern in code_lower + for pattern in [ + "function ", + "const ", + "let ", + "var ", + "=>", + "require(", + "module.exports", + "export ", + "import ", + "console.log", + ] + ): + return Language.JAVASCRIPT + + if any( + pattern in code_lower + for pattern in [ + "#include", + "int main(", + "void ", + "char ", + "float ", + "double ", + "struct ", + "#define", + "printf(", + "scanf(", + ] + ): + return Language.C + + return None + + @staticmethod + def detect_language( + code_text: str, filename: Optional[str] = None + ) -> Optional[Language]: + """Detect language from both filename and content.""" + + if filename: + lang = LanguageDetector.detect_from_extension(filename) + if lang: + return lang + return None + + return LanguageDetector.detect_from_content(code_text) + + +class CodeChunkingStrategyFactory: + """Factory for creating language-specific code chunking strategies.""" + + @staticmethod + def create_chunker(language: Language, **kwargs: Any) -> CodeChunker: + """Create a language-specific code chunker.""" + + chunker_map = { + Language.PYTHON: PythonFunctionChunker, + Language.TYPESCRIPT: TypeScriptFunctionChunker, + Language.JAVASCRIPT: JavaScriptFunctionChunker, + Language.C: CFunctionChunker, + Language.JAVA: JavaFunctionChunker, + } + + chunker_class = chunker_map.get(language) + if not chunker_class: + raise ValueError(f"No chunker available for language: {language}") + + return chunker_class(**kwargs) + + +class DefaultCodeChunkingStrategy: + """Default implementation of CodeChunkingStrategy that uses language detection and appropriate chunkers.""" + + def __init__(self, **chunker_kwargs: Any): + """Initialize the strategy with optional chunker parameters.""" + + self.chunker_kwargs = chunker_kwargs + self._chunker_cache: Dict[Language, CodeChunker] = {} + + def _get_chunker(self, language: Language) -> CodeChunker: + """Get or create a chunker for the given language.""" + + if language not in self._chunker_cache: + self._chunker_cache[language] = CodeChunkingStrategyFactory.create_chunker( + language, **self.chunker_kwargs + ) + return self._chunker_cache[language] + + def chunk_code_item( + self, + code_text: str, + language: Language, + original_doc=None, + original_item=None, + **kwargs: Any, + ) -> Iterator[CodeChunk]: + """Chunk a single code item using the appropriate language chunker.""" + + if not code_text.strip(): + return + + chunker = self._get_chunker(language) + + if original_doc and original_doc.origin: + filename = original_doc.origin.filename or "code_chunk" + mimetype = original_doc.origin.mimetype or "text/plain" + binary_hash = _create_hash(code_text) + else: + filename = "code_chunk" + mimetype = "text/plain" + binary_hash = _create_hash(code_text) + + if original_item and hasattr(original_item, "self_ref"): + self_ref = original_item.self_ref + else: + self_ref = "#/texts/0" + + code_item = CodeItem(text=code_text, self_ref=self_ref, orig=code_text) + + doc = DoclingDocument( + name=filename, + texts=[code_item], + pages={0: PageItem(page_no=0, size=Size(width=612.0, height=792.0))}, + origin=DocumentOrigin( + filename=filename, mimetype=mimetype, binary_hash=binary_hash + ), + ) + + yield from chunker.chunk(doc, **kwargs) + + +class NoOpCodeChunkingStrategy: + """No-operation code chunking strategy that returns the original code as a single chunk.""" + + def chunk_code_item( + self, + code_text: str, + language: Language, + original_doc=None, + original_item=None, + **kwargs: Any, + ) -> Iterator[CodeChunk]: + """Return the code as a single chunk without further processing.""" + + if not code_text.strip(): + return + + meta = CodeDocMeta( + chunk_type=ChunkType.CODE_BLOCK, + start_line=1, + end_line=len(code_text.splitlines()), + ) + + yield CodeChunk(text=code_text, meta=meta) diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py index b429a51b..be0e6ad4 100644 --- a/docling_core/transforms/chunker/hierarchical_chunker.py +++ b/docling_core/transforms/chunker/hierarchical_chunker.py @@ -9,7 +9,8 @@ import logging import re -from typing import Any, ClassVar, Final, Iterator, Literal, Optional +from enum import Enum +from typing import Any, ClassVar, Final, Iterator, Literal, Optional, Protocol from pydantic import ConfigDict, Field, StringConstraints, field_validator from typing_extensions import Annotated, override @@ -40,6 +41,7 @@ TableItem, TitleItem, ) +from docling_core.types.doc.labels import DocItemLabel _VERSION: Final = "1.0.0" @@ -116,6 +118,45 @@ def check_version_is_compatible(cls, v: str) -> str: return _VERSION +class CodeDocMeta(DocMeta): + """Data model for CodeChunker metadata.""" + + doc_items: Optional[list[DocItem]] = Field(default=None, alias=_KEY_DOC_ITEMS) + part_name: Optional[str] = Field(default=None) + docstring: Optional[str] = Field(default=None) + sha256: Optional[int] = Field(default=None) + start_line: Optional[int] = Field(default=None) + end_line: Optional[int] = Field(default=None) + end_line_signature: Optional[int] = Field(default=None) + chunk_type: Optional[str] = Field(default=None) + + +class CodeChunk(BaseChunk): + """Data model for code chunks.""" + + meta: CodeDocMeta + + +class ChunkType(str, Enum): + """Chunk type""" + + FUNCTION = "function" + METHOD = "method" + PREAMBLE = "preamble" + CLASS = "class" + CODE_BLOCK = "code_block" + + +class CodeChunkingStrategy(Protocol): + """Protocol for code chunking strategies that can be plugged into HierarchicalChunker.""" + + def chunk_code_item( + self, code_text: str, language: Any, **kwargs: Any + ) -> Iterator[CodeChunk]: + """Chunk a single code item.""" + ... + + class DocChunk(BaseChunk): """Data model for document chunks.""" @@ -199,11 +240,15 @@ class HierarchicalChunker(BaseChunker): merge_list_items (bool): Whether to merge successive list items. Defaults to True. delim (str): Delimiter to use for merging text. Defaults to "\n". + code_chunking_strategy (CodeChunkingStrategy): Optional strategy for chunking code items. + If provided, code items will be processed using this strategy instead of being + treated as regular text. Defaults to None (no special code processing). """ model_config = ConfigDict(arbitrary_types_allowed=True) serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider() + code_chunking_strategy: Optional[Any] = Field(default=None) # deprecated: merge_list_items: Annotated[bool, Field(deprecated=True)] = True @@ -242,6 +287,37 @@ def chunk( isinstance(item, (ListGroup, InlineGroup, DocItem)) and item.self_ref not in visited ): + if ( + isinstance(item, DocItem) + and hasattr(item, "label") + and item.label == DocItemLabel.CODE + and self.code_chunking_strategy is not None + ): + + from docling_core.transforms.chunker.code_chunking_strategy import ( + LanguageDetector, + ) + + language = LanguageDetector.detect_language( + item.text, + ( + getattr(dl_doc.origin, "filename", None) + if dl_doc.origin + else None + ), + ) + + if language: + for code_chunk in self.code_chunking_strategy.chunk_code_item( + item.text, + language, + original_doc=dl_doc, + original_item=item, + **kwargs, + ): + yield code_chunk + continue + ser_res = my_doc_ser.serialize(item=item, visited=visited) else: continue diff --git a/docling_core/transforms/chunker/language_code_chunkers.py b/docling_core/transforms/chunker/language_code_chunkers.py index c97430e5..45c488ea 100644 --- a/docling_core/transforms/chunker/language_code_chunkers.py +++ b/docling_core/transforms/chunker/language_code_chunkers.py @@ -450,6 +450,7 @@ def __init__(self, **data): @override def _file_prefix(self, root_node: Node) -> Tuple[str, List[Tuple[int, int]]]: used_ranges = [] + prefix = "" for child in root_node.children: if child.type == self.package_declaration: prefix = to_str(child).strip() + "\n" diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 45d8611b..344c0d73 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -2827,7 +2827,7 @@ def add_code( """add_code. :param text: str: - :param code_language: Optional[str]: (Default value = None) + :param code_language: Optional[CodeLanguageLabel]: (Default value = None) :param orig: Optional[str]: (Default value = None) :param caption: Optional[Union[TextItem: :param RefItem]]: (Default value = None) @@ -2840,6 +2840,23 @@ def add_code( if not orig: orig = text + if code_language is None: + try: + from docling_core.transforms.chunker.code_chunking_strategy import ( + LanguageDetector, + ) + + detected_language = LanguageDetector.detect_language( + text, + getattr(self.origin, "filename", None) if self.origin else None, + ) + if detected_language: + code_language = detected_language.to_code_language_label() + else: + code_language = CodeLanguageLabel.UNKNOWN + except ImportError: + code_language = CodeLanguageLabel.UNKNOWN + text_index = len(self.texts) cref = f"#/texts/{text_index}" code_item = CodeItem( @@ -2850,8 +2867,7 @@ def add_code( formatting=formatting, hyperlink=hyperlink, ) - if code_language: - code_item.code_language = code_language + code_item.code_language = code_language if content_layer: code_item.content_layer = content_layer if prov: diff --git a/docling_core/types/doc/labels.py b/docling_core/types/doc/labels.py index e5884bcb..b387004e 100644 --- a/docling_core/types/doc/labels.py +++ b/docling_core/types/doc/labels.py @@ -264,3 +264,16 @@ class CodeLanguageLabel(str, Enum): def __str__(self): """Get string value.""" return str(self.value) + + def to_language(self): + """Convert CodeLanguageLabel to Language enum.""" + from docling_core.transforms.chunker.code_chunk_utils.utils import Language + + mapping = { + CodeLanguageLabel.PYTHON: Language.PYTHON, + CodeLanguageLabel.JAVA: Language.JAVA, + CodeLanguageLabel.C: Language.C, + CodeLanguageLabel.TYPESCRIPT: Language.TYPESCRIPT, + CodeLanguageLabel.JAVASCRIPT: Language.JAVASCRIPT, + } + return mapping.get(self, None) diff --git a/test/data/chunker_repo/C/repo_out_chunks.json b/test/data/chunker_repo/C/repo_out_chunks.json index f8628120..ced1e8e6 100644 --- a/test/data/chunker_repo/C/repo_out_chunks.json +++ b/test/data/chunker_repo/C/repo_out_chunks.json @@ -3,657 +3,731 @@ { "text": "\nstatic void string_replace_all_occurrences_with_char(char *s, const char *occur, char repl_char)\n{\n\tsize_t slen = strlen(s);\n\tsize_t skip = strlen(occur) - 1; /* length of the occurrence, minus the char we're replacing */\n\tchar *p = s;\n\twhile ((p = strstr(p, occur)))\n\t{\n\t\t*p = repl_char;\n\t\tp++;\n\t\tslen -= skip;\n\t\tmemmove(p, (p + skip), slen - (p - s) + 1); /* includes null char too */\n\t}\n}", "meta": { - "part_name": "string_replace_all_occurrences_with_char", - "docstring": "/**\n * JavaScript Object Notation (JSON) Pointer\n * RFC 6901 - https://tools.ietf.org/html/rfc6901\n */", - "sha256": 1117482735928585729815737415012422172962871245598, - "start_line": 31, - "end_line": 43, - "end_line_signature": 32, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 3389072908273760774, "filename": "json_pointer.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" }, + "part_name": "string_replace_all_occurrences_with_char", + "docstring": "/**\n * JavaScript Object Notation (JSON) Pointer\n * RFC 6901 - https://tools.ietf.org/html/rfc6901\n */", + "sha256": 1117482735928585729815737415012422172962871245598, + "start_line": 31, + "end_line": 43, + "end_line_signature": 32, "chunk_type": "function" } }, { "text": "/* Avoid ctype.h and locale overhead */\n#define is_plain_digit(c) ((c) >= '0' && (c) <= '9')\nstatic int is_valid_index(const char *path, size_t *idx)\n{\n\tsize_t i, len = strlen(path);\n\t/* this code-path optimizes a bit, for when we reference the 0-9 index range\n\t * in a JSON array and because leading zeros not allowed\n\t */\n\tif (len == 1)\n\t{\n\t\tif (is_plain_digit(path[0]))\n\t\t{\n\t\t\t*idx = (path[0] - '0');\n\t\t\treturn 1;\n\t\t}\n\t\terrno = EINVAL;\n\t\treturn 0;\n\t}\n\t/* leading zeros not allowed per RFC */\n\tif (path[0] == '0')\n\t{\n\t\terrno = EINVAL;\n\t\treturn 0;\n\t}\n\t/* RFC states base-10 decimals */\n\tfor (i = 0; i < len; i++)\n\t{\n\t\tif (!is_plain_digit(path[i]))\n\t\t{\n\t\t\terrno = EINVAL;\n\t\t\treturn 0;\n\t\t}\n\t}\n\n\t// We know it's all digits, so the only error case here is overflow,\n\t// but ULLONG_MAX will be longer than any array length so that's ok.\n\t*idx = strtoull(path, NULL, 10);\n\n\treturn 1;\n}", "meta": { - "part_name": "is_valid_index", - "docstring": "", - "sha256": 234354095953395323597807168380238510580195482334, - "start_line": 45, - "end_line": 82, - "end_line_signature": 46, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 3389072908273760774, "filename": "json_pointer.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" }, + "part_name": "is_valid_index", + "docstring": "", + "sha256": 234354095953395323597807168380238510580195482334, + "start_line": 45, + "end_line": 82, + "end_line_signature": 46, "chunk_type": "function" } }, { "text": "\nstatic int json_pointer_get_single_path(struct json_object *obj, char *path,\n struct json_object **value, size_t *idx)\n{\n\tif (json_object_is_type(obj, json_type_array))\n\t{\n\t\tif (!is_valid_index(path, idx))\n\t\t\treturn -1;\n\t\tif (*idx >= json_object_array_length(obj))\n\t\t{\n\t\t\terrno = ENOENT;\n\t\t\treturn -1;\n\t\t}\n\n\t\tobj = json_object_array_get_idx(obj, *idx);\n\t\tif (obj)\n\t\t{\n\t\t\tif (value)\n\t\t\t\t*value = obj;\n\t\t\treturn 0;\n\t\t}\n\t\t/* Entry not found */\n\t\terrno = ENOENT;\n\t\treturn -1;\n\t}\n\n\t/* RFC states that we first must eval all ~1 then all ~0 */\n\tstring_replace_all_occurrences_with_char(path, \"~1\", '/');\n\tstring_replace_all_occurrences_with_char(path, \"~0\", '~');\n\n\tif (!json_object_object_get_ex(obj, path, value))\n\t{\n\t\terrno = ENOENT;\n\t\treturn -1;\n\t}\n\n\treturn 0;\n}", "meta": { - "part_name": "json_pointer_get_single_path", - "docstring": "", - "sha256": 85913314315132048628912722197929586436214235955, - "start_line": 84, - "end_line": 120, - "end_line_signature": 86, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 3389072908273760774, "filename": "json_pointer.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" }, + "part_name": "json_pointer_get_single_path", + "docstring": "", + "sha256": 85913314315132048628912722197929586436214235955, + "start_line": 84, + "end_line": 120, + "end_line_signature": 86, "chunk_type": "function" } }, { "text": "\nstatic int json_object_array_put_idx_cb(struct json_object *parent, size_t idx,\n\t\t\t\t\tstruct json_object *value, void *priv)\n{\n\treturn json_object_array_put_idx(parent, idx, value);\n}", "meta": { - "part_name": "json_object_array_put_idx_cb", - "docstring": "", - "sha256": 515670096298758350505203262066130806756892931374, - "start_line": 122, - "end_line": 126, - "end_line_signature": 124, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 3389072908273760774, "filename": "json_pointer.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" }, + "part_name": "json_object_array_put_idx_cb", + "docstring": "", + "sha256": 515670096298758350505203262066130806756892931374, + "start_line": 122, + "end_line": 126, + "end_line_signature": 124, "chunk_type": "function" } }, { "text": "\nstatic int json_pointer_set_single_path(struct json_object *parent, const char *path,\n struct json_object *value,\n\t\t\t\t\tjson_pointer_array_set_cb array_set_cb, void *priv)\n{\n\tif (json_object_is_type(parent, json_type_array))\n\t{\n\t\tsize_t idx;\n\t\t/* RFC (Chapter 4) states that '-' may be used to add new elements to an array */\n\t\tif (path[0] == '-' && path[1] == '\\0')\n\t\t\treturn json_object_array_add(parent, value);\n\t\tif (!is_valid_index(path, &idx))\n\t\t\treturn -1;\n\t\treturn array_set_cb(parent, idx, value, priv);\n\t}\n\n\t/* path replacements should have been done in json_pointer_get_single_path(),\n\t * and we should still be good here\n\t */\n\tif (json_object_is_type(parent, json_type_object))\n\t\treturn json_object_object_add(parent, path, value);\n\n\t/* Getting here means that we tried to \"dereference\" a primitive JSON type\n\t * (like string, int, bool).i.e. add a sub-object to it\n\t */\n\terrno = ENOENT;\n\treturn -1;\n}", "meta": { - "part_name": "json_pointer_set_single_path", - "docstring": "", - "sha256": 744226804185536688172092186408538018610881378934, - "start_line": 128, - "end_line": 154, - "end_line_signature": 131, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 3389072908273760774, "filename": "json_pointer.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" }, + "part_name": "json_pointer_set_single_path", + "docstring": "", + "sha256": 744226804185536688172092186408538018610881378934, + "start_line": 128, + "end_line": 154, + "end_line_signature": 131, "chunk_type": "function" } }, { "text": "\nstatic int json_pointer_result_get_recursive(struct json_object *obj, char *path,\n struct json_pointer_get_result *res)\n{\n\tstruct json_object *parent_obj = obj;\n\tsize_t idx = 0;\n\tchar *endp;\n\tint rc;\n\n\t/* All paths (on each recursion level must have a leading '/' */\n\tif (path[0] != '/')\n\t{\n\t\terrno = EINVAL;\n\t\treturn -1;\n\t}\n\tpath++;\n\n\tendp = strchr(path, '/');\n\tif (endp)\n\t\t*endp = '\\0';\n\n\t/* If we err-ed here, return here */\n\tif ((rc = json_pointer_get_single_path(obj, path, &obj, &idx)))\n\t\treturn rc;\n\n\tif (endp)\n\t{\n\t\t/* Put the slash back, so that the sanity check passes on next recursion level */\n\t\t*endp = '/';\n\t\treturn json_pointer_result_get_recursive(obj, endp, res);\n\t}\n\n\t/* We should be at the end of the recursion here */\n\tif (res) {\n\t\tres->parent = parent_obj;\n\t\tres->obj = obj;\n\t\tif (json_object_is_type(res->parent, json_type_array))\n\t\t\tres->index_in_parent = idx;\n\t\telse\n\t\t\tres->key_in_parent = path;\n\t}\n\n\treturn 0;\n}", "meta": { - "part_name": "json_pointer_result_get_recursive", - "docstring": "", - "sha256": 518298991245464116417798779750096461462494486587, - "start_line": 156, - "end_line": 198, - "end_line_signature": 158, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 3389072908273760774, "filename": "json_pointer.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" }, + "part_name": "json_pointer_result_get_recursive", + "docstring": "", + "sha256": 518298991245464116417798779750096461462494486587, + "start_line": 156, + "end_line": 198, + "end_line_signature": 158, "chunk_type": "function" } }, { "text": "\nstatic int json_pointer_object_get_recursive(struct json_object *obj, char *path,\n struct json_object **value)\n{\n\tstruct json_pointer_get_result res;\n\tint rc;\n\n\trc = json_pointer_result_get_recursive(obj, path, &res);\n\tif (rc)\n\t\treturn rc;\n\n\tif (value)\n\t\t*value = res.obj;\n\n\treturn 0;\n}", "meta": { - "part_name": "json_pointer_object_get_recursive", - "docstring": "", - "sha256": 1217293748232453207346015288718037001737705783321, - "start_line": 200, - "end_line": 214, - "end_line_signature": 202, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 3389072908273760774, "filename": "json_pointer.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" }, + "part_name": "json_pointer_object_get_recursive", + "docstring": "", + "sha256": 1217293748232453207346015288718037001737705783321, + "start_line": 200, + "end_line": 214, + "end_line_signature": 202, "chunk_type": "function" } }, { "text": "\nint json_pointer_get_internal(struct json_object *obj, const char *path,\n struct json_pointer_get_result *res)\n{\n\tchar *path_copy = NULL;\n\tint rc;\n\n\tif (!obj || !path)\n\t{\n\t\terrno = EINVAL;\n\t\treturn -1;\n\t}\n\n\tif (path[0] == '\\0')\n\t{\n\t\tres->parent = NULL;\n\t\tres->obj = obj;\n\t\tres->key_in_parent = NULL;\n\t\tres->index_in_parent = UINT32_MAX;\n\t\treturn 0;\n\t}\n\n\t/* pass a working copy to the recursive call */\n\tif (!(path_copy = strdup(path)))\n\t{\n\t\terrno = ENOMEM;\n\t\treturn -1;\n\t}\n\trc = json_pointer_result_get_recursive(obj, path_copy, res);\n\t/* re-map the path string to the const-path string */\n\tif (rc == 0 && json_object_is_type(res->parent, json_type_object) && res->key_in_parent)\n\t\tres->key_in_parent = path + (res->key_in_parent - path_copy);\n\tfree(path_copy);\n\n\treturn rc;\n}", "meta": { - "part_name": "json_pointer_get_internal", - "docstring": "", - "sha256": 196996869167588750666460162571361715333822997162, - "start_line": 216, - "end_line": 250, - "end_line_signature": 218, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 3389072908273760774, "filename": "json_pointer.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" }, + "part_name": "json_pointer_get_internal", + "docstring": "", + "sha256": 196996869167588750666460162571361715333822997162, + "start_line": 216, + "end_line": 250, + "end_line_signature": 218, "chunk_type": "function" } }, { "text": "\nint json_pointer_get(struct json_object *obj, const char *path, struct json_object **res)\n{\n\tstruct json_pointer_get_result jpres;\n\tint rc;\n\n\trc = json_pointer_get_internal(obj, path, &jpres);\n\tif (rc)\n\t\treturn rc;\n\n\tif (res)\n\t\t*res = jpres.obj;\n\n\treturn 0;\n}", "meta": { - "part_name": "json_pointer_get", - "docstring": "", - "sha256": 463273473259540096316239720380761023977777440343, - "start_line": 252, - "end_line": 265, - "end_line_signature": 253, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 3389072908273760774, "filename": "json_pointer.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" }, + "part_name": "json_pointer_get", + "docstring": "", + "sha256": 463273473259540096316239720380761023977777440343, + "start_line": 252, + "end_line": 265, + "end_line_signature": 253, "chunk_type": "function" } }, { "text": "\nint json_pointer_getf(struct json_object *obj, struct json_object **res, const char *path_fmt, ...)\n{\n\tchar *path_copy = NULL;\n\tint rc = 0;\n\tva_list args;\n\n\tif (!obj || !path_fmt)\n\t{\n\t\terrno = EINVAL;\n\t\treturn -1;\n\t}\n\n\tva_start(args, path_fmt);\n\trc = vasprintf(&path_copy, path_fmt, args);\n\tva_end(args);\n\n\tif (rc < 0)\n\t\treturn rc;\n\n\tif (path_copy[0] == '\\0')\n\t{\n\t\tif (res)\n\t\t\t*res = obj;\n\t\tgoto out;\n\t}\n\n\trc = json_pointer_object_get_recursive(obj, path_copy, res);\nout:\n\tfree(path_copy);\n\n\treturn rc;\n}", "meta": { - "part_name": "json_pointer_getf", - "docstring": "", - "sha256": 924347282411192461265505339007126264782988122151, - "start_line": 267, - "end_line": 298, - "end_line_signature": 268, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 3389072908273760774, "filename": "json_pointer.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" }, + "part_name": "json_pointer_getf", + "docstring": "", + "sha256": 924347282411192461265505339007126264782988122151, + "start_line": 267, + "end_line": 298, + "end_line_signature": 268, "chunk_type": "function" } }, { "text": "\nint json_pointer_set_with_array_cb(struct json_object **obj, const char *path,\n\t\t\t\t struct json_object *value,\n\t\t\t\t json_pointer_array_set_cb array_set_cb, void *priv)\n{\n\tconst char *endp;\n\tchar *path_copy = NULL;\n\tstruct json_object *set = NULL;\n\tint rc;\n\n\tif (!obj || !path)\n\t{\n\t\terrno = EINVAL;\n\t\treturn -1;\n\t}\n\n\tif (path[0] == '\\0')\n\t{\n\t\tjson_object_put(*obj);\n\t\t*obj = value;\n\t\treturn 0;\n\t}\n\n\tif (path[0] != '/')\n\t{\n\t\terrno = EINVAL;\n\t\treturn -1;\n\t}\n\n\t/* If there's only 1 level to set, stop here */\n\tif ((endp = strrchr(path, '/')) == path)\n\t{\n\t\tpath++;\n\t\treturn json_pointer_set_single_path(*obj, path, value, array_set_cb, priv);\n\t}\n\n\t/* pass a working copy to the recursive call */\n\tif (!(path_copy = strdup(path)))\n\t{\n\t\terrno = ENOMEM;\n\t\treturn -1;\n\t}\n\tpath_copy[endp - path] = '\\0';\n\trc = json_pointer_object_get_recursive(*obj, path_copy, &set);\n\tfree(path_copy);\n\n\tif (rc)\n\t\treturn rc;\n\n\tendp++;\n\treturn json_pointer_set_single_path(set, endp, value, array_set_cb, priv);\n}", "meta": { - "part_name": "json_pointer_set_with_array_cb", - "docstring": "", - "sha256": 262882567182967450570864701845081576454846403316, - "start_line": 300, - "end_line": 350, - "end_line_signature": 303, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 3389072908273760774, "filename": "json_pointer.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" }, + "part_name": "json_pointer_set_with_array_cb", + "docstring": "", + "sha256": 262882567182967450570864701845081576454846403316, + "start_line": 300, + "end_line": 350, + "end_line_signature": 303, "chunk_type": "function" } }, { "text": "\nint json_pointer_set(struct json_object **obj, const char *path, struct json_object *value)\n{\n\treturn json_pointer_set_with_array_cb(obj, path, value, json_object_array_put_idx_cb, NULL);\n}", "meta": { - "part_name": "json_pointer_set", - "docstring": "", - "sha256": 23353041941238655187424843169018755147284430949, - "start_line": 352, - "end_line": 355, - "end_line_signature": 353, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 3389072908273760774, "filename": "json_pointer.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" }, + "part_name": "json_pointer_set", + "docstring": "", + "sha256": 23353041941238655187424843169018755147284430949, + "start_line": 352, + "end_line": 355, + "end_line_signature": 353, "chunk_type": "function" } }, { "text": "\nint json_pointer_setf(struct json_object **obj, struct json_object *value, const char *path_fmt,\n ...)\n{\n\tchar *endp;\n\tchar *path_copy = NULL;\n\tstruct json_object *set = NULL;\n\tva_list args;\n\tint rc = 0;\n\n\tif (!obj || !path_fmt)\n\t{\n\t\terrno = EINVAL;\n\t\treturn -1;\n\t}\n\n\t/* pass a working copy to the recursive call */\n\tva_start(args, path_fmt);\n\trc = vasprintf(&path_copy, path_fmt, args);\n\tva_end(args);\n\n\tif (rc < 0)\n\t\treturn rc;\n\n\tif (path_copy[0] == '\\0')\n\t{\n\t\tjson_object_put(*obj);\n\t\t*obj = value;\n\t\tgoto out;\n\t}\n\n\tif (path_copy[0] != '/')\n\t{\n\t\terrno = EINVAL;\n\t\trc = -1;\n\t\tgoto out;\n\t}\n\n\t/* If there's only 1 level to set, stop here */\n\tif ((endp = strrchr(path_copy, '/')) == path_copy)\n\t{\n\t\tset = *obj;\n\t\tgoto set_single_path;\n\t}\n\n\t*endp = '\\0';\n\trc = json_pointer_object_get_recursive(*obj, path_copy, &set);\n\n\tif (rc)\n\t\tgoto out;\n\nset_single_path:\n\tendp++;\n\trc = json_pointer_set_single_path(set, endp, value,\n\t\t\t\t\t json_object_array_put_idx_cb, NULL);\nout:\n\tfree(path_copy);\n\treturn rc;\n}", "meta": { - "part_name": "json_pointer_setf", - "docstring": "", - "sha256": 278409402010463874805413705333555052224985193220, - "start_line": 357, - "end_line": 414, - "end_line_signature": 359, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 3389072908273760774, "filename": "json_pointer.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" }, + "part_name": "json_pointer_setf", + "docstring": "", + "sha256": 278409402010463874805413705333555052224985193220, + "start_line": 357, + "end_line": 414, + "end_line_signature": 359, "chunk_type": "function" } }, { "text": "#include \"config.h\"\n\n#include \"strerror_override.h\"\n\n#include \n#include \n#include \n#include \n\n#include \"json_object_private.h\"\n#include \"json_pointer.h\"\n#include \"json_pointer_private.h\"\n#include \"strdup_compat.h\"\n#include \"vasprintf_compat.h\"\n\n/**\n * JavaScript Object Notation (JSON) Pointer\n * RFC 6901 - https://tools.ietf.org/html/rfc6901\n */", "meta": { - "sha256": 1217234116973748366829093199878078246801755936207, - "start_line": 7, - "end_line": 31, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 3389072908273760774, "filename": "json_pointer.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/json_pointer.c" }, + "sha256": 1217234116973748366829093199878078246801755936207, + "start_line": 7, + "end_line": 31, "chunk_type": "preamble" } }, { "text": "/* hash functions */\nstatic unsigned long lh_char_hash(const void *k);\nstatic lh_hash_fn *char_hash_fn = lh_char_hash;\nstatic unsigned long lh_perllike_str_hash(const void *k);\nint json_global_set_string_hash(const int h)\n{\n\tswitch (h)\n\t{\n\tcase JSON_C_STR_HASH_DFLT: char_hash_fn = lh_char_hash; break;\n\tcase JSON_C_STR_HASH_PERLLIKE: char_hash_fn = lh_perllike_str_hash; break;\n\tdefault: return -1;\n\t}\n\treturn 0;\n}", "meta": { - "part_name": "json_global_set_string_hash", - "docstring": "", - "sha256": 998221257334549775068212743280296040491437343457, - "start_line": 45, - "end_line": 54, - "end_line_signature": 46, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12737018040358436176, "filename": "linkhash.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" }, + "part_name": "json_global_set_string_hash", + "docstring": "", + "sha256": 998221257334549775068212743280296040491437343457, + "start_line": 45, + "end_line": 54, + "end_line_signature": 46, "chunk_type": "function" } }, { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nstatic unsigned long lh_ptr_hash(const void *k)\n{\n\t/* CAW: refactored to be 64bit nice */\n\treturn (unsigned long)((((ptrdiff_t)k * LH_PRIME) >> 4) & ULONG_MAX);\n}", "meta": { - "part_name": "lh_ptr_hash", - "docstring": "", - "sha256": 1293894620828796812611104590645246395957873389975, - "start_line": 56, - "end_line": 60, - "end_line_signature": 57, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12737018040358436176, "filename": "linkhash.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" }, + "part_name": "lh_ptr_hash", + "docstring": "", + "sha256": 1293894620828796812611104590645246395957873389975, + "start_line": 56, + "end_line": 60, + "end_line_signature": 57, "chunk_type": "function" } }, { "text": "int lh_ptr_equal(const void *k1, const void *k2);\nstatic lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_ptr_equal(const void *k1, const void *k2)\n{\n\treturn (k1 == k2);\n}", "meta": { - "part_name": "lh_ptr_equal", - "docstring": "", - "sha256": 561855950266729137986207467028016863357001140681, - "start_line": 62, - "end_line": 65, - "end_line_signature": 63, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12737018040358436176, "filename": "linkhash.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" }, + "part_name": "lh_ptr_equal", + "docstring": "", + "sha256": 561855950266729137986207467028016863357001140681, + "start_line": 62, + "end_line": 65, + "end_line_signature": 63, "chunk_type": "function" } }, { "text": "#define HASH_LITTLE_ENDIAN 0\n/*\n-------------------------------------------------------------------------------\nmix -- mix 3 32-bit values reversibly.\n\nThis is reversible, so any information in (a,b,c) before mix() is\nstill in (a,b,c) after mix().\n\nIf four pairs of (a,b,c) inputs are run through mix(), or through\nmix() in reverse, there are at least 32 bits of the output that\nare sometimes the same for one pair and different for another pair.\nThis was tested for:\n* pairs that differed by one bit, by two bits, in any combination\n of top bits of (a,b,c), or in any combination of bottom bits of\n (a,b,c).\n* \"differ\" is defined as +, -, ^, or ~^. For + and -, I transformed\n the output delta to a Gray code (a^(a>>1)) so a string of 1's (as\n is commonly produced by subtraction) look like a single 1-bit\n difference.\n* the base values were pseudorandom, all zero but one bit set, or\n all zero plus a counter that starts at zero.\n\nSome k values for my \"a-=c; a^=rot(c,k); c+=b;\" arrangement that\nsatisfy this are\n 4 6 8 16 19 4\n 9 15 3 18 27 15\n 14 9 3 7 17 3\nWell, \"9 15 3 18 27 15\" didn't quite get 32 bits diffing\nfor \"differ\" defined as + with a one-bit base and a two-bit delta. I\nused https://burtleburtle.net/bob/hash/avalanche.html to choose\nthe operations, constants, and arrangements of the variables.\n\nThis does not achieve avalanche. There are input bits of (a,b,c)\nthat fail to affect some output bits of (a,b,c), especially of a. The\nmost thoroughly mixed value is c, but it doesn't really even achieve\navalanche in c.\n\nThis allows some parallelism. Read-after-writes are good at doubling\nthe number of bits affected, so the goal of mixing pulls in the opposite\ndirection as the goal of parallelism. I did what I could. Rotates\nseem to cost as much as shifts on every machine I could lay my hands\non, and rotates are much kinder to the top and bottom bits, so I used\nrotates.\n-------------------------------------------------------------------------------\n*//* clang-format off */\n#define mix(a,b,c) \\\n{ \\\n\ta -= c; a ^= rot(c, 4); c += b; \\\n\tb -= a; b ^= rot(a, 6); a += c; \\\n\tc -= b; c ^= rot(b, 8); b += a; \\\n\ta -= c; a ^= rot(c,16); c += b; \\\n\tb -= a; b ^= rot(a,19); a += c; \\\n\tc -= b; c ^= rot(b, 4); b += a; \\\n}\n/* clang-format on *//*\n-------------------------------------------------------------------------------\nfinal -- final mixing of 3 32-bit values (a,b,c) into c\n\nPairs of (a,b,c) values differing in only a few bits will usually\nproduce values of c that look totally different. This was tested for\n* pairs that differed by one bit, by two bits, in any combination\n of top bits of (a,b,c), or in any combination of bottom bits of\n (a,b,c).\n* \"differ\" is defined as +, -, ^, or ~^. For + and -, I transformed\n the output delta to a Gray code (a^(a>>1)) so a string of 1's (as\n is commonly produced by subtraction) look like a single 1-bit\n difference.\n* the base values were pseudorandom, all zero but one bit set, or\n all zero plus a counter that starts at zero.\n\nThese constants passed:\n 14 11 25 16 4 14 24\n 12 14 25 16 4 14 24\nand these came close:\n 4 8 15 26 3 22 24\n 10 8 15 26 3 22 24\n 11 8 15 26 3 22 24\n-------------------------------------------------------------------------------\n*//* clang-format off */\n#define final(a,b,c) \\\n{ \\\n\tc ^= b; c -= rot(b,14); \\\n\ta ^= c; a -= rot(c,11); \\\n\tb ^= a; b -= rot(a,25); \\\n\tc ^= b; c -= rot(b,16); \\\n\ta ^= c; a -= rot(c,4); \\\n\tb ^= a; b -= rot(a,14); \\\n\tc ^= b; c -= rot(b,24); \\\n}\nstatic lh_hash_fn *char_hash_fn = lh_char_hash;\nstatic uint32_t hashlittle(const void *key, size_t length, uint32_t initval)\n{\n\tuint32_t a,b,c; /* internal state */\n\tunion\n\t{\n\t\tconst void *ptr;\n\t\tsize_t i;\n\t} u; /* needed for Mac Powerbook G4 */\n\n\t/* Set up the internal state */\n\ta = b = c = 0xdeadbeef + ((uint32_t)length) + initval;\n\n\tu.ptr = key;\n\tif (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {\n\t\tconst uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */\n\n\t\t/*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */\n\t\twhile (length > 12)\n\t\t{\n\t\t\ta += k[0];\n\t\t\tb += k[1];\n\t\t\tc += k[2];\n\t\t\tmix(a,b,c);\n\t\t\tlength -= 12;\n\t\t\tk += 3;\n\t\t}\n\n\t\t/*----------------------------- handle the last (probably partial) block */\n\t\t/*\n\t\t * \"k[2]&0xffffff\" actually reads beyond the end of the string, but\n\t\t * then masks off the part it's not allowed to read. Because the\n\t\t * string is aligned, the masked-off tail is in the same word as the\n\t\t * rest of the string. Every machine with memory protection I've seen\n\t\t * does it on word boundaries, so is OK with this. But VALGRIND will\n\t\t * still catch it and complain. The masking trick does make the hash\n\t\t * noticeably faster for short strings (like English words).\n\t\t * AddressSanitizer is similarly picky about overrunning\n\t\t * the buffer. (https://clang.llvm.org/docs/AddressSanitizer.html)\n\t\t */\n#ifdef VALGRIND\n#define PRECISE_MEMORY_ACCESS 1\n#elif defined(__SANITIZE_ADDRESS__) /* GCC's ASAN */\n#define PRECISE_MEMORY_ACCESS 1\n#elif defined(__has_feature)\n#if __has_feature(address_sanitizer) /* Clang's ASAN */\n#define PRECISE_MEMORY_ACCESS 1\n#endif\n#endif\n#ifndef PRECISE_MEMORY_ACCESS\n\n\t\tswitch(length)\n\t\t{\n\t\tcase 12: c+=k[2]; b+=k[1]; a+=k[0]; break;\n\t\tcase 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break;\n\t\tcase 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break;\n\t\tcase 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break;\n\t\tcase 8 : b+=k[1]; a+=k[0]; break;\n\t\tcase 7 : b+=k[1]&0xffffff; a+=k[0]; break;\n\t\tcase 6 : b+=k[1]&0xffff; a+=k[0]; break;\n\t\tcase 5 : b+=k[1]&0xff; a+=k[0]; break;\n\t\tcase 4 : a+=k[0]; break;\n\t\tcase 3 : a+=k[0]&0xffffff; break;\n\t\tcase 2 : a+=k[0]&0xffff; break;\n\t\tcase 1 : a+=k[0]&0xff; break;\n\t\tcase 0 : return c; /* zero length strings require no mixing */\n\t\t}\n\n#else /* make valgrind happy */\n\n\t\tconst uint8_t *k8 = (const uint8_t *)k;\n\t\tswitch(length)\n\t\t{\n\t\tcase 12: c+=k[2]; b+=k[1]; a+=k[0]; break;\n\t\tcase 11: c+=((uint32_t)k8[10])<<16; /* fall through */\n\t\tcase 10: c+=((uint32_t)k8[9])<<8; /* fall through */\n\t\tcase 9 : c+=k8[8]; /* fall through */\n\t\tcase 8 : b+=k[1]; a+=k[0]; break;\n\t\tcase 7 : b+=((uint32_t)k8[6])<<16; /* fall through */\n\t\tcase 6 : b+=((uint32_t)k8[5])<<8; /* fall through */\n\t\tcase 5 : b+=k8[4]; /* fall through */\n\t\tcase 4 : a+=k[0]; break;\n\t\tcase 3 : a+=((uint32_t)k8[2])<<16; /* fall through */\n\t\tcase 2 : a+=((uint32_t)k8[1])<<8; /* fall through */\n\t\tcase 1 : a+=k8[0]; break;\n\t\tcase 0 : return c;\n\t\t}\n\n#endif /* !valgrind */\n\n\t}\n\telse if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0))\n\t{\n\t\tconst uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */\n\t\tconst uint8_t *k8;\n\n\t\t/*--------------- all but last block: aligned reads and different mixing */\n\t\twhile (length > 12)\n\t\t{\n\t\t\ta += k[0] + (((uint32_t)k[1])<<16);\n\t\t\tb += k[2] + (((uint32_t)k[3])<<16);\n\t\t\tc += k[4] + (((uint32_t)k[5])<<16);\n\t\t\tmix(a,b,c);\n\t\t\tlength -= 12;\n\t\t\tk += 6;\n\t\t}\n\n\t\t/*----------------------------- handle the last (probably partial) block */\n\t\tk8 = (const uint8_t *)k;\n\t\tswitch(length)\n\t\t{\n\t\tcase 12: c+=k[4]+(((uint32_t)k[5])<<16);\n\t\t\t b+=k[2]+(((uint32_t)k[3])<<16);\n\t\t\t a+=k[0]+(((uint32_t)k[1])<<16);\n\t\t\t break;\n\t\tcase 11: c+=((uint32_t)k8[10])<<16; /* fall through */\n\t\tcase 10: c+=k[4];\n\t\t\t b+=k[2]+(((uint32_t)k[3])<<16);\n\t\t\t a+=k[0]+(((uint32_t)k[1])<<16);\n\t\t\t break;\n\t\tcase 9 : c+=k8[8]; /* fall through */\n\t\tcase 8 : b+=k[2]+(((uint32_t)k[3])<<16);\n\t\t\t a+=k[0]+(((uint32_t)k[1])<<16);\n\t\t\t break;\n\t\tcase 7 : b+=((uint32_t)k8[6])<<16; /* fall through */\n\t\tcase 6 : b+=k[2];\n\t\t\t a+=k[0]+(((uint32_t)k[1])<<16);\n\t\t\t break;\n\t\tcase 5 : b+=k8[4]; /* fall through */\n\t\tcase 4 : a+=k[0]+(((uint32_t)k[1])<<16);\n\t\t\t break;\n\t\tcase 3 : a+=((uint32_t)k8[2])<<16; /* fall through */\n\t\tcase 2 : a+=k[0];\n\t\t\t break;\n\t\tcase 1 : a+=k8[0];\n\t\t\t break;\n\t\tcase 0 : return c; /* zero length requires no mixing */\n\t\t}\n\n\t}\n\telse\n\t{\n\t\t/* need to read the key one byte at a time */\n\t\tconst uint8_t *k = (const uint8_t *)key;\n\n\t\t/*--------------- all but the last block: affect some 32 bits of (a,b,c) */\n\t\twhile (length > 12)\n\t\t{\n\t\t\ta += k[0];\n\t\t\ta += ((uint32_t)k[1])<<8;\n\t\t\ta += ((uint32_t)k[2])<<16;\n\t\t\ta += ((uint32_t)k[3])<<24;\n\t\t\tb += k[4];\n\t\t\tb += ((uint32_t)k[5])<<8;\n\t\t\tb += ((uint32_t)k[6])<<16;\n\t\t\tb += ((uint32_t)k[7])<<24;\n\t\t\tc += k[8];\n\t\t\tc += ((uint32_t)k[9])<<8;\n\t\t\tc += ((uint32_t)k[10])<<16;\n\t\t\tc += ((uint32_t)k[11])<<24;\n\t\t\tmix(a,b,c);\n\t\t\tlength -= 12;\n\t\t\tk += 12;\n\t\t}\n\n\t\t/*-------------------------------- last block: affect all 32 bits of (c) */\n\t\tswitch(length) /* all the case statements fall through */\n\t\t{\n\t\tcase 12: c+=((uint32_t)k[11])<<24; /* FALLTHRU */\n\t\tcase 11: c+=((uint32_t)k[10])<<16; /* FALLTHRU */\n\t\tcase 10: c+=((uint32_t)k[9])<<8; /* FALLTHRU */\n\t\tcase 9 : c+=k[8]; /* FALLTHRU */\n\t\tcase 8 : b+=((uint32_t)k[7])<<24; /* FALLTHRU */\n\t\tcase 7 : b+=((uint32_t)k[6])<<16; /* FALLTHRU */\n\t\tcase 6 : b+=((uint32_t)k[5])<<8; /* FALLTHRU */\n\t\tcase 5 : b+=k[4]; /* FALLTHRU */\n\t\tcase 4 : a+=((uint32_t)k[3])<<24; /* FALLTHRU */\n\t\tcase 3 : a+=((uint32_t)k[2])<<16; /* FALLTHRU */\n\t\tcase 2 : a+=((uint32_t)k[1])<<8; /* FALLTHRU */\n\t\tcase 1 : a+=k[0];\n\t\t\t break;\n\t\tcase 0 : return c;\n\t\t}\n\t}\n\n\tfinal(a,b,c);\n\treturn c;\n}", "meta": { - "part_name": "hashlittle", - "docstring": "/* clang-format off *//*\n-------------------------------------------------------------------------------\nhashlittle() -- hash a variable-length key into a 32-bit value\n k : the key (the unaligned variable-length array of bytes)\n length : the length of the key, counting by bytes\n initval : can be any 4-byte value\nReturns a 32-bit value. Every bit of the key affects every bit of\nthe return value. Two keys differing by one or two bits will have\ntotally different hash values.\n\nThe best hash table sizes are powers of 2. There is no need to do\nmod a prime (mod is sooo slow!). If you need less than 32 bits,\nuse a bitmask. For example, if you need only 10 bits, do\n h = (h & hashmask(10));\nIn which case, the hash table should have hashsize(10) elements.\n\nIf you are hashing n strings (uint8_t **)k, do it like this:\n for (i=0, h=0; i 0);\n\tt = (struct lh_table *)calloc(1, sizeof(struct lh_table));\n\tif (!t)\n\t\treturn NULL;\n\n\tt->count = 0;\n\tt->size = size;\n\tt->table = (struct lh_entry *)calloc(size, sizeof(struct lh_entry));\n\tif (!t->table)\n\t{\n\t\tfree(t);\n\t\treturn NULL;\n\t}\n\tt->free_fn = free_fn;\n\tt->hash_fn = hash_fn;\n\tt->equal_fn = equal_fn;\n\tfor (i = 0; i < size; i++)\n\t\tt->table[i].k = LH_EMPTY;\n\treturn t;\n}", "meta": { - "part_name": "lh_table_new(int size, lh_entry_free_fn *free_fn, lh_hash_fn *hash_fn,\n lh_equal_fn *equal_fn)", - "docstring": "", - "sha256": 1353978832949898607839900715285314979449043361401, - "start_line": 498, - "end_line": 524, - "end_line_signature": 500, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12737018040358436176, "filename": "linkhash.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" }, + "part_name": "lh_table_new(int size, lh_entry_free_fn *free_fn, lh_hash_fn *hash_fn,\n lh_equal_fn *equal_fn)", + "docstring": "", + "sha256": 1353978832949898607839900715285314979449043361401, + "start_line": 498, + "end_line": 524, + "end_line_signature": 500, "chunk_type": "function" } }, { "text": "/* comparison functions */\nint lh_char_equal(const void *k1, const void *k2);\nstatic lh_hash_fn *char_hash_fn = lh_char_hash;\nstruct lh_table *lh_kchar_table_new(int size, lh_entry_free_fn *free_fn)\n{\n\treturn lh_table_new(size, free_fn, char_hash_fn, lh_char_equal);\n}", "meta": { - "part_name": "lh_kchar_table_new(int size, lh_entry_free_fn *free_fn)", - "docstring": "", - "sha256": 1143372083340073080177483187079554398021595080815, - "start_line": 526, - "end_line": 529, - "end_line_signature": 527, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12737018040358436176, "filename": "linkhash.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" }, + "part_name": "lh_kchar_table_new(int size, lh_entry_free_fn *free_fn)", + "docstring": "", + "sha256": 1143372083340073080177483187079554398021595080815, + "start_line": 526, + "end_line": 529, + "end_line_signature": 527, "chunk_type": "function" } }, { "text": "int lh_ptr_equal(const void *k1, const void *k2);\nstatic lh_hash_fn *char_hash_fn = lh_char_hash;\nstruct lh_table *lh_kptr_table_new(int size, lh_entry_free_fn *free_fn)\n{\n\treturn lh_table_new(size, free_fn, lh_ptr_hash, lh_ptr_equal);\n}", "meta": { - "part_name": "lh_kptr_table_new(int size, lh_entry_free_fn *free_fn)", - "docstring": "", - "sha256": 5889006246199618862907451444252129237803738713, - "start_line": 531, - "end_line": 534, - "end_line_signature": 532, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12737018040358436176, "filename": "linkhash.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" }, + "part_name": "lh_kptr_table_new(int size, lh_entry_free_fn *free_fn)", + "docstring": "", + "sha256": 5889006246199618862907451444252129237803738713, + "start_line": 531, + "end_line": 534, + "end_line_signature": 532, "chunk_type": "function" } }, { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_table_resize(struct lh_table *t, int new_size)\n{\n\tstruct lh_table *new_t;\n\tstruct lh_entry *ent;\n\n\tnew_t = lh_table_new(new_size, NULL, t->hash_fn, t->equal_fn);\n\tif (new_t == NULL)\n\t\treturn -1;\n\n\tfor (ent = t->head; ent != NULL; ent = ent->next)\n\t{\n\t\tunsigned long h = lh_get_hash(new_t, ent->k);\n\t\tunsigned int opts = 0;\n\t\tif (ent->k_is_constant)\n\t\t\topts = JSON_C_OBJECT_ADD_CONSTANT_KEY;\n\t\tif (lh_table_insert_w_hash(new_t, ent->k, ent->v, h, opts) != 0)\n\t\t{\n\t\t\tlh_table_free(new_t);\n\t\t\treturn -1;\n\t\t}\n\t}\n\tfree(t->table);\n\tt->table = new_t->table;\n\tt->size = new_size;\n\tt->head = new_t->head;\n\tt->tail = new_t->tail;\n\tfree(new_t);\n\n\treturn 0;\n}", "meta": { - "part_name": "lh_table_resize", - "docstring": "", - "sha256": 513675985135766203593761374923281677204896804938, - "start_line": 536, - "end_line": 565, - "end_line_signature": 537, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12737018040358436176, "filename": "linkhash.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" }, + "part_name": "lh_table_resize", + "docstring": "", + "sha256": 513675985135766203593761374923281677204896804938, + "start_line": 536, + "end_line": 565, + "end_line_signature": 537, "chunk_type": "function" } }, { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nvoid lh_table_free(struct lh_table *t)\n{\n\tstruct lh_entry *c;\n\tif (t->free_fn)\n\t{\n\t\tfor (c = t->head; c != NULL; c = c->next)\n\t\t\tt->free_fn(c);\n\t}\n\tfree(t->table);\n\tfree(t);\n}", "meta": { - "part_name": "lh_table_free", - "docstring": "", - "sha256": 1461287464285525024180362726694195219608876587842, - "start_line": 567, - "end_line": 577, - "end_line_signature": 568, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12737018040358436176, "filename": "linkhash.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" }, + "part_name": "lh_table_free", + "docstring": "", + "sha256": 1461287464285525024180362726694195219608876587842, + "start_line": 567, + "end_line": 577, + "end_line_signature": 568, "chunk_type": "function" } }, { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_table_insert_w_hash(struct lh_table *t, const void *k, const void *v, const unsigned long h,\n const unsigned opts)\n{\n\tunsigned long n;\n\n\tif (t->count >= t->size * LH_LOAD_FACTOR)\n\t{\n\t\t/* Avoid signed integer overflow with large tables. */\n\t\tint new_size = (t->size > INT_MAX / 2) ? INT_MAX : (t->size * 2);\n\t\tif (t->size == INT_MAX || lh_table_resize(t, new_size) != 0)\n\t\t\treturn -1;\n\t}\n\n\tn = h % t->size;\n\n\twhile (1)\n\t{\n\t\tif (t->table[n].k == LH_EMPTY || t->table[n].k == LH_FREED)\n\t\t\tbreak;\n\t\tif ((int)++n == t->size)\n\t\t\tn = 0;\n\t}\n\n\tt->table[n].k = k;\n\tt->table[n].k_is_constant = (opts & JSON_C_OBJECT_ADD_CONSTANT_KEY);\n\tt->table[n].v = v;\n\tt->count++;\n\n\tif (t->head == NULL)\n\t{\n\t\tt->head = t->tail = &t->table[n];\n\t\tt->table[n].next = t->table[n].prev = NULL;\n\t}\n\telse\n\t{\n\t\tt->tail->next = &t->table[n];\n\t\tt->table[n].prev = t->tail;\n\t\tt->table[n].next = NULL;\n\t\tt->tail = &t->table[n];\n\t}\n\n\treturn 0;\n}", "meta": { - "part_name": "lh_table_insert_w_hash", - "docstring": "", - "sha256": 1457272684322346273275672024449638738347377045483, - "start_line": 579, - "end_line": 621, - "end_line_signature": 581, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12737018040358436176, "filename": "linkhash.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" }, + "part_name": "lh_table_insert_w_hash", + "docstring": "", + "sha256": 1457272684322346273275672024449638738347377045483, + "start_line": 579, + "end_line": 621, + "end_line_signature": 581, "chunk_type": "function" } }, { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_table_insert(struct lh_table *t, const void *k, const void *v)\n{\n\treturn lh_table_insert_w_hash(t, k, v, lh_get_hash(t, k), 0);\n}", "meta": { - "part_name": "lh_table_insert", - "docstring": "", - "sha256": 914976466412251973023999660063657611060463868578, - "start_line": 622, - "end_line": 625, - "end_line_signature": 623, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12737018040358436176, "filename": "linkhash.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" }, + "part_name": "lh_table_insert", + "docstring": "", + "sha256": 914976466412251973023999660063657611060463868578, + "start_line": 622, + "end_line": 625, + "end_line_signature": 623, "chunk_type": "function" } }, { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nstruct lh_entry *lh_table_lookup_entry_w_hash(struct lh_table *t, const void *k,\n const unsigned long h)\n{\n\tunsigned long n = h % t->size;\n\tint count = 0;\n\n\twhile (count < t->size)\n\t{\n\t\tif (t->table[n].k == LH_EMPTY)\n\t\t\treturn NULL;\n\t\tif (t->table[n].k != LH_FREED && t->equal_fn(t->table[n].k, k))\n\t\t\treturn &t->table[n];\n\t\tif ((int)++n == t->size)\n\t\t\tn = 0;\n\t\tcount++;\n\t}\n\treturn NULL;\n}", "meta": { - "part_name": "lh_table_lookup_entry_w_hash(struct lh_table *t, const void *k,\n const unsigned long h)", - "docstring": "", - "sha256": 1235017991348899387013216690716133251889003151179, - "start_line": 627, - "end_line": 644, - "end_line_signature": 629, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12737018040358436176, "filename": "linkhash.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" }, + "part_name": "lh_table_lookup_entry_w_hash(struct lh_table *t, const void *k,\n const unsigned long h)", + "docstring": "", + "sha256": 1235017991348899387013216690716133251889003151179, + "start_line": 627, + "end_line": 644, + "end_line_signature": 629, "chunk_type": "function" } }, { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nstruct lh_entry *lh_table_lookup_entry(struct lh_table *t, const void *k)\n{\n\treturn lh_table_lookup_entry_w_hash(t, k, lh_get_hash(t, k));\n}", "meta": { - "part_name": "lh_table_lookup_entry(struct lh_table *t, const void *k)", - "docstring": "", - "sha256": 1030652463340488651179217356281066519111633079656, - "start_line": 646, - "end_line": 649, - "end_line_signature": 647, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12737018040358436176, "filename": "linkhash.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" }, + "part_name": "lh_table_lookup_entry(struct lh_table *t, const void *k)", + "docstring": "", + "sha256": 1030652463340488651179217356281066519111633079656, + "start_line": 646, + "end_line": 649, + "end_line_signature": 647, "chunk_type": "function" } }, { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\njson_bool lh_table_lookup_ex(struct lh_table *t, const void *k, void **v)\n{\n\tstruct lh_entry *e = lh_table_lookup_entry(t, k);\n\tif (e != NULL)\n\t{\n\t\tif (v != NULL)\n\t\t\t*v = lh_entry_v(e);\n\t\treturn 1; /* key found */\n\t}\n\tif (v != NULL)\n\t\t*v = NULL;\n\treturn 0; /* key not found */\n}", "meta": { - "part_name": "lh_table_lookup_ex", - "docstring": "", - "sha256": 634848787249761171541292303028190495411132430955, - "start_line": 651, - "end_line": 663, - "end_line_signature": 652, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12737018040358436176, "filename": "linkhash.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" }, + "part_name": "lh_table_lookup_ex", + "docstring": "", + "sha256": 634848787249761171541292303028190495411132430955, + "start_line": 651, + "end_line": 663, + "end_line_signature": 652, "chunk_type": "function" } }, { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_table_delete_entry(struct lh_table *t, struct lh_entry *e)\n{\n\t/* CAW: fixed to be 64bit nice, still need the crazy negative case... */\n\tptrdiff_t n = (ptrdiff_t)(e - t->table);\n\n\t/* CAW: this is bad, really bad, maybe stack goes other direction on this machine... */\n\tif (n < 0)\n\t{\n\t\treturn -2;\n\t}\n\n\tif (t->table[n].k == LH_EMPTY || t->table[n].k == LH_FREED)\n\t\treturn -1;\n\tt->count--;\n\tif (t->free_fn)\n\t\tt->free_fn(e);\n\tt->table[n].v = NULL;\n\tt->table[n].k = LH_FREED;\n\tif (t->tail == &t->table[n] && t->head == &t->table[n])\n\t{\n\t\tt->head = t->tail = NULL;\n\t}\n\telse if (t->head == &t->table[n])\n\t{\n\t\tt->head->next->prev = NULL;\n\t\tt->head = t->head->next;\n\t}\n\telse if (t->tail == &t->table[n])\n\t{\n\t\tt->tail->prev->next = NULL;\n\t\tt->tail = t->tail->prev;\n\t}\n\telse\n\t{\n\t\tt->table[n].prev->next = t->table[n].next;\n\t\tt->table[n].next->prev = t->table[n].prev;\n\t}\n\tt->table[n].next = t->table[n].prev = NULL;\n\treturn 0;\n}", "meta": { - "part_name": "lh_table_delete_entry", - "docstring": "", - "sha256": 951281510321322595326628350604221188298138063502, - "start_line": 665, - "end_line": 704, - "end_line_signature": 666, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12737018040358436176, "filename": "linkhash.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" }, + "part_name": "lh_table_delete_entry", + "docstring": "", + "sha256": 951281510321322595326628350604221188298138063502, + "start_line": 665, + "end_line": 704, + "end_line_signature": 666, "chunk_type": "function" } }, { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_table_delete(struct lh_table *t, const void *k)\n{\n\tstruct lh_entry *e = lh_table_lookup_entry(t, k);\n\tif (!e)\n\t\treturn -1;\n\treturn lh_table_delete_entry(t, e);\n}", "meta": { - "part_name": "lh_table_delete", - "docstring": "", - "sha256": 44220499716621050420359222788414516841562449761, - "start_line": 706, - "end_line": 712, - "end_line_signature": 707, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12737018040358436176, "filename": "linkhash.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" }, + "part_name": "lh_table_delete", + "docstring": "", + "sha256": 44220499716621050420359222788414516841562449761, + "start_line": 706, + "end_line": 712, + "end_line_signature": 707, "chunk_type": "function" } }, { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_table_length(struct lh_table *t)\n{\n\treturn t->count;\n}", "meta": { - "part_name": "lh_table_length", - "docstring": "", - "sha256": 719864955613574534766865380227650980669430095114, - "start_line": 714, - "end_line": 717, - "end_line_signature": 715, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12737018040358436176, "filename": "linkhash.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/linkhash.c" }, + "part_name": "lh_table_length", + "docstring": "", + "sha256": 719864955613574534766865380227650980669430095114, + "start_line": 714, + "end_line": 717, + "end_line_signature": 715, "chunk_type": "function" } }, { "text": "#include \"config.h\"\n\n#include \n#include \n#include \n#include \n#include \n#include \n#include \n\n#ifdef HAVE_ENDIAN_H\n#include /* attempt to define endianness */\n#endif\n\n#if defined(_MSC_VER) || defined(__MINGW32__)\n#ifndef WIN32_LEAN_AND_MEAN\n#define WIN32_LEAN_AND_MEAN\n#endif\n#include /* Get InterlockedCompareExchange */\n#endif\n\n#include \"linkhash.h\"\n#include \"random_seed.h\"\n\n/*\n * hashlittle from lookup3.c, by Bob Jenkins, May 2006, Public Domain.\n * https://burtleburtle.net/bob/c/lookup3.c\n * minor modifications to make functions static so no symbols are exported\n * minor modifications to compile with -Werror\n */\n\n/*\n-------------------------------------------------------------------------------\nlookup3.c, by Bob Jenkins, May 2006, Public Domain.\n\nThese are functions for producing 32-bit hashes for hash table lookup.\nhashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()\nare externally useful functions. Routines to test the hash are included\nif SELF_TEST is defined. You can use this free for any purpose. It's in\nthe public domain. It has no warranty.\n\nYou probably want to use hashlittle(). hashlittle() and hashbig()\nhash byte arrays. hashlittle() is faster than hashbig() on\nlittle-endian machines. Intel and AMD are little-endian machines.\nOn second thought, you probably want hashlittle2(), which is identical to\nhashlittle() except it returns two 32-bit hashes for the price of one.\nYou could implement hashbig2() if you wanted but I haven't bothered here.\n\nIf you want to find a hash of, say, exactly 7 integers, do\n a = i1; b = i2; c = i3;\n mix(a,b,c);\n a += i4; b += i5; c += i6;\n mix(a,b,c);\n a += i7;\n final(a,b,c);\nthen use c as the hash value. If you have a variable length array of\n4-byte integers to hash, use hashword(). If you have a byte array (like\na character string), use hashlittle(). If you have several byte arrays, or\na mix of things, see the comments above hashlittle().\n\nWhy is this so big? I read 12 bytes at a time into 3 4-byte integers,\nthen mix those integers. This is fast (you can do a lot more thorough\nmixing with 12*3 instructions on 3 integers than you can with 3 instructions\non 1 byte), but shoehorning those bytes into integers efficiently is messy.\n-------------------------------------------------------------------------------\n*/\n\n/*\n * My best guess at if you are big-endian or little-endian. This may\n * need adjustment.\n */\n#if (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) || \\\n (defined(i386) || defined(__i386__) || defined(__i486__) || defined(__i586__) || \\\n defined(__i686__) || defined(vax) || defined(MIPSEL))\n#define HASH_LITTLE_ENDIAN 1\n#define HASH_BIG_ENDIAN 0\n#elif (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) || \\\n (defined(sparc) || defined(POWERPC) || defined(mc68000) || defined(sel))\n#define HASH_LITTLE_ENDIAN 0\n#define HASH_BIG_ENDIAN 1\n#else\n\n#define HASH_BIG_ENDIAN 0\n#endif\n\n#define hashsize(n) ((uint32_t)1 << (n))\n#define hashmask(n) (hashsize(n) - 1)\n#define rot(x, k) (((x) << (k)) | ((x) >> (32 - (k))))\n\n/* clang-format on */\n\n/*\n-------------------------------------------------------------------------------\nhashlittle() -- hash a variable-length key into a 32-bit value\n k : the key (the unaligned variable-length array of bytes)\n length : the length of the key, counting by bytes\n initval : can be any 4-byte value\nReturns a 32-bit value. Every bit of the key affects every bit of\nthe return value. Two keys differing by one or two bits will have\ntotally different hash values.\n\nThe best hash table sizes are powers of 2. There is no need to do\nmod a prime (mod is sooo slow!). If you need less than 32 bits,\nuse a bitmask. For example, if you need only 10 bits, do\n h = (h & hashmask(10));\nIn which case, the hash table should have hashsize(10) elements.\n\nIf you are hashing n strings (uint8_t **)k, do it like this:\n for (i=0, h=0; i= 10; errno_in /= 10, ii++)\n\t{\n\t\tdigbuf[ii] = \"0123456789\"[(errno_in % 10)];\n\t}\n\tdigbuf[ii] = \"0123456789\"[(errno_in % 10)];\n\n\t// Reverse the digits\n\tfor (start_idx = sizeof(PREFIX) - 1; ii >= 0; ii--, start_idx++)\n\t{\n\t\terrno_buf[start_idx] = digbuf[ii];\n\t}\n\terrno_buf[start_idx] = '\\0';\n\treturn errno_buf;\n}", "meta": { - "part_name": "_json_c_strerror(int errno_in)", - "docstring": "", - "sha256": 70696874837601163637337209327435673270874963588, - "start_line": 66, - "end_line": 109, - "end_line_signature": 67, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 14386364040007058020, "filename": "strerror_override.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/strerror_override.c" }, + "part_name": "_json_c_strerror(int errno_in)", + "docstring": "", + "sha256": 70696874837601163637337209327435673270874963588, + "start_line": 66, + "end_line": 109, + "end_line_signature": 67, "chunk_type": "function" } }, { "text": "#define STRERROR_OVERRIDE_IMPL 1\n#include \"strerror_override.h\"", "meta": { - "sha256": 1284269608364859541196511107996937073581407786389, - "start_line": 1, - "end_line": 3, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 14386364040007058020, "filename": "strerror_override.c", "uri": "https://github.com/json-c/json-c/blob/abc123def456/strerror_override.c" }, + "sha256": 1284269608364859541196511107996937073581407786389, + "start_line": 1, + "end_line": 3, "chunk_type": "preamble" } } diff --git a/test/data/chunker_repo/Java/repo_out_chunks.json b/test/data/chunker_repo/Java/repo_out_chunks.json index 693630e4..55e0125c 100644 --- a/test/data/chunker_repo/Java/repo_out_chunks.json +++ b/test/data/chunker_repo/Java/repo_out_chunks.json @@ -3,90 +3,100 @@ { "text": "package com.acmeair;\n\npublic interface AcmeAirConstants {\n\n\t\n}", "meta": { - "part_name": "AcmeAirConstants", - "docstring": "", - "sha256": 802233348002430704981298093765030369309512619867, - "start_line": 2, - "end_line": 5, - "end_line_signature": 5, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 4721786809665574388, "filename": "AcmeAirConstants.java", "uri": "https://github.com/acmeair/acmeair/blob/abc123def456/AcmeAirConstants.java" }, + "part_name": "AcmeAirConstants", + "docstring": "", + "sha256": 802233348002430704981298093765030369309512619867, + "start_line": 2, + "end_line": 5, + "end_line_signature": 5, "chunk_type": "class" } }, { "text": "package com.acmeair.loader;\nimport com.acmeair.entities.Customer.PhoneType;\nimport com.acmeair.entities.Customer;\nimport com.acmeair.entities.CustomerAddress;\nimport com.acmeair.service.CustomerService;\nimport com.acmeair.service.ServiceLocator;\npublic class CustomerLoader {\n\n private CustomerService customerService = ServiceLocator.instance().getService(CustomerService.class);} public void loadCustomers(long numCustomers) {\n\t\tCustomerAddress address = customerService.createAddress(\"123 Main St.\", null, \"Anytown\", \"NC\", \"USA\", \"27617\");\n\t\tfor (long ii = 0; ii < numCustomers; ii++) {\n\t\t\tcustomerService.createCustomer(\"uid\"+ii+\"@email.com\", \"password\", Customer.MemberShipStatus.GOLD, 1000000, 1000, \"919-123-4567\", PhoneType.BUSINESS, address);\n\t\t}\n\t}", "meta": { - "part_name": "loadCustomers", - "docstring": "", - "sha256": 216694729768327235646074424271588504079937437501, - "start_line": 29, - "end_line": 34, - "end_line_signature": 29, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 16652446628586613798, "filename": "CustomerLoader.java", "uri": "https://github.com/acmeair/acmeair/blob/abc123def456/CustomerLoader.java" }, + "part_name": "loadCustomers", + "docstring": "", + "sha256": 216694729768327235646074424271588504079937437501, + "start_line": 29, + "end_line": 34, + "end_line_signature": 29, "chunk_type": "function" } }, { "text": "package com.acmeair.loader;\nimport com.acmeair.entities.AirportCodeMapping;\nimport com.acmeair.service.FlightService;\nimport com.acmeair.service.ServiceLocator;\nimport java.io.InputStream;\nimport java.io.InputStreamReader;\nimport java.io.LineNumberReader;\nimport java.math.*;\nimport java.util.*;\npublic class FlightLoader {\n\n private static final int MAX_FLIGHTS_PER_SEGMENT = 30; private FlightService flightService = ServiceLocator.instance().getService(FlightService.class);} public void loadFlights() throws Exception {\n\t\tInputStream csvInputStream = FlightLoader.class.getResourceAsStream(\"/mileage.csv\");\n\t\t\n\t\tLineNumberReader lnr = new LineNumberReader(new InputStreamReader(csvInputStream));\n\t\tString line1 = lnr.readLine();\n\t\tStringTokenizer st = new StringTokenizer(line1, \",\");\n\t\tArrayList airports = new ArrayList();\n\t\t\n\t\t// read the first line which are airport names\n\t\twhile (st.hasMoreTokens()) {\n\t\t\tAirportCodeMapping acm = flightService.createAirportCodeMapping(null, st.nextToken());\n\t\t//\tacm.setAirportName(st.nextToken());\n\t\t\tairports.add(acm);\n\t\t}\n\t\t// read the second line which contains matching airport codes for the first line\n\t\tString line2 = lnr.readLine();\n\t\tst = new StringTokenizer(line2, \",\");\n\t\tint ii = 0;\n\t\twhile (st.hasMoreTokens()) {\n\t\t\tString airportCode = st.nextToken();\n\t\t\tairports.get(ii).setAirportCode(airportCode);\n\t\t\tii++;\n\t\t}\n\t\t// read the other lines which are of format:\n\t\t// airport name, aiport code, distance from this airport to whatever airport is in the column from lines one and two\n\t\tString line;\n\t\tint flightNumber = 0;\n\t\twhile (true) {\n\t\t\tline = lnr.readLine();\n\t\t\tif (line == null || line.trim().equals(\"\")) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tst = new StringTokenizer(line, \",\");\n\t\t\tString airportName = st.nextToken();\n\t\t\tString airportCode = st.nextToken();\n\t\t\tif (!alreadyInCollection(airportCode, airports)) {\n\t\t\t\tAirportCodeMapping acm = flightService.createAirportCodeMapping(airportCode, airportName);\n\t\t\t\tairports.add(acm);\n\t\t\t}\n\t\t\tint indexIntoTopLine = 0;\n\t\t\twhile (st.hasMoreTokens()) {\n\t\t\t\tString milesString = st.nextToken();\n\t\t\t\tif (milesString.equals(\"NA\")) {\n\t\t\t\t\tindexIntoTopLine++;\n\t\t\t\t\tcontinue;\n\t\t\t\t}\n\t\t\t\tint miles = Integer.parseInt(milesString);\n\t\t\t\tString toAirport = airports.get(indexIntoTopLine).getAirportCode();\n\t\t\t\tString flightId = \"AA\" + flightNumber;\t\t\t\n\t\t\t\tflightService.storeFlightSegment(flightId, airportCode, toAirport, miles);\n\t\t\t\tDate now = new Date();\n\t\t\t\tfor (int daysFromNow = 0; daysFromNow < MAX_FLIGHTS_PER_SEGMENT; daysFromNow++) {\n\t\t\t\t\tCalendar c = Calendar.getInstance();\n\t\t\t\t\tc.setTime(now);\n\t\t\t\t\tc.set(Calendar.HOUR_OF_DAY, 0);\n\t\t\t\t c.set(Calendar.MINUTE, 0);\n\t\t\t\t c.set(Calendar.SECOND, 0);\n\t\t\t\t c.set(Calendar.MILLISECOND, 0);\n\t\t\t\t\tc.add(Calendar.DATE, daysFromNow);\n\t\t\t\t\tDate departureTime = c.getTime();\n\t\t\t\t\tDate arrivalTime = getArrivalTime(departureTime, miles);\n\t\t\t\t\tflightService.createNewFlight(flightId, departureTime, arrivalTime, new BigDecimal(500), new BigDecimal(200), 10, 200, \"B747\");\n\t\t\t\t\t\n\t\t\t\t}\n\t\t\t\tflightNumber++;\n\t\t\t\tindexIntoTopLine++;\n\t\t\t}\n\t\t}\n\t\t\n\t\tfor (int jj = 0; jj < airports.size(); jj++) {\n\t\t\tflightService.storeAirportMapping(airports.get(jj));\n\t\t}\n\t\tlnr.close();\n\t}", "meta": { - "part_name": "loadFlights", - "docstring": "", - "sha256": 1402513010551547847601046795871134853016900337764, - "start_line": 37, - "end_line": 110, - "end_line_signature": 37, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 13929779701984022643, "filename": "FlightLoader.java", "uri": "https://github.com/acmeair/acmeair/blob/abc123def456/FlightLoader.java" }, + "part_name": "loadFlights", + "docstring": "", + "sha256": 1402513010551547847601046795871134853016900337764, + "start_line": 37, + "end_line": 110, + "end_line_signature": 37, "chunk_type": "function" } }, { "text": "package com.acmeair.loader;\nimport com.acmeair.service.FlightService;\nimport com.acmeair.service.ServiceLocator;\nimport java.math.*;\nimport java.util.*;\npublic class FlightLoader {\n\n private static final int MAX_FLIGHTS_PER_SEGMENT = 30; private FlightService flightService = ServiceLocator.instance().getService(FlightService.class);} private static Date getArrivalTime(Date departureTime, int mileage) {\n\t\tdouble averageSpeed = 600.0; // 600 miles/hours\n\t\tdouble hours = (double) mileage / averageSpeed; // miles / miles/hour = hours\n\t\tdouble partsOfHour = hours % 1.0;\n\t\tint minutes = (int)(60.0 * partsOfHour);\n\t\tCalendar c = Calendar.getInstance();\n\t\tc.setTime(departureTime);\n\t\tc.add(Calendar.HOUR, (int)hours);\n\t\tc.add(Calendar.MINUTE, minutes);\n\t\treturn c.getTime();\n\t}", "meta": { - "part_name": "getArrivalTime", - "docstring": "", - "sha256": 498682806925350255052209347840878724203772545481, - "start_line": 112, - "end_line": 122, - "end_line_signature": 112, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 13929779701984022643, "filename": "FlightLoader.java", "uri": "https://github.com/acmeair/acmeair/blob/abc123def456/FlightLoader.java" }, + "part_name": "getArrivalTime", + "docstring": "", + "sha256": 498682806925350255052209347840878724203772545481, + "start_line": 112, + "end_line": 122, + "end_line_signature": 112, "chunk_type": "function" } }, { "text": "package com.acmeair.loader;\nimport com.acmeair.entities.AirportCodeMapping;\nimport com.acmeair.service.FlightService;\nimport com.acmeair.service.ServiceLocator;\nimport java.math.*;\nimport java.util.*;\npublic class FlightLoader {\n\n private static final int MAX_FLIGHTS_PER_SEGMENT = 30; private FlightService flightService = ServiceLocator.instance().getService(FlightService.class);} static private boolean alreadyInCollection(String airportCode, ArrayList airports) {\n\t\tfor (int ii = 0; ii < airports.size(); ii++) {\n\t\t\tif (airports.get(ii).getAirportCode().equals(airportCode)) {\n\t\t\t\treturn true;\n\t\t\t}\n\t\t}\n\t\treturn false;\n\t}", "meta": { - "part_name": "alreadyInCollection", - "docstring": "", - "sha256": 846281520348793854081127747386039365962060314516, - "start_line": 124, - "end_line": 131, - "end_line_signature": 124, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 13929779701984022643, "filename": "FlightLoader.java", "uri": "https://github.com/acmeair/acmeair/blob/abc123def456/FlightLoader.java" }, + "part_name": "alreadyInCollection", + "docstring": "", + "sha256": 846281520348793854081127747386039365962060314516, + "start_line": 124, + "end_line": 131, + "end_line_signature": 124, "chunk_type": "function" } } diff --git a/test/data/chunker_repo/JavaScript/repo_out_chunks.json b/test/data/chunker_repo/JavaScript/repo_out_chunks.json index 20024a15..c6cdf6b4 100644 --- a/test/data/chunker_repo/JavaScript/repo_out_chunks.json +++ b/test/data/chunker_repo/JavaScript/repo_out_chunks.json @@ -3,99 +3,111 @@ { "text": "import { jQuery } from \"../core.js\";\nimport { toType } from \"../core/toType.js\";\n\n// Multifunctional method to get and set values of a collection\n// The value/s can optionally be executed if it's a function\nexport function access( elems, fn, key, value, chainable, emptyGet, raw ) {\n\tvar i = 0,\n\t\tlen = elems.length,\n\t\tbulk = key == null;\n\n\t// Sets many values\n\tif ( toType( key ) === \"object\" ) {\n\t\tchainable = true;\n\t\tfor ( i in key ) {\n\t\t\taccess( elems, fn, i, key[ i ], true, emptyGet, raw );\n\t\t}\n\n\t// Sets one value\n\t} else if ( value !== undefined ) {\n\t\tchainable = true;\n\n\t\tif ( typeof value !== \"function\" ) {\n\t\t\traw = true;\n\t\t}\n\n\t\tif ( bulk ) {\n\n\t\t\t// Bulk operations run against the entire set\n\t\t\tif ( raw ) {\n\t\t\t\tfn.call( elems, value );\n\t\t\t\tfn = null;\n\n\t\t\t// ...except when executing function values\n\t\t\t} else {\n\t\t\t\tbulk = fn;\n\t\t\t\tfn = function( elem, _key, value ) {\n\t\t\t\t\treturn bulk.call( jQuery( elem ), value );\n\t\t\t\t};\n\t\t\t}\n\t\t}\n\n\t\tif ( fn ) {\n\t\t\tfor ( ; i < len; i++ ) {\n\t\t\t\tfn(\n\t\t\t\t\telems[ i ], key, raw ?\n\t\t\t\t\t\tvalue :\n\t\t\t\t\t\tvalue.call( elems[ i ], i, fn( elems[ i ], key ) )\n\t\t\t\t);\n\t\t\t}\n\t\t}\n\t}\n\n\tif ( chainable ) {\n\t\treturn elems;\n\t}\n\n\t// Gets\n\tif ( bulk ) {\n\t\treturn fn.call( elems );\n\t}\n\n\treturn len ? fn( elems[ 0 ], key ) : emptyGet;\n}", "meta": { - "sha256": 19281888941792979874208112177048718444947121672, - "start_line": 1, - "end_line": 64, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 6135287906716252438, "filename": "access.js", "uri": "https://github.com/jquery/jquery/blob/abc123def456/access.js" }, + "sha256": 19281888941792979874208112177048718444947121672, + "start_line": 1, + "end_line": 64, "chunk_type": "preamble" } }, { "text": "\nfunction getData( data ) {\n\tif ( data === \"true\" ) {\n\t\treturn true;\n\t}\n\n\tif ( data === \"false\" ) {\n\t\treturn false;\n\t}\n\n\tif ( data === \"null\" ) {\n\t\treturn null;\n\t}\n\n\t// Only convert to a number if it doesn't change the string\n\tif ( data === +data + \"\" ) {\n\t\treturn +data;\n\t}\n\n\tif ( rbrace.test( data ) ) {\n\t\treturn JSON.parse( data );\n\t}\n\n\treturn data;\n}", "meta": { - "part_name": "getData", - "docstring": "", - "sha256": 726798474764155913807876762001398681472967415464, - "start_line": 19, - "end_line": 42, - "end_line_signature": 42, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 15216584529958576692, "filename": "data.js", "uri": "https://github.com/jquery/jquery/blob/abc123def456/data.js" }, + "part_name": "getData", + "docstring": "", + "sha256": 726798474764155913807876762001398681472967415464, + "start_line": 19, + "end_line": 42, + "end_line_signature": 42, "chunk_type": "function" } }, { "text": "import { dataUser } from \"./data/var/dataUser.js\";\nfunction dataAttr( elem, key, data ) {\n\tvar name;\n\n\t// If nothing was found internally, try to fetch any\n\t// data from the HTML5 data-* attribute\n\tif ( data === undefined && elem.nodeType === 1 ) {\n\t\tname = \"data-\" + key.replace( rmultiDash, \"-$&\" ).toLowerCase();\n\t\tdata = elem.getAttribute( name );\n\n\t\tif ( typeof data === \"string\" ) {\n\t\t\ttry {\n\t\t\t\tdata = getData( data );\n\t\t\t} catch ( e ) {}\n\n\t\t\t// Make sure we set the data so it isn't changed later\n\t\t\tdataUser.set( elem, key, data );\n\t\t} else {\n\t\t\tdata = undefined;\n\t\t}\n\t}\n\treturn data;\n}", "meta": { - "part_name": "dataAttr", - "docstring": "", - "sha256": 1201089615638546656833156594995163567755610461195, - "start_line": 44, - "end_line": 65, - "end_line_signature": 65, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 15216584529958576692, "filename": "data.js", "uri": "https://github.com/jquery/jquery/blob/abc123def456/data.js" }, + "part_name": "dataAttr", + "docstring": "", + "sha256": 1201089615638546656833156594995163567755610461195, + "start_line": 44, + "end_line": 65, + "end_line_signature": 65, "chunk_type": "function" } }, { "text": "import { jQuery } from \"./core.js\";\nimport { access } from \"./core/access.js\";\nimport { camelCase } from \"./core/camelCase.js\";\nimport { dataPriv } from \"./data/var/dataPriv.js\";\n\n//\tImplementation Summary\n//\n//\t1. Enforce API surface and semantic compatibility with 1.9.x branch\n//\t2. Improve the module's maintainability by reducing the storage\n//\t\tpaths to a single mechanism.\n//\t3. Use the same single mechanism to support \"private\" and \"user\" data.\n//\t4. _Never_ expose \"private\" data to user code (TODO: Drop _data, _removeData)\n//\t5. Avoid exposing implementation details on user objects (eg. expando properties)\n//\t6. Provide a clear path for implementation upgrade to WeakMap in 2014\n\nvar rbrace = /^(?:\\{[\\w\\W]*\\}|\\[[\\w\\W]*\\])$/,\n\trmultiDash = /[A-Z]/g;\n\njQuery.extend( {\n\thasData: function( elem ) {\n\t\treturn dataUser.hasData( elem ) || dataPriv.hasData( elem );\n\t},\n\n\tdata: function( elem, name, data ) {\n\t\treturn dataUser.access( elem, name, data );\n\t},\n\n\tremoveData: function( elem, name ) {\n\t\tdataUser.remove( elem, name );\n\t},\n\n\t// TODO: Now that all calls to _data and _removeData have been replaced\n\t// with direct calls to dataPriv methods, these can be deprecated.\n\t_data: function( elem, name, data ) {\n\t\treturn dataPriv.access( elem, name, data );\n\t},\n\n\t_removeData: function( elem, name ) {\n\t\tdataPriv.remove( elem, name );\n\t}\n} );\n\njQuery.fn.extend( {\n\tdata: function( key, value ) {\n\t\tvar i, name, data,\n\t\t\telem = this[ 0 ],\n\t\t\tattrs = elem && elem.attributes;\n\n\t\t// Gets all values\n\t\tif ( key === undefined ) {\n\t\t\tif ( this.length ) {\n\t\t\t\tdata = dataUser.get( elem );\n\n\t\t\t\tif ( elem.nodeType === 1 && !dataPriv.get( elem, \"hasDataAttrs\" ) ) {\n\t\t\t\t\ti = attrs.length;\n\t\t\t\t\twhile ( i-- ) {\n\n\t\t\t\t\t\t// Support: IE 11+\n\t\t\t\t\t\t// The attrs elements can be null (trac-14894)\n\t\t\t\t\t\tif ( attrs[ i ] ) {\n\t\t\t\t\t\t\tname = attrs[ i ].name;\n\t\t\t\t\t\t\tif ( name.indexOf( \"data-\" ) === 0 ) {\n\t\t\t\t\t\t\t\tname = camelCase( name.slice( 5 ) );\n\t\t\t\t\t\t\t\tdataAttr( elem, name, data[ name ] );\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t\tdataPriv.set( elem, \"hasDataAttrs\", true );\n\t\t\t\t}\n\t\t\t}\n\n\t\t\treturn data;\n\t\t}\n\n\t\t// Sets multiple values\n\t\tif ( typeof key === \"object\" ) {\n\t\t\treturn this.each( function() {\n\t\t\t\tdataUser.set( this, key );\n\t\t\t} );\n\t\t}\n\n\t\treturn access( this, function( value ) {\n\t\t\tvar data;\n\n\t\t\t// The calling jQuery object (element matches) is not empty\n\t\t\t// (and therefore has an element appears at this[ 0 ]) and the\n\t\t\t// `value` parameter was not undefined. An empty jQuery object\n\t\t\t// will result in `undefined` for elem = this[ 0 ] which will\n\t\t\t// throw an exception if an attempt to read a data cache is made.\n\t\t\tif ( elem && value === undefined ) {\n\n\t\t\t\t// Attempt to get data from the cache\n\t\t\t\t// The key will always be camelCased in Data\n\t\t\t\tdata = dataUser.get( elem, key );\n\t\t\t\tif ( data !== undefined ) {\n\t\t\t\t\treturn data;\n\t\t\t\t}\n\n\t\t\t\t// Attempt to \"discover\" the data in\n\t\t\t\t// HTML5 custom data-* attrs\n\t\t\t\tdata = dataAttr( elem, key );\n\t\t\t\tif ( data !== undefined ) {\n\t\t\t\t\treturn data;\n\t\t\t\t}\n\n\t\t\t\t// We tried really hard, but the data doesn't exist.\n\t\t\t\treturn;\n\t\t\t}\n\n\t\t\t// Set the data...\n\t\t\tthis.each( function() {\n\n\t\t\t\t// We always store the camelCased key\n\t\t\t\tdataUser.set( this, key, value );\n\t\t\t} );\n\t\t}, null, value, arguments.length > 1, null, true );\n\t},\n\n\tremoveData: function( key ) {\n\t\treturn this.each( function() {\n\t\t\tdataUser.remove( this, key );\n\t\t} );\n\t}\n} );\n\nexport { jQuery, jQuery as $ };", "meta": { - "sha256": 541558141920421205501086138012356951496054039953, - "start_line": 1, - "end_line": 176, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 15216584529958576692, "filename": "data.js", "uri": "https://github.com/jquery/jquery/blob/abc123def456/data.js" }, + "sha256": 541558141920421205501086138012356951496054039953, + "start_line": 1, + "end_line": 176, "chunk_type": "preamble" } }, { "text": "import { jQuery } from \"./core.js\";\nimport { toType } from \"./core/toType.js\";\nfunction buildParams( prefix, obj, traditional, add ) {\n\tvar name;\n\n\tif ( Array.isArray( obj ) ) {\n\n\t\t// Serialize array item.\n\t\tjQuery.each( obj, function( i, v ) {\n\t\t\tif ( traditional || rbracket.test( prefix ) ) {\n\n\t\t\t\t// Treat each array item as a scalar.\n\t\t\t\tadd( prefix, v );\n\n\t\t\t} else {\n\n\t\t\t\t// Item is non-scalar (array or object), encode its numeric index.\n\t\t\t\tbuildParams(\n\t\t\t\t\tprefix + \"[\" + ( typeof v === \"object\" && v != null ? i : \"\" ) + \"]\",\n\t\t\t\t\tv,\n\t\t\t\t\ttraditional,\n\t\t\t\t\tadd\n\t\t\t\t);\n\t\t\t}\n\t\t} );\n\n\t} else if ( !traditional && toType( obj ) === \"object\" ) {\n\n\t\t// Serialize object item.\n\t\tfor ( name in obj ) {\n\t\t\tbuildParams( prefix + \"[\" + name + \"]\", obj[ name ], traditional, add );\n\t\t}\n\n\t} else {\n\n\t\t// Serialize scalar item.\n\t\tadd( prefix, obj );\n\t}\n}", "meta": { - "part_name": "buildParams", - "docstring": "", - "sha256": 1111988990908705094703814693986724759434620677873, - "start_line": 14, - "end_line": 50, - "end_line_signature": 50, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 7904055776319460817, "filename": "serialize.js", "uri": "https://github.com/jquery/jquery/blob/abc123def456/serialize.js" }, + "part_name": "buildParams", + "docstring": "", + "sha256": 1111988990908705094703814693986724759434620677873, + "start_line": 14, + "end_line": 50, + "end_line_signature": 50, "chunk_type": "function" } }, { "text": "import { rcheckableType } from \"./var/rcheckableType.js\";\n\nimport \"./core/init.js\";\nimport \"./traversing.js\"; // filter\nimport \"./attributes/prop.js\";\n\nvar\n\trbracket = /\\[\\]$/,\n\trCRLF = /\\r?\\n/g,\n\trsubmitterTypes = /^(?:submit|button|image|reset|file)$/i,\n\trsubmittable = /^(?:input|select|textarea|keygen)/i;\n\n// Serialize an array of form elements or a set of\n// key/values into a query string\njQuery.param = function( a, traditional ) {\n\tvar prefix,\n\t\ts = [],\n\t\tadd = function( key, valueOrFunction ) {\n\n\t\t\t// If value is a function, invoke it and use its return value\n\t\t\tvar value = typeof valueOrFunction === \"function\" ?\n\t\t\t\tvalueOrFunction() :\n\t\t\t\tvalueOrFunction;\n\n\t\t\ts[ s.length ] = encodeURIComponent( key ) + \"=\" +\n\t\t\t\tencodeURIComponent( value == null ? \"\" : value );\n\t\t};\n\n\tif ( a == null ) {\n\t\treturn \"\";\n\t}\n\n\t// If an array was passed in, assume that it is an array of form elements.\n\tif ( Array.isArray( a ) || ( a.jquery && !jQuery.isPlainObject( a ) ) ) {\n\n\t\t// Serialize the form elements\n\t\tjQuery.each( a, function() {\n\t\t\tadd( this.name, this.value );\n\t\t} );\n\n\t} else {\n\n\t\t// If traditional, encode the \"old\" way (the way 1.3.2 or older\n\t\t// did it), otherwise encode params recursively.\n\t\tfor ( prefix in a ) {\n\t\t\tbuildParams( prefix, a[ prefix ], traditional, add );\n\t\t}\n\t}\n\n\t// Return the resulting serialization\n\treturn s.join( \"&\" );\n};\n\njQuery.fn.extend( {\n\tserialize: function() {\n\t\treturn jQuery.param( this.serializeArray() );\n\t},\n\tserializeArray: function() {\n\t\treturn this.map( function() {\n\n\t\t\t// Can add propHook for \"elements\" to filter or add form elements\n\t\t\tvar elements = jQuery.prop( this, \"elements\" );\n\t\t\treturn elements ? jQuery.makeArray( elements ) : this;\n\t\t} ).filter( function() {\n\t\t\tvar type = this.type;\n\n\t\t\t// Use .is( \":disabled\" ) so that fieldset[disabled] works\n\t\t\treturn this.name && !jQuery( this ).is( \":disabled\" ) &&\n\t\t\t\trsubmittable.test( this.nodeName ) && !rsubmitterTypes.test( type ) &&\n\t\t\t\t( this.checked || !rcheckableType.test( type ) );\n\t\t} ).map( function( _i, elem ) {\n\t\t\tvar val = jQuery( this ).val();\n\n\t\t\tif ( val == null ) {\n\t\t\t\treturn null;\n\t\t\t}\n\n\t\t\tif ( Array.isArray( val ) ) {\n\t\t\t\treturn jQuery.map( val, function( val ) {\n\t\t\t\t\treturn { name: elem.name, value: val.replace( rCRLF, \"\\r\\n\" ) };\n\t\t\t\t} );\n\t\t\t}\n\n\t\t\treturn { name: elem.name, value: val.replace( rCRLF, \"\\r\\n\" ) };\n\t\t} ).get();\n\t}\n} );\n\nexport { jQuery, jQuery as $ };", "meta": { - "sha256": 1122560629718129408618766425038972734060154979936, - "start_line": 2, - "end_line": 130, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 7904055776319460817, "filename": "serialize.js", "uri": "https://github.com/jquery/jquery/blob/abc123def456/serialize.js" }, + "sha256": 1122560629718129408618766425038972734060154979936, + "start_line": 2, + "end_line": 130, "chunk_type": "preamble" } } diff --git a/test/data/chunker_repo/Python/repo_out_chunks.json b/test/data/chunker_repo/Python/repo_out_chunks.json index 386b99ad..139dd5dd 100644 --- a/test/data/chunker_repo/Python/repo_out_chunks.json +++ b/test/data/chunker_repo/Python/repo_out_chunks.json @@ -3,1221 +3,1357 @@ { "text": "from docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float\nfrom docling_core.types.doc.page import SegmentedPdfPage, TextCell\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass Cluster(BaseModel):\n\n id: int\n label: DocItemLabel\n bbox: BoundingBox\n confidence: float = 1.0\n cells: List[TextCell] = []\n children: List[\"Cluster\"] = []\n @field_serializer(\"confidence\")\n def _serialize(self, value: float, info: FieldSerializationInfo) -> float:\n return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)", "meta": { - "part_name": "_serialize", - "docstring": "", - "sha256": 1370311415977656221876886741900648971627414401247, - "start_line": 150, - "end_line": 151, - "end_line_signature": 151, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "_serialize", + "docstring": "", + "sha256": 1370311415977656221876886741900648971627414401247, + "start_line": 150, + "end_line": 151, + "end_line_signature": 151, "chunk_type": "function" } }, { "text": "from docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass FigureElement(BasePageElement):\n\n annotations: List[PictureDataType] = []\n provenance: Optional[str] = None\n predicted_class: Optional[str] = None\n confidence: Optional[float] = None\n @field_serializer(\"confidence\")\n def _serialize(\n self, value: Optional[float], info: FieldSerializationInfo\n ) -> Optional[float]:\n return (\n round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)\n if value is not None\n else None\n )", "meta": { - "part_name": "_serialize", - "docstring": "", - "sha256": 548765170194758372904020338821756398576566540703, - "start_line": 206, - "end_line": 213, - "end_line_signature": 209, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "_serialize", + "docstring": "", + "sha256": 548765170194758372904020338821756398576566540703, + "start_line": 206, + "end_line": 213, + "end_line_signature": 209, "chunk_type": "function" } }, { "text": " from docling.backend.pdf_backend import PdfPageBackend\n# DO NOT REMOVE; explicitly exposed from this location\nfrom PIL.Image import Image\nfrom docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.page import SegmentedPdfPage, TextCell\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass Page(BaseModel):\n\n model_config = ConfigDict(arbitrary_types_allowed=True)\n page_no: int\n size: Optional[Size] = None\n parsed_page: Optional[SegmentedPdfPage] = None\n predictions: PagePredictions = PagePredictions()\n assembled: Optional[AssembledUnit] = None\n _backend: Optional[\"PdfPageBackend\"] = (\n None # Internal PDF backend. By default it is cleared during assembling.\n )\n _default_image_scale: float = 1.0\n _image_cache: Dict[float, Image] = (\n {}\n )\n @property\n def cells(self) -> List[TextCell]:\n \"\"\"Return text cells as a read-only view of parsed_page.textline_cells.\"\"\"\n if self.parsed_page is not None:\n return self.parsed_page.textline_cells\n else:\n return []", "meta": { - "part_name": "cells", - "docstring": "", - "sha256": 808130656114478424554213379229194132787588082937, - "start_line": 269, - "end_line": 274, - "end_line_signature": 270, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "cells", + "docstring": "", + "sha256": 808130656114478424554213379229194132787588082937, + "start_line": 269, + "end_line": 274, + "end_line_signature": 270, "chunk_type": "function" } }, { "text": " from docling.backend.pdf_backend import PdfPageBackend\n# DO NOT REMOVE; explicitly exposed from this location\nfrom PIL.Image import Image\nfrom docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.page import SegmentedPdfPage, TextCell\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass Page(BaseModel):\n\n model_config = ConfigDict(arbitrary_types_allowed=True)\n page_no: int\n size: Optional[Size] = None\n parsed_page: Optional[SegmentedPdfPage] = None\n predictions: PagePredictions = PagePredictions()\n assembled: Optional[AssembledUnit] = None\n _backend: Optional[\"PdfPageBackend\"] = (\n None # Internal PDF backend. By default it is cleared during assembling.\n )\n _default_image_scale: float = 1.0\n _image_cache: Dict[float, Image] = (\n {}\n )\n def get_image(\n self,\n scale: float = 1.0,\n max_size: Optional[int] = None,\n cropbox: Optional[BoundingBox] = None,\n ) -> Optional[Image]:\n if self._backend is None:\n return self._image_cache.get(scale, None)\n\n if max_size:\n assert self.size is not None\n scale = min(scale, max_size / max(self.size.as_tuple()))\n\n if scale not in self._image_cache:\n if cropbox is None:\n self._image_cache[scale] = self._backend.get_page_image(scale=scale)\n else:\n return self._backend.get_page_image(scale=scale, cropbox=cropbox)\n\n if cropbox is None:\n return self._image_cache[scale]\n else:\n page_im = self._image_cache[scale]\n assert self.size is not None\n return page_im.crop(\n cropbox.to_top_left_origin(page_height=self.size.height)\n .scaled(scale=scale)\n .as_tuple()\n )", "meta": { - "part_name": "get_image", - "docstring": "", - "sha256": 730486712684958979505969494702830191562221418826, - "start_line": 276, - "end_line": 304, - "end_line_signature": 282, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "get_image", + "docstring": "", + "sha256": 730486712684958979505969494702830191562221418826, + "start_line": 276, + "end_line": 304, + "end_line_signature": 282, "chunk_type": "function" } }, { "text": " from docling.backend.pdf_backend import PdfPageBackend\n# DO NOT REMOVE; explicitly exposed from this location\nfrom PIL.Image import Image\nfrom docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.page import SegmentedPdfPage, TextCell\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass Page(BaseModel):\n\n model_config = ConfigDict(arbitrary_types_allowed=True)\n page_no: int\n size: Optional[Size] = None\n parsed_page: Optional[SegmentedPdfPage] = None\n predictions: PagePredictions = PagePredictions()\n assembled: Optional[AssembledUnit] = None\n _backend: Optional[\"PdfPageBackend\"] = (\n None # Internal PDF backend. By default it is cleared during assembling.\n )\n _default_image_scale: float = 1.0\n _image_cache: Dict[float, Image] = (\n {}\n )\n @property\n def image(self) -> Optional[Image]:\n return self.get_image(scale=self._default_image_scale)", "meta": { - "part_name": "image", - "docstring": "", - "sha256": 411118430431318207465607893315291238177524289712, - "start_line": 307, - "end_line": 308, - "end_line_signature": 308, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "image", + "docstring": "", + "sha256": 411118430431318207465607893315291238177524289712, + "start_line": 307, + "end_line": 308, + "end_line_signature": 308, "chunk_type": "function" } }, { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nimport numpy as np\nclass PageConfidenceScores(BaseModel):\n\n parse_score: ScoreValue = np.nan\n layout_score: ScoreValue = np.nan\n table_score: ScoreValue = np.nan\n ocr_score: ScoreValue = np.nan\n def _score_to_grade(self, score: ScoreValue) -> QualityGrade:\n if score < 0.5:\n return QualityGrade.POOR\n elif score < 0.8:\n return QualityGrade.FAIR\n elif score < 0.9:\n return QualityGrade.GOOD\n elif score >= 0.9:\n return QualityGrade.EXCELLENT\n\n return QualityGrade.UNSPECIFIED", "meta": { - "part_name": "_score_to_grade", - "docstring": "", - "sha256": 1226961229592084659714241042350075927273793479169, - "start_line": 361, - "end_line": 371, - "end_line_signature": 362, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "_score_to_grade", + "docstring": "", + "sha256": 1226961229592084659714241042350075927273793479169, + "start_line": 361, + "end_line": 371, + "end_line_signature": 362, "chunk_type": "function" } }, { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nimport numpy as np\nclass PageConfidenceScores(BaseModel):\n\n parse_score: ScoreValue = np.nan\n layout_score: ScoreValue = np.nan\n table_score: ScoreValue = np.nan\n ocr_score: ScoreValue = np.nan\n @computed_field # type: ignore\n @property\n def mean_grade(self) -> QualityGrade:\n return self._score_to_grade(self.mean_score)", "meta": { - "part_name": "mean_grade", - "docstring": "", - "sha256": 970148436571335637993437576490782463715252886019, - "start_line": 375, - "end_line": 376, - "end_line_signature": 376, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "mean_grade", + "docstring": "", + "sha256": 970148436571335637993437576490782463715252886019, + "start_line": 375, + "end_line": 376, + "end_line_signature": 376, "chunk_type": "function" } }, { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nimport numpy as np\nclass PageConfidenceScores(BaseModel):\n\n parse_score: ScoreValue = np.nan\n layout_score: ScoreValue = np.nan\n table_score: ScoreValue = np.nan\n ocr_score: ScoreValue = np.nan\n @computed_field # type: ignore\n @property\n def low_grade(self) -> QualityGrade:\n return self._score_to_grade(self.low_score)", "meta": { - "part_name": "low_grade", - "docstring": "", - "sha256": 1414417851083571439151429300774211251904833950620, - "start_line": 380, - "end_line": 381, - "end_line_signature": 381, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "low_grade", + "docstring": "", + "sha256": 1414417851083571439151429300774211251904833950620, + "start_line": 380, + "end_line": 381, + "end_line_signature": 381, "chunk_type": "function" } }, { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nimport numpy as np\nScoreValue = float\nclass PageConfidenceScores(BaseModel):\n\n parse_score: ScoreValue = np.nan\n layout_score: ScoreValue = np.nan\n table_score: ScoreValue = np.nan\n ocr_score: ScoreValue = np.nan\n @computed_field # type: ignore\n @property\n def mean_score(self) -> ScoreValue:\n return ScoreValue(\n np.nanmean(\n [\n self.ocr_score,\n self.table_score,\n self.layout_score,\n self.parse_score,\n ]\n )\n )", "meta": { - "part_name": "mean_score", - "docstring": "", - "sha256": 1258375186580609407958319910033845627450381020082, - "start_line": 385, - "end_line": 395, - "end_line_signature": 386, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "mean_score", + "docstring": "", + "sha256": 1258375186580609407958319910033845627450381020082, + "start_line": 385, + "end_line": 395, + "end_line_signature": 386, "chunk_type": "function" } }, { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nimport numpy as np\nScoreValue = float\nclass PageConfidenceScores(BaseModel):\n\n parse_score: ScoreValue = np.nan\n layout_score: ScoreValue = np.nan\n table_score: ScoreValue = np.nan\n ocr_score: ScoreValue = np.nan\n @computed_field # type: ignore\n @property\n def low_score(self) -> ScoreValue:\n return ScoreValue(\n np.nanquantile(\n [\n self.ocr_score,\n self.table_score,\n self.layout_score,\n self.parse_score,\n ],\n q=0.05,\n )\n )", "meta": { - "part_name": "low_score", - "docstring": "", - "sha256": 530920199340573617576130514840888087666895770482, - "start_line": 399, - "end_line": 410, - "end_line_signature": 400, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "low_score", + "docstring": "", + "sha256": 530920199340573617576130514840888087666895770482, + "start_line": 399, + "end_line": 410, + "end_line_signature": 400, "chunk_type": "function" } }, { "text": "from collections import defaultdict\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nimport numpy as np\nScoreValue = float\nclass ConfidenceReport(PageConfidenceScores):\n\n pages: Dict[int, PageConfidenceScores] = Field(\n default_factory=lambda: defaultdict(PageConfidenceScores)\n )\n @computed_field # type: ignore\n @property\n def mean_score(self) -> ScoreValue:\n return ScoreValue(\n np.nanmean(\n [c.mean_score for c in self.pages.values()],\n )\n )", "meta": { - "part_name": "mean_score", - "docstring": "", - "sha256": 132450849266989335217771535733536814236612441736, - "start_line": 420, - "end_line": 425, - "end_line_signature": 421, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "mean_score", + "docstring": "", + "sha256": 132450849266989335217771535733536814236612441736, + "start_line": 420, + "end_line": 425, + "end_line_signature": 421, "chunk_type": "function" } }, { "text": "from collections import defaultdict\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nimport numpy as np\nScoreValue = float\nclass ConfidenceReport(PageConfidenceScores):\n\n pages: Dict[int, PageConfidenceScores] = Field(\n default_factory=lambda: defaultdict(PageConfidenceScores)\n )\n @computed_field # type: ignore\n @property\n def low_score(self) -> ScoreValue:\n return ScoreValue(\n np.nanmean(\n [c.low_score for c in self.pages.values()],\n )\n )", "meta": { - "part_name": "low_score", - "docstring": "", - "sha256": 970153367204825604327172702664272609484373919390, - "start_line": 429, - "end_line": 434, - "end_line_signature": 430, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "low_score", + "docstring": "", + "sha256": 970153367204825604327172702664272609484373919390, + "start_line": 429, + "end_line": 434, + "end_line_signature": 430, "chunk_type": "function" } }, { "text": "from enum import Enum\nclass ConversionStatus(str, Enum):\n PENDING = \"pending\"\n STARTED = \"started\"\n FAILURE = \"failure\"\n SUCCESS = \"success\"\n PARTIAL_SUCCESS = \"partial_success\"\n SKIPPED = \"skipped\"", "meta": { - "part_name": "ConversionStatus", - "docstring": "", - "sha256": 620766103743608450410859564155193221612617787030, - "start_line": 32, - "end_line": 38, - "end_line_signature": 38, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "ConversionStatus", + "docstring": "", + "sha256": 620766103743608450410859564155193221612617787030, + "start_line": 32, + "end_line": 38, + "end_line_signature": 38, "chunk_type": "class" } }, { "text": "from enum import Enum\nimport numpy as np\nclass InputFormat(str, Enum):\n \"\"\"A document format supported by document backend parsers.\"\"\"\n\n DOCX = \"docx\"\n PPTX = \"pptx\"\n HTML = \"html\"\n IMAGE = \"image\"\n PDF = \"pdf\"\n ASCIIDOC = \"asciidoc\"\n MD = \"md\"\n CSV = \"csv\"\n XLSX = \"xlsx\"\n XML_USPTO = \"xml_uspto\"\n XML_JATS = \"xml_jats\"\n JSON_DOCLING = \"json_docling\"\n AUDIO = \"audio\"", "meta": { - "part_name": "InputFormat", - "docstring": "", - "sha256": 892216703579506331579469699340486405594949995133, - "start_line": 41, - "end_line": 56, - "end_line_signature": 56, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "InputFormat", + "docstring": "", + "sha256": 892216703579506331579469699340486405594949995133, + "start_line": 41, + "end_line": 56, + "end_line_signature": 56, "chunk_type": "class" } }, { "text": "from enum import Enum\nclass OutputFormat(str, Enum):\n MARKDOWN = \"md\"\n JSON = \"json\"\n HTML = \"html\"\n HTML_SPLIT_PAGE = \"html_split_page\"\n TEXT = \"text\"\n DOCTAGS = \"doctags\"", "meta": { - "part_name": "OutputFormat", - "docstring": "", - "sha256": 1347846176447908013052242174477254615853046589425, - "start_line": 59, - "end_line": 65, - "end_line_signature": 65, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "OutputFormat", + "docstring": "", + "sha256": 1347846176447908013052242174477254615853046589425, + "start_line": 59, + "end_line": 65, + "end_line_signature": 65, "chunk_type": "class" } }, { "text": "from enum import Enum\nimport numpy as np\nclass DocInputType(str, Enum):\n PATH = \"path\"\n STREAM = \"stream\"", "meta": { - "part_name": "DocInputType", - "docstring": "", - "sha256": 1223714591888346503494526053642460574025115616826, - "start_line": 123, - "end_line": 125, - "end_line_signature": 125, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "DocInputType", + "docstring": "", + "sha256": 1223714591888346503494526053642460574025115616826, + "start_line": 123, + "end_line": 125, + "end_line_signature": 125, "chunk_type": "class" } }, { "text": "from enum import Enum\nimport numpy as np\nclass DoclingComponentType(str, Enum):\n DOCUMENT_BACKEND = \"document_backend\"\n MODEL = \"model\"\n DOC_ASSEMBLER = \"doc_assembler\"\n USER_INPUT = \"user_input\"", "meta": { - "part_name": "DoclingComponentType", - "docstring": "", - "sha256": 1044618506138011322142671349003262389011658625241, - "start_line": 128, - "end_line": 132, - "end_line_signature": 132, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "DoclingComponentType", + "docstring": "", + "sha256": 1044618506138011322142671349003262389011658625241, + "start_line": 128, + "end_line": 132, + "end_line_signature": 132, "chunk_type": "class" } }, { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nclass ErrorItem(BaseModel):\n component_type: DoclingComponentType\n module_name: str\n error_message: str", "meta": { - "part_name": "ErrorItem", - "docstring": "", - "sha256": 1086117531920474775755100836319966340621867981803, - "start_line": 135, - "end_line": 138, - "end_line_signature": 138, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "ErrorItem", + "docstring": "", + "sha256": 1086117531920474775755100836319966340621867981803, + "start_line": 135, + "end_line": 138, + "end_line_signature": 138, "chunk_type": "class" } }, { "text": "from docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float\nfrom docling_core.types.doc.page import SegmentedPdfPage, TextCell\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass Cluster(BaseModel):\n id: int\n label: DocItemLabel\n bbox: BoundingBox\n confidence: float = 1.0\n cells: List[TextCell] = []\n children: List[\"Cluster\"] = [] # Add child cluster support\n\n @field_serializer(\"confidence\")\n def _serialize(self, value: float, info: FieldSerializationInfo) -> float:\n return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)", "meta": { - "part_name": "Cluster", - "docstring": "", - "sha256": 1265293438447400808420129430722899990479108904146, - "start_line": 141, - "end_line": 151, - "end_line_signature": 151, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "Cluster", + "docstring": "", + "sha256": 1265293438447400808420129430722899990479108904146, + "start_line": 141, + "end_line": 151, + "end_line_signature": 151, "chunk_type": "class" } }, { "text": "from docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass BasePageElement(BaseModel):\n label: DocItemLabel\n id: int\n page_no: int\n cluster: Cluster\n text: Optional[str] = None", "meta": { - "part_name": "BasePageElement", - "docstring": "", - "sha256": 27686403131898710443755657765582374638708518770, - "start_line": 154, - "end_line": 159, - "end_line_signature": 159, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "BasePageElement", + "docstring": "", + "sha256": 27686403131898710443755657765582374638708518770, + "start_line": 154, + "end_line": 159, + "end_line_signature": 159, "chunk_type": "class" } }, { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass LayoutPrediction(BaseModel):\n clusters: List[Cluster] = []", "meta": { - "part_name": "LayoutPrediction", - "docstring": "", - "sha256": 987561170389338380550072794621774259794654494830, - "start_line": 162, - "end_line": 163, - "end_line_signature": 163, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "LayoutPrediction", + "docstring": "", + "sha256": 987561170389338380550072794621774259794654494830, + "start_line": 162, + "end_line": 163, + "end_line_signature": 163, "chunk_type": "class" } }, { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nclass VlmPredictionToken(BaseModel):\n text: str = \"\"\n token: int = -1\n logprob: float = -1", "meta": { - "part_name": "VlmPredictionToken", - "docstring": "", - "sha256": 65603519381706971863039126377096035311378651150, - "start_line": 166, - "end_line": 169, - "end_line_signature": 169, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "VlmPredictionToken", + "docstring": "", + "sha256": 65603519381706971863039126377096035311378651150, + "start_line": 166, + "end_line": 169, + "end_line_signature": 169, "chunk_type": "class" } }, { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nclass VlmPrediction(BaseModel):\n text: str = \"\"\n generated_tokens: list[VlmPredictionToken] = []\n generation_time: float = -1", "meta": { - "part_name": "VlmPrediction", - "docstring": "", - "sha256": 1001170426609364857440748112674472207227386338107, - "start_line": 172, - "end_line": 175, - "end_line_signature": 175, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "VlmPrediction", + "docstring": "", + "sha256": 1001170426609364857440748112674472207227386338107, + "start_line": 172, + "end_line": 175, + "end_line_signature": 175, "chunk_type": "class" } }, { "text": "from typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass ContainerElement(\n BasePageElement\n): # Used for Form and Key-Value-Regions, only for typing.\n pass", "meta": { - "part_name": "ContainerElement", - "docstring": "", - "sha256": 595661713802144347628508983545186540799570992038, - "start_line": 178, - "end_line": 181, - "end_line_signature": 181, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "ContainerElement", + "docstring": "", + "sha256": 595661713802144347628508983545186540799570992038, + "start_line": 178, + "end_line": 181, + "end_line_signature": 181, "chunk_type": "class" } }, { "text": "from docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass Table(BasePageElement):\n otsl_seq: List[str]\n num_rows: int = 0\n num_cols: int = 0\n table_cells: List[TableCell]", "meta": { - "part_name": "Table", - "docstring": "", - "sha256": 899700098549855605262894705349504270338610789745, - "start_line": 184, - "end_line": 188, - "end_line_signature": 188, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "Table", + "docstring": "", + "sha256": 899700098549855605262894705349504270338610789745, + "start_line": 184, + "end_line": 188, + "end_line_signature": 188, "chunk_type": "class" } }, { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass TableStructurePrediction(BaseModel):\n table_map: Dict[int, Table] = {}", "meta": { - "part_name": "TableStructurePrediction", - "docstring": "", - "sha256": 10390869097689903408310238897062292721124859701, - "start_line": 191, - "end_line": 192, - "end_line_signature": 192, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "TableStructurePrediction", + "docstring": "", + "sha256": 10390869097689903408310238897062292721124859701, + "start_line": 191, + "end_line": 192, + "end_line_signature": 192, "chunk_type": "class" } }, { "text": "\nclass TextElement(BasePageElement):\n text: str", "meta": { - "part_name": "TextElement", - "docstring": "", - "sha256": 910599684044725278502405469110289841532247426179, - "start_line": 195, - "end_line": 196, - "end_line_signature": 196, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "TextElement", + "docstring": "", + "sha256": 910599684044725278502405469110289841532247426179, + "start_line": 195, + "end_line": 196, + "end_line_signature": 196, "chunk_type": "class" } }, { "text": "from docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass FigureElement(BasePageElement):\n annotations: List[PictureDataType] = []\n provenance: Optional[str] = None\n predicted_class: Optional[str] = None\n confidence: Optional[float] = None\n\n @field_serializer(\"confidence\")\n def _serialize(\n self, value: Optional[float], info: FieldSerializationInfo\n ) -> Optional[float]:\n return (\n round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)\n if value is not None\n else None\n )", "meta": { - "part_name": "FigureElement", - "docstring": "", - "sha256": 49150437793556841766787782882109128052965382281, - "start_line": 199, - "end_line": 213, - "end_line_signature": 213, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "FigureElement", + "docstring": "", + "sha256": 49150437793556841766787782882109128052965382281, + "start_line": 199, + "end_line": 213, + "end_line_signature": 213, "chunk_type": "class" } }, { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass FigureClassificationPrediction(BaseModel):\n figure_count: int = 0\n figure_map: Dict[int, FigureElement] = {}", "meta": { - "part_name": "FigureClassificationPrediction", - "docstring": "", - "sha256": 393470850502283580013047324181028663675467213132, - "start_line": 216, - "end_line": 218, - "end_line_signature": 218, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "FigureClassificationPrediction", + "docstring": "", + "sha256": 393470850502283580013047324181028663675467213132, + "start_line": 216, + "end_line": 218, + "end_line_signature": 218, "chunk_type": "class" } }, { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass EquationPrediction(BaseModel):\n equation_count: int = 0\n equation_map: Dict[int, TextElement] = {}", "meta": { - "part_name": "EquationPrediction", - "docstring": "", - "sha256": 283019580808330812046385053785548970241882616440, - "start_line": 221, - "end_line": 223, - "end_line_signature": 223, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "EquationPrediction", + "docstring": "", + "sha256": 283019580808330812046385053785548970241882616440, + "start_line": 221, + "end_line": 223, + "end_line_signature": 223, "chunk_type": "class" } }, { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass PagePredictions(BaseModel):\n layout: Optional[LayoutPrediction] = None\n tablestructure: Optional[TableStructurePrediction] = None\n figures_classification: Optional[FigureClassificationPrediction] = None\n equations_prediction: Optional[EquationPrediction] = None\n vlm_response: Optional[VlmPrediction] = None", "meta": { - "part_name": "PagePredictions", - "docstring": "", - "sha256": 1048734808182693909263635311132078575228024326882, - "start_line": 226, - "end_line": 231, - "end_line_signature": 231, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "PagePredictions", + "docstring": "", + "sha256": 1048734808182693909263635311132078575228024326882, + "start_line": 226, + "end_line": 231, + "end_line_signature": 231, "chunk_type": "class" } }, { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass AssembledUnit(BaseModel):\n elements: List[PageElement] = []\n body: List[PageElement] = []\n headers: List[PageElement] = []", "meta": { - "part_name": "AssembledUnit", - "docstring": "", - "sha256": 1094387419928066226155485436524594727573943352114, - "start_line": 237, - "end_line": 240, - "end_line_signature": 240, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "AssembledUnit", + "docstring": "", + "sha256": 1094387419928066226155485436524594727573943352114, + "start_line": 237, + "end_line": 240, + "end_line_signature": 240, "chunk_type": "class" } }, { "text": "# DO NOT REMOVE; explicitly exposed from this location\nfrom PIL.Image import Image\nfrom docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass ItemAndImageEnrichmentElement(BaseModel):\n model_config = ConfigDict(arbitrary_types_allowed=True)\n\n item: NodeItem\n image: Image", "meta": { - "part_name": "ItemAndImageEnrichmentElement", - "docstring": "", - "sha256": 30748452496409606175686443467765939596665570803, - "start_line": 243, - "end_line": 247, - "end_line_signature": 247, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "ItemAndImageEnrichmentElement", + "docstring": "", + "sha256": 30748452496409606175686443467765939596665570803, + "start_line": 243, + "end_line": 247, + "end_line_signature": 247, "chunk_type": "class" } }, { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\n## OpenAI API Request / Response Models ##\nclass OpenAiChatMessage(BaseModel):\n role: str\n content: str", "meta": { - "part_name": "OpenAiChatMessage", - "docstring": "", - "sha256": 515012574841107792563852565513091992302046287434, - "start_line": 314, - "end_line": 316, - "end_line_signature": 316, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "OpenAiChatMessage", + "docstring": "", + "sha256": 515012574841107792563852565513091992302046287434, + "start_line": 314, + "end_line": 316, + "end_line_signature": 316, "chunk_type": "class" } }, { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass OpenAiResponseChoice(BaseModel):\n index: int\n message: OpenAiChatMessage\n finish_reason: Optional[str]", "meta": { - "part_name": "OpenAiResponseChoice", - "docstring": "", - "sha256": 337899610582669912657797333467719843029840509833, - "start_line": 319, - "end_line": 322, - "end_line_signature": 322, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "OpenAiResponseChoice", + "docstring": "", + "sha256": 337899610582669912657797333467719843029840509833, + "start_line": 319, + "end_line": 322, + "end_line_signature": 322, "chunk_type": "class" } }, { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nclass OpenAiResponseUsage(BaseModel):\n prompt_tokens: int\n completion_tokens: int\n total_tokens: int", "meta": { - "part_name": "OpenAiResponseUsage", - "docstring": "", - "sha256": 130433137267720616513765827820400685577191918977, - "start_line": 325, - "end_line": 328, - "end_line_signature": 328, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "OpenAiResponseUsage", + "docstring": "", + "sha256": 130433137267720616513765827820400685577191918977, + "start_line": 325, + "end_line": 328, + "end_line_signature": 328, "chunk_type": "class" } }, { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass OpenAiApiResponse(BaseModel):\n model_config = ConfigDict(\n protected_namespaces=(),\n )\n\n id: str\n model: Optional[str] = None # returned by openai\n choices: List[OpenAiResponseChoice]\n created: int\n usage: OpenAiResponseUsage", "meta": { - "part_name": "OpenAiApiResponse", - "docstring": "", - "sha256": 891404258682255341223304052944928949828473242768, - "start_line": 331, - "end_line": 340, - "end_line_signature": 340, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "OpenAiApiResponse", + "docstring": "", + "sha256": 891404258682255341223304052944928949828473242768, + "start_line": 331, + "end_line": 340, + "end_line_signature": 340, "chunk_type": "class" } }, { "text": "from enum import Enum\nclass QualityGrade(str, Enum):\n POOR = \"poor\"\n FAIR = \"fair\"\n GOOD = \"good\"\n EXCELLENT = \"excellent\"\n UNSPECIFIED = \"unspecified\"", "meta": { - "part_name": "QualityGrade", - "docstring": "", - "sha256": 193399221256625706292721394797930754087225969626, - "start_line": 347, - "end_line": 352, - "end_line_signature": 352, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "QualityGrade", + "docstring": "", + "sha256": 193399221256625706292721394797930754087225969626, + "start_line": 347, + "end_line": 352, + "end_line_signature": 352, "chunk_type": "class" } }, { "text": "from collections import defaultdict\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nimport numpy as np\nclass ConfidenceReport(PageConfidenceScores):\n pages: Dict[int, PageConfidenceScores] = Field(\n default_factory=lambda: defaultdict(PageConfidenceScores)\n )\n\n @computed_field # type: ignore\n @property\n def mean_score(self) -> ScoreValue:\n return ScoreValue(\n np.nanmean(\n [c.mean_score for c in self.pages.values()],\n )\n )\n\n @computed_field # type: ignore\n @property\n def low_score(self) -> ScoreValue:\n return ScoreValue(\n np.nanmean(\n [c.low_score for c in self.pages.values()],\n )\n )", "meta": { - "part_name": "ConfidenceReport", - "docstring": "", - "sha256": 1446615132461763223157668983795322811472146148315, - "start_line": 413, - "end_line": 434, - "end_line_signature": 434, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "part_name": "ConfidenceReport", + "docstring": "", + "sha256": 1446615132461763223157668983795322811472146148315, + "start_line": 413, + "end_line": 434, + "end_line_signature": 434, "chunk_type": "class" } }, { "text": "if TYPE_CHECKING:\n\nFormatToExtensions: Dict[InputFormat, List[str]] = {\n InputFormat.DOCX: [\"docx\", \"dotx\", \"docm\", \"dotm\"],\n InputFormat.PPTX: [\"pptx\", \"potx\", \"ppsx\", \"pptm\", \"potm\", \"ppsm\"],\n InputFormat.PDF: [\"pdf\"],\n InputFormat.MD: [\"md\"],\n InputFormat.HTML: [\"html\", \"htm\", \"xhtml\"],\n InputFormat.XML_JATS: [\"xml\", \"nxml\"],\n InputFormat.IMAGE: [\"jpg\", \"jpeg\", \"png\", \"tif\", \"tiff\", \"bmp\", \"webp\"],\n InputFormat.ASCIIDOC: [\"adoc\", \"asciidoc\", \"asc\"],\n InputFormat.CSV: [\"csv\"],\n InputFormat.XLSX: [\"xlsx\", \"xlsm\"],\n InputFormat.XML_USPTO: [\"xml\", \"txt\"],\n InputFormat.JSON_DOCLING: [\"json\"],\n InputFormat.AUDIO: [\"wav\", \"mp3\"],\n}\n\nFormatToMimeType: Dict[InputFormat, List[str]] = {\n InputFormat.DOCX: [\n \"application/vnd.openxmlformats-officedocument.wordprocessingml.document\",\n \"application/vnd.openxmlformats-officedocument.wordprocessingml.template\",\n ],\n InputFormat.PPTX: [\n \"application/vnd.openxmlformats-officedocument.presentationml.template\",\n \"application/vnd.openxmlformats-officedocument.presentationml.slideshow\",\n \"application/vnd.openxmlformats-officedocument.presentationml.presentation\",\n ],\n InputFormat.HTML: [\"text/html\", \"application/xhtml+xml\"],\n InputFormat.XML_JATS: [\"application/xml\"],\n InputFormat.IMAGE: [\n \"image/png\",\n \"image/jpeg\",\n \"image/tiff\",\n \"image/gif\",\n \"image/bmp\",\n \"image/webp\",\n ],\n InputFormat.PDF: [\"application/pdf\"],\n InputFormat.ASCIIDOC: [\"text/asciidoc\"],\n InputFormat.MD: [\"text/markdown\", \"text/x-markdown\"],\n InputFormat.CSV: [\"text/csv\"],\n InputFormat.XLSX: [\n \"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet\"\n ],\n InputFormat.XML_USPTO: [\"application/xml\", \"text/plain\"],\n InputFormat.JSON_DOCLING: [\"application/json\"],\n InputFormat.AUDIO: [\"audio/x-wav\", \"audio/mpeg\", \"audio/wav\", \"audio/mp3\"],\n}\n\nMimeTypeToFormat: dict[str, list[InputFormat]] = {\n mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]\n for value in FormatToMimeType.values()\n for mime in value\n}\n\nPageElement = Union[TextElement, Table, FigureElement, ContainerElement]", "meta": { - "sha256": 937534938268631177739242095765995242760409532040, - "start_line": 27, - "end_line": 237, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 17127733993255342652, "filename": "base_models.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/base_models.py" }, + "sha256": 937534938268631177739242095765995242760409532040, + "start_line": 27, + "end_line": 237, "chunk_type": "preamble" } }, { "text": "from collections.abc import Iterable\nfrom docling.datamodel.document import ConversionResult, Page\nfrom docling_core.types.doc import BoundingBox, CoordOrigin\nfrom docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table\nfrom typing import Any, Dict, List, Tuple, Union\n_log = logging.getLogger(__name__)\ndef generate_multimodal_pages(\n doc_result: ConversionResult,\n) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:\n label_to_doclaynet = {\n \"title\": \"title\",\n \"table-of-contents\": \"document_index\",\n \"subtitle-level-1\": \"section_header\",\n \"checkbox-selected\": \"checkbox_selected\",\n \"checkbox-unselected\": \"checkbox_unselected\",\n \"caption\": \"caption\",\n \"page-header\": \"page_header\",\n \"page-footer\": \"page_footer\",\n \"footnote\": \"footnote\",\n \"table\": \"table\",\n \"formula\": \"formula\",\n \"list-item\": \"list_item\",\n \"code\": \"code\",\n \"figure\": \"picture\",\n \"picture\": \"picture\",\n \"reference\": \"text\",\n \"paragraph\": \"text\",\n \"text\": \"text\",\n }\n\n content_text = \"\"\n page_no = 0\n start_ix = 0\n end_ix = 0\n doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []\n\n doc = doc_result.legacy_document\n\n def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):\n segments = []\n\n for ix, item in doc_items:\n item_type = item.obj_type\n label = label_to_doclaynet.get(item_type, None)\n\n if label is None or item.prov is None or page.size is None:\n continue\n\n bbox = BoundingBox.from_tuple(\n tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT\n )\n new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(\n page_size=page.size\n )\n\n new_segment = {\n \"index_in_doc\": ix,\n \"label\": label,\n \"text\": item.text if item.text is not None else \"\",\n \"bbox\": new_bbox.as_tuple(),\n \"data\": [],\n }\n\n if isinstance(item, Table):\n table_html = item.export_to_html()\n new_segment[\"data\"].append(\n {\n \"html_seq\": table_html,\n \"otsl_seq\": \"\",\n }\n )\n\n segments.append(new_segment)\n\n return segments\n\n def _process_page_cells(page: Page):\n cells: List[dict] = []\n if page.size is None:\n return cells\n for cell in page.cells:\n new_bbox = (\n cell.rect.to_bounding_box()\n .to_top_left_origin(page_height=page.size.height)\n .normalized(page_size=page.size)\n )\n is_ocr = cell.from_ocr\n ocr_confidence = cell.confidence\n cells.append(\n {\n \"text\": cell.text,\n \"bbox\": new_bbox.as_tuple(),\n \"ocr\": is_ocr,\n \"ocr_confidence\": ocr_confidence,\n }\n )\n return cells\n\n def _process_page():\n page_ix = page_no - 1\n page = doc_result.pages[page_ix]\n\n page_cells = _process_page_cells(page=page)\n page_segments = _process_page_segments(doc_items=doc_items, page=page)\n content_md = doc.export_to_markdown(\n main_text_start=start_ix, main_text_stop=end_ix\n )\n # No page-tagging since we only do 1 page at the time\n content_dt = doc.export_to_document_tokens(\n main_text_start=start_ix, main_text_stop=end_ix, add_page_index=False\n )\n\n return content_text, content_md, content_dt, page_cells, page_segments, page\n\n if doc.main_text is None:\n return\n for ix, orig_item in enumerate(doc.main_text):\n item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item\n if item is None or item.prov is None or len(item.prov) == 0:\n _log.debug(f\"Skipping item {orig_item}\")\n continue\n\n item_page = item.prov[0].page\n\n # Page is complete\n if page_no > 0 and item_page > page_no:\n yield _process_page()\n\n start_ix = ix\n doc_items = []\n content_text = \"\"\n\n page_no = item_page\n end_ix = ix\n doc_items.append((ix, item))\n if item.text is not None and item.text != \"\":\n content_text += item.text + \" \"\n\n if len(doc_items) > 0:\n yield _process_page()", "meta": { - "part_name": "generate_multimodal_pages", - "docstring": "", - "sha256": 1004790262158132739538587728436826033595593751761, - "start_line": 12, - "end_line": 145, - "end_line_signature": 15, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 11028592083014135829, "filename": "export.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/export.py" }, + "part_name": "generate_multimodal_pages", + "docstring": "", + "sha256": 1004790262158132739538587728436826033595593751761, + "start_line": 12, + "end_line": 145, + "end_line_signature": 15, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n @override\n def is_valid(self) -> bool:\n return self.valid", "meta": { - "part_name": "is_valid", - "docstring": "", - "sha256": 1389299177428647533914300122685171886284474960790, - "start_line": 120, - "end_line": 121, - "end_line_signature": 121, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "is_valid", + "docstring": "", + "sha256": 1389299177428647533914300122685171886284474960790, + "start_line": 120, + "end_line": 121, + "end_line_signature": 121, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n @classmethod\n @override\n def supports_pagination(cls) -> bool:\n return False", "meta": { - "part_name": "supports_pagination", - "docstring": "", - "sha256": 189133244729867257087740036829509886476419053207, - "start_line": 125, - "end_line": 126, - "end_line_signature": 126, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "supports_pagination", + "docstring": "", + "sha256": 189133244729867257087740036829509886476419053207, + "start_line": 125, + "end_line": 126, + "end_line_signature": 126, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n @override\n def unload(self):\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.close()\n self.path_or_stream = None", "meta": { - "part_name": "unload", - "docstring": "", - "sha256": 19733788426265514145027761479429042000417200591, - "start_line": 129, - "end_line": 132, - "end_line_signature": 130, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "unload", + "docstring": "", + "sha256": 19733788426265514145027761479429042000417200591, + "start_line": 129, + "end_line": 132, + "end_line_signature": 130, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.base_models import InputFormat\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n @classmethod\n @override\n def supported_formats(cls) -> set[InputFormat]:\n return {InputFormat.XML_JATS}", "meta": { - "part_name": "supported_formats", - "docstring": "", - "sha256": 95992898799884786951251283661078353705554191150, - "start_line": 136, - "end_line": 137, - "end_line_signature": 137, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "supported_formats", + "docstring": "", + "sha256": 95992898799884786951251283661078353705554191150, + "start_line": 136, + "end_line": 137, + "end_line_signature": 137, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nimport traceback\n_log = logging.getLogger(__name__)\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n @override\n def convert(self) -> DoclingDocument:\n try:\n # Create empty document\n origin = DocumentOrigin(\n filename=self.file.name or \"file\",\n mimetype=\"application/xml\",\n binary_hash=self.document_hash,\n )\n doc = DoclingDocument(name=self.file.stem or \"file\", origin=origin)\n self.hlevel = 0\n\n # Get metadata XML components\n xml_components: XMLComponents = self._parse_metadata()\n\n # Add metadata to the document\n self._add_metadata(doc, xml_components)\n\n # walk over the XML body\n body = self.tree.xpath(\"//body\")\n if self.root and len(body) > 0:\n self._walk_linear(doc, self.root, body[0])\n\n # walk over the XML back matter\n back = self.tree.xpath(\"//back\")\n if self.root and len(back) > 0:\n self._walk_linear(doc, self.root, back[0])\n except Exception:\n _log.error(traceback.format_exc())\n\n return doc", "meta": { - "part_name": "convert", - "docstring": "", - "sha256": 401268109311259531613842418991708895011320755673, - "start_line": 140, - "end_line": 169, - "end_line_signature": 141, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "convert", + "docstring": "", + "sha256": 401268109311259531613842418991708895011320755673, + "start_line": 140, + "end_line": 169, + "end_line_signature": 141, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n @staticmethod\n def _get_text(node: etree._Element, sep: Optional[str] = None) -> str:\n skip_tags = [\"term\", \"disp-formula\", \"inline-formula\"]\n text: str = (\n node.text.replace(\"\\n\", \" \")\n if (node.tag not in skip_tags and node.text)\n else \"\"\n )\n for child in list(node):\n if child.tag not in skip_tags:\n # TODO: apply styling according to child.tag when supported by docling-core\n text += JatsDocumentBackend._get_text(child, sep)\n if sep:\n text = text.rstrip(sep) + sep\n text += child.tail.replace(\"\\n\", \" \") if child.tail else \"\"\n\n return text", "meta": { - "part_name": "_get_text", - "docstring": "", - "sha256": 766714162982515447138884963963637052165120920700, - "start_line": 172, - "end_line": 187, - "end_line_signature": 173, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "_get_text", + "docstring": "", + "sha256": 766714162982515447138884963963637052165120920700, + "start_line": 172, + "end_line": 187, + "end_line_signature": 173, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _find_metadata(self) -> Optional[etree._Element]:\n meta_names: list[str] = [\"article-meta\", \"book-part-meta\"]\n meta: Optional[etree._Element] = None\n for name in meta_names:\n node = self.tree.xpath(f\".//{name}\")\n if len(node) > 0:\n meta = node[0]\n break\n\n return meta", "meta": { - "part_name": "_find_metadata", - "docstring": "", - "sha256": 864269816803865464490166888555034068213906212092, - "start_line": 189, - "end_line": 198, - "end_line_signature": 190, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "_find_metadata", + "docstring": "", + "sha256": 864269816803865464490166888555034068213906212092, + "start_line": 189, + "end_line": 198, + "end_line_signature": 190, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _parse_abstract(self) -> list[Abstract]:\n # TODO: address cases with multiple sections\n abs_list: list[Abstract] = []\n\n for abs_node in self.tree.xpath(\".//abstract\"):\n abstract: Abstract = dict(label=\"\", content=\"\")\n texts = []\n for abs_par in abs_node.xpath(\"p\"):\n texts.append(JatsDocumentBackend._get_text(abs_par).strip())\n abstract[\"content\"] = \" \".join(texts)\n\n label_node = abs_node.xpath(\"title|label\")\n if len(label_node) > 0:\n abstract[\"label\"] = label_node[0].text.strip()\n\n abs_list.append(abstract)\n\n return abs_list", "meta": { - "part_name": "_parse_abstract", - "docstring": "", - "sha256": 270615732461062541428727022585950585711717687697, - "start_line": 200, - "end_line": 217, - "end_line_signature": 202, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "_parse_abstract", + "docstring": "", + "sha256": 270615732461062541428727022585950585711717687697, + "start_line": 200, + "end_line": 217, + "end_line_signature": 202, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _parse_authors(self) -> list[Author]:\n # Get mapping between affiliation ids and names\n authors: list[Author] = []\n meta: Optional[etree._Element] = self._find_metadata()\n if meta is None:\n return authors\n\n affiliation_names = []\n for affiliation_node in meta.xpath(\".//aff[@id]\"):\n aff = \", \".join([t for t in affiliation_node.itertext() if t.strip()])\n aff = aff.replace(\"\\n\", \" \")\n label = affiliation_node.xpath(\"label\")\n if label:\n # TODO: once superscript is supported, add label with formatting\n aff = aff.removeprefix(f\"{label[0].text}, \")\n affiliation_names.append(aff)\n affiliation_ids_names = dict(\n zip(meta.xpath(\".//aff[@id]/@id\"), affiliation_names)\n )\n\n # Get author names and affiliation names\n for author_node in meta.xpath(\n './/contrib-group/contrib[@contrib-type=\"author\"]'\n ):\n author: Author = {\n \"name\": \"\",\n \"affiliation_names\": [],\n }\n\n # Affiliation names\n affiliation_ids = [\n a.attrib[\"rid\"] for a in author_node.xpath('xref[@ref-type=\"aff\"]')\n ]\n for id in affiliation_ids:\n if id in affiliation_ids_names:\n author[\"affiliation_names\"].append(affiliation_ids_names[id])\n\n # Name\n author[\"name\"] = (\n author_node.xpath(\"name/given-names\")[0].text\n + \" \"\n + author_node.xpath(\"name/surname\")[0].text\n )\n\n authors.append(author)\n\n return authors", "meta": { - "part_name": "_parse_authors", - "docstring": "", - "sha256": 285578325238635728078594781795591481429703102959, - "start_line": 219, - "end_line": 265, - "end_line_signature": 221, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "_parse_authors", + "docstring": "", + "sha256": 285578325238635728078594781795591481429703102959, + "start_line": 219, + "end_line": 265, + "end_line_signature": 221, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _parse_title(self) -> str:\n meta_names: list[str] = [\n \"article-meta\",\n \"collection-meta\",\n \"book-meta\",\n \"book-part-meta\",\n ]\n title_names: list[str] = [\"article-title\", \"subtitle\", \"title\", \"label\"]\n titles: list[str] = [\n \" \".join(\n elem.text.replace(\"\\n\", \" \").strip()\n for elem in list(title_node)\n if elem.tag in title_names\n ).strip()\n for title_node in self.tree.xpath(\n \"|\".join([f\".//{item}/title-group\" for item in meta_names])\n )\n ]\n\n text = \" - \".join(titles)\n\n return text", "meta": { - "part_name": "_parse_title", - "docstring": "", - "sha256": 211107707856227464327571216206163890165368281377, - "start_line": 267, - "end_line": 288, - "end_line_signature": 268, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "_parse_title", + "docstring": "", + "sha256": 211107707856227464327571216206163890165368281377, + "start_line": 267, + "end_line": 288, + "end_line_signature": 268, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _parse_metadata(self) -> XMLComponents:\n \"\"\"Parsing JATS document metadata.\"\"\"\n xml_components: XMLComponents = {\n \"title\": self._parse_title(),\n \"authors\": self._parse_authors(),\n \"abstract\": self._parse_abstract(),\n }\n return xml_components", "meta": { - "part_name": "_parse_metadata", - "docstring": "", - "sha256": 6721807935708893971952263073653597579122214521, - "start_line": 290, - "end_line": 297, - "end_line_signature": 291, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "_parse_metadata", + "docstring": "", + "sha256": 6721807935708893971952263073653597579122214521, + "start_line": 290, + "end_line": 297, + "end_line_signature": 291, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nDEFAULT_HEADER_ABSTRACT: Final = \"Abstract\"\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_abstract(\n self, doc: DoclingDocument, xml_components: XMLComponents\n ) -> None:\n for abstract in xml_components[\"abstract\"]:\n text: str = abstract[\"content\"]\n title: str = abstract[\"label\"] or DEFAULT_HEADER_ABSTRACT\n if not text:\n continue\n parent = doc.add_heading(\n parent=self.root, text=title, level=self.hlevel + 1\n )\n doc.add_text(\n parent=parent,\n text=text,\n label=DocItemLabel.TEXT,\n )\n\n return", "meta": { - "part_name": "_add_abstract", - "docstring": "", - "sha256": 532581450273390935568634548645819886174208360025, - "start_line": 299, - "end_line": 316, - "end_line_signature": 302, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "_add_abstract", + "docstring": "", + "sha256": 532581450273390935568634548645819886174208360025, + "start_line": 299, + "end_line": 316, + "end_line_signature": 302, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:\n # TODO: once docling supports text formatting, add affiliation reference to\n # author names through superscripts\n authors: list = [item[\"name\"] for item in xml_components[\"authors\"]]\n authors_str = \", \".join(authors)\n affiliations: list = [\n item\n for author in xml_components[\"authors\"]\n for item in author[\"affiliation_names\"]\n ]\n affiliations_str = \"; \".join(list(dict.fromkeys(affiliations)))\n if authors_str:\n doc.add_text(\n parent=self.root,\n text=authors_str,\n label=DocItemLabel.PARAGRAPH,\n )\n if affiliations_str:\n doc.add_text(\n parent=self.root,\n text=affiliations_str,\n label=DocItemLabel.PARAGRAPH,\n )\n\n return", "meta": { - "part_name": "_add_authors", - "docstring": "", - "sha256": 1338802139689470366285923911940829378081971981022, - "start_line": 318, - "end_line": 342, - "end_line_signature": 321, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "_add_authors", + "docstring": "", + "sha256": 1338802139689470366285923911940829378081971981022, + "start_line": 318, + "end_line": 342, + "end_line_signature": 321, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_citation(self, doc: DoclingDocument, parent: NodeItem, text: str) -> None:\n if isinstance(parent, GroupItem) and parent.label == GroupLabel.LIST:\n doc.add_list_item(text=text, enumerated=False, parent=parent)\n else:\n doc.add_text(text=text, label=DocItemLabel.TEXT, parent=parent)\n\n return", "meta": { - "part_name": "_add_citation", - "docstring": "", - "sha256": 886159930357800932034845911444780765912823069638, - "start_line": 344, - "end_line": 350, - "end_line_signature": 345, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "_add_citation", + "docstring": "", + "sha256": 886159930357800932034845911444780765912823069638, + "start_line": 344, + "end_line": 350, + "end_line_signature": 345, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nDEFAULT_TEXT_ETAL: Final = \"et al.\"\n_log = logging.getLogger(__name__)\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901\n citation: Citation = {\n \"author_names\": \"\",\n \"title\": \"\",\n \"source\": \"\",\n \"year\": \"\",\n \"volume\": \"\",\n \"page\": \"\",\n \"pub_id\": \"\",\n \"publisher_name\": \"\",\n \"publisher_loc\": \"\",\n }\n\n _log.debug(\"Citation parsing started\")\n\n # Author names\n names = []\n for name_node in node.xpath(\".//name\"):\n name_str = (\n name_node.xpath(\"surname\")[0].text.replace(\"\\n\", \" \").strip()\n + \" \"\n + name_node.xpath(\"given-names\")[0].text.replace(\"\\n\", \" \").strip()\n )\n names.append(name_str)\n etal_node = node.xpath(\".//etal\")\n if len(etal_node) > 0:\n etal_text = etal_node[0].text or DEFAULT_TEXT_ETAL\n names.append(etal_text)\n citation[\"author_names\"] = \", \".join(names)\n\n titles: list[str] = [\n \"article-title\",\n \"chapter-title\",\n \"data-title\",\n \"issue-title\",\n \"part-title\",\n \"trans-title\",\n ]\n title_node: Optional[etree._Element] = None\n for name in titles:\n name_node = node.xpath(name)\n if len(name_node) > 0:\n title_node = name_node[0]\n break\n citation[\"title\"] = (\n JatsDocumentBackend._get_text(title_node)\n if title_node is not None\n else node.text.replace(\"\\n\", \" \").strip()\n )\n\n # Journal, year, publisher name, publisher location, volume, elocation\n fields: list[str] = [\n \"source\",\n \"year\",\n \"publisher-name\",\n \"publisher-loc\",\n \"volume\",\n ]\n for item in fields:\n item_node = node.xpath(item)\n if len(item_node) > 0:\n citation[item.replace(\"-\", \"_\")] = ( # type: ignore[literal-required]\n item_node[0].text.replace(\"\\n\", \" \").strip()\n )\n\n # Publication identifier\n if len(node.xpath(\"pub-id\")) > 0:\n pub_id: list[str] = []\n for id_node in node.xpath(\"pub-id\"):\n id_type = id_node.get(\"assigning-authority\") or id_node.get(\n \"pub-id-type\"\n )\n id_text = id_node.text\n if id_type and id_text:\n pub_id.append(\n id_type.replace(\"\\n\", \" \").strip().upper()\n + \": \"\n + id_text.replace(\"\\n\", \" \").strip()\n )\n if pub_id:\n citation[\"pub_id\"] = \", \".join(pub_id)\n\n # Pages\n if len(node.xpath(\"elocation-id\")) > 0:\n citation[\"page\"] = (\n node.xpath(\"elocation-id\")[0].text.replace(\"\\n\", \" \").strip()\n )\n elif len(node.xpath(\"fpage\")) > 0:\n citation[\"page\"] = node.xpath(\"fpage\")[0].text.replace(\"\\n\", \" \").strip()\n if len(node.xpath(\"lpage\")) > 0:\n citation[\"page\"] += (\n \"\u2013\"\n + node.xpath(\"lpage\")[0]\n .text.replace(\"\\n\", \" \")\n .strip() # noqa: RUF001\n )\n\n # Flatten the citation to string\n\n text = \"\"\n if citation[\"author_names\"]:\n text += citation[\"author_names\"].rstrip(\".\") + \". \"\n if citation[\"title\"]:\n text += citation[\"title\"] + \". \"\n if citation[\"source\"]:\n text += citation[\"source\"] + \". \"\n if citation[\"publisher_name\"]:\n if citation[\"publisher_loc\"]:\n text += f\"{citation['publisher_loc']}: \"\n text += citation[\"publisher_name\"] + \". \"\n if citation[\"volume\"]:\n text = text.rstrip(\". \")\n text += f\" {citation['volume']}. \"\n if citation[\"page\"]:\n text = text.rstrip(\". \")\n if citation[\"volume\"]:\n text += \":\"\n text += citation[\"page\"] + \". \"\n if citation[\"year\"]:\n text = text.rstrip(\". \")\n text += f\" ({citation['year']}).\"\n if citation[\"pub_id\"]:\n text = text.rstrip(\".\") + \". \"\n text += citation[\"pub_id\"]\n\n _log.debug(\"Citation flattened\")\n\n return text", "meta": { - "part_name": "_parse_element_citation", - "docstring": "", - "sha256": 270746555936847738057126604179455710215169149513, - "start_line": 352, - "end_line": 479, - "end_line_signature": 353, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "_parse_element_citation", + "docstring": "", + "sha256": 270746555936847738057126604179455710215169149513, + "start_line": 352, + "end_line": 479, + "end_line_signature": 353, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_equation(\n self, doc: DoclingDocument, parent: NodeItem, node: etree._Element\n ) -> None:\n math_text = node.text\n math_parts = math_text.split(\"$$\")\n if len(math_parts) == 3:\n math_formula = math_parts[1]\n doc.add_text(label=DocItemLabel.FORMULA, text=math_formula, parent=parent)\n\n return", "meta": { - "part_name": "_add_equation", - "docstring": "", - "sha256": 1414534615925307980331912092067693530565141108001, - "start_line": 481, - "end_line": 490, - "end_line_signature": 484, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "_add_equation", + "docstring": "", + "sha256": 1414534615925307980331912092067693530565141108001, + "start_line": 481, + "end_line": 490, + "end_line_signature": 484, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_figure_captions(\n self, doc: DoclingDocument, parent: NodeItem, node: etree._Element\n ) -> None:\n label_node = node.xpath(\"label\")\n label: Optional[str] = (\n JatsDocumentBackend._get_text(label_node[0]).strip() if label_node else \"\"\n )\n\n caption_node = node.xpath(\"caption\")\n caption: Optional[str]\n if len(caption_node) > 0:\n caption = \"\"\n for caption_par in list(caption_node[0]):\n if caption_par.xpath(\".//supplementary-material\"):\n continue\n caption += JatsDocumentBackend._get_text(caption_par).strip() + \" \"\n caption = caption.strip()\n else:\n caption = None\n\n # TODO: format label vs caption once styling is supported\n fig_text: str = f\"{label}{' ' if label and caption else ''}{caption}\"\n fig_caption: Optional[TextItem] = (\n doc.add_text(label=DocItemLabel.CAPTION, text=fig_text)\n if fig_text\n else None\n )\n\n doc.add_picture(parent=parent, caption=fig_caption)\n\n return", "meta": { - "part_name": "_add_figure_captions", - "docstring": "", - "sha256": 835512377408024386136672483548752387568193882073, - "start_line": 492, - "end_line": 522, - "end_line_signature": 495, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "_add_figure_captions", + "docstring": "", + "sha256": 835512377408024386136672483548752387568193882073, + "start_line": 492, + "end_line": 522, + "end_line_signature": 495, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_metadata(\n self, doc: DoclingDocument, xml_components: XMLComponents\n ) -> None:\n self._add_title(doc, xml_components)\n self._add_authors(doc, xml_components)\n self._add_abstract(doc, xml_components)\n\n return", "meta": { - "part_name": "_add_metadata", - "docstring": "", - "sha256": 153982142573938397014666471275555982493113402407, - "start_line": 531, - "end_line": 538, - "end_line_signature": 534, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "_add_metadata", + "docstring": "", + "sha256": 153982142573938397014666471275555982493113402407, + "start_line": 531, + "end_line": 538, + "end_line_signature": 534, "chunk_type": "function" } }, { "text": "from bs4 import BeautifulSoup, Tag\nfrom docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.backend.html_backend import HTMLDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_table(\n self, doc: DoclingDocument, parent: NodeItem, table_xml_component: Table\n ) -> None:\n soup = BeautifulSoup(table_xml_component[\"content\"], \"html.parser\")\n table_tag = soup.find(\"table\")\n if not isinstance(table_tag, Tag):\n return\n\n data = HTMLDocumentBackend.parse_table_data(table_tag)\n\n # TODO: format label vs caption once styling is supported\n label = table_xml_component[\"label\"]\n caption = table_xml_component[\"caption\"]\n table_text: str = f\"{label}{' ' if label and caption else ''}{caption}\"\n table_caption: Optional[TextItem] = (\n doc.add_text(label=DocItemLabel.CAPTION, text=table_text)\n if table_text\n else None\n )\n\n if data is not None:\n doc.add_table(data=data, parent=parent, caption=table_caption)\n\n return", "meta": { - "part_name": "_add_table", - "docstring": "", - "sha256": 1342802968890476187190364473245592459773739883169, - "start_line": 540, - "end_line": 563, - "end_line_signature": 543, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "_add_table", + "docstring": "", + "sha256": 1342802968890476187190364473245592459773739883169, + "start_line": 540, + "end_line": 563, + "end_line_signature": 543, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\n_log = logging.getLogger(__name__)\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_tables(\n self, doc: DoclingDocument, parent: NodeItem, node: etree._Element\n ) -> None:\n table: Table = {\"label\": \"\", \"caption\": \"\", \"content\": \"\"}\n\n # Content\n if len(node.xpath(\"table\")) > 0:\n table_content_node = node.xpath(\"table\")[0]\n elif len(node.xpath(\"alternatives/table\")) > 0:\n table_content_node = node.xpath(\"alternatives/table\")[0]\n else:\n table_content_node = None\n if table_content_node is not None:\n table[\"content\"] = etree.tostring(table_content_node).decode(\"utf-8\")\n\n # Caption\n caption_node = node.xpath(\"caption\")\n caption: Optional[str]\n if caption_node:\n caption = \"\"\n for caption_par in list(caption_node[0]):\n if caption_par.xpath(\".//supplementary-material\"):\n continue\n caption += JatsDocumentBackend._get_text(caption_par).strip() + \" \"\n caption = caption.strip()\n else:\n caption = None\n if caption is not None:\n table[\"caption\"] = caption\n\n # Label\n if len(node.xpath(\"label\")) > 0:\n table[\"label\"] = node.xpath(\"label\")[0].text\n\n try:\n self._add_table(doc, parent, table)\n except Exception:\n _log.warning(f\"Skipping unsupported table in {self.file!s}\")\n\n return", "meta": { - "part_name": "_add_tables", - "docstring": "", - "sha256": 1033621645055163687215987453641177660800797256694, - "start_line": 565, - "end_line": 604, - "end_line_signature": 568, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "_add_tables", + "docstring": "", + "sha256": 1033621645055163687215987453641177660800797256694, + "start_line": 565, + "end_line": 604, + "end_line_signature": 568, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:\n self.root = doc.add_text(\n parent=None,\n text=xml_components[\"title\"],\n label=DocItemLabel.TITLE,\n )\n return", "meta": { - "part_name": "_add_title", - "docstring": "", - "sha256": 974332749105219020038962392126855200985294853221, - "start_line": 606, - "end_line": 612, - "end_line_signature": 607, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "_add_title", + "docstring": "", + "sha256": 974332749105219020038962392126855200985294853221, + "start_line": 606, + "end_line": 612, + "end_line_signature": 607, "chunk_type": "function" } }, { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nDEFAULT_HEADER_ACKNOWLEDGMENTS: Final = \"Acknowledgments\"\nDEFAULT_HEADER_REFERENCES: Final = \"References\"\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _walk_linear( # noqa: C901\n self, doc: DoclingDocument, parent: NodeItem, node: etree._Element\n ) -> str:\n skip_tags = [\"term\"]\n flush_tags = [\"ack\", \"sec\", \"list\", \"boxed-text\", \"disp-formula\", \"fig\"]\n new_parent: NodeItem = parent\n node_text: str = (\n node.text.replace(\"\\n\", \" \")\n if (node.tag not in skip_tags and node.text)\n else \"\"\n )\n\n for child in list(node):\n stop_walk: bool = False\n\n # flush text into TextItem for some tags in paragraph nodes\n if node.tag == \"p\" and node_text.strip() and child.tag in flush_tags:\n doc.add_text(\n label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent\n )\n node_text = \"\"\n\n # add elements and decide whether to stop walking\n if child.tag in (\"sec\", \"ack\"):\n header = child.xpath(\"title|label\")\n text: Optional[str] = None\n if len(header) > 0:\n text = JatsDocumentBackend._get_text(header[0])\n elif child.tag == \"ack\":\n text = DEFAULT_HEADER_ACKNOWLEDGMENTS\n if text:\n self.hlevel += 1\n new_parent = doc.add_heading(\n text=text, parent=parent, level=self.hlevel\n )\n elif child.tag == \"list\":\n new_parent = doc.add_group(\n label=GroupLabel.LIST, name=\"list\", parent=parent\n )\n elif child.tag == \"list-item\":\n # TODO: address any type of content (another list, formula,...)\n # TODO: address list type and item label\n text = JatsDocumentBackend._get_text(child).strip()\n new_parent = doc.add_list_item(text=text, parent=parent)\n stop_walk = True\n elif child.tag == \"fig\":\n self._add_figure_captions(doc, parent, child)\n stop_walk = True\n elif child.tag == \"table-wrap\":\n self._add_tables(doc, parent, child)\n stop_walk = True\n elif child.tag == \"suplementary-material\":\n stop_walk = True\n elif child.tag == \"fn-group\":\n # header = child.xpath(\".//title\") or child.xpath(\".//label\")\n # if header:\n # text = JatsDocumentBackend._get_text(header[0])\n # fn_parent = doc.add_heading(text=text, parent=new_parent)\n # self._add_footnote_group(doc, fn_parent, child)\n stop_walk = True\n elif child.tag == \"ref-list\" and node.tag != \"ref-list\":\n header = child.xpath(\"title|label\")\n text = (\n JatsDocumentBackend._get_text(header[0])\n if len(header) > 0\n else DEFAULT_HEADER_REFERENCES\n )\n new_parent = doc.add_heading(text=text, parent=parent)\n new_parent = doc.add_group(\n parent=new_parent, label=GroupLabel.LIST, name=\"list\"\n )\n elif child.tag == \"element-citation\":\n text = self._parse_element_citation(child)\n self._add_citation(doc, parent, text)\n stop_walk = True\n elif child.tag == \"mixed-citation\":\n text = JatsDocumentBackend._get_text(child).strip()\n self._add_citation(doc, parent, text)\n stop_walk = True\n elif child.tag == \"tex-math\":\n self._add_equation(doc, parent, child)\n stop_walk = True\n elif child.tag == \"inline-formula\":\n # TODO: address inline formulas when supported by docling-core\n stop_walk = True\n\n # step into child\n if not stop_walk:\n new_text = self._walk_linear(doc, new_parent, child)\n if not (node.getparent().tag == \"p\" and node.tag in flush_tags):\n node_text += new_text\n if child.tag in (\"sec\", \"ack\") and text:\n self.hlevel -= 1\n\n # pick up the tail text\n node_text += child.tail.replace(\"\\n\", \" \") if child.tail else \"\"\n\n # create paragraph\n if node.tag == \"p\" and node_text.strip():\n doc.add_text(label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent)\n return \"\"\n else:\n # backpropagate the text\n return node_text", "meta": { - "part_name": "_walk_linear", - "docstring": "", - "sha256": 1396734027815047329843267767563675798505988021776, - "start_line": 614, - "end_line": 717, - "end_line_signature": 617, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "_walk_linear", + "docstring": "", + "sha256": 1396734027815047329843267767563675798505988021776, + "start_line": 614, + "end_line": 717, + "end_line_signature": 617, "chunk_type": "function" } }, { "text": "from typing_extensions import TypedDict, override\nclass Abstract(TypedDict):\n label: str\n content: str", "meta": { - "part_name": "Abstract", - "docstring": "", - "sha256": 794614712107844233737490895996588109287861339811, - "start_line": 33, - "end_line": 35, - "end_line_signature": 35, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "Abstract", + "docstring": "", + "sha256": 794614712107844233737490895996588109287861339811, + "start_line": 33, + "end_line": 35, + "end_line_signature": 35, "chunk_type": "class" } }, { "text": "from io import BytesIO\nfrom typing_extensions import TypedDict, override\nclass Author(TypedDict):\n name: str\n affiliation_names: list[str]", "meta": { - "part_name": "Author", - "docstring": "", - "sha256": 618469300419808735784045889717450654715997143657, - "start_line": 38, - "end_line": 40, - "end_line_signature": 40, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "Author", + "docstring": "", + "sha256": 618469300419808735784045889717450654715997143657, + "start_line": 38, + "end_line": 40, + "end_line_signature": 40, "chunk_type": "class" } }, { "text": "from io import BytesIO\nfrom typing_extensions import TypedDict, override\nclass Citation(TypedDict):\n author_names: str\n title: str\n source: str\n year: str\n volume: str\n page: str\n pub_id: str\n publisher_name: str\n publisher_loc: str", "meta": { - "part_name": "Citation", - "docstring": "", - "sha256": 47216956481538603575192296942081985433567090375, - "start_line": 43, - "end_line": 52, - "end_line_signature": 52, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "Citation", + "docstring": "", + "sha256": 47216956481538603575192296942081985433567090375, + "start_line": 43, + "end_line": 52, + "end_line_signature": 52, "chunk_type": "class" } }, { "text": "from io import BytesIO\nfrom typing_extensions import TypedDict, override\nclass Table(TypedDict):\n label: str\n caption: str\n content: str", "meta": { - "part_name": "Table", - "docstring": "", - "sha256": 652205560496743097978957542262426472701689171417, - "start_line": 55, - "end_line": 58, - "end_line_signature": 58, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "Table", + "docstring": "", + "sha256": 652205560496743097978957542262426472701689171417, + "start_line": 55, + "end_line": 58, + "end_line_signature": 58, "chunk_type": "class" } }, { "text": "from typing_extensions import TypedDict, override\nclass XMLComponents(TypedDict):\n title: str\n authors: list[Author]\n abstract: list[Abstract]", "meta": { - "part_name": "XMLComponents", - "docstring": "", - "sha256": 1130452765636835800645360676517087324676223612005, - "start_line": 61, - "end_line": 64, - "end_line_signature": 64, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 12767849390864590006, "filename": "jats_backend.py", "uri": "https://github.com/docling-project/docling/blob/abc123def456/jats_backend.py" }, + "part_name": "XMLComponents", + "docstring": "", + "sha256": 1130452765636835800645360676517087324676223612005, + "start_line": 61, + "end_line": 64, + "end_line_signature": 64, "chunk_type": "class" } } diff --git a/test/data/chunker_repo/TypeScript/repo_out_chunks.json b/test/data/chunker_repo/TypeScript/repo_out_chunks.json index a192ed53..bfa97bc2 100644 --- a/test/data/chunker_repo/TypeScript/repo_out_chunks.json +++ b/test/data/chunker_repo/TypeScript/repo_out_chunks.json @@ -3,171 +3,191 @@ { "text": "import Model from \"./base/Model\";\nclass Comment extends Model\n public get isResolved(): boolean {\n return !!this.resolvedAt || !!this.parentComment?.isResolved;\n }", "meta": { - "part_name": "isResolved", - "docstring": "", - "sha256": 1268395403700592019784717617222283727541873921424, - "start_line": 100, - "end_line": 102, - "end_line_signature": 102, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 13313267827846711454, "filename": "Comment.ts", "uri": "https://github.com/outline/outline/blob/abc123def456/Comment.ts" }, + "part_name": "isResolved", + "docstring": "", + "sha256": 1268395403700592019784717617222283727541873921424, + "start_line": 100, + "end_line": 102, + "end_line_signature": 102, "chunk_type": "function" } }, { "text": "import Model from \"./base/Model\";\nclass Comment extends Model\n public get isReply() {\n return !!this.parentCommentId;\n }", "meta": { - "part_name": "isReply", - "docstring": "", - "sha256": 1365364938419899639010891183359481253853232355963, - "start_line": 108, - "end_line": 110, - "end_line_signature": 110, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 13313267827846711454, "filename": "Comment.ts", "uri": "https://github.com/outline/outline/blob/abc123def456/Comment.ts" }, + "part_name": "isReply", + "docstring": "", + "sha256": 1365364938419899639010891183359481253853232355963, + "start_line": 108, + "end_line": 110, + "end_line_signature": 110, "chunk_type": "function" } }, { "text": "import Model from \"./base/Model\";\nclass Comment extends Model\n public resolve() {\n return this.store.rootStore.comments.resolve(this.id);\n }", "meta": { - "part_name": "resolve", - "docstring": "/**\n * Resolve the comment\n */", - "sha256": 991119951853749619459124936919291768908369832281, - "start_line": 115, - "end_line": 117, - "end_line_signature": 117, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 13313267827846711454, "filename": "Comment.ts", "uri": "https://github.com/outline/outline/blob/abc123def456/Comment.ts" }, + "part_name": "resolve", + "docstring": "/**\n * Resolve the comment\n */", + "sha256": 991119951853749619459124936919291768908369832281, + "start_line": 115, + "end_line": 117, + "end_line_signature": 117, "chunk_type": "function" } }, { "text": "import Model from \"./base/Model\";\nclass Comment extends Model\n public unresolve() {\n return this.store.rootStore.comments.unresolve(this.id);\n }", "meta": { - "part_name": "unresolve", - "docstring": "/**\n * Unresolve the comment\n */", - "sha256": 737181169666352833175846995267642651601564437701, - "start_line": 122, - "end_line": 124, - "end_line_signature": 124, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 13313267827846711454, "filename": "Comment.ts", "uri": "https://github.com/outline/outline/blob/abc123def456/Comment.ts" }, + "part_name": "unresolve", + "docstring": "/**\n * Unresolve the comment\n */", + "sha256": 737181169666352833175846995267642651601564437701, + "start_line": 122, + "end_line": 124, + "end_line_signature": 124, "chunk_type": "function" } }, { "text": "import invariant from \"invariant\";\nimport uniq from \"lodash/uniq\";\nimport { action, computed, observable } from \"mobx\";\nimport { Pagination } from \"@shared/constants\";\nimport type { ProsemirrorData, ReactionSummary } from \"@shared/types\";\nimport User from \"~/models/User\";\nimport { client } from \"~/utils/ApiClient\";\nimport Document from \"./Document\";\n\nimport Field from \"./decorators/Field\";\nimport Relation from \"./decorators/Relation\";\n\nexport default Comment;", "meta": { - "sha256": 127587344566918131981664969548384712413573188523, - "start_line": 1, - "end_line": 279, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 13313267827846711454, "filename": "Comment.ts", "uri": "https://github.com/outline/outline/blob/abc123def456/Comment.ts" }, + "sha256": 127587344566918131981664969548384712413573188523, + "start_line": 1, + "end_line": 279, "chunk_type": "preamble" } }, { "text": "import Group from \"./Group\";\nimport Model from \"./base/Model\";\nimport Relation from \"./decorators/Relation\";\nimport User from \"./User\";\n\nclass GroupUser extends Model {\n static modelName = \"GroupUser\";\n\n /** The ID of the user. */\n userId: string;\n\n /** The user that belongs to the group. */\n @Relation(() => User, { onDelete: \"cascade\" })\n user: User;\n\n /** The ID of the group. */\n groupId: string;\n\n /** The group that the user belongs to. */\n @Relation(() => Group, { onDelete: \"cascade\" })\n group: Group;\n}", "meta": { - "part_name": "GroupUser", - "docstring": "/**\n * Represents a user's membership to a group.\n */", - "sha256": 819039209099366519772307112685515925657900275191, - "start_line": 8, - "end_line": 24, - "end_line_signature": 24, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 1986469258069411733, "filename": "GroupUser.ts", "uri": "https://github.com/outline/outline/blob/abc123def456/GroupUser.ts" }, + "part_name": "GroupUser", + "docstring": "/**\n * Represents a user's membership to a group.\n */", + "sha256": 819039209099366519772307112685515925657900275191, + "start_line": 8, + "end_line": 24, + "end_line_signature": 24, "chunk_type": "class" } }, { "text": "export default GroupUser;", "meta": { - "sha256": 1202573002644555545724623497903246581585285637847, - "start_line": 25, - "end_line": 28, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 1986469258069411733, "filename": "GroupUser.ts", "uri": "https://github.com/outline/outline/blob/abc123def456/GroupUser.ts" }, + "sha256": 1202573002644555545724623497903246581585285637847, + "start_line": 25, + "end_line": 28, "chunk_type": "preamble" } }, { "text": "import {\n Action,\n ActionContext,\n ActionV2,\n ActionV2Group,\n ActionV2Separator as TActionV2Separator,\n ActionV2Variant,\n ActionV2WithChildren,\n CommandBarAction,\n ExternalLinkActionV2,\n InternalLinkActionV2,\n MenuExternalLink,\n MenuInternalLink,\n MenuItem,\n MenuItemButton,\n MenuItemWithChildren,\n} from \"~/types\";\nfunction resolve(value: any, context: ActionContext): T {\n return typeof value === \"function\" ? value(context) : value;\n}", "meta": { - "part_name": "resolve", - "docstring": "", - "sha256": 1289291728661617648625599715448098966958049316632, - "start_line": 24, - "end_line": 26, - "end_line_signature": 26, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 16803020185603763773, "filename": "index.ts", "uri": "https://github.com/outline/outline/blob/abc123def456/index.ts" }, + "part_name": "resolve", + "docstring": "", + "sha256": 1289291728661617648625599715448098966958049316632, + "start_line": 24, + "end_line": 26, + "end_line_signature": 26, "chunk_type": "function" } }, { "text": "import {\n Action,\n ActionContext,\n ActionV2,\n ActionV2Group,\n ActionV2Separator as TActionV2Separator,\n ActionV2Variant,\n ActionV2WithChildren,\n CommandBarAction,\n ExternalLinkActionV2,\n InternalLinkActionV2,\n MenuExternalLink,\n MenuInternalLink,\n MenuItem,\n MenuItemButton,\n MenuItemWithChildren,\n} from \"~/types\";\nfunction hasVisibleItems(items: MenuItem[]) {\n const applicableTypes = [\"button\", \"link\", \"route\", \"group\", \"submenu\"];\n return items.some(\n (item) => applicableTypes.includes(item.type) && item.visible\n );\n}", "meta": { - "part_name": "hasVisibleItems", - "docstring": "", - "sha256": 1279869349240065760172944255546797254943234495037, - "start_line": 359, - "end_line": 364, - "end_line_signature": 364, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 16803020185603763773, "filename": "index.ts", "uri": "https://github.com/outline/outline/blob/abc123def456/index.ts" }, + "part_name": "hasVisibleItems", + "docstring": "", + "sha256": 1279869349240065760172944255546797254943234495037, + "start_line": 359, + "end_line": 364, + "end_line_signature": 364, "chunk_type": "function" } }, { "text": "import flattenDeep from \"lodash/flattenDeep\";\nimport { toast } from \"sonner\";\nimport { Optional } from \"utility-types\";\nimport { v4 as uuidv4 } from \"uuid\";\n\nimport Analytics from \"~/utils/Analytics\";\nimport history from \"~/utils/history\";\n\nexport function createAction(definition: Optional): Action {\n return {\n ...definition,\n perform: definition.perform\n ? (context) => {\n // We must use the specific analytics name here as the action name is\n // translated and potentially contains user strings.\n if (definition.analyticsName) {\n Analytics.track(\"perform_action\", definition.analyticsName, {\n context: context.isButton\n ? \"button\"\n : context.isCommandBar\n ? \"commandbar\"\n : \"contextmenu\",\n });\n }\n return definition.perform?.(context);\n }\n : undefined,\n id: definition.id ?? uuidv4(),\n };\n}\n\nexport function actionToMenuItem(\n action: Action,\n context: ActionContext\n): MenuItemButton | MenuExternalLink | MenuInternalLink | MenuItemWithChildren {\n const resolvedIcon = resolve>(action.icon, context);\n const resolvedChildren = resolve(action.children, context);\n const visible = action.visible ? action.visible(context) : true;\n const title = resolve(action.name, context);\n const icon =\n resolvedIcon && action.iconInContextMenu !== false\n ? resolvedIcon\n : undefined;\n\n if (resolvedChildren) {\n const items = resolvedChildren\n .map((a) => actionToMenuItem(a, context))\n .filter(Boolean)\n .filter((a) => a.visible);\n\n return {\n type: \"submenu\",\n title,\n icon,\n items,\n visible: visible && items.length > 0,\n };\n }\n\n if (action.to) {\n return typeof action.to === \"string\"\n ? {\n type: \"route\",\n title,\n icon,\n visible,\n to: action.to,\n selected: action.selected?.(context),\n }\n : {\n type: \"link\",\n title,\n icon,\n visible,\n href: action.to,\n selected: action.selected?.(context),\n };\n }\n\n return {\n type: \"button\",\n title,\n icon,\n visible,\n dangerous: action.dangerous,\n onClick: () => performAction(action, context),\n selected: action.selected?.(context),\n };\n}\n\nexport function actionToKBar(\n action: Action,\n context: ActionContext\n): CommandBarAction[] {\n if (typeof action.visible === \"function\" && !action.visible(context)) {\n return [];\n }\n\n const resolvedIcon = resolve(action.icon, context);\n const resolvedChildren = resolve(action.children, context);\n const resolvedSection = resolve(action.section, context);\n const resolvedName = resolve(action.name, context);\n const resolvedPlaceholder = resolve(action.placeholder, context);\n const children = resolvedChildren\n ? flattenDeep(resolvedChildren.map((a) => actionToKBar(a, context))).filter(\n (a) => !!a\n )\n : [];\n\n const sectionPriority =\n typeof action.section !== \"string\" && \"priority\" in action.section\n ? ((action.section.priority as number) ?? 0)\n : 0;\n\n return [\n {\n id: action.id,\n name: resolvedName,\n analyticsName: action.analyticsName,\n section: resolvedSection,\n placeholder: resolvedPlaceholder,\n keywords: action.keywords ?? \"\",\n shortcut: action.shortcut || [],\n icon: resolvedIcon,\n priority: (1 + (action.priority ?? 0)) * (1 + (sectionPriority ?? 0)),\n perform:\n action.perform || action.to\n ? () => performAction(action, context)\n : undefined,\n },\n ].concat(\n // @ts-expect-error ts-migrate(2769) FIXME: No overload matches this call.\n children.map((child) => ({ ...child, parent: child.parent ?? action.id }))\n );\n}\n\nexport async function performAction(action: Action, context: ActionContext) {\n const result = action.perform\n ? action.perform(context)\n : action.to\n ? typeof action.to === \"string\"\n ? history.push(action.to)\n : window.open(action.to.url, action.to.target)\n : undefined;\n\n if (result instanceof Promise) {\n return result.catch((err: Error) => {\n toast.error(err.message);\n });\n }\n\n return result;\n}\n\n/** Actions V2 */\n\nexport const ActionV2Separator: TActionV2Separator = {\n type: \"action_separator\",\n};\n\nexport function createActionV2(\n definition: Optional, \"id\">\n): ActionV2 {\n return {\n ...definition,\n type: \"action\",\n variant: \"action\",\n perform: definition.perform\n ? (context) => {\n // We must use the specific analytics name here as the action name is\n // translated and potentially contains user strings.\n if (definition.analyticsName) {\n Analytics.track(\"perform_action\", definition.analyticsName, {\n context: context.isButton\n ? \"button\"\n : context.isCommandBar\n ? \"commandbar\"\n : \"contextmenu\",\n });\n }\n return definition.perform(context);\n }\n : () => {},\n id: definition.id ?? uuidv4(),\n };\n}\n\nexport function createInternalLinkActionV2(\n definition: Optional, \"id\">\n): InternalLinkActionV2 {\n return {\n ...definition,\n type: \"action\",\n variant: \"internal_link\",\n id: definition.id ?? uuidv4(),\n };\n}\n\nexport function createExternalLinkActionV2(\n definition: Optional, \"id\">\n): ExternalLinkActionV2 {\n return {\n ...definition,\n type: \"action\",\n variant: \"external_link\",\n id: definition.id ?? uuidv4(),\n };\n}\n\nexport function createActionV2WithChildren(\n definition: Optional, \"id\">\n): ActionV2WithChildren {\n return {\n ...definition,\n type: \"action\",\n variant: \"action_with_children\",\n id: definition.id ?? uuidv4(),\n };\n}\n\nexport function createActionV2Group(\n definition: Omit\n): ActionV2Group {\n return {\n ...definition,\n type: \"action_group\",\n };\n}\n\nexport function createRootMenuAction(\n actions: (ActionV2Variant | ActionV2Group | TActionV2Separator)[]\n): ActionV2WithChildren {\n return {\n id: uuidv4(),\n type: \"action\",\n variant: \"action_with_children\",\n name: \"root_action\",\n section: \"Root\",\n children: actions,\n };\n}\n\nexport function actionV2ToMenuItem(\n action: ActionV2Variant | ActionV2Group | TActionV2Separator,\n context: ActionContext\n): MenuItem {\n switch (action.type) {\n case \"action\": {\n const title = resolve(action.name, context);\n const visible = resolve(action.visible, context);\n const icon =\n !!action.icon && action.iconInContextMenu !== false\n ? action.icon\n : undefined;\n\n switch (action.variant) {\n case \"action\":\n return {\n type: \"button\",\n title,\n icon,\n visible,\n dangerous: action.dangerous,\n onClick: () => performActionV2(action, context),\n };\n\n case \"internal_link\":\n return {\n type: \"route\",\n title,\n icon,\n visible,\n to: action.to,\n };\n\n case \"external_link\":\n return {\n type: \"link\",\n title,\n icon,\n visible,\n href: action.target\n ? { url: action.url, target: action.target }\n : action.url,\n };\n\n case \"action_with_children\": {\n const children = resolve<\n (ActionV2Variant | ActionV2Group | TActionV2Separator)[]\n >(action.children, context);\n const subMenuItems = children.map((a) =>\n actionV2ToMenuItem(a, context)\n );\n return {\n type: \"submenu\",\n title,\n icon,\n items: subMenuItems,\n visible: visible && hasVisibleItems(subMenuItems),\n };\n }\n\n default:\n throw Error(\"invalid action variant\");\n }\n }\n\n case \"action_group\": {\n const groupItems = action.actions.map((a) =>\n actionV2ToMenuItem(a, context)\n );\n return {\n type: \"group\",\n title: resolve(action.name, context),\n visible: hasVisibleItems(groupItems),\n items: groupItems,\n };\n }\n\n case \"action_separator\":\n return { type: \"separator\" };\n }\n}\n\nexport async function performActionV2(\n action: ActionV2,\n context: ActionContext\n) {\n const result = action.perform(context);\n\n if (result instanceof Promise) {\n return result.catch((err: Error) => {\n toast.error(err.message);\n });\n }\n\n return result;\n}", "meta": { - "sha256": 1201606939402032044701817936630056161927504135324, - "start_line": 1, - "end_line": 359, + "schema_name": "docling_core.transforms.chunker.DocMeta", + "version": "1.0.0", "origin": { "mimetype": "text/plain", "binary_hash": 16803020185603763773, "filename": "index.ts", "uri": "https://github.com/outline/outline/blob/abc123def456/index.ts" }, + "sha256": 1201606939402032044701817936630056161927504135324, + "start_line": 1, + "end_line": 359, "chunk_type": "preamble" } } diff --git a/test/data/repo_chunking/sample.c b/test/data/repo_chunking/sample.c new file mode 100644 index 00000000..8832ef48 --- /dev/null +++ b/test/data/repo_chunking/sample.c @@ -0,0 +1,22 @@ +#include +#include + +double circle_area(double radius) { + return M_PI * radius * radius; +} + +double circle_circumference(double radius) { + return 2 * M_PI * radius; +} + +int fibonacci(int n) { + if (n <= 1) { + return n; + } + return fibonacci(n - 1) + fibonacci(n - 2); +} + +int main() { + printf("Hello, World!\n"); + return 0; +} diff --git a/test/data/repo_chunking/sample.go b/test/data/repo_chunking/sample.go new file mode 100644 index 00000000..9e36e6cf --- /dev/null +++ b/test/data/repo_chunking/sample.go @@ -0,0 +1,23 @@ +package main + +import "fmt" + +func main() { + fmt.Println("Hello, World!") +} + +func fibonacci(n int) int { + if n <= 1 { + return n + } + return fibonacci(n-1) + fibonacci(n-2) +} + +type Person struct { + Name string + Age int +} + +func (p Person) String() string { + return fmt.Sprintf("%s is %d years old", p.Name, p.Age) +} diff --git a/test/data/repo_chunking/sample.java b/test/data/repo_chunking/sample.java new file mode 100644 index 00000000..c65a4925 --- /dev/null +++ b/test/data/repo_chunking/sample.java @@ -0,0 +1,25 @@ +public class Calculator { + private double result; + + public Calculator() { + this.result = 0.0; + } + + public double add(double a, double b) { + this.result = a + b; + return this.result; + } + + public double subtract(double a, double b) { + this.result = a - b; + return this.result; + } + + public double getResult() { + return this.result; + } + + public void reset() { + this.result = 0.0; + } +} diff --git a/test/data/repo_chunking/sample.js b/test/data/repo_chunking/sample.js new file mode 100644 index 00000000..65474c68 --- /dev/null +++ b/test/data/repo_chunking/sample.js @@ -0,0 +1,23 @@ +function multiply(a, b) { + return a * b; +} + +function divide(a, b) { + return b !== 0 ? a / b : 0; +} + +function greet(name) { + return `Hello, ${name}!`; +} + +class Calculator { + constructor() { + this.history = []; + } + + add(a, b) { + const result = a + b; + this.history.push(`${a} + ${b} = ${result}`); + return result; + } +} diff --git a/test/data/repo_chunking/sample.md b/test/data/repo_chunking/sample.md new file mode 100644 index 00000000..64f2bf0b --- /dev/null +++ b/test/data/repo_chunking/sample.md @@ -0,0 +1,23 @@ +# Sample Markdown File + +This is a sample markdown file for testing repository chunking. + +## Code Example + +Here's some Python code: + +```python +def hello(): + print("Hello, World!") +``` + +## Features + +- Lists +- **Bold text** +- *Italic text* +- [Links](https://example.com) + +## Conclusion + +This file should be processed as text content, not as code. diff --git a/test/data/repo_chunking/sample.py b/test/data/repo_chunking/sample.py new file mode 100644 index 00000000..ea1addd5 --- /dev/null +++ b/test/data/repo_chunking/sample.py @@ -0,0 +1,25 @@ +def fibonacci(n): + """Calculate the nth Fibonacci number using recursion.""" + if n <= 1: + return n + return fibonacci(n - 1) + fibonacci(n - 2) + + +class MathUtils: + """Utility class for mathematical operations.""" + + def __init__(self): + self.pi = 3.14159 + + def circle_area(self, radius): + """Calculate the area of a circle.""" + return self.pi * radius**2 + + def circle_circumference(self, radius): + """Calculate the circumference of a circle.""" + return 2 * self.pi * radius + + +def hello(): + """A simple hello function.""" + return "Hello, World!" diff --git a/test/data/repo_chunking/sample.ts b/test/data/repo_chunking/sample.ts new file mode 100644 index 00000000..3bc890a8 --- /dev/null +++ b/test/data/repo_chunking/sample.ts @@ -0,0 +1,20 @@ +interface Point { + x: number; + y: number; +} + +function distance(p1: Point, p2: Point): number { + return Math.sqrt((p2.x - p1.x) ** 2 + (p2.y - p1.y) ** 2); +} + +class Vector { + constructor(public x: number, public y: number) {} + + magnitude(): number { + return Math.sqrt(this.x ** 2 + this.y ** 2); + } + + add(other: Vector): Vector { + return new Vector(this.x + other.x, this.y + other.y); + } +} diff --git a/test/test_code_chunker.py b/test/test_code_chunker.py index 91abd8d2..51095761 100644 --- a/test/test_code_chunker.py +++ b/test/test_code_chunker.py @@ -1,10 +1,14 @@ +import glob import json import os import pathlib +from typing import List +import git import pytest from docling_core.transforms.chunker.base_code_chunker import CodeChunk +from docling_core.transforms.chunker.code_chunk_utils.utils import Language from docling_core.transforms.chunker.language_code_chunkers import ( CFunctionChunker, JavaFunctionChunker, @@ -12,10 +16,76 @@ PythonFunctionChunker, TypeScriptFunctionChunker, ) +from docling_core.types.doc import DoclingDocument, DocumentOrigin from docling_core.types.doc.labels import DocItemLabel +from docling_core.utils.legacy import _create_hash from .test_data_gen_flag import GEN_TEST_DATA -from .test_utils_repo_ds import create_ds, language_to_extension + + +def get_latest_commit_id(file_dir: str) -> str: + """Returns the latest commit ID in the given Git repository directory.""" + try: + repo = git.Repo(file_dir, search_parent_directories=True) + return repo.head.commit.hexsha + except Exception: + return "" + + +def create_documents_from_repository( + file_dir: str, repo_url: str, commit_id: str = None +) -> List[DoclingDocument]: + """Build DoclingDocument objects from a local checkout, one per code file.""" + + documents: List[DoclingDocument] = [] + if commit_id is None: + commit_id = get_latest_commit_id(file_dir) + + all_extensions = set() + for language in Language: + all_extensions.update(language.file_extensions()) + + all_files = [] + for extension in all_extensions: + all_files.extend( + [ + f + for f in sorted( + glob.glob(f"{file_dir}/**/*{extension}", recursive=True) + ) + ] + ) + + all_files = sorted(list(set(all_files))) + + for file_path in all_files: + try: + with open(file_path, "r", encoding="utf-8") as f: + file_content = f.read() + + file_relative = os.path.relpath(file_path, start=file_dir).replace( + "\\", "/" + ) + + origin = DocumentOrigin( + filename=file_relative, + uri=( + f"{repo_url}/blob/{commit_id}/{file_relative}" + if commit_id + else f"{repo_url}/{file_relative}" + ), + mimetype="text/plain", + binary_hash=_create_hash(file_content), + ) + + doc = DoclingDocument(name=file_relative, origin=origin) + doc.add_code(text=file_content) + documents.append(doc) + except Exception: + continue + + return documents + HERE = pathlib.Path(__file__).parent DATA = HERE / "data" / "chunker_repo" @@ -75,13 +145,14 @@ def test_function_chunkers_repo(name, local_path, repo_url, chunker_factory): if not os.path.isdir(local_path_full): pytest.skip(f"Missing repo at {local_path_full}; skipping {name} test.") - docs = create_ds(local_path_full, repo_url, commit_id="abc123def456") + docs = create_documents_from_repository( + local_path_full, repo_url, commit_id="abc123def456" + ) docs = [ doc for doc in docs if any(text.label == DocItemLabel.CODE and text.text for text in doc.texts) ] - docs = [doc for doc in docs if doc.name.endswith(language_to_extension[name])] if not docs: pytest.skip(f"No documents found in {local_path_full} for {name}.") diff --git a/test/test_code_chunking_strategy.py b/test/test_code_chunking_strategy.py new file mode 100644 index 00000000..94d4c30c --- /dev/null +++ b/test/test_code_chunking_strategy.py @@ -0,0 +1,296 @@ +from pathlib import Path + +import pytest + +from docling_core.transforms.chunker import ( + DefaultCodeChunkingStrategy, + HierarchicalChunker, + HybridChunker, + Language, + LanguageDetector, + NoOpCodeChunkingStrategy, +) +from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer +from docling_core.types.doc.base import Size +from docling_core.types.doc.document import DoclingDocument, DocumentOrigin +from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel + + +@pytest.fixture +def test_data_dir(): + """Path to test data directory.""" + return Path(__file__).parent / "data" / "repo_chunking" + + +def test_language_detection_from_filename(): + """Test language detection from file extensions.""" + test_cases = [ + ("test.py", Language.PYTHON), + ("test.js", Language.JAVASCRIPT), + ("test.ts", Language.TYPESCRIPT), + ("test.java", Language.JAVA), + ("test.c", Language.C), + ("test.go", None), + ("test.md", None), + ] + + for filename, expected in test_cases: + detected = LanguageDetector.detect_from_extension(filename) + assert ( + detected == expected + ), f"Failed for {filename}: expected {expected}, got {detected}" + + +def test_language_detection_from_content(): + """Test language detection from code content.""" + test_cases = [ + ("def hello(): pass\nimport os", Language.PYTHON), + ("function test() { return 1; }\nconst x = 5;", Language.JAVASCRIPT), + ("interface User { name: string; }", Language.TYPESCRIPT), + ('public class Test { }\nSystem.out.println("Hello");', Language.JAVA), + ('#include \nint main() { printf("hello"); }', Language.C), + ("package main\nfunc main() { }", None), + ] + + for code, expected in test_cases: + detected = LanguageDetector.detect_from_content(code) + assert ( + detected == expected + ), f"Failed for code: expected {expected}, got {detected}" + + +def test_language_detection_integration(test_data_dir): + """Test language detection with real files.""" + test_files = { + "sample.py": CodeLanguageLabel.PYTHON, + "sample.js": CodeLanguageLabel.JAVASCRIPT, + "sample.ts": CodeLanguageLabel.TYPESCRIPT, + "sample.java": CodeLanguageLabel.JAVA, + "sample.c": CodeLanguageLabel.C, + "sample.go": CodeLanguageLabel.UNKNOWN, + "sample.md": CodeLanguageLabel.UNKNOWN, + } + + for filename, expected_language in test_files.items(): + file_path = test_data_dir / filename + if not file_path.exists(): + pytest.skip(f"Test file {filename} not found") + + doc = DoclingDocument(name=filename) + doc.origin = DocumentOrigin( + filename=filename, mimetype="text/plain", binary_hash=12345 + ) + + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + doc.add_code(text=content) + + assert len(doc.texts) == 1 + assert doc.texts[0].code_language == expected_language + assert doc.texts[0].text == content + + +def test_code_chunking_strategies(): + """Test different code chunking strategies.""" + python_code = ''' +def fibonacci(n): + """Calculate the nth Fibonacci number.""" + if n <= 1: + return n + return fibonacci(n-1) + fibonacci(n-2) + +def factorial(n): + """Calculate factorial of n.""" + if n <= 1: + return 1 + return n * factorial(n-1) +''' + + strategy = DefaultCodeChunkingStrategy(min_chunk_size=10, max_tokens=100) + language = Language.PYTHON + chunks = list(strategy.chunk_code_item(python_code, language)) + + assert len(chunks) > 0 + for chunk in chunks: + assert hasattr(chunk, "text") + assert hasattr(chunk, "meta") + assert hasattr(chunk.meta, "chunk_type") + + noop_strategy = NoOpCodeChunkingStrategy() + chunks = list(noop_strategy.chunk_code_item(python_code, language)) + + assert len(chunks) == 1 + chunk = chunks[0] + assert chunk.text == python_code + assert chunk.meta.chunk_type == "code_block" + + +def test_hierarchical_chunker_integration(): + """Test HierarchicalChunker with and without code chunking strategy.""" + python_code = ''' +def fibonacci(n): + """Calculate the nth Fibonacci number.""" + if n <= 1: + return n + return fibonacci(n-1) + fibonacci(n-2) +''' + + doc = DoclingDocument(name="test") + doc.add_page(page_no=0, size=Size(width=612.0, height=792.0)) + doc.add_text( + label=DocItemLabel.TEXT, + text="Here's some Python code:", + orig="Here's some Python code:", + ) + doc.add_code( + text=python_code, code_language=CodeLanguageLabel.PYTHON, orig=python_code + ) + doc.origin = DocumentOrigin( + filename="test.py", mimetype="text/x-python", binary_hash=12345 + ) + + strategy = DefaultCodeChunkingStrategy(min_chunk_size=50, max_tokens=1000) + chunker_with_strategy = HierarchicalChunker(code_chunking_strategy=strategy) + chunks_with_strategy = list(chunker_with_strategy.chunk(doc)) + + assert len(chunks_with_strategy) > 0 + for chunk in chunks_with_strategy: + assert hasattr(chunk, "text") + assert hasattr(chunk, "meta") + + chunker_without_strategy = HierarchicalChunker() + chunks_without_strategy = list(chunker_without_strategy.chunk(doc)) + + assert len(chunks_without_strategy) > 0 + for chunk in chunks_without_strategy: + assert hasattr(chunk, "text") + assert hasattr(chunk, "meta") + + +def test_hybrid_chunker_with_code_files(test_data_dir): + """Test that HybridChunker can process code files.""" + tokenizer = HuggingFaceTokenizer.from_pretrained( + model_name="sentence-transformers/all-MiniLM-L6-v2", max_tokens=512 + ) + chunker = HybridChunker(tokenizer=tokenizer, merge_peers=True) + + python_file = test_data_dir / "sample.py" + if not python_file.exists(): + pytest.skip("Python test file not found") + + doc = DoclingDocument(name="sample.py") + doc.origin = DocumentOrigin( + filename="sample.py", mimetype="text/x-python", binary_hash=12345 + ) + + with open(python_file, "r", encoding="utf-8") as f: + content = f.read() + doc.add_code(text=content) + + chunks = list(chunker.chunk(dl_doc=doc)) + + assert len(chunks) > 0 + for chunk in chunks: + assert hasattr(chunk, "text") + assert hasattr(chunk, "meta") + assert len(chunk.text) > 0 + + +def test_unsupported_language_fallback(test_data_dir): + """Test that unsupported languages fall back to regular text chunking.""" + tokenizer = HuggingFaceTokenizer.from_pretrained( + model_name="sentence-transformers/all-MiniLM-L6-v2", max_tokens=512 + ) + chunker = HybridChunker(tokenizer=tokenizer, merge_peers=True) + + go_file = test_data_dir / "sample.go" + if go_file.exists(): + doc = DoclingDocument(name="sample.go") + doc.origin = DocumentOrigin( + filename="sample.go", mimetype="text/plain", binary_hash=12345 + ) + + with open(go_file, "r", encoding="utf-8") as f: + content = f.read() + doc.add_code(text=content) + + assert len(doc.texts) == 1 + assert doc.texts[0].code_language == CodeLanguageLabel.UNKNOWN + + chunks = list(chunker.chunk(dl_doc=doc)) + assert len(chunks) > 0 + + all_text = " ".join(chunk.text for chunk in chunks) + assert "package main" in all_text + assert "func fibonacci" in all_text + + md_file = test_data_dir / "sample.md" + if md_file.exists(): + doc = DoclingDocument(name="sample.md") + doc.origin = DocumentOrigin( + filename="sample.md", mimetype="text/plain", binary_hash=12345 + ) + + with open(md_file, "r", encoding="utf-8") as f: + content = f.read() + doc.add_code(text=content) + + assert len(doc.texts) == 1 + assert doc.texts[0].code_language == CodeLanguageLabel.UNKNOWN + + chunks = list(chunker.chunk(dl_doc=doc)) + assert len(chunks) > 0 + + all_text = " ".join(chunk.text for chunk in chunks) + assert "Sample Markdown File" in all_text + assert "def hello()" in all_text + + +def test_repository_processing(test_data_dir): + """Test processing multiple files from a repository.""" + tokenizer = HuggingFaceTokenizer.from_pretrained( + model_name="sentence-transformers/all-MiniLM-L6-v2", max_tokens=512 + ) + chunker = HybridChunker(tokenizer=tokenizer, merge_peers=True) + + all_chunks = [] + for file_path in test_data_dir.glob("sample.*"): + doc = DoclingDocument(name=file_path.name) + doc.origin = DocumentOrigin( + filename=file_path.name, mimetype="text/plain", binary_hash=12345 + ) + + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + doc.add_code(text=content) + + chunks = list(chunker.chunk(dl_doc=doc)) + all_chunks.extend(chunks) + + assert len(all_chunks) > 0 + + for chunk in all_chunks: + assert hasattr(chunk, "text") + assert hasattr(chunk, "meta") + assert len(chunk.text) > 0 + + +def test_language_enum_mappings(): + """Test language enum values and mappings.""" + assert Language.PYTHON.value == "python" + assert Language.JAVASCRIPT.value == "javascript" + assert Language.TYPESCRIPT.value == "typescript" + assert Language.JAVA.value == "java" + assert Language.C.value == "c" + + assert Language.PYTHON.to_code_language_label() == CodeLanguageLabel.PYTHON + assert Language.JAVASCRIPT.to_code_language_label() == CodeLanguageLabel.JAVASCRIPT + assert Language.TYPESCRIPT.to_code_language_label() == CodeLanguageLabel.TYPESCRIPT + assert Language.JAVA.to_code_language_label() == CodeLanguageLabel.JAVA + assert Language.C.to_code_language_label() == CodeLanguageLabel.C + + assert CodeLanguageLabel.PYTHON.to_language() == Language.PYTHON + assert CodeLanguageLabel.JAVASCRIPT.to_language() == Language.JAVASCRIPT + assert CodeLanguageLabel.TYPESCRIPT.to_language() == Language.TYPESCRIPT + assert CodeLanguageLabel.JAVA.to_language() == Language.JAVA + assert CodeLanguageLabel.C.to_language() == Language.C diff --git a/test/test_utils_repo_ds.py b/test/test_utils_repo_ds.py deleted file mode 100644 index 88141f01..00000000 --- a/test/test_utils_repo_ds.py +++ /dev/null @@ -1,140 +0,0 @@ -import fnmatch -import glob -import os -from typing import List - -import git - -from docling_core.types.doc import DoclingDocument, DocumentOrigin -from docling_core.types.doc.labels import CodeLanguageLabel -from docling_core.utils.legacy import _create_hash - -language_to_extension = { - "Python": ".py", - "Java": ".java", - "C": ".c", - "TypeScript": ".ts", - "JavaScript": ".js", -} - -language_to_enum = { - "Python": CodeLanguageLabel.PYTHON, - "Java": CodeLanguageLabel.JAVA, - "C": CodeLanguageLabel.C, - "TypeScript": CodeLanguageLabel.TYPESCRIPT, - "JavaScript": CodeLanguageLabel.JAVASCRIPT, -} - - -def get_latest_commit_id(file_dir: str) -> str: - """ - Returns the hexadecimal SHA-1 ID of the latest commit in the given Git repository directory. - - Parameters: - file_dir (str): The path to the Git repository directory. - - Returns: - str: The hexadecimal SHA-1 ID of the latest commit, or an empty string if an error occurs. - """ - try: - repo = git.Repo(file_dir, search_parent_directories=True) - return repo.head.commit.hexsha - except Exception: - return "" - - -def load_ignore_patterns(ignore_file_path: str) -> list: - """ - Load ignore patterns from a file. - - This function reads a file containing ignore patterns (one per line) and returns a list of patterns, - excluding empty lines and lines starting with '#'. If the file does not exist, it returns an empty list. - - Args: - ignore_file_path (str): The path to the ignore file. - - Returns: - list: A list of ignore patterns. - """ - if not os.path.exists(ignore_file_path): - return [] - with open(ignore_file_path, "r", encoding="utf-8") as file: - return [ - line.strip() for line in file if line.strip() and not line.startswith("#") - ] - - -def is_ignored(file_path: str, ignore_patterns: List[str]) -> bool: - """ - Check if a file path matches any of the given ignore patterns. - - This function takes a file path and a list of ignore patterns, and returns True if the file path matches any of the patterns, - indicating that the file should be ignored. Otherwise, it returns False. - - Args: - file_path (str): The path of the file to check. - ignore_patterns (list of str): A list of patterns to check against the file path. - - Returns: - bool: True if the file path matches any ignore pattern, False otherwise. - """ - for pattern in ignore_patterns: - if fnmatch.fnmatch(file_path, pattern): - return True - return False - - -def create_ds( - file_dir: str, repo_url: str, commit_id: str = None -) -> List[DoclingDocument]: - """ - Build DoclingDocument objects from a local checkout, one per code file. - Deterministic ordering and hashes for use in tests. - - Args: - file_dir: Directory containing the repository - repo_url: URL of the repository - commit_id: Specific commit ID to use (defaults to "main" for deterministic testing) - """ - documents: List[DoclingDocument] = [] - if commit_id is None: - commit_id = get_latest_commit_id(file_dir) - ignore_file = os.path.join(file_dir, ".ragignore") - ignore_patterns = load_ignore_patterns(ignore_file) - - for language, extension in language_to_extension.items(): - files = [ - f - for f in sorted(glob.glob(f"{file_dir}/**/*{extension}", recursive=True)) - if not is_ignored(f, ignore_patterns) - ] - files.sort() - for file_path in files: - try: - with open(file_path, "r", encoding="utf-8") as f: - file_content = f.read() - - file_relative = os.path.relpath(file_path, start=file_dir).replace( - "\\", "/" - ) - - origin = DocumentOrigin( - filename=file_relative, - uri=( - f"{repo_url}/blob/{commit_id}/{file_relative}" - if commit_id - else f"{repo_url}/{file_relative}" - ), - mimetype="text/plain", - binary_hash=_create_hash(file_content), - ) - - doc = DoclingDocument(name=file_relative, origin=origin) - doc.add_code( - text=file_content, code_language=language_to_enum[language] - ) - documents.append(doc) - except Exception: - continue - - return documents From 336dd6a2078272dd30ade58b4fc6392e7a2edf86 Mon Sep 17 00:00:00 2001 From: Bridget McGinn Date: Wed, 22 Oct 2025 16:39:21 -0400 Subject: [PATCH 04/12] add serializer, internal marking of chunkers, typing --- docling_core/transforms/chunker/__init__.py | 3 +- .../transforms/chunker/base_code_chunker.py | 2 +- .../chunker/code_chunk_utils/chunk_utils.py | 8 +-- .../chunker/code_chunking_strategy.py | 36 ++++++------ .../chunker/hierarchical_chunker.py | 57 +++++++++++++++---- .../chunker/language_code_chunkers.py | 12 ++-- test/test_code_chunker.py | 29 ++++++---- 7 files changed, 94 insertions(+), 53 deletions(-) diff --git a/docling_core/transforms/chunker/__init__.py b/docling_core/transforms/chunker/__init__.py index a218fb8a..29f0ab3a 100644 --- a/docling_core/transforms/chunker/__init__.py +++ b/docling_core/transforms/chunker/__init__.py @@ -6,7 +6,6 @@ """Define the chunker types.""" from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta -from docling_core.transforms.chunker.base_code_chunker import CodeChunker from docling_core.transforms.chunker.code_chunk_utils.chunk_utils import ( ChunkBuilder, ChunkMetadataBuilder, @@ -21,9 +20,9 @@ NoOpCodeChunkingStrategy, ) from docling_core.transforms.chunker.hierarchical_chunker import ( - ChunkType, CodeChunk, CodeChunkingStrategy, + CodeChunkType, CodeDocMeta, DocChunk, DocMeta, diff --git a/docling_core/transforms/chunker/base_code_chunker.py b/docling_core/transforms/chunker/base_code_chunker.py index 7c9a8cc2..3433ab51 100644 --- a/docling_core/transforms/chunker/base_code_chunker.py +++ b/docling_core/transforms/chunker/base_code_chunker.py @@ -19,7 +19,7 @@ from docling_core.types.doc.labels import DocItemLabel -class CodeChunker(BaseChunker): +class _CodeChunker(BaseChunker): """Data model for code chunker.""" language: Language diff --git a/docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py b/docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py index 4756c82e..403ec4fc 100644 --- a/docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py +++ b/docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py @@ -4,8 +4,8 @@ from tree_sitter import Node from docling_core.transforms.chunker.hierarchical_chunker import ( - ChunkType, CodeChunk, + CodeChunkType, CodeDocMeta, ) from docling_core.types.doc.document import DocumentOrigin @@ -99,7 +99,7 @@ def build_function_metadata( end_line=end_line, end_line_signature=signature_end_line, origin=self.origin, - chunk_type=ChunkType.FUNCTION, + chunk_type=CodeChunkType.FUNCTION, ) def build_class_metadata( @@ -119,7 +119,7 @@ def build_class_metadata( end_line=end_line, end_line_signature=end_line, origin=self.origin, - chunk_type=ChunkType.CLASS, + chunk_type=CodeChunkType.CLASS, ) def build_preamble_metadata( @@ -131,7 +131,7 @@ def build_preamble_metadata( start_line=start_line, end_line=end_line, origin=self.origin, - chunk_type=ChunkType.PREAMBLE, + chunk_type=CodeChunkType.PREAMBLE, ) def calculate_line_numbers( diff --git a/docling_core/transforms/chunker/code_chunking_strategy.py b/docling_core/transforms/chunker/code_chunking_strategy.py index f78dedfa..d7485d57 100644 --- a/docling_core/transforms/chunker/code_chunking_strategy.py +++ b/docling_core/transforms/chunker/code_chunking_strategy.py @@ -1,18 +1,18 @@ from typing import Any, Dict, Iterator, Optional -from docling_core.transforms.chunker.base_code_chunker import CodeChunker +from docling_core.transforms.chunker.base_code_chunker import _CodeChunker from docling_core.transforms.chunker.code_chunk_utils.utils import Language from docling_core.transforms.chunker.hierarchical_chunker import ( - ChunkType, CodeChunk, + CodeChunkType, CodeDocMeta, ) from docling_core.transforms.chunker.language_code_chunkers import ( - CFunctionChunker, - JavaFunctionChunker, - JavaScriptFunctionChunker, - PythonFunctionChunker, - TypeScriptFunctionChunker, + _CFunctionChunker, + _JavaFunctionChunker, + _JavaScriptFunctionChunker, + _PythonFunctionChunker, + _TypeScriptFunctionChunker, ) from docling_core.types.doc.base import Size from docling_core.types.doc.document import ( @@ -183,15 +183,15 @@ class CodeChunkingStrategyFactory: """Factory for creating language-specific code chunking strategies.""" @staticmethod - def create_chunker(language: Language, **kwargs: Any) -> CodeChunker: + def create_chunker(language: Language, **kwargs: Any) -> _CodeChunker: """Create a language-specific code chunker.""" chunker_map = { - Language.PYTHON: PythonFunctionChunker, - Language.TYPESCRIPT: TypeScriptFunctionChunker, - Language.JAVASCRIPT: JavaScriptFunctionChunker, - Language.C: CFunctionChunker, - Language.JAVA: JavaFunctionChunker, + Language.PYTHON: _PythonFunctionChunker, + Language.TYPESCRIPT: _TypeScriptFunctionChunker, + Language.JAVASCRIPT: _JavaScriptFunctionChunker, + Language.C: _CFunctionChunker, + Language.JAVA: _JavaFunctionChunker, } chunker_class = chunker_map.get(language) @@ -208,9 +208,9 @@ def __init__(self, **chunker_kwargs: Any): """Initialize the strategy with optional chunker parameters.""" self.chunker_kwargs = chunker_kwargs - self._chunker_cache: Dict[Language, CodeChunker] = {} + self._chunker_cache: Dict[Language, _CodeChunker] = {} - def _get_chunker(self, language: Language) -> CodeChunker: + def _get_chunker(self, language: Language) -> _CodeChunker: """Get or create a chunker for the given language.""" if language not in self._chunker_cache: @@ -238,10 +238,12 @@ def chunk_code_item( filename = original_doc.origin.filename or "code_chunk" mimetype = original_doc.origin.mimetype or "text/plain" binary_hash = _create_hash(code_text) + uri = getattr(original_doc.origin, "uri", None) else: filename = "code_chunk" mimetype = "text/plain" binary_hash = _create_hash(code_text) + uri = None if original_item and hasattr(original_item, "self_ref"): self_ref = original_item.self_ref @@ -255,7 +257,7 @@ def chunk_code_item( texts=[code_item], pages={0: PageItem(page_no=0, size=Size(width=612.0, height=792.0))}, origin=DocumentOrigin( - filename=filename, mimetype=mimetype, binary_hash=binary_hash + filename=filename, mimetype=mimetype, binary_hash=binary_hash, uri=uri ), ) @@ -279,7 +281,7 @@ def chunk_code_item( return meta = CodeDocMeta( - chunk_type=ChunkType.CODE_BLOCK, + chunk_type=CodeChunkType.CODE_BLOCK, start_line=1, end_line=len(code_text.splitlines()), ) diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py index be0e6ad4..3e17002b 100644 --- a/docling_core/transforms/chunker/hierarchical_chunker.py +++ b/docling_core/transforms/chunker/hierarchical_chunker.py @@ -10,7 +10,16 @@ import logging import re from enum import Enum -from typing import Any, ClassVar, Final, Iterator, Literal, Optional, Protocol +from typing import ( + TYPE_CHECKING, + Any, + ClassVar, + Final, + Iterator, + Literal, + Optional, + Protocol, +) from pydantic import ConfigDict, Field, StringConstraints, field_validator from typing_extensions import Annotated, override @@ -119,7 +128,7 @@ def check_version_is_compatible(cls, v: str) -> str: class CodeDocMeta(DocMeta): - """Data model for CodeChunker metadata.""" + """Data model for code chunk metadata.""" doc_items: Optional[list[DocItem]] = Field(default=None, alias=_KEY_DOC_ITEMS) part_name: Optional[str] = Field(default=None) @@ -137,7 +146,7 @@ class CodeChunk(BaseChunk): meta: CodeDocMeta -class ChunkType(str, Enum): +class CodeChunkType(str, Enum): """Chunk type""" FUNCTION = "function" @@ -157,6 +166,12 @@ def chunk_code_item( ... +if TYPE_CHECKING: + CodeChunkingStrategyType = CodeChunkingStrategy +else: + CodeChunkingStrategyType = Any + + class DocChunk(BaseChunk): """Data model for document chunks.""" @@ -248,7 +263,7 @@ class HierarchicalChunker(BaseChunker): model_config = ConfigDict(arbitrary_types_allowed=True) serializer_provider: BaseSerializerProvider = ChunkingSerializerProvider() - code_chunking_strategy: Optional[Any] = Field(default=None) + code_chunking_strategy: Optional[CodeChunkingStrategyType] = Field(default=None) # deprecated: merge_list_items: Annotated[bool, Field(deprecated=True)] = True @@ -308,14 +323,21 @@ def chunk( ) if language: - for code_chunk in self.code_chunking_strategy.chunk_code_item( - item.text, - language, - original_doc=dl_doc, - original_item=item, - **kwargs, - ): - yield code_chunk + ser_res = my_doc_ser.serialize(item=item, visited=visited) + if ser_res.text: + code_text = self._strip_markdown_code_formatting( + ser_res.text + ) + for ( + code_chunk + ) in self.code_chunking_strategy.chunk_code_item( + code_text, + language, + original_doc=dl_doc, + original_item=item, + **kwargs, + ): + yield code_chunk continue ser_res = my_doc_ser.serialize(item=item, visited=visited) @@ -335,3 +357,14 @@ def chunk( ), ) yield c + + def _strip_markdown_code_formatting(self, text: str) -> str: + """Strip markdown code block formatting from text.""" + if not text.startswith("```") or not text.endswith("```"): + return text + + lines = text.split("\n") + if len(lines) >= 3 and lines[0].startswith("```") and lines[-1] == "```": + return "\n".join(lines[1:-1]) + + return text diff --git a/docling_core/transforms/chunker/language_code_chunkers.py b/docling_core/transforms/chunker/language_code_chunkers.py index 45c488ea..8f8e2e4d 100644 --- a/docling_core/transforms/chunker/language_code_chunkers.py +++ b/docling_core/transforms/chunker/language_code_chunkers.py @@ -4,7 +4,7 @@ from tree_sitter import Node, Tree from typing_extensions import override -from docling_core.transforms.chunker.base_code_chunker import CodeChunker +from docling_core.transforms.chunker.base_code_chunker import _CodeChunker from docling_core.transforms.chunker.code_chunk_utils.utils import ( Language, _get_default_tokenizer, @@ -16,7 +16,7 @@ from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer -class PythonFunctionChunker(CodeChunker): +class _PythonFunctionChunker(_CodeChunker): language: Language = Language.PYTHON ts_language: Any = Field(default=None) @@ -151,7 +151,7 @@ def _is_local_assignment(self, identifier_node: Node) -> bool: return False -class TypeScriptFunctionChunker(CodeChunker): +class _TypeScriptFunctionChunker(_CodeChunker): language: Language = Language.TYPESCRIPT ts_language: Any = Field(default=None) parser: Any = Field(default=None) @@ -232,12 +232,12 @@ def _get_module_variables(self, tree: Tree) -> Dict[str, Node]: return {} -class JavaScriptFunctionChunker(TypeScriptFunctionChunker): +class _JavaScriptFunctionChunker(_TypeScriptFunctionChunker): def __init__(self, **data): super().__init__(language=Language.JAVASCRIPT) -class CFunctionChunker(CodeChunker): +class _CFunctionChunker(_CodeChunker): language: Language = Language.C ts_language: Any = Field(default=None) parser: Any = Field(default=None) @@ -389,7 +389,7 @@ def collect_identifiers(node, depth=0): return used_macros -class JavaFunctionChunker(CodeChunker): +class _JavaFunctionChunker(_CodeChunker): language: Language = Language.JAVA ts_language: Any = Field(default=None) diff --git a/test/test_code_chunker.py b/test/test_code_chunker.py index 51095761..60bd044a 100644 --- a/test/test_code_chunker.py +++ b/test/test_code_chunker.py @@ -9,13 +9,10 @@ from docling_core.transforms.chunker.base_code_chunker import CodeChunk from docling_core.transforms.chunker.code_chunk_utils.utils import Language -from docling_core.transforms.chunker.language_code_chunkers import ( - CFunctionChunker, - JavaFunctionChunker, - JavaScriptFunctionChunker, - PythonFunctionChunker, - TypeScriptFunctionChunker, +from docling_core.transforms.chunker.code_chunking_strategy import ( + DefaultCodeChunkingStrategy, ) +from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker from docling_core.types.doc import DoclingDocument, DocumentOrigin from docling_core.types.doc.labels import DocItemLabel from docling_core.utils.legacy import _create_hash @@ -96,31 +93,41 @@ def create_documents_from_repository( "Java", "/test/data/chunker_repo/repos/acmeair", "https://github.com/acmeair/acmeair", - lambda: JavaFunctionChunker(max_tokens=5000), + lambda: HierarchicalChunker( + code_chunking_strategy=DefaultCodeChunkingStrategy(max_tokens=5000) + ), ), ( "TypeScript", "/test/data/chunker_repo/repos/outline", "https://github.com/outline/outline", - lambda: TypeScriptFunctionChunker(max_tokens=5000), + lambda: HierarchicalChunker( + code_chunking_strategy=DefaultCodeChunkingStrategy(max_tokens=5000) + ), ), ( "JavaScript", "/test/data/chunker_repo/repos/jquery", "https://github.com/jquery/jquery", - lambda: JavaScriptFunctionChunker(max_tokens=5000), + lambda: HierarchicalChunker( + code_chunking_strategy=DefaultCodeChunkingStrategy(max_tokens=5000) + ), ), ( "Python", "/test/data/chunker_repo/repos/docling", "https://github.com/docling-project/docling", - lambda: PythonFunctionChunker(max_tokens=5000), + lambda: HierarchicalChunker( + code_chunking_strategy=DefaultCodeChunkingStrategy(max_tokens=5000) + ), ), ( "C", "/test/data/chunker_repo/repos/json-c", "https://github.com/json-c/json-c", - lambda: CFunctionChunker(max_tokens=5000), + lambda: HierarchicalChunker( + code_chunking_strategy=DefaultCodeChunkingStrategy(max_tokens=5000) + ), ), ] From 9344d8e143c25d3ba5f473a25408d83765767ebe Mon Sep 17 00:00:00 2001 From: Bridget Date: Thu, 23 Oct 2025 13:21:39 -0400 Subject: [PATCH 05/12] Update pyproject.toml Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Signed-off-by: Bridget --- pyproject.toml | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 01b03326..44a5c402 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -48,15 +48,14 @@ dependencies = [ 'pillow (>=10.0.0,<13.0.0)', 'pyyaml (>=5.1,<7.0.0)', 'typing-extensions (>=4.12.2,<5.0.0)', - 'typer (>=0.12.5,<0.17.0)', + 'typer (>=0.12.5,<0.20.0)', 'latex2mathml (>=3.77.0,<4.0.0)', - "tree-sitter==0.23.2", - "tree-sitter-python==0.23.6", - "tree-sitter-c==0.23.4", - "tree-sitter-java==0.23.5", - "tree-sitter-javascript==0.23.1", - "tree-sitter-typescript==0.23.2", - + "tree-sitter (>=0.23.2,<1.0.0)", + "tree-sitter-python (>=0.23.6,<1.0.0)", + "tree-sitter-c (>=0.23.4,<1.0.0)", + "tree-sitter-java (>=0.23.5,<1.0.0)", + "tree-sitter-javascript (>=0.23.1,<1.0.0)", + "tree-sitter-typescript (>=0.23.2,<1.0.0)", ] [project.urls] From bed80ffb8577781c74055b68490f66073c261f5b Mon Sep 17 00:00:00 2001 From: Bridget Date: Thu, 23 Oct 2025 13:21:58 -0400 Subject: [PATCH 06/12] Update docling_core/transforms/chunker/hierarchical_chunker.py Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Signed-off-by: Bridget --- docling_core/transforms/chunker/hierarchical_chunker.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py index 3e17002b..de249651 100644 --- a/docling_core/transforms/chunker/hierarchical_chunker.py +++ b/docling_core/transforms/chunker/hierarchical_chunker.py @@ -130,6 +130,10 @@ def check_version_is_compatible(cls, v: str) -> str: class CodeDocMeta(DocMeta): """Data model for code chunk metadata.""" + schema_name: Literal["docling_core.transforms.chunker.CodeDocMeta"] = Field( # type: ignore[assignment] + default="docling_core.transforms.chunker.CodeDocMeta", + alias=_KEY_SCHEMA_NAME, + ) doc_items: Optional[list[DocItem]] = Field(default=None, alias=_KEY_DOC_ITEMS) part_name: Optional[str] = Field(default=None) docstring: Optional[str] = Field(default=None) From 68890e9f48e6f310e27eb20eeb2d98d45c90bd36 Mon Sep 17 00:00:00 2001 From: Bridget McGinn Date: Thu, 23 Oct 2025 21:09:54 -0400 Subject: [PATCH 07/12] run all pre-commit less pytest --- .../transforms/chunker/base_code_chunker.py | 17 ++++++------- .../chunker/code_chunk_utils/chunk_utils.py | 8 +++++- .../chunker/code_chunk_utils/utils.py | 15 ++++++++++- .../chunker/code_chunking_strategy.py | 25 ++++++++++++------- .../chunker/hierarchical_chunker.py | 8 +++--- .../chunker/language_code_chunkers.py | 6 ++++- pyproject.toml | 1 + test/test_code_chunker.py | 4 +-- 8 files changed, 58 insertions(+), 26 deletions(-) diff --git a/docling_core/transforms/chunker/base_code_chunker.py b/docling_core/transforms/chunker/base_code_chunker.py index 3433ab51..a0db9e95 100644 --- a/docling_core/transforms/chunker/base_code_chunker.py +++ b/docling_core/transforms/chunker/base_code_chunker.py @@ -1,3 +1,5 @@ +"""Base code chunker implementation for parsing and chunking code files.""" + from typing import Any, Dict, Iterator, List, Optional, Tuple from tree_sitter import Node, Parser, Tree @@ -58,13 +60,8 @@ def __init__(self, **data): if self.parser is None: self.parser = Parser(self.ts_language) - @property - def max_tokens(self) -> int: - """Get maximum number of tokens allowed.""" - return self.tokenizer.get_max_tokens() - def parse_code(self, code: str) -> Tree: - """Get tree sitter parser""" + """Get tree sitter parser.""" return self.parser.parse(bytes(code, self.utf8_encoding)) def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[CodeChunk]: @@ -199,7 +196,10 @@ def _yield_function_chunks_with_ranges( function_content.replace(docstring, "") if docstring else function_content ) - base_content = f"{prefix}{imports}{module_variable_definitions}{additional_context_no_docstring}{function_no_docstring}" + base_content = ( + f"{prefix}{imports}{module_variable_definitions}" + f"{additional_context_no_docstring}{function_no_docstring}" + ) if chunk_builder: yield chunk_builder.build_function_chunk( @@ -437,7 +437,6 @@ def find_used_imports(node): def _get_node_with_comments(self, node: Node) -> str: """Get node text including any preceding comments.""" - current = node.prev_sibling comment_parts: List[str] = [] @@ -516,7 +515,7 @@ def _build_additional_context( return context, context_no_docstring def _is_docstring(self, node: Node) -> bool: - """Determines if a node is a docstring""" + """Determines if a node is a docstring.""" return bool( node.type == self.expression_statement and node.named_children diff --git a/docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py b/docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py index 403ec4fc..d2efb655 100644 --- a/docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py +++ b/docling_core/transforms/chunker/code_chunk_utils/chunk_utils.py @@ -1,3 +1,5 @@ +"""Utility classes for code chunking operations.""" + import hashlib from typing import Iterator, List, Tuple @@ -20,6 +22,7 @@ class RangeTracker: """Handles tracking and management of used byte ranges in code.""" def __init__(self): + """Initialize the range tracker with an empty list of used ranges.""" self.used_ranges: List[Tuple[int, int]] = [] def mark_used(self, start_byte: int, end_byte: int) -> None: @@ -79,6 +82,7 @@ class ChunkMetadataBuilder: """Builds metadata for code chunks.""" def __init__(self, origin: DocumentOrigin): + """Initialize the metadata builder with document origin.""" self.origin = origin def build_function_metadata( @@ -152,6 +156,7 @@ class ChunkBuilder: """Builds code chunks from nodes and content.""" def __init__(self, origin: DocumentOrigin): + """Initialize the chunk builder with document origin.""" self.metadata_builder = ChunkMetadataBuilder(origin) def build_function_chunk( @@ -231,6 +236,7 @@ class ChunkSizeProcessor: def __init__( self, tokenizer, max_tokens: int, min_chunk_size: int = 300, chunker=None ): + """Initialize the chunk size processor with tokenizer and size constraints.""" self.tokenizer = tokenizer self.max_tokens = max_tokens self.min_chunk_size = min_chunk_size @@ -322,7 +328,7 @@ def _split_function_chunk( new_meta = chunk.meta.model_copy() new_meta.part_name = ( - f"{chunk.meta.part_name}_part_{i+1}" + f"{chunk.meta.part_name}_part_{i + 1}" if len(chunks) > 1 else chunk.meta.part_name ) diff --git a/docling_core/transforms/chunker/code_chunk_utils/utils.py b/docling_core/transforms/chunker/code_chunk_utils/utils.py index 58301e19..73c2e70a 100644 --- a/docling_core/transforms/chunker/code_chunk_utils/utils.py +++ b/docling_core/transforms/chunker/code_chunk_utils/utils.py @@ -1,3 +1,5 @@ +"""Utility functions and classes for code language detection and processing.""" + from enum import Enum from typing import List, Optional @@ -14,6 +16,8 @@ class Language(str, Enum): + """Supported programming languages for code chunking.""" + PYTHON = "python" JAVASCRIPT = "javascript" TYPESCRIPT = "typescript" @@ -21,6 +25,7 @@ class Language(str, Enum): C = "c" def file_extensions(self) -> List[str]: + """Get the file extensions associated with this language.""" if self == Language.PYTHON: return [".py"] elif self == Language.TYPESCRIPT: @@ -35,6 +40,7 @@ def file_extensions(self) -> List[str]: return [] def get_tree_sitter_language(self): + """Get the tree-sitter language object for this language.""" if self == Language.PYTHON: return Lang(ts_python.language()) elif self == Language.TYPESCRIPT: @@ -49,7 +55,7 @@ def get_tree_sitter_language(self): return None def to_code_language_label(self): - + """Convert this language to a CodeLanguageLabel.""" mapping = { Language.PYTHON: CodeLanguageLabel.PYTHON, Language.JAVA: CodeLanguageLabel.JAVA, @@ -60,6 +66,7 @@ def to_code_language_label(self): return mapping.get(self, CodeLanguageLabel.UNKNOWN) def get_import_query(self) -> Optional[str]: + """Get the tree-sitter query string for finding imports in this language.""" if self == Language.PYTHON: return """ (import_statement) @import @@ -101,6 +108,7 @@ def get_import_query(self) -> Optional[str]: return None def get_function_name(self, node: Node) -> Optional[str]: + """Extract the function name from a function node.""" if self == Language.C: declarator = node.child_by_field_name("declarator") if declarator: @@ -115,6 +123,7 @@ def get_function_name(self, node: Node) -> Optional[str]: return None def is_collectable_function(self, node: Node, constructor_name: str) -> bool: + """Check if a function should be collected for chunking.""" if self == Language.C: return True else: @@ -126,6 +135,7 @@ def is_collectable_function(self, node: Node, constructor_name: str) -> bool: def _get_default_tokenizer() -> "BaseTokenizer": + """Get the default tokenizer instance.""" from docling_core.transforms.chunker.tokenizer.huggingface import ( HuggingFaceTokenizer, ) @@ -136,10 +146,12 @@ def _get_default_tokenizer() -> "BaseTokenizer": def has_child(node: Node, child_name: str) -> bool: + """Check if a node has a child with the specified name.""" return bool(node and node.child_by_field_name(child_name)) def get_children(node: Node, child_types: List[str]) -> List[Node]: + """Get all children of a node that match the specified types.""" if not node.children: return [] @@ -147,6 +159,7 @@ def get_children(node: Node, child_types: List[str]) -> List[Node]: def to_str(node: Node) -> str: + """Convert a tree-sitter node to a string.""" if not node or not node.text: return "" text = node.text.decode() diff --git a/docling_core/transforms/chunker/code_chunking_strategy.py b/docling_core/transforms/chunker/code_chunking_strategy.py index d7485d57..cb7e6841 100644 --- a/docling_core/transforms/chunker/code_chunking_strategy.py +++ b/docling_core/transforms/chunker/code_chunking_strategy.py @@ -1,3 +1,5 @@ +"""Code chunking strategy implementations for different programming languages.""" + from typing import Any, Dict, Iterator, Optional from docling_core.transforms.chunker.base_code_chunker import _CodeChunker @@ -30,7 +32,6 @@ class LanguageDetector: @staticmethod def detect_from_extension(filename: Optional[str]) -> Optional[Language]: """Detect language from file extension.""" - if not filename: return None @@ -45,7 +46,6 @@ def detect_from_extension(filename: Optional[str]) -> Optional[Language]: @staticmethod def detect_from_content(code_text: str) -> Optional[Language]: """Detect language from code content using heuristics.""" - if not code_text: return None @@ -65,7 +65,20 @@ def detect_from_content(code_text: str) -> Optional[Language]: ] ) and not any( pattern in code_lower - for pattern in ["public class", "private ", "protected ", "package "] + for pattern in [ + "public class", + "private ", + "protected ", + "package ", + "package main", + "func main()", + 'import "fmt"', + "chan ", + "interface{}", + "go func", + "defer ", + ":= ", + ] ): return Language.PYTHON @@ -169,7 +182,6 @@ def detect_language( code_text: str, filename: Optional[str] = None ) -> Optional[Language]: """Detect language from both filename and content.""" - if filename: lang = LanguageDetector.detect_from_extension(filename) if lang: @@ -185,7 +197,6 @@ class CodeChunkingStrategyFactory: @staticmethod def create_chunker(language: Language, **kwargs: Any) -> _CodeChunker: """Create a language-specific code chunker.""" - chunker_map = { Language.PYTHON: _PythonFunctionChunker, Language.TYPESCRIPT: _TypeScriptFunctionChunker, @@ -206,13 +217,11 @@ class DefaultCodeChunkingStrategy: def __init__(self, **chunker_kwargs: Any): """Initialize the strategy with optional chunker parameters.""" - self.chunker_kwargs = chunker_kwargs self._chunker_cache: Dict[Language, _CodeChunker] = {} def _get_chunker(self, language: Language) -> _CodeChunker: """Get or create a chunker for the given language.""" - if language not in self._chunker_cache: self._chunker_cache[language] = CodeChunkingStrategyFactory.create_chunker( language, **self.chunker_kwargs @@ -228,7 +237,6 @@ def chunk_code_item( **kwargs: Any, ) -> Iterator[CodeChunk]: """Chunk a single code item using the appropriate language chunker.""" - if not code_text.strip(): return @@ -276,7 +284,6 @@ def chunk_code_item( **kwargs: Any, ) -> Iterator[CodeChunk]: """Return the code as a single chunk without further processing.""" - if not code_text.strip(): return diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py index de249651..9e7a8ec3 100644 --- a/docling_core/transforms/chunker/hierarchical_chunker.py +++ b/docling_core/transforms/chunker/hierarchical_chunker.py @@ -19,6 +19,7 @@ Literal, Optional, Protocol, + cast, ) from pydantic import ConfigDict, Field, StringConstraints, field_validator @@ -134,7 +135,7 @@ class CodeDocMeta(DocMeta): default="docling_core.transforms.chunker.CodeDocMeta", alias=_KEY_SCHEMA_NAME, ) - doc_items: Optional[list[DocItem]] = Field(default=None, alias=_KEY_DOC_ITEMS) + doc_items: Optional[list[DocItem]] = Field(default=None, alias=_KEY_DOC_ITEMS) # type: ignore[assignment] part_name: Optional[str] = Field(default=None) docstring: Optional[str] = Field(default=None) sha256: Optional[int] = Field(default=None) @@ -151,7 +152,7 @@ class CodeChunk(BaseChunk): class CodeChunkType(str, Enum): - """Chunk type""" + """Chunk type.""" FUNCTION = "function" METHOD = "method" @@ -317,8 +318,9 @@ def chunk( LanguageDetector, ) + text_item = cast(Any, item) language = LanguageDetector.detect_language( - item.text, + text_item.text, ( getattr(dl_doc.origin, "filename", None) if dl_doc.origin diff --git a/docling_core/transforms/chunker/language_code_chunkers.py b/docling_core/transforms/chunker/language_code_chunkers.py index 8f8e2e4d..29a22d75 100644 --- a/docling_core/transforms/chunker/language_code_chunkers.py +++ b/docling_core/transforms/chunker/language_code_chunkers.py @@ -1,3 +1,5 @@ +"""Language-specific code chunker implementations.""" + from typing import Any, Dict, List, Tuple from pydantic import Field @@ -31,7 +33,7 @@ class _PythonFunctionChunker(_CodeChunker): function_body: str = "block" tokenizer: BaseTokenizer = Field(default_factory=_get_default_tokenizer) min_chunk_size: int = 300 - max_tokens: int = 50 + max_tokens: int = 5000 docs_types: List[str] = ["body", "comment"] dotted_name: str = "dotted_name" aliased_import: str = "aliased_import" @@ -112,6 +114,7 @@ def _find_used_variables(self, function_node: Node) -> set: used_vars = set() def collect_identifiers(node, depth=0): + """Collect identifiers from node.""" " " * depth if node.type in self.identifiers: var_name = node.text.decode(self.utf8_encoding) @@ -365,6 +368,7 @@ def _find_used_variables(self, function_node: Node) -> set: used_macros = set() def collect_identifiers(node, depth=0): + """Collect identifiers from node.""" " " * depth if node.type in self.identifiers: macro_name = node.text.decode(self.utf8_encoding) diff --git a/pyproject.toml b/pyproject.toml index 44a5c402..d8561f8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -136,6 +136,7 @@ namespace_packages = true show_error_codes = true python_version = "3.9" plugins = ["pydantic.mypy"] +exclude = "(^|/)test/data/.*" [[tool.mypy.overrides]] module = [ diff --git a/test/test_code_chunker.py b/test/test_code_chunker.py index 60bd044a..65d16a2f 100644 --- a/test/test_code_chunker.py +++ b/test/test_code_chunker.py @@ -2,7 +2,7 @@ import json import os import pathlib -from typing import List +from typing import List, Optional import git import pytest @@ -30,7 +30,7 @@ def get_latest_commit_id(file_dir: str) -> str: def create_documents_from_repository( - file_dir: str, repo_url: str, commit_id: str = None + file_dir: str, repo_url: str, commit_id: Optional[str] = None ) -> List[DoclingDocument]: """Build DoclingDocument objects from a local checkout, one per code file.""" From 3c65eef59e817e9bd8cd3c55e98e843e326dd7a2 Mon Sep 17 00:00:00 2001 From: Bridget McGinn Date: Thu, 23 Oct 2025 21:21:55 -0400 Subject: [PATCH 08/12] update test files for code ID --- examples/2408.09869v3.json | 4 +- test/data/chunker/0_inp_dl_doc.json | 4 +- test/data/chunker_repo/C/repo_out_chunks.json | 74 +++++----- .../chunker_repo/Java/repo_out_chunks.json | 10 +- .../JavaScript/repo_out_chunks.json | 12 +- .../chunker_repo/Python/repo_out_chunks.json | 136 +++++++++--------- .../TypeScript/repo_out_chunks.json | 20 +-- test/data/doc/2408.09869v3_enriched.dt | 4 +- test/data/doc/2408.09869v3_enriched.dt.json | 4 +- test/data/doc/2408.09869v3_enriched.json | 4 +- test/data/doc/2408.09869v3_enriched.out.dt | 4 +- .../doc/2408.09869v3_enriched.out.dt.json | 4 +- .../2408.09869v3_enriched_p2_p3_p5.gt.json | 4 +- test/data/doc/concatenated.json | 4 +- ...onstructed_doc.added_extracted_doc.json.gt | 4 +- .../constructed_doc.appended_child.json.gt | 4 +- ...constructed_doc.bulk_item_addition.json.gt | 4 +- ...onstructed_doc.bulk_item_insertion.json.gt | 4 +- .../doc/constructed_doc.deleted_group.json.gt | 4 +- ...onstructed_doc.deleted_items_range.json.gt | 4 +- .../constructed_doc.deleted_picture.json.gt | 4 +- .../doc/constructed_doc.deleted_table.json.gt | 4 +- .../doc/constructed_doc.deleted_text.json.gt | 4 +- test/data/doc/constructed_doc.dt | 4 +- test/data/doc/constructed_doc.dt.gt | 4 +- .../data/doc/constructed_doc.embedded.json.gt | 4 +- .../data/doc/constructed_doc.embedded.yaml.gt | 4 +- ...ructed_doc.extracted_with_deletion.json.gt | 4 +- ...tructed_doc.inserted_extracted_doc.json.gt | 4 +- ...d_doc.inserted_items_with_insert_*.json.gt | 4 +- ....inserted_list_items_with_insert_*.json.gt | 4 +- .../doc/constructed_doc.inserted_text.json.gt | 4 +- .../constructed_doc.manipulated_table.json.gt | 4 +- .../doc/constructed_doc.referenced.json.gt | 4 +- .../doc/constructed_doc.referenced.yaml.gt | 4 +- .../doc/constructed_doc.replaced_item.json.gt | 4 +- test/data/doc/constructed_document.yaml.dt | 4 +- 37 files changed, 190 insertions(+), 190 deletions(-) diff --git a/examples/2408.09869v3.json b/examples/2408.09869v3.json index 6dac6986..3901ffd0 100644 --- a/examples/2408.09869v3.json +++ b/examples/2408.09869v3.json @@ -1316,7 +1316,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/23", @@ -1347,7 +1347,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/24", diff --git a/test/data/chunker/0_inp_dl_doc.json b/test/data/chunker/0_inp_dl_doc.json index af6329b1..f987b308 100644 --- a/test/data/chunker/0_inp_dl_doc.json +++ b/test/data/chunker/0_inp_dl_doc.json @@ -1317,7 +1317,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/23", @@ -1348,7 +1348,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/24", diff --git a/test/data/chunker_repo/C/repo_out_chunks.json b/test/data/chunker_repo/C/repo_out_chunks.json index ced1e8e6..3aae247f 100644 --- a/test/data/chunker_repo/C/repo_out_chunks.json +++ b/test/data/chunker_repo/C/repo_out_chunks.json @@ -3,7 +3,7 @@ { "text": "\nstatic void string_replace_all_occurrences_with_char(char *s, const char *occur, char repl_char)\n{\n\tsize_t slen = strlen(s);\n\tsize_t skip = strlen(occur) - 1; /* length of the occurrence, minus the char we're replacing */\n\tchar *p = s;\n\twhile ((p = strstr(p, occur)))\n\t{\n\t\t*p = repl_char;\n\t\tp++;\n\t\tslen -= skip;\n\t\tmemmove(p, (p + skip), slen - (p - s) + 1); /* includes null char too */\n\t}\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -23,7 +23,7 @@ { "text": "/* Avoid ctype.h and locale overhead */\n#define is_plain_digit(c) ((c) >= '0' && (c) <= '9')\nstatic int is_valid_index(const char *path, size_t *idx)\n{\n\tsize_t i, len = strlen(path);\n\t/* this code-path optimizes a bit, for when we reference the 0-9 index range\n\t * in a JSON array and because leading zeros not allowed\n\t */\n\tif (len == 1)\n\t{\n\t\tif (is_plain_digit(path[0]))\n\t\t{\n\t\t\t*idx = (path[0] - '0');\n\t\t\treturn 1;\n\t\t}\n\t\terrno = EINVAL;\n\t\treturn 0;\n\t}\n\t/* leading zeros not allowed per RFC */\n\tif (path[0] == '0')\n\t{\n\t\terrno = EINVAL;\n\t\treturn 0;\n\t}\n\t/* RFC states base-10 decimals */\n\tfor (i = 0; i < len; i++)\n\t{\n\t\tif (!is_plain_digit(path[i]))\n\t\t{\n\t\t\terrno = EINVAL;\n\t\t\treturn 0;\n\t\t}\n\t}\n\n\t// We know it's all digits, so the only error case here is overflow,\n\t// but ULLONG_MAX will be longer than any array length so that's ok.\n\t*idx = strtoull(path, NULL, 10);\n\n\treturn 1;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -43,7 +43,7 @@ { "text": "\nstatic int json_pointer_get_single_path(struct json_object *obj, char *path,\n struct json_object **value, size_t *idx)\n{\n\tif (json_object_is_type(obj, json_type_array))\n\t{\n\t\tif (!is_valid_index(path, idx))\n\t\t\treturn -1;\n\t\tif (*idx >= json_object_array_length(obj))\n\t\t{\n\t\t\terrno = ENOENT;\n\t\t\treturn -1;\n\t\t}\n\n\t\tobj = json_object_array_get_idx(obj, *idx);\n\t\tif (obj)\n\t\t{\n\t\t\tif (value)\n\t\t\t\t*value = obj;\n\t\t\treturn 0;\n\t\t}\n\t\t/* Entry not found */\n\t\terrno = ENOENT;\n\t\treturn -1;\n\t}\n\n\t/* RFC states that we first must eval all ~1 then all ~0 */\n\tstring_replace_all_occurrences_with_char(path, \"~1\", '/');\n\tstring_replace_all_occurrences_with_char(path, \"~0\", '~');\n\n\tif (!json_object_object_get_ex(obj, path, value))\n\t{\n\t\terrno = ENOENT;\n\t\treturn -1;\n\t}\n\n\treturn 0;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -63,7 +63,7 @@ { "text": "\nstatic int json_object_array_put_idx_cb(struct json_object *parent, size_t idx,\n\t\t\t\t\tstruct json_object *value, void *priv)\n{\n\treturn json_object_array_put_idx(parent, idx, value);\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -83,7 +83,7 @@ { "text": "\nstatic int json_pointer_set_single_path(struct json_object *parent, const char *path,\n struct json_object *value,\n\t\t\t\t\tjson_pointer_array_set_cb array_set_cb, void *priv)\n{\n\tif (json_object_is_type(parent, json_type_array))\n\t{\n\t\tsize_t idx;\n\t\t/* RFC (Chapter 4) states that '-' may be used to add new elements to an array */\n\t\tif (path[0] == '-' && path[1] == '\\0')\n\t\t\treturn json_object_array_add(parent, value);\n\t\tif (!is_valid_index(path, &idx))\n\t\t\treturn -1;\n\t\treturn array_set_cb(parent, idx, value, priv);\n\t}\n\n\t/* path replacements should have been done in json_pointer_get_single_path(),\n\t * and we should still be good here\n\t */\n\tif (json_object_is_type(parent, json_type_object))\n\t\treturn json_object_object_add(parent, path, value);\n\n\t/* Getting here means that we tried to \"dereference\" a primitive JSON type\n\t * (like string, int, bool).i.e. add a sub-object to it\n\t */\n\terrno = ENOENT;\n\treturn -1;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -103,7 +103,7 @@ { "text": "\nstatic int json_pointer_result_get_recursive(struct json_object *obj, char *path,\n struct json_pointer_get_result *res)\n{\n\tstruct json_object *parent_obj = obj;\n\tsize_t idx = 0;\n\tchar *endp;\n\tint rc;\n\n\t/* All paths (on each recursion level must have a leading '/' */\n\tif (path[0] != '/')\n\t{\n\t\terrno = EINVAL;\n\t\treturn -1;\n\t}\n\tpath++;\n\n\tendp = strchr(path, '/');\n\tif (endp)\n\t\t*endp = '\\0';\n\n\t/* If we err-ed here, return here */\n\tif ((rc = json_pointer_get_single_path(obj, path, &obj, &idx)))\n\t\treturn rc;\n\n\tif (endp)\n\t{\n\t\t/* Put the slash back, so that the sanity check passes on next recursion level */\n\t\t*endp = '/';\n\t\treturn json_pointer_result_get_recursive(obj, endp, res);\n\t}\n\n\t/* We should be at the end of the recursion here */\n\tif (res) {\n\t\tres->parent = parent_obj;\n\t\tres->obj = obj;\n\t\tif (json_object_is_type(res->parent, json_type_array))\n\t\t\tres->index_in_parent = idx;\n\t\telse\n\t\t\tres->key_in_parent = path;\n\t}\n\n\treturn 0;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -123,7 +123,7 @@ { "text": "\nstatic int json_pointer_object_get_recursive(struct json_object *obj, char *path,\n struct json_object **value)\n{\n\tstruct json_pointer_get_result res;\n\tint rc;\n\n\trc = json_pointer_result_get_recursive(obj, path, &res);\n\tif (rc)\n\t\treturn rc;\n\n\tif (value)\n\t\t*value = res.obj;\n\n\treturn 0;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -143,7 +143,7 @@ { "text": "\nint json_pointer_get_internal(struct json_object *obj, const char *path,\n struct json_pointer_get_result *res)\n{\n\tchar *path_copy = NULL;\n\tint rc;\n\n\tif (!obj || !path)\n\t{\n\t\terrno = EINVAL;\n\t\treturn -1;\n\t}\n\n\tif (path[0] == '\\0')\n\t{\n\t\tres->parent = NULL;\n\t\tres->obj = obj;\n\t\tres->key_in_parent = NULL;\n\t\tres->index_in_parent = UINT32_MAX;\n\t\treturn 0;\n\t}\n\n\t/* pass a working copy to the recursive call */\n\tif (!(path_copy = strdup(path)))\n\t{\n\t\terrno = ENOMEM;\n\t\treturn -1;\n\t}\n\trc = json_pointer_result_get_recursive(obj, path_copy, res);\n\t/* re-map the path string to the const-path string */\n\tif (rc == 0 && json_object_is_type(res->parent, json_type_object) && res->key_in_parent)\n\t\tres->key_in_parent = path + (res->key_in_parent - path_copy);\n\tfree(path_copy);\n\n\treturn rc;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -163,7 +163,7 @@ { "text": "\nint json_pointer_get(struct json_object *obj, const char *path, struct json_object **res)\n{\n\tstruct json_pointer_get_result jpres;\n\tint rc;\n\n\trc = json_pointer_get_internal(obj, path, &jpres);\n\tif (rc)\n\t\treturn rc;\n\n\tif (res)\n\t\t*res = jpres.obj;\n\n\treturn 0;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -183,7 +183,7 @@ { "text": "\nint json_pointer_getf(struct json_object *obj, struct json_object **res, const char *path_fmt, ...)\n{\n\tchar *path_copy = NULL;\n\tint rc = 0;\n\tva_list args;\n\n\tif (!obj || !path_fmt)\n\t{\n\t\terrno = EINVAL;\n\t\treturn -1;\n\t}\n\n\tva_start(args, path_fmt);\n\trc = vasprintf(&path_copy, path_fmt, args);\n\tva_end(args);\n\n\tif (rc < 0)\n\t\treturn rc;\n\n\tif (path_copy[0] == '\\0')\n\t{\n\t\tif (res)\n\t\t\t*res = obj;\n\t\tgoto out;\n\t}\n\n\trc = json_pointer_object_get_recursive(obj, path_copy, res);\nout:\n\tfree(path_copy);\n\n\treturn rc;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -203,7 +203,7 @@ { "text": "\nint json_pointer_set_with_array_cb(struct json_object **obj, const char *path,\n\t\t\t\t struct json_object *value,\n\t\t\t\t json_pointer_array_set_cb array_set_cb, void *priv)\n{\n\tconst char *endp;\n\tchar *path_copy = NULL;\n\tstruct json_object *set = NULL;\n\tint rc;\n\n\tif (!obj || !path)\n\t{\n\t\terrno = EINVAL;\n\t\treturn -1;\n\t}\n\n\tif (path[0] == '\\0')\n\t{\n\t\tjson_object_put(*obj);\n\t\t*obj = value;\n\t\treturn 0;\n\t}\n\n\tif (path[0] != '/')\n\t{\n\t\terrno = EINVAL;\n\t\treturn -1;\n\t}\n\n\t/* If there's only 1 level to set, stop here */\n\tif ((endp = strrchr(path, '/')) == path)\n\t{\n\t\tpath++;\n\t\treturn json_pointer_set_single_path(*obj, path, value, array_set_cb, priv);\n\t}\n\n\t/* pass a working copy to the recursive call */\n\tif (!(path_copy = strdup(path)))\n\t{\n\t\terrno = ENOMEM;\n\t\treturn -1;\n\t}\n\tpath_copy[endp - path] = '\\0';\n\trc = json_pointer_object_get_recursive(*obj, path_copy, &set);\n\tfree(path_copy);\n\n\tif (rc)\n\t\treturn rc;\n\n\tendp++;\n\treturn json_pointer_set_single_path(set, endp, value, array_set_cb, priv);\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -223,7 +223,7 @@ { "text": "\nint json_pointer_set(struct json_object **obj, const char *path, struct json_object *value)\n{\n\treturn json_pointer_set_with_array_cb(obj, path, value, json_object_array_put_idx_cb, NULL);\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -243,7 +243,7 @@ { "text": "\nint json_pointer_setf(struct json_object **obj, struct json_object *value, const char *path_fmt,\n ...)\n{\n\tchar *endp;\n\tchar *path_copy = NULL;\n\tstruct json_object *set = NULL;\n\tva_list args;\n\tint rc = 0;\n\n\tif (!obj || !path_fmt)\n\t{\n\t\terrno = EINVAL;\n\t\treturn -1;\n\t}\n\n\t/* pass a working copy to the recursive call */\n\tva_start(args, path_fmt);\n\trc = vasprintf(&path_copy, path_fmt, args);\n\tva_end(args);\n\n\tif (rc < 0)\n\t\treturn rc;\n\n\tif (path_copy[0] == '\\0')\n\t{\n\t\tjson_object_put(*obj);\n\t\t*obj = value;\n\t\tgoto out;\n\t}\n\n\tif (path_copy[0] != '/')\n\t{\n\t\terrno = EINVAL;\n\t\trc = -1;\n\t\tgoto out;\n\t}\n\n\t/* If there's only 1 level to set, stop here */\n\tif ((endp = strrchr(path_copy, '/')) == path_copy)\n\t{\n\t\tset = *obj;\n\t\tgoto set_single_path;\n\t}\n\n\t*endp = '\\0';\n\trc = json_pointer_object_get_recursive(*obj, path_copy, &set);\n\n\tif (rc)\n\t\tgoto out;\n\nset_single_path:\n\tendp++;\n\trc = json_pointer_set_single_path(set, endp, value,\n\t\t\t\t\t json_object_array_put_idx_cb, NULL);\nout:\n\tfree(path_copy);\n\treturn rc;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -263,7 +263,7 @@ { "text": "#include \"config.h\"\n\n#include \"strerror_override.h\"\n\n#include \n#include \n#include \n#include \n\n#include \"json_object_private.h\"\n#include \"json_pointer.h\"\n#include \"json_pointer_private.h\"\n#include \"strdup_compat.h\"\n#include \"vasprintf_compat.h\"\n\n/**\n * JavaScript Object Notation (JSON) Pointer\n * RFC 6901 - https://tools.ietf.org/html/rfc6901\n */", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -280,7 +280,7 @@ { "text": "/* hash functions */\nstatic unsigned long lh_char_hash(const void *k);\nstatic lh_hash_fn *char_hash_fn = lh_char_hash;\nstatic unsigned long lh_perllike_str_hash(const void *k);\nint json_global_set_string_hash(const int h)\n{\n\tswitch (h)\n\t{\n\tcase JSON_C_STR_HASH_DFLT: char_hash_fn = lh_char_hash; break;\n\tcase JSON_C_STR_HASH_PERLLIKE: char_hash_fn = lh_perllike_str_hash; break;\n\tdefault: return -1;\n\t}\n\treturn 0;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -300,7 +300,7 @@ { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nstatic unsigned long lh_ptr_hash(const void *k)\n{\n\t/* CAW: refactored to be 64bit nice */\n\treturn (unsigned long)((((ptrdiff_t)k * LH_PRIME) >> 4) & ULONG_MAX);\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -320,7 +320,7 @@ { "text": "int lh_ptr_equal(const void *k1, const void *k2);\nstatic lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_ptr_equal(const void *k1, const void *k2)\n{\n\treturn (k1 == k2);\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -340,7 +340,7 @@ { "text": "#define HASH_LITTLE_ENDIAN 0\n/*\n-------------------------------------------------------------------------------\nmix -- mix 3 32-bit values reversibly.\n\nThis is reversible, so any information in (a,b,c) before mix() is\nstill in (a,b,c) after mix().\n\nIf four pairs of (a,b,c) inputs are run through mix(), or through\nmix() in reverse, there are at least 32 bits of the output that\nare sometimes the same for one pair and different for another pair.\nThis was tested for:\n* pairs that differed by one bit, by two bits, in any combination\n of top bits of (a,b,c), or in any combination of bottom bits of\n (a,b,c).\n* \"differ\" is defined as +, -, ^, or ~^. For + and -, I transformed\n the output delta to a Gray code (a^(a>>1)) so a string of 1's (as\n is commonly produced by subtraction) look like a single 1-bit\n difference.\n* the base values were pseudorandom, all zero but one bit set, or\n all zero plus a counter that starts at zero.\n\nSome k values for my \"a-=c; a^=rot(c,k); c+=b;\" arrangement that\nsatisfy this are\n 4 6 8 16 19 4\n 9 15 3 18 27 15\n 14 9 3 7 17 3\nWell, \"9 15 3 18 27 15\" didn't quite get 32 bits diffing\nfor \"differ\" defined as + with a one-bit base and a two-bit delta. I\nused https://burtleburtle.net/bob/hash/avalanche.html to choose\nthe operations, constants, and arrangements of the variables.\n\nThis does not achieve avalanche. There are input bits of (a,b,c)\nthat fail to affect some output bits of (a,b,c), especially of a. The\nmost thoroughly mixed value is c, but it doesn't really even achieve\navalanche in c.\n\nThis allows some parallelism. Read-after-writes are good at doubling\nthe number of bits affected, so the goal of mixing pulls in the opposite\ndirection as the goal of parallelism. I did what I could. Rotates\nseem to cost as much as shifts on every machine I could lay my hands\non, and rotates are much kinder to the top and bottom bits, so I used\nrotates.\n-------------------------------------------------------------------------------\n*//* clang-format off */\n#define mix(a,b,c) \\\n{ \\\n\ta -= c; a ^= rot(c, 4); c += b; \\\n\tb -= a; b ^= rot(a, 6); a += c; \\\n\tc -= b; c ^= rot(b, 8); b += a; \\\n\ta -= c; a ^= rot(c,16); c += b; \\\n\tb -= a; b ^= rot(a,19); a += c; \\\n\tc -= b; c ^= rot(b, 4); b += a; \\\n}\n/* clang-format on *//*\n-------------------------------------------------------------------------------\nfinal -- final mixing of 3 32-bit values (a,b,c) into c\n\nPairs of (a,b,c) values differing in only a few bits will usually\nproduce values of c that look totally different. This was tested for\n* pairs that differed by one bit, by two bits, in any combination\n of top bits of (a,b,c), or in any combination of bottom bits of\n (a,b,c).\n* \"differ\" is defined as +, -, ^, or ~^. For + and -, I transformed\n the output delta to a Gray code (a^(a>>1)) so a string of 1's (as\n is commonly produced by subtraction) look like a single 1-bit\n difference.\n* the base values were pseudorandom, all zero but one bit set, or\n all zero plus a counter that starts at zero.\n\nThese constants passed:\n 14 11 25 16 4 14 24\n 12 14 25 16 4 14 24\nand these came close:\n 4 8 15 26 3 22 24\n 10 8 15 26 3 22 24\n 11 8 15 26 3 22 24\n-------------------------------------------------------------------------------\n*//* clang-format off */\n#define final(a,b,c) \\\n{ \\\n\tc ^= b; c -= rot(b,14); \\\n\ta ^= c; a -= rot(c,11); \\\n\tb ^= a; b -= rot(a,25); \\\n\tc ^= b; c -= rot(b,16); \\\n\ta ^= c; a -= rot(c,4); \\\n\tb ^= a; b -= rot(a,14); \\\n\tc ^= b; c -= rot(b,24); \\\n}\nstatic lh_hash_fn *char_hash_fn = lh_char_hash;\nstatic uint32_t hashlittle(const void *key, size_t length, uint32_t initval)\n{\n\tuint32_t a,b,c; /* internal state */\n\tunion\n\t{\n\t\tconst void *ptr;\n\t\tsize_t i;\n\t} u; /* needed for Mac Powerbook G4 */\n\n\t/* Set up the internal state */\n\ta = b = c = 0xdeadbeef + ((uint32_t)length) + initval;\n\n\tu.ptr = key;\n\tif (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {\n\t\tconst uint32_t *k = (const uint32_t *)key; /* read 32-bit chunks */\n\n\t\t/*------ all but last block: aligned reads and affect 32 bits of (a,b,c) */\n\t\twhile (length > 12)\n\t\t{\n\t\t\ta += k[0];\n\t\t\tb += k[1];\n\t\t\tc += k[2];\n\t\t\tmix(a,b,c);\n\t\t\tlength -= 12;\n\t\t\tk += 3;\n\t\t}\n\n\t\t/*----------------------------- handle the last (probably partial) block */\n\t\t/*\n\t\t * \"k[2]&0xffffff\" actually reads beyond the end of the string, but\n\t\t * then masks off the part it's not allowed to read. Because the\n\t\t * string is aligned, the masked-off tail is in the same word as the\n\t\t * rest of the string. Every machine with memory protection I've seen\n\t\t * does it on word boundaries, so is OK with this. But VALGRIND will\n\t\t * still catch it and complain. The masking trick does make the hash\n\t\t * noticeably faster for short strings (like English words).\n\t\t * AddressSanitizer is similarly picky about overrunning\n\t\t * the buffer. (https://clang.llvm.org/docs/AddressSanitizer.html)\n\t\t */\n#ifdef VALGRIND\n#define PRECISE_MEMORY_ACCESS 1\n#elif defined(__SANITIZE_ADDRESS__) /* GCC's ASAN */\n#define PRECISE_MEMORY_ACCESS 1\n#elif defined(__has_feature)\n#if __has_feature(address_sanitizer) /* Clang's ASAN */\n#define PRECISE_MEMORY_ACCESS 1\n#endif\n#endif\n#ifndef PRECISE_MEMORY_ACCESS\n\n\t\tswitch(length)\n\t\t{\n\t\tcase 12: c+=k[2]; b+=k[1]; a+=k[0]; break;\n\t\tcase 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break;\n\t\tcase 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break;\n\t\tcase 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break;\n\t\tcase 8 : b+=k[1]; a+=k[0]; break;\n\t\tcase 7 : b+=k[1]&0xffffff; a+=k[0]; break;\n\t\tcase 6 : b+=k[1]&0xffff; a+=k[0]; break;\n\t\tcase 5 : b+=k[1]&0xff; a+=k[0]; break;\n\t\tcase 4 : a+=k[0]; break;\n\t\tcase 3 : a+=k[0]&0xffffff; break;\n\t\tcase 2 : a+=k[0]&0xffff; break;\n\t\tcase 1 : a+=k[0]&0xff; break;\n\t\tcase 0 : return c; /* zero length strings require no mixing */\n\t\t}\n\n#else /* make valgrind happy */\n\n\t\tconst uint8_t *k8 = (const uint8_t *)k;\n\t\tswitch(length)\n\t\t{\n\t\tcase 12: c+=k[2]; b+=k[1]; a+=k[0]; break;\n\t\tcase 11: c+=((uint32_t)k8[10])<<16; /* fall through */\n\t\tcase 10: c+=((uint32_t)k8[9])<<8; /* fall through */\n\t\tcase 9 : c+=k8[8]; /* fall through */\n\t\tcase 8 : b+=k[1]; a+=k[0]; break;\n\t\tcase 7 : b+=((uint32_t)k8[6])<<16; /* fall through */\n\t\tcase 6 : b+=((uint32_t)k8[5])<<8; /* fall through */\n\t\tcase 5 : b+=k8[4]; /* fall through */\n\t\tcase 4 : a+=k[0]; break;\n\t\tcase 3 : a+=((uint32_t)k8[2])<<16; /* fall through */\n\t\tcase 2 : a+=((uint32_t)k8[1])<<8; /* fall through */\n\t\tcase 1 : a+=k8[0]; break;\n\t\tcase 0 : return c;\n\t\t}\n\n#endif /* !valgrind */\n\n\t}\n\telse if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0))\n\t{\n\t\tconst uint16_t *k = (const uint16_t *)key; /* read 16-bit chunks */\n\t\tconst uint8_t *k8;\n\n\t\t/*--------------- all but last block: aligned reads and different mixing */\n\t\twhile (length > 12)\n\t\t{\n\t\t\ta += k[0] + (((uint32_t)k[1])<<16);\n\t\t\tb += k[2] + (((uint32_t)k[3])<<16);\n\t\t\tc += k[4] + (((uint32_t)k[5])<<16);\n\t\t\tmix(a,b,c);\n\t\t\tlength -= 12;\n\t\t\tk += 6;\n\t\t}\n\n\t\t/*----------------------------- handle the last (probably partial) block */\n\t\tk8 = (const uint8_t *)k;\n\t\tswitch(length)\n\t\t{\n\t\tcase 12: c+=k[4]+(((uint32_t)k[5])<<16);\n\t\t\t b+=k[2]+(((uint32_t)k[3])<<16);\n\t\t\t a+=k[0]+(((uint32_t)k[1])<<16);\n\t\t\t break;\n\t\tcase 11: c+=((uint32_t)k8[10])<<16; /* fall through */\n\t\tcase 10: c+=k[4];\n\t\t\t b+=k[2]+(((uint32_t)k[3])<<16);\n\t\t\t a+=k[0]+(((uint32_t)k[1])<<16);\n\t\t\t break;\n\t\tcase 9 : c+=k8[8]; /* fall through */\n\t\tcase 8 : b+=k[2]+(((uint32_t)k[3])<<16);\n\t\t\t a+=k[0]+(((uint32_t)k[1])<<16);\n\t\t\t break;\n\t\tcase 7 : b+=((uint32_t)k8[6])<<16; /* fall through */\n\t\tcase 6 : b+=k[2];\n\t\t\t a+=k[0]+(((uint32_t)k[1])<<16);\n\t\t\t break;\n\t\tcase 5 : b+=k8[4]; /* fall through */\n\t\tcase 4 : a+=k[0]+(((uint32_t)k[1])<<16);\n\t\t\t break;\n\t\tcase 3 : a+=((uint32_t)k8[2])<<16; /* fall through */\n\t\tcase 2 : a+=k[0];\n\t\t\t break;\n\t\tcase 1 : a+=k8[0];\n\t\t\t break;\n\t\tcase 0 : return c; /* zero length requires no mixing */\n\t\t}\n\n\t}\n\telse\n\t{\n\t\t/* need to read the key one byte at a time */\n\t\tconst uint8_t *k = (const uint8_t *)key;\n\n\t\t/*--------------- all but the last block: affect some 32 bits of (a,b,c) */\n\t\twhile (length > 12)\n\t\t{\n\t\t\ta += k[0];\n\t\t\ta += ((uint32_t)k[1])<<8;\n\t\t\ta += ((uint32_t)k[2])<<16;\n\t\t\ta += ((uint32_t)k[3])<<24;\n\t\t\tb += k[4];\n\t\t\tb += ((uint32_t)k[5])<<8;\n\t\t\tb += ((uint32_t)k[6])<<16;\n\t\t\tb += ((uint32_t)k[7])<<24;\n\t\t\tc += k[8];\n\t\t\tc += ((uint32_t)k[9])<<8;\n\t\t\tc += ((uint32_t)k[10])<<16;\n\t\t\tc += ((uint32_t)k[11])<<24;\n\t\t\tmix(a,b,c);\n\t\t\tlength -= 12;\n\t\t\tk += 12;\n\t\t}\n\n\t\t/*-------------------------------- last block: affect all 32 bits of (c) */\n\t\tswitch(length) /* all the case statements fall through */\n\t\t{\n\t\tcase 12: c+=((uint32_t)k[11])<<24; /* FALLTHRU */\n\t\tcase 11: c+=((uint32_t)k[10])<<16; /* FALLTHRU */\n\t\tcase 10: c+=((uint32_t)k[9])<<8; /* FALLTHRU */\n\t\tcase 9 : c+=k[8]; /* FALLTHRU */\n\t\tcase 8 : b+=((uint32_t)k[7])<<24; /* FALLTHRU */\n\t\tcase 7 : b+=((uint32_t)k[6])<<16; /* FALLTHRU */\n\t\tcase 6 : b+=((uint32_t)k[5])<<8; /* FALLTHRU */\n\t\tcase 5 : b+=k[4]; /* FALLTHRU */\n\t\tcase 4 : a+=((uint32_t)k[3])<<24; /* FALLTHRU */\n\t\tcase 3 : a+=((uint32_t)k[2])<<16; /* FALLTHRU */\n\t\tcase 2 : a+=((uint32_t)k[1])<<8; /* FALLTHRU */\n\t\tcase 1 : a+=k[0];\n\t\t\t break;\n\t\tcase 0 : return c;\n\t\t}\n\t}\n\n\tfinal(a,b,c);\n\treturn c;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -360,7 +360,7 @@ { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nstatic unsigned long lh_perllike_str_hash(const void *k);\nstatic unsigned long lh_perllike_str_hash(const void *k)\n{\n\tconst char *rkey = (const char *)k;\n\tunsigned hashval = 1;\n\n\twhile (*rkey)\n\t\thashval = hashval * 33 + *rkey++;\n\n\treturn hashval;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -380,7 +380,7 @@ { "text": "/* hash functions */\nstatic unsigned long lh_char_hash(const void *k);\nstatic lh_hash_fn *char_hash_fn = lh_char_hash;\nstatic unsigned long lh_char_hash(const void *k)\n{\n#if defined _MSC_VER || defined __MINGW32__\n#define RANDOM_SEED_TYPE LONG\n#else\n#define RANDOM_SEED_TYPE int\n#endif\n\tstatic volatile RANDOM_SEED_TYPE random_seed = -1;\n\n\tif (random_seed == -1)\n\t{\n\t\tRANDOM_SEED_TYPE seed;\n\t\t/* we can't use -1 as it is the uninitialized sentinel */\n\t\twhile ((seed = json_c_get_random_seed()) == -1) {}\n#if SIZEOF_INT == 8 && defined __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8\n#define USE_SYNC_COMPARE_AND_SWAP 1\n#endif\n#if SIZEOF_INT == 4 && defined __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4\n#define USE_SYNC_COMPARE_AND_SWAP 1\n#endif\n#if SIZEOF_INT == 2 && defined __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2\n#define USE_SYNC_COMPARE_AND_SWAP 1\n#endif\n#if defined USE_SYNC_COMPARE_AND_SWAP\n\t\t(void)__sync_val_compare_and_swap(&random_seed, -1, seed);\n#elif defined _MSC_VER || defined __MINGW32__\n\t\tInterlockedCompareExchange(&random_seed, seed, -1);\n#else\n\t\t//#warning \"racy random seed initialization if used by multiple threads\"\n\t\trandom_seed = seed; /* potentially racy */\n#endif\n\t}\n\n\treturn hashlittle((const char *)k, strlen((const char *)k), (uint32_t)random_seed);\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -400,7 +400,7 @@ { "text": "/* comparison functions */\nint lh_char_equal(const void *k1, const void *k2);\nstatic lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_char_equal(const void *k1, const void *k2)\n{\n\treturn (strcmp((const char *)k1, (const char *)k2) == 0);\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -420,7 +420,7 @@ { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nstruct lh_table *lh_table_new(int size, lh_entry_free_fn *free_fn, lh_hash_fn *hash_fn,\n lh_equal_fn *equal_fn)\n{\n\tint i;\n\tstruct lh_table *t;\n\n\t/* Allocate space for elements to avoid divisions by zero. */\n\tassert(size > 0);\n\tt = (struct lh_table *)calloc(1, sizeof(struct lh_table));\n\tif (!t)\n\t\treturn NULL;\n\n\tt->count = 0;\n\tt->size = size;\n\tt->table = (struct lh_entry *)calloc(size, sizeof(struct lh_entry));\n\tif (!t->table)\n\t{\n\t\tfree(t);\n\t\treturn NULL;\n\t}\n\tt->free_fn = free_fn;\n\tt->hash_fn = hash_fn;\n\tt->equal_fn = equal_fn;\n\tfor (i = 0; i < size; i++)\n\t\tt->table[i].k = LH_EMPTY;\n\treturn t;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -440,7 +440,7 @@ { "text": "/* comparison functions */\nint lh_char_equal(const void *k1, const void *k2);\nstatic lh_hash_fn *char_hash_fn = lh_char_hash;\nstruct lh_table *lh_kchar_table_new(int size, lh_entry_free_fn *free_fn)\n{\n\treturn lh_table_new(size, free_fn, char_hash_fn, lh_char_equal);\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -460,7 +460,7 @@ { "text": "int lh_ptr_equal(const void *k1, const void *k2);\nstatic lh_hash_fn *char_hash_fn = lh_char_hash;\nstruct lh_table *lh_kptr_table_new(int size, lh_entry_free_fn *free_fn)\n{\n\treturn lh_table_new(size, free_fn, lh_ptr_hash, lh_ptr_equal);\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -480,7 +480,7 @@ { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_table_resize(struct lh_table *t, int new_size)\n{\n\tstruct lh_table *new_t;\n\tstruct lh_entry *ent;\n\n\tnew_t = lh_table_new(new_size, NULL, t->hash_fn, t->equal_fn);\n\tif (new_t == NULL)\n\t\treturn -1;\n\n\tfor (ent = t->head; ent != NULL; ent = ent->next)\n\t{\n\t\tunsigned long h = lh_get_hash(new_t, ent->k);\n\t\tunsigned int opts = 0;\n\t\tif (ent->k_is_constant)\n\t\t\topts = JSON_C_OBJECT_ADD_CONSTANT_KEY;\n\t\tif (lh_table_insert_w_hash(new_t, ent->k, ent->v, h, opts) != 0)\n\t\t{\n\t\t\tlh_table_free(new_t);\n\t\t\treturn -1;\n\t\t}\n\t}\n\tfree(t->table);\n\tt->table = new_t->table;\n\tt->size = new_size;\n\tt->head = new_t->head;\n\tt->tail = new_t->tail;\n\tfree(new_t);\n\n\treturn 0;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -500,7 +500,7 @@ { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nvoid lh_table_free(struct lh_table *t)\n{\n\tstruct lh_entry *c;\n\tif (t->free_fn)\n\t{\n\t\tfor (c = t->head; c != NULL; c = c->next)\n\t\t\tt->free_fn(c);\n\t}\n\tfree(t->table);\n\tfree(t);\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -520,7 +520,7 @@ { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_table_insert_w_hash(struct lh_table *t, const void *k, const void *v, const unsigned long h,\n const unsigned opts)\n{\n\tunsigned long n;\n\n\tif (t->count >= t->size * LH_LOAD_FACTOR)\n\t{\n\t\t/* Avoid signed integer overflow with large tables. */\n\t\tint new_size = (t->size > INT_MAX / 2) ? INT_MAX : (t->size * 2);\n\t\tif (t->size == INT_MAX || lh_table_resize(t, new_size) != 0)\n\t\t\treturn -1;\n\t}\n\n\tn = h % t->size;\n\n\twhile (1)\n\t{\n\t\tif (t->table[n].k == LH_EMPTY || t->table[n].k == LH_FREED)\n\t\t\tbreak;\n\t\tif ((int)++n == t->size)\n\t\t\tn = 0;\n\t}\n\n\tt->table[n].k = k;\n\tt->table[n].k_is_constant = (opts & JSON_C_OBJECT_ADD_CONSTANT_KEY);\n\tt->table[n].v = v;\n\tt->count++;\n\n\tif (t->head == NULL)\n\t{\n\t\tt->head = t->tail = &t->table[n];\n\t\tt->table[n].next = t->table[n].prev = NULL;\n\t}\n\telse\n\t{\n\t\tt->tail->next = &t->table[n];\n\t\tt->table[n].prev = t->tail;\n\t\tt->table[n].next = NULL;\n\t\tt->tail = &t->table[n];\n\t}\n\n\treturn 0;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -540,7 +540,7 @@ { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_table_insert(struct lh_table *t, const void *k, const void *v)\n{\n\treturn lh_table_insert_w_hash(t, k, v, lh_get_hash(t, k), 0);\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -560,7 +560,7 @@ { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nstruct lh_entry *lh_table_lookup_entry_w_hash(struct lh_table *t, const void *k,\n const unsigned long h)\n{\n\tunsigned long n = h % t->size;\n\tint count = 0;\n\n\twhile (count < t->size)\n\t{\n\t\tif (t->table[n].k == LH_EMPTY)\n\t\t\treturn NULL;\n\t\tif (t->table[n].k != LH_FREED && t->equal_fn(t->table[n].k, k))\n\t\t\treturn &t->table[n];\n\t\tif ((int)++n == t->size)\n\t\t\tn = 0;\n\t\tcount++;\n\t}\n\treturn NULL;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -580,7 +580,7 @@ { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nstruct lh_entry *lh_table_lookup_entry(struct lh_table *t, const void *k)\n{\n\treturn lh_table_lookup_entry_w_hash(t, k, lh_get_hash(t, k));\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -600,7 +600,7 @@ { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\njson_bool lh_table_lookup_ex(struct lh_table *t, const void *k, void **v)\n{\n\tstruct lh_entry *e = lh_table_lookup_entry(t, k);\n\tif (e != NULL)\n\t{\n\t\tif (v != NULL)\n\t\t\t*v = lh_entry_v(e);\n\t\treturn 1; /* key found */\n\t}\n\tif (v != NULL)\n\t\t*v = NULL;\n\treturn 0; /* key not found */\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -620,7 +620,7 @@ { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_table_delete_entry(struct lh_table *t, struct lh_entry *e)\n{\n\t/* CAW: fixed to be 64bit nice, still need the crazy negative case... */\n\tptrdiff_t n = (ptrdiff_t)(e - t->table);\n\n\t/* CAW: this is bad, really bad, maybe stack goes other direction on this machine... */\n\tif (n < 0)\n\t{\n\t\treturn -2;\n\t}\n\n\tif (t->table[n].k == LH_EMPTY || t->table[n].k == LH_FREED)\n\t\treturn -1;\n\tt->count--;\n\tif (t->free_fn)\n\t\tt->free_fn(e);\n\tt->table[n].v = NULL;\n\tt->table[n].k = LH_FREED;\n\tif (t->tail == &t->table[n] && t->head == &t->table[n])\n\t{\n\t\tt->head = t->tail = NULL;\n\t}\n\telse if (t->head == &t->table[n])\n\t{\n\t\tt->head->next->prev = NULL;\n\t\tt->head = t->head->next;\n\t}\n\telse if (t->tail == &t->table[n])\n\t{\n\t\tt->tail->prev->next = NULL;\n\t\tt->tail = t->tail->prev;\n\t}\n\telse\n\t{\n\t\tt->table[n].prev->next = t->table[n].next;\n\t\tt->table[n].next->prev = t->table[n].prev;\n\t}\n\tt->table[n].next = t->table[n].prev = NULL;\n\treturn 0;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -640,7 +640,7 @@ { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_table_delete(struct lh_table *t, const void *k)\n{\n\tstruct lh_entry *e = lh_table_lookup_entry(t, k);\n\tif (!e)\n\t\treturn -1;\n\treturn lh_table_delete_entry(t, e);\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -660,7 +660,7 @@ { "text": "static lh_hash_fn *char_hash_fn = lh_char_hash;\nint lh_table_length(struct lh_table *t)\n{\n\treturn t->count;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -680,7 +680,7 @@ { "text": "#include \"config.h\"\n\n#include \n#include \n#include \n#include \n#include \n#include \n#include \n\n#ifdef HAVE_ENDIAN_H\n#include /* attempt to define endianness */\n#endif\n\n#if defined(_MSC_VER) || defined(__MINGW32__)\n#ifndef WIN32_LEAN_AND_MEAN\n#define WIN32_LEAN_AND_MEAN\n#endif\n#include /* Get InterlockedCompareExchange */\n#endif\n\n#include \"linkhash.h\"\n#include \"random_seed.h\"\n\n/*\n * hashlittle from lookup3.c, by Bob Jenkins, May 2006, Public Domain.\n * https://burtleburtle.net/bob/c/lookup3.c\n * minor modifications to make functions static so no symbols are exported\n * minor modifications to compile with -Werror\n */\n\n/*\n-------------------------------------------------------------------------------\nlookup3.c, by Bob Jenkins, May 2006, Public Domain.\n\nThese are functions for producing 32-bit hashes for hash table lookup.\nhashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()\nare externally useful functions. Routines to test the hash are included\nif SELF_TEST is defined. You can use this free for any purpose. It's in\nthe public domain. It has no warranty.\n\nYou probably want to use hashlittle(). hashlittle() and hashbig()\nhash byte arrays. hashlittle() is faster than hashbig() on\nlittle-endian machines. Intel and AMD are little-endian machines.\nOn second thought, you probably want hashlittle2(), which is identical to\nhashlittle() except it returns two 32-bit hashes for the price of one.\nYou could implement hashbig2() if you wanted but I haven't bothered here.\n\nIf you want to find a hash of, say, exactly 7 integers, do\n a = i1; b = i2; c = i3;\n mix(a,b,c);\n a += i4; b += i5; c += i6;\n mix(a,b,c);\n a += i7;\n final(a,b,c);\nthen use c as the hash value. If you have a variable length array of\n4-byte integers to hash, use hashword(). If you have a byte array (like\na character string), use hashlittle(). If you have several byte arrays, or\na mix of things, see the comments above hashlittle().\n\nWhy is this so big? I read 12 bytes at a time into 3 4-byte integers,\nthen mix those integers. This is fast (you can do a lot more thorough\nmixing with 12*3 instructions on 3 integers than you can with 3 instructions\non 1 byte), but shoehorning those bytes into integers efficiently is messy.\n-------------------------------------------------------------------------------\n*/\n\n/*\n * My best guess at if you are big-endian or little-endian. This may\n * need adjustment.\n */\n#if (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN) || \\\n (defined(i386) || defined(__i386__) || defined(__i486__) || defined(__i586__) || \\\n defined(__i686__) || defined(vax) || defined(MIPSEL))\n#define HASH_LITTLE_ENDIAN 1\n#define HASH_BIG_ENDIAN 0\n#elif (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN) || \\\n (defined(sparc) || defined(POWERPC) || defined(mc68000) || defined(sel))\n#define HASH_LITTLE_ENDIAN 0\n#define HASH_BIG_ENDIAN 1\n#else\n\n#define HASH_BIG_ENDIAN 0\n#endif\n\n#define hashsize(n) ((uint32_t)1 << (n))\n#define hashmask(n) (hashsize(n) - 1)\n#define rot(x, k) (((x) << (k)) | ((x) >> (32 - (k))))\n\n/* clang-format on */\n\n/*\n-------------------------------------------------------------------------------\nhashlittle() -- hash a variable-length key into a 32-bit value\n k : the key (the unaligned variable-length array of bytes)\n length : the length of the key, counting by bytes\n initval : can be any 4-byte value\nReturns a 32-bit value. Every bit of the key affects every bit of\nthe return value. Two keys differing by one or two bits will have\ntotally different hash values.\n\nThe best hash table sizes are powers of 2. There is no need to do\nmod a prime (mod is sooo slow!). If you need less than 32 bits,\nuse a bitmask. For example, if you need only 10 bits, do\n h = (h & hashmask(10));\nIn which case, the hash table should have hashsize(10) elements.\n\nIf you are hashing n strings (uint8_t **)k, do it like this:\n for (i=0, h=0; i= 10; errno_in /= 10, ii++)\n\t{\n\t\tdigbuf[ii] = \"0123456789\"[(errno_in % 10)];\n\t}\n\tdigbuf[ii] = \"0123456789\"[(errno_in % 10)];\n\n\t// Reverse the digits\n\tfor (start_idx = sizeof(PREFIX) - 1; ii >= 0; ii--, start_idx++)\n\t{\n\t\terrno_buf[start_idx] = digbuf[ii];\n\t}\n\terrno_buf[start_idx] = '\\0';\n\treturn errno_buf;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -717,7 +717,7 @@ { "text": "#define STRERROR_OVERRIDE_IMPL 1\n#include \"strerror_override.h\"", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", diff --git a/test/data/chunker_repo/Java/repo_out_chunks.json b/test/data/chunker_repo/Java/repo_out_chunks.json index 55e0125c..1af04b26 100644 --- a/test/data/chunker_repo/Java/repo_out_chunks.json +++ b/test/data/chunker_repo/Java/repo_out_chunks.json @@ -3,7 +3,7 @@ { "text": "package com.acmeair;\n\npublic interface AcmeAirConstants {\n\n\t\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -23,7 +23,7 @@ { "text": "package com.acmeair.loader;\nimport com.acmeair.entities.Customer.PhoneType;\nimport com.acmeair.entities.Customer;\nimport com.acmeair.entities.CustomerAddress;\nimport com.acmeair.service.CustomerService;\nimport com.acmeair.service.ServiceLocator;\npublic class CustomerLoader {\n\n private CustomerService customerService = ServiceLocator.instance().getService(CustomerService.class);} public void loadCustomers(long numCustomers) {\n\t\tCustomerAddress address = customerService.createAddress(\"123 Main St.\", null, \"Anytown\", \"NC\", \"USA\", \"27617\");\n\t\tfor (long ii = 0; ii < numCustomers; ii++) {\n\t\t\tcustomerService.createCustomer(\"uid\"+ii+\"@email.com\", \"password\", Customer.MemberShipStatus.GOLD, 1000000, 1000, \"919-123-4567\", PhoneType.BUSINESS, address);\n\t\t}\n\t}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -43,7 +43,7 @@ { "text": "package com.acmeair.loader;\nimport com.acmeair.entities.AirportCodeMapping;\nimport com.acmeair.service.FlightService;\nimport com.acmeair.service.ServiceLocator;\nimport java.io.InputStream;\nimport java.io.InputStreamReader;\nimport java.io.LineNumberReader;\nimport java.math.*;\nimport java.util.*;\npublic class FlightLoader {\n\n private static final int MAX_FLIGHTS_PER_SEGMENT = 30; private FlightService flightService = ServiceLocator.instance().getService(FlightService.class);} public void loadFlights() throws Exception {\n\t\tInputStream csvInputStream = FlightLoader.class.getResourceAsStream(\"/mileage.csv\");\n\t\t\n\t\tLineNumberReader lnr = new LineNumberReader(new InputStreamReader(csvInputStream));\n\t\tString line1 = lnr.readLine();\n\t\tStringTokenizer st = new StringTokenizer(line1, \",\");\n\t\tArrayList airports = new ArrayList();\n\t\t\n\t\t// read the first line which are airport names\n\t\twhile (st.hasMoreTokens()) {\n\t\t\tAirportCodeMapping acm = flightService.createAirportCodeMapping(null, st.nextToken());\n\t\t//\tacm.setAirportName(st.nextToken());\n\t\t\tairports.add(acm);\n\t\t}\n\t\t// read the second line which contains matching airport codes for the first line\n\t\tString line2 = lnr.readLine();\n\t\tst = new StringTokenizer(line2, \",\");\n\t\tint ii = 0;\n\t\twhile (st.hasMoreTokens()) {\n\t\t\tString airportCode = st.nextToken();\n\t\t\tairports.get(ii).setAirportCode(airportCode);\n\t\t\tii++;\n\t\t}\n\t\t// read the other lines which are of format:\n\t\t// airport name, aiport code, distance from this airport to whatever airport is in the column from lines one and two\n\t\tString line;\n\t\tint flightNumber = 0;\n\t\twhile (true) {\n\t\t\tline = lnr.readLine();\n\t\t\tif (line == null || line.trim().equals(\"\")) {\n\t\t\t\tbreak;\n\t\t\t}\n\t\t\tst = new StringTokenizer(line, \",\");\n\t\t\tString airportName = st.nextToken();\n\t\t\tString airportCode = st.nextToken();\n\t\t\tif (!alreadyInCollection(airportCode, airports)) {\n\t\t\t\tAirportCodeMapping acm = flightService.createAirportCodeMapping(airportCode, airportName);\n\t\t\t\tairports.add(acm);\n\t\t\t}\n\t\t\tint indexIntoTopLine = 0;\n\t\t\twhile (st.hasMoreTokens()) {\n\t\t\t\tString milesString = st.nextToken();\n\t\t\t\tif (milesString.equals(\"NA\")) {\n\t\t\t\t\tindexIntoTopLine++;\n\t\t\t\t\tcontinue;\n\t\t\t\t}\n\t\t\t\tint miles = Integer.parseInt(milesString);\n\t\t\t\tString toAirport = airports.get(indexIntoTopLine).getAirportCode();\n\t\t\t\tString flightId = \"AA\" + flightNumber;\t\t\t\n\t\t\t\tflightService.storeFlightSegment(flightId, airportCode, toAirport, miles);\n\t\t\t\tDate now = new Date();\n\t\t\t\tfor (int daysFromNow = 0; daysFromNow < MAX_FLIGHTS_PER_SEGMENT; daysFromNow++) {\n\t\t\t\t\tCalendar c = Calendar.getInstance();\n\t\t\t\t\tc.setTime(now);\n\t\t\t\t\tc.set(Calendar.HOUR_OF_DAY, 0);\n\t\t\t\t c.set(Calendar.MINUTE, 0);\n\t\t\t\t c.set(Calendar.SECOND, 0);\n\t\t\t\t c.set(Calendar.MILLISECOND, 0);\n\t\t\t\t\tc.add(Calendar.DATE, daysFromNow);\n\t\t\t\t\tDate departureTime = c.getTime();\n\t\t\t\t\tDate arrivalTime = getArrivalTime(departureTime, miles);\n\t\t\t\t\tflightService.createNewFlight(flightId, departureTime, arrivalTime, new BigDecimal(500), new BigDecimal(200), 10, 200, \"B747\");\n\t\t\t\t\t\n\t\t\t\t}\n\t\t\t\tflightNumber++;\n\t\t\t\tindexIntoTopLine++;\n\t\t\t}\n\t\t}\n\t\t\n\t\tfor (int jj = 0; jj < airports.size(); jj++) {\n\t\t\tflightService.storeAirportMapping(airports.get(jj));\n\t\t}\n\t\tlnr.close();\n\t}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -63,7 +63,7 @@ { "text": "package com.acmeair.loader;\nimport com.acmeair.service.FlightService;\nimport com.acmeair.service.ServiceLocator;\nimport java.math.*;\nimport java.util.*;\npublic class FlightLoader {\n\n private static final int MAX_FLIGHTS_PER_SEGMENT = 30; private FlightService flightService = ServiceLocator.instance().getService(FlightService.class);} private static Date getArrivalTime(Date departureTime, int mileage) {\n\t\tdouble averageSpeed = 600.0; // 600 miles/hours\n\t\tdouble hours = (double) mileage / averageSpeed; // miles / miles/hour = hours\n\t\tdouble partsOfHour = hours % 1.0;\n\t\tint minutes = (int)(60.0 * partsOfHour);\n\t\tCalendar c = Calendar.getInstance();\n\t\tc.setTime(departureTime);\n\t\tc.add(Calendar.HOUR, (int)hours);\n\t\tc.add(Calendar.MINUTE, minutes);\n\t\treturn c.getTime();\n\t}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -83,7 +83,7 @@ { "text": "package com.acmeair.loader;\nimport com.acmeair.entities.AirportCodeMapping;\nimport com.acmeair.service.FlightService;\nimport com.acmeair.service.ServiceLocator;\nimport java.math.*;\nimport java.util.*;\npublic class FlightLoader {\n\n private static final int MAX_FLIGHTS_PER_SEGMENT = 30; private FlightService flightService = ServiceLocator.instance().getService(FlightService.class);} static private boolean alreadyInCollection(String airportCode, ArrayList airports) {\n\t\tfor (int ii = 0; ii < airports.size(); ii++) {\n\t\t\tif (airports.get(ii).getAirportCode().equals(airportCode)) {\n\t\t\t\treturn true;\n\t\t\t}\n\t\t}\n\t\treturn false;\n\t}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", diff --git a/test/data/chunker_repo/JavaScript/repo_out_chunks.json b/test/data/chunker_repo/JavaScript/repo_out_chunks.json index c6cdf6b4..682ea477 100644 --- a/test/data/chunker_repo/JavaScript/repo_out_chunks.json +++ b/test/data/chunker_repo/JavaScript/repo_out_chunks.json @@ -3,7 +3,7 @@ { "text": "import { jQuery } from \"../core.js\";\nimport { toType } from \"../core/toType.js\";\n\n// Multifunctional method to get and set values of a collection\n// The value/s can optionally be executed if it's a function\nexport function access( elems, fn, key, value, chainable, emptyGet, raw ) {\n\tvar i = 0,\n\t\tlen = elems.length,\n\t\tbulk = key == null;\n\n\t// Sets many values\n\tif ( toType( key ) === \"object\" ) {\n\t\tchainable = true;\n\t\tfor ( i in key ) {\n\t\t\taccess( elems, fn, i, key[ i ], true, emptyGet, raw );\n\t\t}\n\n\t// Sets one value\n\t} else if ( value !== undefined ) {\n\t\tchainable = true;\n\n\t\tif ( typeof value !== \"function\" ) {\n\t\t\traw = true;\n\t\t}\n\n\t\tif ( bulk ) {\n\n\t\t\t// Bulk operations run against the entire set\n\t\t\tif ( raw ) {\n\t\t\t\tfn.call( elems, value );\n\t\t\t\tfn = null;\n\n\t\t\t// ...except when executing function values\n\t\t\t} else {\n\t\t\t\tbulk = fn;\n\t\t\t\tfn = function( elem, _key, value ) {\n\t\t\t\t\treturn bulk.call( jQuery( elem ), value );\n\t\t\t\t};\n\t\t\t}\n\t\t}\n\n\t\tif ( fn ) {\n\t\t\tfor ( ; i < len; i++ ) {\n\t\t\t\tfn(\n\t\t\t\t\telems[ i ], key, raw ?\n\t\t\t\t\t\tvalue :\n\t\t\t\t\t\tvalue.call( elems[ i ], i, fn( elems[ i ], key ) )\n\t\t\t\t);\n\t\t\t}\n\t\t}\n\t}\n\n\tif ( chainable ) {\n\t\treturn elems;\n\t}\n\n\t// Gets\n\tif ( bulk ) {\n\t\treturn fn.call( elems );\n\t}\n\n\treturn len ? fn( elems[ 0 ], key ) : emptyGet;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -20,7 +20,7 @@ { "text": "\nfunction getData( data ) {\n\tif ( data === \"true\" ) {\n\t\treturn true;\n\t}\n\n\tif ( data === \"false\" ) {\n\t\treturn false;\n\t}\n\n\tif ( data === \"null\" ) {\n\t\treturn null;\n\t}\n\n\t// Only convert to a number if it doesn't change the string\n\tif ( data === +data + \"\" ) {\n\t\treturn +data;\n\t}\n\n\tif ( rbrace.test( data ) ) {\n\t\treturn JSON.parse( data );\n\t}\n\n\treturn data;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -40,7 +40,7 @@ { "text": "import { dataUser } from \"./data/var/dataUser.js\";\nfunction dataAttr( elem, key, data ) {\n\tvar name;\n\n\t// If nothing was found internally, try to fetch any\n\t// data from the HTML5 data-* attribute\n\tif ( data === undefined && elem.nodeType === 1 ) {\n\t\tname = \"data-\" + key.replace( rmultiDash, \"-$&\" ).toLowerCase();\n\t\tdata = elem.getAttribute( name );\n\n\t\tif ( typeof data === \"string\" ) {\n\t\t\ttry {\n\t\t\t\tdata = getData( data );\n\t\t\t} catch ( e ) {}\n\n\t\t\t// Make sure we set the data so it isn't changed later\n\t\t\tdataUser.set( elem, key, data );\n\t\t} else {\n\t\t\tdata = undefined;\n\t\t}\n\t}\n\treturn data;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -60,7 +60,7 @@ { "text": "import { jQuery } from \"./core.js\";\nimport { access } from \"./core/access.js\";\nimport { camelCase } from \"./core/camelCase.js\";\nimport { dataPriv } from \"./data/var/dataPriv.js\";\n\n//\tImplementation Summary\n//\n//\t1. Enforce API surface and semantic compatibility with 1.9.x branch\n//\t2. Improve the module's maintainability by reducing the storage\n//\t\tpaths to a single mechanism.\n//\t3. Use the same single mechanism to support \"private\" and \"user\" data.\n//\t4. _Never_ expose \"private\" data to user code (TODO: Drop _data, _removeData)\n//\t5. Avoid exposing implementation details on user objects (eg. expando properties)\n//\t6. Provide a clear path for implementation upgrade to WeakMap in 2014\n\nvar rbrace = /^(?:\\{[\\w\\W]*\\}|\\[[\\w\\W]*\\])$/,\n\trmultiDash = /[A-Z]/g;\n\njQuery.extend( {\n\thasData: function( elem ) {\n\t\treturn dataUser.hasData( elem ) || dataPriv.hasData( elem );\n\t},\n\n\tdata: function( elem, name, data ) {\n\t\treturn dataUser.access( elem, name, data );\n\t},\n\n\tremoveData: function( elem, name ) {\n\t\tdataUser.remove( elem, name );\n\t},\n\n\t// TODO: Now that all calls to _data and _removeData have been replaced\n\t// with direct calls to dataPriv methods, these can be deprecated.\n\t_data: function( elem, name, data ) {\n\t\treturn dataPriv.access( elem, name, data );\n\t},\n\n\t_removeData: function( elem, name ) {\n\t\tdataPriv.remove( elem, name );\n\t}\n} );\n\njQuery.fn.extend( {\n\tdata: function( key, value ) {\n\t\tvar i, name, data,\n\t\t\telem = this[ 0 ],\n\t\t\tattrs = elem && elem.attributes;\n\n\t\t// Gets all values\n\t\tif ( key === undefined ) {\n\t\t\tif ( this.length ) {\n\t\t\t\tdata = dataUser.get( elem );\n\n\t\t\t\tif ( elem.nodeType === 1 && !dataPriv.get( elem, \"hasDataAttrs\" ) ) {\n\t\t\t\t\ti = attrs.length;\n\t\t\t\t\twhile ( i-- ) {\n\n\t\t\t\t\t\t// Support: IE 11+\n\t\t\t\t\t\t// The attrs elements can be null (trac-14894)\n\t\t\t\t\t\tif ( attrs[ i ] ) {\n\t\t\t\t\t\t\tname = attrs[ i ].name;\n\t\t\t\t\t\t\tif ( name.indexOf( \"data-\" ) === 0 ) {\n\t\t\t\t\t\t\t\tname = camelCase( name.slice( 5 ) );\n\t\t\t\t\t\t\t\tdataAttr( elem, name, data[ name ] );\n\t\t\t\t\t\t\t}\n\t\t\t\t\t\t}\n\t\t\t\t\t}\n\t\t\t\t\tdataPriv.set( elem, \"hasDataAttrs\", true );\n\t\t\t\t}\n\t\t\t}\n\n\t\t\treturn data;\n\t\t}\n\n\t\t// Sets multiple values\n\t\tif ( typeof key === \"object\" ) {\n\t\t\treturn this.each( function() {\n\t\t\t\tdataUser.set( this, key );\n\t\t\t} );\n\t\t}\n\n\t\treturn access( this, function( value ) {\n\t\t\tvar data;\n\n\t\t\t// The calling jQuery object (element matches) is not empty\n\t\t\t// (and therefore has an element appears at this[ 0 ]) and the\n\t\t\t// `value` parameter was not undefined. An empty jQuery object\n\t\t\t// will result in `undefined` for elem = this[ 0 ] which will\n\t\t\t// throw an exception if an attempt to read a data cache is made.\n\t\t\tif ( elem && value === undefined ) {\n\n\t\t\t\t// Attempt to get data from the cache\n\t\t\t\t// The key will always be camelCased in Data\n\t\t\t\tdata = dataUser.get( elem, key );\n\t\t\t\tif ( data !== undefined ) {\n\t\t\t\t\treturn data;\n\t\t\t\t}\n\n\t\t\t\t// Attempt to \"discover\" the data in\n\t\t\t\t// HTML5 custom data-* attrs\n\t\t\t\tdata = dataAttr( elem, key );\n\t\t\t\tif ( data !== undefined ) {\n\t\t\t\t\treturn data;\n\t\t\t\t}\n\n\t\t\t\t// We tried really hard, but the data doesn't exist.\n\t\t\t\treturn;\n\t\t\t}\n\n\t\t\t// Set the data...\n\t\t\tthis.each( function() {\n\n\t\t\t\t// We always store the camelCased key\n\t\t\t\tdataUser.set( this, key, value );\n\t\t\t} );\n\t\t}, null, value, arguments.length > 1, null, true );\n\t},\n\n\tremoveData: function( key ) {\n\t\treturn this.each( function() {\n\t\t\tdataUser.remove( this, key );\n\t\t} );\n\t}\n} );\n\nexport { jQuery, jQuery as $ };", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -77,7 +77,7 @@ { "text": "import { jQuery } from \"./core.js\";\nimport { toType } from \"./core/toType.js\";\nfunction buildParams( prefix, obj, traditional, add ) {\n\tvar name;\n\n\tif ( Array.isArray( obj ) ) {\n\n\t\t// Serialize array item.\n\t\tjQuery.each( obj, function( i, v ) {\n\t\t\tif ( traditional || rbracket.test( prefix ) ) {\n\n\t\t\t\t// Treat each array item as a scalar.\n\t\t\t\tadd( prefix, v );\n\n\t\t\t} else {\n\n\t\t\t\t// Item is non-scalar (array or object), encode its numeric index.\n\t\t\t\tbuildParams(\n\t\t\t\t\tprefix + \"[\" + ( typeof v === \"object\" && v != null ? i : \"\" ) + \"]\",\n\t\t\t\t\tv,\n\t\t\t\t\ttraditional,\n\t\t\t\t\tadd\n\t\t\t\t);\n\t\t\t}\n\t\t} );\n\n\t} else if ( !traditional && toType( obj ) === \"object\" ) {\n\n\t\t// Serialize object item.\n\t\tfor ( name in obj ) {\n\t\t\tbuildParams( prefix + \"[\" + name + \"]\", obj[ name ], traditional, add );\n\t\t}\n\n\t} else {\n\n\t\t// Serialize scalar item.\n\t\tadd( prefix, obj );\n\t}\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -97,7 +97,7 @@ { "text": "import { rcheckableType } from \"./var/rcheckableType.js\";\n\nimport \"./core/init.js\";\nimport \"./traversing.js\"; // filter\nimport \"./attributes/prop.js\";\n\nvar\n\trbracket = /\\[\\]$/,\n\trCRLF = /\\r?\\n/g,\n\trsubmitterTypes = /^(?:submit|button|image|reset|file)$/i,\n\trsubmittable = /^(?:input|select|textarea|keygen)/i;\n\n// Serialize an array of form elements or a set of\n// key/values into a query string\njQuery.param = function( a, traditional ) {\n\tvar prefix,\n\t\ts = [],\n\t\tadd = function( key, valueOrFunction ) {\n\n\t\t\t// If value is a function, invoke it and use its return value\n\t\t\tvar value = typeof valueOrFunction === \"function\" ?\n\t\t\t\tvalueOrFunction() :\n\t\t\t\tvalueOrFunction;\n\n\t\t\ts[ s.length ] = encodeURIComponent( key ) + \"=\" +\n\t\t\t\tencodeURIComponent( value == null ? \"\" : value );\n\t\t};\n\n\tif ( a == null ) {\n\t\treturn \"\";\n\t}\n\n\t// If an array was passed in, assume that it is an array of form elements.\n\tif ( Array.isArray( a ) || ( a.jquery && !jQuery.isPlainObject( a ) ) ) {\n\n\t\t// Serialize the form elements\n\t\tjQuery.each( a, function() {\n\t\t\tadd( this.name, this.value );\n\t\t} );\n\n\t} else {\n\n\t\t// If traditional, encode the \"old\" way (the way 1.3.2 or older\n\t\t// did it), otherwise encode params recursively.\n\t\tfor ( prefix in a ) {\n\t\t\tbuildParams( prefix, a[ prefix ], traditional, add );\n\t\t}\n\t}\n\n\t// Return the resulting serialization\n\treturn s.join( \"&\" );\n};\n\njQuery.fn.extend( {\n\tserialize: function() {\n\t\treturn jQuery.param( this.serializeArray() );\n\t},\n\tserializeArray: function() {\n\t\treturn this.map( function() {\n\n\t\t\t// Can add propHook for \"elements\" to filter or add form elements\n\t\t\tvar elements = jQuery.prop( this, \"elements\" );\n\t\t\treturn elements ? jQuery.makeArray( elements ) : this;\n\t\t} ).filter( function() {\n\t\t\tvar type = this.type;\n\n\t\t\t// Use .is( \":disabled\" ) so that fieldset[disabled] works\n\t\t\treturn this.name && !jQuery( this ).is( \":disabled\" ) &&\n\t\t\t\trsubmittable.test( this.nodeName ) && !rsubmitterTypes.test( type ) &&\n\t\t\t\t( this.checked || !rcheckableType.test( type ) );\n\t\t} ).map( function( _i, elem ) {\n\t\t\tvar val = jQuery( this ).val();\n\n\t\t\tif ( val == null ) {\n\t\t\t\treturn null;\n\t\t\t}\n\n\t\t\tif ( Array.isArray( val ) ) {\n\t\t\t\treturn jQuery.map( val, function( val ) {\n\t\t\t\t\treturn { name: elem.name, value: val.replace( rCRLF, \"\\r\\n\" ) };\n\t\t\t\t} );\n\t\t\t}\n\n\t\t\treturn { name: elem.name, value: val.replace( rCRLF, \"\\r\\n\" ) };\n\t\t} ).get();\n\t}\n} );\n\nexport { jQuery, jQuery as $ };", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", diff --git a/test/data/chunker_repo/Python/repo_out_chunks.json b/test/data/chunker_repo/Python/repo_out_chunks.json index 139dd5dd..e2d0eff7 100644 --- a/test/data/chunker_repo/Python/repo_out_chunks.json +++ b/test/data/chunker_repo/Python/repo_out_chunks.json @@ -3,7 +3,7 @@ { "text": "from docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float\nfrom docling_core.types.doc.page import SegmentedPdfPage, TextCell\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass Cluster(BaseModel):\n\n id: int\n label: DocItemLabel\n bbox: BoundingBox\n confidence: float = 1.0\n cells: List[TextCell] = []\n children: List[\"Cluster\"] = []\n @field_serializer(\"confidence\")\n def _serialize(self, value: float, info: FieldSerializationInfo) -> float:\n return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -23,7 +23,7 @@ { "text": "from docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass FigureElement(BasePageElement):\n\n annotations: List[PictureDataType] = []\n provenance: Optional[str] = None\n predicted_class: Optional[str] = None\n confidence: Optional[float] = None\n @field_serializer(\"confidence\")\n def _serialize(\n self, value: Optional[float], info: FieldSerializationInfo\n ) -> Optional[float]:\n return (\n round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)\n if value is not None\n else None\n )", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -43,7 +43,7 @@ { "text": " from docling.backend.pdf_backend import PdfPageBackend\n# DO NOT REMOVE; explicitly exposed from this location\nfrom PIL.Image import Image\nfrom docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.page import SegmentedPdfPage, TextCell\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass Page(BaseModel):\n\n model_config = ConfigDict(arbitrary_types_allowed=True)\n page_no: int\n size: Optional[Size] = None\n parsed_page: Optional[SegmentedPdfPage] = None\n predictions: PagePredictions = PagePredictions()\n assembled: Optional[AssembledUnit] = None\n _backend: Optional[\"PdfPageBackend\"] = (\n None # Internal PDF backend. By default it is cleared during assembling.\n )\n _default_image_scale: float = 1.0\n _image_cache: Dict[float, Image] = (\n {}\n )\n @property\n def cells(self) -> List[TextCell]:\n \"\"\"Return text cells as a read-only view of parsed_page.textline_cells.\"\"\"\n if self.parsed_page is not None:\n return self.parsed_page.textline_cells\n else:\n return []", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -63,7 +63,7 @@ { "text": " from docling.backend.pdf_backend import PdfPageBackend\n# DO NOT REMOVE; explicitly exposed from this location\nfrom PIL.Image import Image\nfrom docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.page import SegmentedPdfPage, TextCell\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass Page(BaseModel):\n\n model_config = ConfigDict(arbitrary_types_allowed=True)\n page_no: int\n size: Optional[Size] = None\n parsed_page: Optional[SegmentedPdfPage] = None\n predictions: PagePredictions = PagePredictions()\n assembled: Optional[AssembledUnit] = None\n _backend: Optional[\"PdfPageBackend\"] = (\n None # Internal PDF backend. By default it is cleared during assembling.\n )\n _default_image_scale: float = 1.0\n _image_cache: Dict[float, Image] = (\n {}\n )\n def get_image(\n self,\n scale: float = 1.0,\n max_size: Optional[int] = None,\n cropbox: Optional[BoundingBox] = None,\n ) -> Optional[Image]:\n if self._backend is None:\n return self._image_cache.get(scale, None)\n\n if max_size:\n assert self.size is not None\n scale = min(scale, max_size / max(self.size.as_tuple()))\n\n if scale not in self._image_cache:\n if cropbox is None:\n self._image_cache[scale] = self._backend.get_page_image(scale=scale)\n else:\n return self._backend.get_page_image(scale=scale, cropbox=cropbox)\n\n if cropbox is None:\n return self._image_cache[scale]\n else:\n page_im = self._image_cache[scale]\n assert self.size is not None\n return page_im.crop(\n cropbox.to_top_left_origin(page_height=self.size.height)\n .scaled(scale=scale)\n .as_tuple()\n )", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -83,7 +83,7 @@ { "text": " from docling.backend.pdf_backend import PdfPageBackend\n# DO NOT REMOVE; explicitly exposed from this location\nfrom PIL.Image import Image\nfrom docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.page import SegmentedPdfPage, TextCell\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass Page(BaseModel):\n\n model_config = ConfigDict(arbitrary_types_allowed=True)\n page_no: int\n size: Optional[Size] = None\n parsed_page: Optional[SegmentedPdfPage] = None\n predictions: PagePredictions = PagePredictions()\n assembled: Optional[AssembledUnit] = None\n _backend: Optional[\"PdfPageBackend\"] = (\n None # Internal PDF backend. By default it is cleared during assembling.\n )\n _default_image_scale: float = 1.0\n _image_cache: Dict[float, Image] = (\n {}\n )\n @property\n def image(self) -> Optional[Image]:\n return self.get_image(scale=self._default_image_scale)", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -103,7 +103,7 @@ { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nimport numpy as np\nclass PageConfidenceScores(BaseModel):\n\n parse_score: ScoreValue = np.nan\n layout_score: ScoreValue = np.nan\n table_score: ScoreValue = np.nan\n ocr_score: ScoreValue = np.nan\n def _score_to_grade(self, score: ScoreValue) -> QualityGrade:\n if score < 0.5:\n return QualityGrade.POOR\n elif score < 0.8:\n return QualityGrade.FAIR\n elif score < 0.9:\n return QualityGrade.GOOD\n elif score >= 0.9:\n return QualityGrade.EXCELLENT\n\n return QualityGrade.UNSPECIFIED", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -123,7 +123,7 @@ { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nimport numpy as np\nclass PageConfidenceScores(BaseModel):\n\n parse_score: ScoreValue = np.nan\n layout_score: ScoreValue = np.nan\n table_score: ScoreValue = np.nan\n ocr_score: ScoreValue = np.nan\n @computed_field # type: ignore\n @property\n def mean_grade(self) -> QualityGrade:\n return self._score_to_grade(self.mean_score)", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -143,7 +143,7 @@ { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nimport numpy as np\nclass PageConfidenceScores(BaseModel):\n\n parse_score: ScoreValue = np.nan\n layout_score: ScoreValue = np.nan\n table_score: ScoreValue = np.nan\n ocr_score: ScoreValue = np.nan\n @computed_field # type: ignore\n @property\n def low_grade(self) -> QualityGrade:\n return self._score_to_grade(self.low_score)", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -163,7 +163,7 @@ { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nimport numpy as np\nScoreValue = float\nclass PageConfidenceScores(BaseModel):\n\n parse_score: ScoreValue = np.nan\n layout_score: ScoreValue = np.nan\n table_score: ScoreValue = np.nan\n ocr_score: ScoreValue = np.nan\n @computed_field # type: ignore\n @property\n def mean_score(self) -> ScoreValue:\n return ScoreValue(\n np.nanmean(\n [\n self.ocr_score,\n self.table_score,\n self.layout_score,\n self.parse_score,\n ]\n )\n )", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -183,7 +183,7 @@ { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nimport numpy as np\nScoreValue = float\nclass PageConfidenceScores(BaseModel):\n\n parse_score: ScoreValue = np.nan\n layout_score: ScoreValue = np.nan\n table_score: ScoreValue = np.nan\n ocr_score: ScoreValue = np.nan\n @computed_field # type: ignore\n @property\n def low_score(self) -> ScoreValue:\n return ScoreValue(\n np.nanquantile(\n [\n self.ocr_score,\n self.table_score,\n self.layout_score,\n self.parse_score,\n ],\n q=0.05,\n )\n )", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -203,7 +203,7 @@ { "text": "from collections import defaultdict\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nimport numpy as np\nScoreValue = float\nclass ConfidenceReport(PageConfidenceScores):\n\n pages: Dict[int, PageConfidenceScores] = Field(\n default_factory=lambda: defaultdict(PageConfidenceScores)\n )\n @computed_field # type: ignore\n @property\n def mean_score(self) -> ScoreValue:\n return ScoreValue(\n np.nanmean(\n [c.mean_score for c in self.pages.values()],\n )\n )", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -223,7 +223,7 @@ { "text": "from collections import defaultdict\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nimport numpy as np\nScoreValue = float\nclass ConfidenceReport(PageConfidenceScores):\n\n pages: Dict[int, PageConfidenceScores] = Field(\n default_factory=lambda: defaultdict(PageConfidenceScores)\n )\n @computed_field # type: ignore\n @property\n def low_score(self) -> ScoreValue:\n return ScoreValue(\n np.nanmean(\n [c.low_score for c in self.pages.values()],\n )\n )", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -243,7 +243,7 @@ { "text": "from enum import Enum\nclass ConversionStatus(str, Enum):\n PENDING = \"pending\"\n STARTED = \"started\"\n FAILURE = \"failure\"\n SUCCESS = \"success\"\n PARTIAL_SUCCESS = \"partial_success\"\n SKIPPED = \"skipped\"", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -263,7 +263,7 @@ { "text": "from enum import Enum\nimport numpy as np\nclass InputFormat(str, Enum):\n \"\"\"A document format supported by document backend parsers.\"\"\"\n\n DOCX = \"docx\"\n PPTX = \"pptx\"\n HTML = \"html\"\n IMAGE = \"image\"\n PDF = \"pdf\"\n ASCIIDOC = \"asciidoc\"\n MD = \"md\"\n CSV = \"csv\"\n XLSX = \"xlsx\"\n XML_USPTO = \"xml_uspto\"\n XML_JATS = \"xml_jats\"\n JSON_DOCLING = \"json_docling\"\n AUDIO = \"audio\"", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -283,7 +283,7 @@ { "text": "from enum import Enum\nclass OutputFormat(str, Enum):\n MARKDOWN = \"md\"\n JSON = \"json\"\n HTML = \"html\"\n HTML_SPLIT_PAGE = \"html_split_page\"\n TEXT = \"text\"\n DOCTAGS = \"doctags\"", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -303,7 +303,7 @@ { "text": "from enum import Enum\nimport numpy as np\nclass DocInputType(str, Enum):\n PATH = \"path\"\n STREAM = \"stream\"", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -323,7 +323,7 @@ { "text": "from enum import Enum\nimport numpy as np\nclass DoclingComponentType(str, Enum):\n DOCUMENT_BACKEND = \"document_backend\"\n MODEL = \"model\"\n DOC_ASSEMBLER = \"doc_assembler\"\n USER_INPUT = \"user_input\"", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -343,7 +343,7 @@ { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nclass ErrorItem(BaseModel):\n component_type: DoclingComponentType\n module_name: str\n error_message: str", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -363,7 +363,7 @@ { "text": "from docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float\nfrom docling_core.types.doc.page import SegmentedPdfPage, TextCell\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass Cluster(BaseModel):\n id: int\n label: DocItemLabel\n bbox: BoundingBox\n confidence: float = 1.0\n cells: List[TextCell] = []\n children: List[\"Cluster\"] = [] # Add child cluster support\n\n @field_serializer(\"confidence\")\n def _serialize(self, value: float, info: FieldSerializationInfo) -> float:\n return round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -383,7 +383,7 @@ { "text": "from docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass BasePageElement(BaseModel):\n label: DocItemLabel\n id: int\n page_no: int\n cluster: Cluster\n text: Optional[str] = None", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -403,7 +403,7 @@ { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass LayoutPrediction(BaseModel):\n clusters: List[Cluster] = []", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -423,7 +423,7 @@ { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nclass VlmPredictionToken(BaseModel):\n text: str = \"\"\n token: int = -1\n logprob: float = -1", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -443,7 +443,7 @@ { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nclass VlmPrediction(BaseModel):\n text: str = \"\"\n generated_tokens: list[VlmPredictionToken] = []\n generation_time: float = -1", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -463,7 +463,7 @@ { "text": "from typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass ContainerElement(\n BasePageElement\n): # Used for Form and Key-Value-Regions, only for typing.\n pass", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -483,7 +483,7 @@ { "text": "from docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass Table(BasePageElement):\n otsl_seq: List[str]\n num_rows: int = 0\n num_cols: int = 0\n table_cells: List[TableCell]", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -503,7 +503,7 @@ { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass TableStructurePrediction(BaseModel):\n table_map: Dict[int, Table] = {}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -523,7 +523,7 @@ { "text": "\nclass TextElement(BasePageElement):\n text: str", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -543,7 +543,7 @@ { "text": "from docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom docling_core.types.doc.base import PydanticSerCtxKey, round_pydantic_float\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass FigureElement(BasePageElement):\n annotations: List[PictureDataType] = []\n provenance: Optional[str] = None\n predicted_class: Optional[str] = None\n confidence: Optional[float] = None\n\n @field_serializer(\"confidence\")\n def _serialize(\n self, value: Optional[float], info: FieldSerializationInfo\n ) -> Optional[float]:\n return (\n round_pydantic_float(value, info.context, PydanticSerCtxKey.CONFID_PREC)\n if value is not None\n else None\n )", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -563,7 +563,7 @@ { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass FigureClassificationPrediction(BaseModel):\n figure_count: int = 0\n figure_map: Dict[int, FigureElement] = {}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -583,7 +583,7 @@ { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass EquationPrediction(BaseModel):\n equation_count: int = 0\n equation_map: Dict[int, TextElement] = {}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -603,7 +603,7 @@ { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass PagePredictions(BaseModel):\n layout: Optional[LayoutPrediction] = None\n tablestructure: Optional[TableStructurePrediction] = None\n figures_classification: Optional[FigureClassificationPrediction] = None\n equations_prediction: Optional[EquationPrediction] = None\n vlm_response: Optional[VlmPrediction] = None", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -623,7 +623,7 @@ { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass AssembledUnit(BaseModel):\n elements: List[PageElement] = []\n body: List[PageElement] = []\n headers: List[PageElement] = []", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -643,7 +643,7 @@ { "text": "# DO NOT REMOVE; explicitly exposed from this location\nfrom PIL.Image import Image\nfrom docling_core.types.doc import (\n BoundingBox,\n DocItemLabel,\n NodeItem,\n PictureDataType,\n Size,\n TableCell,\n)\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass ItemAndImageEnrichmentElement(BaseModel):\n model_config = ConfigDict(arbitrary_types_allowed=True)\n\n item: NodeItem\n image: Image", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -663,7 +663,7 @@ { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\n## OpenAI API Request / Response Models ##\nclass OpenAiChatMessage(BaseModel):\n role: str\n content: str", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -683,7 +683,7 @@ { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass OpenAiResponseChoice(BaseModel):\n index: int\n message: OpenAiChatMessage\n finish_reason: Optional[str]", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -703,7 +703,7 @@ { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nclass OpenAiResponseUsage(BaseModel):\n prompt_tokens: int\n completion_tokens: int\n total_tokens: int", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -723,7 +723,7 @@ { "text": "from pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nclass OpenAiApiResponse(BaseModel):\n model_config = ConfigDict(\n protected_namespaces=(),\n )\n\n id: str\n model: Optional[str] = None # returned by openai\n choices: List[OpenAiResponseChoice]\n created: int\n usage: OpenAiResponseUsage", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -743,7 +743,7 @@ { "text": "from enum import Enum\nclass QualityGrade(str, Enum):\n POOR = \"poor\"\n FAIR = \"fair\"\n GOOD = \"good\"\n EXCELLENT = \"excellent\"\n UNSPECIFIED = \"unspecified\"", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -763,7 +763,7 @@ { "text": "from collections import defaultdict\nfrom pydantic import (\n BaseModel,\n ConfigDict,\n Field,\n FieldSerializationInfo,\n computed_field,\n field_serializer,\n)\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Union\nimport numpy as np\nclass ConfidenceReport(PageConfidenceScores):\n pages: Dict[int, PageConfidenceScores] = Field(\n default_factory=lambda: defaultdict(PageConfidenceScores)\n )\n\n @computed_field # type: ignore\n @property\n def mean_score(self) -> ScoreValue:\n return ScoreValue(\n np.nanmean(\n [c.mean_score for c in self.pages.values()],\n )\n )\n\n @computed_field # type: ignore\n @property\n def low_score(self) -> ScoreValue:\n return ScoreValue(\n np.nanmean(\n [c.low_score for c in self.pages.values()],\n )\n )", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -783,7 +783,7 @@ { "text": "if TYPE_CHECKING:\n\nFormatToExtensions: Dict[InputFormat, List[str]] = {\n InputFormat.DOCX: [\"docx\", \"dotx\", \"docm\", \"dotm\"],\n InputFormat.PPTX: [\"pptx\", \"potx\", \"ppsx\", \"pptm\", \"potm\", \"ppsm\"],\n InputFormat.PDF: [\"pdf\"],\n InputFormat.MD: [\"md\"],\n InputFormat.HTML: [\"html\", \"htm\", \"xhtml\"],\n InputFormat.XML_JATS: [\"xml\", \"nxml\"],\n InputFormat.IMAGE: [\"jpg\", \"jpeg\", \"png\", \"tif\", \"tiff\", \"bmp\", \"webp\"],\n InputFormat.ASCIIDOC: [\"adoc\", \"asciidoc\", \"asc\"],\n InputFormat.CSV: [\"csv\"],\n InputFormat.XLSX: [\"xlsx\", \"xlsm\"],\n InputFormat.XML_USPTO: [\"xml\", \"txt\"],\n InputFormat.JSON_DOCLING: [\"json\"],\n InputFormat.AUDIO: [\"wav\", \"mp3\"],\n}\n\nFormatToMimeType: Dict[InputFormat, List[str]] = {\n InputFormat.DOCX: [\n \"application/vnd.openxmlformats-officedocument.wordprocessingml.document\",\n \"application/vnd.openxmlformats-officedocument.wordprocessingml.template\",\n ],\n InputFormat.PPTX: [\n \"application/vnd.openxmlformats-officedocument.presentationml.template\",\n \"application/vnd.openxmlformats-officedocument.presentationml.slideshow\",\n \"application/vnd.openxmlformats-officedocument.presentationml.presentation\",\n ],\n InputFormat.HTML: [\"text/html\", \"application/xhtml+xml\"],\n InputFormat.XML_JATS: [\"application/xml\"],\n InputFormat.IMAGE: [\n \"image/png\",\n \"image/jpeg\",\n \"image/tiff\",\n \"image/gif\",\n \"image/bmp\",\n \"image/webp\",\n ],\n InputFormat.PDF: [\"application/pdf\"],\n InputFormat.ASCIIDOC: [\"text/asciidoc\"],\n InputFormat.MD: [\"text/markdown\", \"text/x-markdown\"],\n InputFormat.CSV: [\"text/csv\"],\n InputFormat.XLSX: [\n \"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet\"\n ],\n InputFormat.XML_USPTO: [\"application/xml\", \"text/plain\"],\n InputFormat.JSON_DOCLING: [\"application/json\"],\n InputFormat.AUDIO: [\"audio/x-wav\", \"audio/mpeg\", \"audio/wav\", \"audio/mp3\"],\n}\n\nMimeTypeToFormat: dict[str, list[InputFormat]] = {\n mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]\n for value in FormatToMimeType.values()\n for mime in value\n}\n\nPageElement = Union[TextElement, Table, FigureElement, ContainerElement]", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -800,7 +800,7 @@ { "text": "from collections.abc import Iterable\nfrom docling.datamodel.document import ConversionResult, Page\nfrom docling_core.types.doc import BoundingBox, CoordOrigin\nfrom docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table\nfrom typing import Any, Dict, List, Tuple, Union\n_log = logging.getLogger(__name__)\ndef generate_multimodal_pages(\n doc_result: ConversionResult,\n) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:\n label_to_doclaynet = {\n \"title\": \"title\",\n \"table-of-contents\": \"document_index\",\n \"subtitle-level-1\": \"section_header\",\n \"checkbox-selected\": \"checkbox_selected\",\n \"checkbox-unselected\": \"checkbox_unselected\",\n \"caption\": \"caption\",\n \"page-header\": \"page_header\",\n \"page-footer\": \"page_footer\",\n \"footnote\": \"footnote\",\n \"table\": \"table\",\n \"formula\": \"formula\",\n \"list-item\": \"list_item\",\n \"code\": \"code\",\n \"figure\": \"picture\",\n \"picture\": \"picture\",\n \"reference\": \"text\",\n \"paragraph\": \"text\",\n \"text\": \"text\",\n }\n\n content_text = \"\"\n page_no = 0\n start_ix = 0\n end_ix = 0\n doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []\n\n doc = doc_result.legacy_document\n\n def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):\n segments = []\n\n for ix, item in doc_items:\n item_type = item.obj_type\n label = label_to_doclaynet.get(item_type, None)\n\n if label is None or item.prov is None or page.size is None:\n continue\n\n bbox = BoundingBox.from_tuple(\n tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT\n )\n new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(\n page_size=page.size\n )\n\n new_segment = {\n \"index_in_doc\": ix,\n \"label\": label,\n \"text\": item.text if item.text is not None else \"\",\n \"bbox\": new_bbox.as_tuple(),\n \"data\": [],\n }\n\n if isinstance(item, Table):\n table_html = item.export_to_html()\n new_segment[\"data\"].append(\n {\n \"html_seq\": table_html,\n \"otsl_seq\": \"\",\n }\n )\n\n segments.append(new_segment)\n\n return segments\n\n def _process_page_cells(page: Page):\n cells: List[dict] = []\n if page.size is None:\n return cells\n for cell in page.cells:\n new_bbox = (\n cell.rect.to_bounding_box()\n .to_top_left_origin(page_height=page.size.height)\n .normalized(page_size=page.size)\n )\n is_ocr = cell.from_ocr\n ocr_confidence = cell.confidence\n cells.append(\n {\n \"text\": cell.text,\n \"bbox\": new_bbox.as_tuple(),\n \"ocr\": is_ocr,\n \"ocr_confidence\": ocr_confidence,\n }\n )\n return cells\n\n def _process_page():\n page_ix = page_no - 1\n page = doc_result.pages[page_ix]\n\n page_cells = _process_page_cells(page=page)\n page_segments = _process_page_segments(doc_items=doc_items, page=page)\n content_md = doc.export_to_markdown(\n main_text_start=start_ix, main_text_stop=end_ix\n )\n # No page-tagging since we only do 1 page at the time\n content_dt = doc.export_to_document_tokens(\n main_text_start=start_ix, main_text_stop=end_ix, add_page_index=False\n )\n\n return content_text, content_md, content_dt, page_cells, page_segments, page\n\n if doc.main_text is None:\n return\n for ix, orig_item in enumerate(doc.main_text):\n item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item\n if item is None or item.prov is None or len(item.prov) == 0:\n _log.debug(f\"Skipping item {orig_item}\")\n continue\n\n item_page = item.prov[0].page\n\n # Page is complete\n if page_no > 0 and item_page > page_no:\n yield _process_page()\n\n start_ix = ix\n doc_items = []\n content_text = \"\"\n\n page_no = item_page\n end_ix = ix\n doc_items.append((ix, item))\n if item.text is not None and item.text != \"\":\n content_text += item.text + \" \"\n\n if len(doc_items) > 0:\n yield _process_page()", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -820,7 +820,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n @override\n def is_valid(self) -> bool:\n return self.valid", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -840,7 +840,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n @classmethod\n @override\n def supports_pagination(cls) -> bool:\n return False", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -860,7 +860,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n @override\n def unload(self):\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.close()\n self.path_or_stream = None", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -880,7 +880,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.base_models import InputFormat\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n @classmethod\n @override\n def supported_formats(cls) -> set[InputFormat]:\n return {InputFormat.XML_JATS}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -900,7 +900,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nimport traceback\n_log = logging.getLogger(__name__)\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n @override\n def convert(self) -> DoclingDocument:\n try:\n # Create empty document\n origin = DocumentOrigin(\n filename=self.file.name or \"file\",\n mimetype=\"application/xml\",\n binary_hash=self.document_hash,\n )\n doc = DoclingDocument(name=self.file.stem or \"file\", origin=origin)\n self.hlevel = 0\n\n # Get metadata XML components\n xml_components: XMLComponents = self._parse_metadata()\n\n # Add metadata to the document\n self._add_metadata(doc, xml_components)\n\n # walk over the XML body\n body = self.tree.xpath(\"//body\")\n if self.root and len(body) > 0:\n self._walk_linear(doc, self.root, body[0])\n\n # walk over the XML back matter\n back = self.tree.xpath(\"//back\")\n if self.root and len(back) > 0:\n self._walk_linear(doc, self.root, back[0])\n except Exception:\n _log.error(traceback.format_exc())\n\n return doc", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -920,7 +920,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n @staticmethod\n def _get_text(node: etree._Element, sep: Optional[str] = None) -> str:\n skip_tags = [\"term\", \"disp-formula\", \"inline-formula\"]\n text: str = (\n node.text.replace(\"\\n\", \" \")\n if (node.tag not in skip_tags and node.text)\n else \"\"\n )\n for child in list(node):\n if child.tag not in skip_tags:\n # TODO: apply styling according to child.tag when supported by docling-core\n text += JatsDocumentBackend._get_text(child, sep)\n if sep:\n text = text.rstrip(sep) + sep\n text += child.tail.replace(\"\\n\", \" \") if child.tail else \"\"\n\n return text", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -940,7 +940,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _find_metadata(self) -> Optional[etree._Element]:\n meta_names: list[str] = [\"article-meta\", \"book-part-meta\"]\n meta: Optional[etree._Element] = None\n for name in meta_names:\n node = self.tree.xpath(f\".//{name}\")\n if len(node) > 0:\n meta = node[0]\n break\n\n return meta", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -960,7 +960,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _parse_abstract(self) -> list[Abstract]:\n # TODO: address cases with multiple sections\n abs_list: list[Abstract] = []\n\n for abs_node in self.tree.xpath(\".//abstract\"):\n abstract: Abstract = dict(label=\"\", content=\"\")\n texts = []\n for abs_par in abs_node.xpath(\"p\"):\n texts.append(JatsDocumentBackend._get_text(abs_par).strip())\n abstract[\"content\"] = \" \".join(texts)\n\n label_node = abs_node.xpath(\"title|label\")\n if len(label_node) > 0:\n abstract[\"label\"] = label_node[0].text.strip()\n\n abs_list.append(abstract)\n\n return abs_list", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -980,7 +980,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _parse_authors(self) -> list[Author]:\n # Get mapping between affiliation ids and names\n authors: list[Author] = []\n meta: Optional[etree._Element] = self._find_metadata()\n if meta is None:\n return authors\n\n affiliation_names = []\n for affiliation_node in meta.xpath(\".//aff[@id]\"):\n aff = \", \".join([t for t in affiliation_node.itertext() if t.strip()])\n aff = aff.replace(\"\\n\", \" \")\n label = affiliation_node.xpath(\"label\")\n if label:\n # TODO: once superscript is supported, add label with formatting\n aff = aff.removeprefix(f\"{label[0].text}, \")\n affiliation_names.append(aff)\n affiliation_ids_names = dict(\n zip(meta.xpath(\".//aff[@id]/@id\"), affiliation_names)\n )\n\n # Get author names and affiliation names\n for author_node in meta.xpath(\n './/contrib-group/contrib[@contrib-type=\"author\"]'\n ):\n author: Author = {\n \"name\": \"\",\n \"affiliation_names\": [],\n }\n\n # Affiliation names\n affiliation_ids = [\n a.attrib[\"rid\"] for a in author_node.xpath('xref[@ref-type=\"aff\"]')\n ]\n for id in affiliation_ids:\n if id in affiliation_ids_names:\n author[\"affiliation_names\"].append(affiliation_ids_names[id])\n\n # Name\n author[\"name\"] = (\n author_node.xpath(\"name/given-names\")[0].text\n + \" \"\n + author_node.xpath(\"name/surname\")[0].text\n )\n\n authors.append(author)\n\n return authors", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -1000,7 +1000,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _parse_title(self) -> str:\n meta_names: list[str] = [\n \"article-meta\",\n \"collection-meta\",\n \"book-meta\",\n \"book-part-meta\",\n ]\n title_names: list[str] = [\"article-title\", \"subtitle\", \"title\", \"label\"]\n titles: list[str] = [\n \" \".join(\n elem.text.replace(\"\\n\", \" \").strip()\n for elem in list(title_node)\n if elem.tag in title_names\n ).strip()\n for title_node in self.tree.xpath(\n \"|\".join([f\".//{item}/title-group\" for item in meta_names])\n )\n ]\n\n text = \" - \".join(titles)\n\n return text", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -1020,7 +1020,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _parse_metadata(self) -> XMLComponents:\n \"\"\"Parsing JATS document metadata.\"\"\"\n xml_components: XMLComponents = {\n \"title\": self._parse_title(),\n \"authors\": self._parse_authors(),\n \"abstract\": self._parse_abstract(),\n }\n return xml_components", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -1040,7 +1040,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nDEFAULT_HEADER_ABSTRACT: Final = \"Abstract\"\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_abstract(\n self, doc: DoclingDocument, xml_components: XMLComponents\n ) -> None:\n for abstract in xml_components[\"abstract\"]:\n text: str = abstract[\"content\"]\n title: str = abstract[\"label\"] or DEFAULT_HEADER_ABSTRACT\n if not text:\n continue\n parent = doc.add_heading(\n parent=self.root, text=title, level=self.hlevel + 1\n )\n doc.add_text(\n parent=parent,\n text=text,\n label=DocItemLabel.TEXT,\n )\n\n return", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -1060,7 +1060,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_authors(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:\n # TODO: once docling supports text formatting, add affiliation reference to\n # author names through superscripts\n authors: list = [item[\"name\"] for item in xml_components[\"authors\"]]\n authors_str = \", \".join(authors)\n affiliations: list = [\n item\n for author in xml_components[\"authors\"]\n for item in author[\"affiliation_names\"]\n ]\n affiliations_str = \"; \".join(list(dict.fromkeys(affiliations)))\n if authors_str:\n doc.add_text(\n parent=self.root,\n text=authors_str,\n label=DocItemLabel.PARAGRAPH,\n )\n if affiliations_str:\n doc.add_text(\n parent=self.root,\n text=affiliations_str,\n label=DocItemLabel.PARAGRAPH,\n )\n\n return", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -1080,7 +1080,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_citation(self, doc: DoclingDocument, parent: NodeItem, text: str) -> None:\n if isinstance(parent, GroupItem) and parent.label == GroupLabel.LIST:\n doc.add_list_item(text=text, enumerated=False, parent=parent)\n else:\n doc.add_text(text=text, label=DocItemLabel.TEXT, parent=parent)\n\n return", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -1100,7 +1100,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nDEFAULT_TEXT_ETAL: Final = \"et al.\"\n_log = logging.getLogger(__name__)\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901\n citation: Citation = {\n \"author_names\": \"\",\n \"title\": \"\",\n \"source\": \"\",\n \"year\": \"\",\n \"volume\": \"\",\n \"page\": \"\",\n \"pub_id\": \"\",\n \"publisher_name\": \"\",\n \"publisher_loc\": \"\",\n }\n\n _log.debug(\"Citation parsing started\")\n\n # Author names\n names = []\n for name_node in node.xpath(\".//name\"):\n name_str = (\n name_node.xpath(\"surname\")[0].text.replace(\"\\n\", \" \").strip()\n + \" \"\n + name_node.xpath(\"given-names\")[0].text.replace(\"\\n\", \" \").strip()\n )\n names.append(name_str)\n etal_node = node.xpath(\".//etal\")\n if len(etal_node) > 0:\n etal_text = etal_node[0].text or DEFAULT_TEXT_ETAL\n names.append(etal_text)\n citation[\"author_names\"] = \", \".join(names)\n\n titles: list[str] = [\n \"article-title\",\n \"chapter-title\",\n \"data-title\",\n \"issue-title\",\n \"part-title\",\n \"trans-title\",\n ]\n title_node: Optional[etree._Element] = None\n for name in titles:\n name_node = node.xpath(name)\n if len(name_node) > 0:\n title_node = name_node[0]\n break\n citation[\"title\"] = (\n JatsDocumentBackend._get_text(title_node)\n if title_node is not None\n else node.text.replace(\"\\n\", \" \").strip()\n )\n\n # Journal, year, publisher name, publisher location, volume, elocation\n fields: list[str] = [\n \"source\",\n \"year\",\n \"publisher-name\",\n \"publisher-loc\",\n \"volume\",\n ]\n for item in fields:\n item_node = node.xpath(item)\n if len(item_node) > 0:\n citation[item.replace(\"-\", \"_\")] = ( # type: ignore[literal-required]\n item_node[0].text.replace(\"\\n\", \" \").strip()\n )\n\n # Publication identifier\n if len(node.xpath(\"pub-id\")) > 0:\n pub_id: list[str] = []\n for id_node in node.xpath(\"pub-id\"):\n id_type = id_node.get(\"assigning-authority\") or id_node.get(\n \"pub-id-type\"\n )\n id_text = id_node.text\n if id_type and id_text:\n pub_id.append(\n id_type.replace(\"\\n\", \" \").strip().upper()\n + \": \"\n + id_text.replace(\"\\n\", \" \").strip()\n )\n if pub_id:\n citation[\"pub_id\"] = \", \".join(pub_id)\n\n # Pages\n if len(node.xpath(\"elocation-id\")) > 0:\n citation[\"page\"] = (\n node.xpath(\"elocation-id\")[0].text.replace(\"\\n\", \" \").strip()\n )\n elif len(node.xpath(\"fpage\")) > 0:\n citation[\"page\"] = node.xpath(\"fpage\")[0].text.replace(\"\\n\", \" \").strip()\n if len(node.xpath(\"lpage\")) > 0:\n citation[\"page\"] += (\n \"\u2013\"\n + node.xpath(\"lpage\")[0]\n .text.replace(\"\\n\", \" \")\n .strip() # noqa: RUF001\n )\n\n # Flatten the citation to string\n\n text = \"\"\n if citation[\"author_names\"]:\n text += citation[\"author_names\"].rstrip(\".\") + \". \"\n if citation[\"title\"]:\n text += citation[\"title\"] + \". \"\n if citation[\"source\"]:\n text += citation[\"source\"] + \". \"\n if citation[\"publisher_name\"]:\n if citation[\"publisher_loc\"]:\n text += f\"{citation['publisher_loc']}: \"\n text += citation[\"publisher_name\"] + \". \"\n if citation[\"volume\"]:\n text = text.rstrip(\". \")\n text += f\" {citation['volume']}. \"\n if citation[\"page\"]:\n text = text.rstrip(\". \")\n if citation[\"volume\"]:\n text += \":\"\n text += citation[\"page\"] + \". \"\n if citation[\"year\"]:\n text = text.rstrip(\". \")\n text += f\" ({citation['year']}).\"\n if citation[\"pub_id\"]:\n text = text.rstrip(\".\") + \". \"\n text += citation[\"pub_id\"]\n\n _log.debug(\"Citation flattened\")\n\n return text", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -1120,7 +1120,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_equation(\n self, doc: DoclingDocument, parent: NodeItem, node: etree._Element\n ) -> None:\n math_text = node.text\n math_parts = math_text.split(\"$$\")\n if len(math_parts) == 3:\n math_formula = math_parts[1]\n doc.add_text(label=DocItemLabel.FORMULA, text=math_formula, parent=parent)\n\n return", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -1140,7 +1140,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_figure_captions(\n self, doc: DoclingDocument, parent: NodeItem, node: etree._Element\n ) -> None:\n label_node = node.xpath(\"label\")\n label: Optional[str] = (\n JatsDocumentBackend._get_text(label_node[0]).strip() if label_node else \"\"\n )\n\n caption_node = node.xpath(\"caption\")\n caption: Optional[str]\n if len(caption_node) > 0:\n caption = \"\"\n for caption_par in list(caption_node[0]):\n if caption_par.xpath(\".//supplementary-material\"):\n continue\n caption += JatsDocumentBackend._get_text(caption_par).strip() + \" \"\n caption = caption.strip()\n else:\n caption = None\n\n # TODO: format label vs caption once styling is supported\n fig_text: str = f\"{label}{' ' if label and caption else ''}{caption}\"\n fig_caption: Optional[TextItem] = (\n doc.add_text(label=DocItemLabel.CAPTION, text=fig_text)\n if fig_text\n else None\n )\n\n doc.add_picture(parent=parent, caption=fig_caption)\n\n return", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -1160,7 +1160,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_metadata(\n self, doc: DoclingDocument, xml_components: XMLComponents\n ) -> None:\n self._add_title(doc, xml_components)\n self._add_authors(doc, xml_components)\n self._add_abstract(doc, xml_components)\n\n return", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -1180,7 +1180,7 @@ { "text": "from bs4 import BeautifulSoup, Tag\nfrom docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.backend.html_backend import HTMLDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_table(\n self, doc: DoclingDocument, parent: NodeItem, table_xml_component: Table\n ) -> None:\n soup = BeautifulSoup(table_xml_component[\"content\"], \"html.parser\")\n table_tag = soup.find(\"table\")\n if not isinstance(table_tag, Tag):\n return\n\n data = HTMLDocumentBackend.parse_table_data(table_tag)\n\n # TODO: format label vs caption once styling is supported\n label = table_xml_component[\"label\"]\n caption = table_xml_component[\"caption\"]\n table_text: str = f\"{label}{' ' if label and caption else ''}{caption}\"\n table_caption: Optional[TextItem] = (\n doc.add_text(label=DocItemLabel.CAPTION, text=table_text)\n if table_text\n else None\n )\n\n if data is not None:\n doc.add_table(data=data, parent=parent, caption=table_caption)\n\n return", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -1200,7 +1200,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\n_log = logging.getLogger(__name__)\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_tables(\n self, doc: DoclingDocument, parent: NodeItem, node: etree._Element\n ) -> None:\n table: Table = {\"label\": \"\", \"caption\": \"\", \"content\": \"\"}\n\n # Content\n if len(node.xpath(\"table\")) > 0:\n table_content_node = node.xpath(\"table\")[0]\n elif len(node.xpath(\"alternatives/table\")) > 0:\n table_content_node = node.xpath(\"alternatives/table\")[0]\n else:\n table_content_node = None\n if table_content_node is not None:\n table[\"content\"] = etree.tostring(table_content_node).decode(\"utf-8\")\n\n # Caption\n caption_node = node.xpath(\"caption\")\n caption: Optional[str]\n if caption_node:\n caption = \"\"\n for caption_par in list(caption_node[0]):\n if caption_par.xpath(\".//supplementary-material\"):\n continue\n caption += JatsDocumentBackend._get_text(caption_par).strip() + \" \"\n caption = caption.strip()\n else:\n caption = None\n if caption is not None:\n table[\"caption\"] = caption\n\n # Label\n if len(node.xpath(\"label\")) > 0:\n table[\"label\"] = node.xpath(\"label\")[0].text\n\n try:\n self._add_table(doc, parent, table)\n except Exception:\n _log.warning(f\"Skipping unsupported table in {self.file!s}\")\n\n return", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -1220,7 +1220,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _add_title(self, doc: DoclingDocument, xml_components: XMLComponents) -> None:\n self.root = doc.add_text(\n parent=None,\n text=xml_components[\"title\"],\n label=DocItemLabel.TITLE,\n )\n return", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -1240,7 +1240,7 @@ { "text": "from docling.backend.abstract_backend import DeclarativeDocumentBackend\nfrom docling.datamodel.document import InputDocument\nfrom docling_core.types.doc import (\n DocItemLabel,\n DoclingDocument,\n DocumentOrigin,\n GroupItem,\n GroupLabel,\n NodeItem,\n TextItem,\n)\nfrom io import BytesIO\nfrom lxml import etree\nfrom pathlib import Path\nfrom typing import Final, Optional, Union\nfrom typing_extensions import TypedDict, override\nDEFAULT_HEADER_ACKNOWLEDGMENTS: Final = \"Acknowledgments\"\nDEFAULT_HEADER_REFERENCES: Final = \"References\"\nclass JatsDocumentBackend(DeclarativeDocumentBackend):\n\n\n @override\n def __init__(\n self, in_doc: \"InputDocument\", path_or_stream: Union[BytesIO, Path]\n ) -> None:\n super().__init__(in_doc, path_or_stream)\n self.path_or_stream = path_or_stream\n\n # Initialize the root of the document hierarchy\n self.root: Optional[NodeItem] = None\n self.hlevel: int = 0\n self.valid: bool = False\n try:\n if isinstance(self.path_or_stream, BytesIO):\n self.path_or_stream.seek(0)\n self.tree: etree._ElementTree = etree.parse(self.path_or_stream)\n\n doc_info: etree.DocInfo = self.tree.docinfo\n if doc_info.system_url and any(\n kwd in doc_info.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n for ent in doc_info.internalDTD.iterentities():\n if ent.system_url and any(\n kwd in ent.system_url for kwd in JATS_DTD_URL\n ):\n self.valid = True\n return\n except Exception as exc:\n raise RuntimeError(\n f\"Could not initialize JATS backend for file with hash {self.document_hash}.\"\n ) from exc\n def _walk_linear( # noqa: C901\n self, doc: DoclingDocument, parent: NodeItem, node: etree._Element\n ) -> str:\n skip_tags = [\"term\"]\n flush_tags = [\"ack\", \"sec\", \"list\", \"boxed-text\", \"disp-formula\", \"fig\"]\n new_parent: NodeItem = parent\n node_text: str = (\n node.text.replace(\"\\n\", \" \")\n if (node.tag not in skip_tags and node.text)\n else \"\"\n )\n\n for child in list(node):\n stop_walk: bool = False\n\n # flush text into TextItem for some tags in paragraph nodes\n if node.tag == \"p\" and node_text.strip() and child.tag in flush_tags:\n doc.add_text(\n label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent\n )\n node_text = \"\"\n\n # add elements and decide whether to stop walking\n if child.tag in (\"sec\", \"ack\"):\n header = child.xpath(\"title|label\")\n text: Optional[str] = None\n if len(header) > 0:\n text = JatsDocumentBackend._get_text(header[0])\n elif child.tag == \"ack\":\n text = DEFAULT_HEADER_ACKNOWLEDGMENTS\n if text:\n self.hlevel += 1\n new_parent = doc.add_heading(\n text=text, parent=parent, level=self.hlevel\n )\n elif child.tag == \"list\":\n new_parent = doc.add_group(\n label=GroupLabel.LIST, name=\"list\", parent=parent\n )\n elif child.tag == \"list-item\":\n # TODO: address any type of content (another list, formula,...)\n # TODO: address list type and item label\n text = JatsDocumentBackend._get_text(child).strip()\n new_parent = doc.add_list_item(text=text, parent=parent)\n stop_walk = True\n elif child.tag == \"fig\":\n self._add_figure_captions(doc, parent, child)\n stop_walk = True\n elif child.tag == \"table-wrap\":\n self._add_tables(doc, parent, child)\n stop_walk = True\n elif child.tag == \"suplementary-material\":\n stop_walk = True\n elif child.tag == \"fn-group\":\n # header = child.xpath(\".//title\") or child.xpath(\".//label\")\n # if header:\n # text = JatsDocumentBackend._get_text(header[0])\n # fn_parent = doc.add_heading(text=text, parent=new_parent)\n # self._add_footnote_group(doc, fn_parent, child)\n stop_walk = True\n elif child.tag == \"ref-list\" and node.tag != \"ref-list\":\n header = child.xpath(\"title|label\")\n text = (\n JatsDocumentBackend._get_text(header[0])\n if len(header) > 0\n else DEFAULT_HEADER_REFERENCES\n )\n new_parent = doc.add_heading(text=text, parent=parent)\n new_parent = doc.add_group(\n parent=new_parent, label=GroupLabel.LIST, name=\"list\"\n )\n elif child.tag == \"element-citation\":\n text = self._parse_element_citation(child)\n self._add_citation(doc, parent, text)\n stop_walk = True\n elif child.tag == \"mixed-citation\":\n text = JatsDocumentBackend._get_text(child).strip()\n self._add_citation(doc, parent, text)\n stop_walk = True\n elif child.tag == \"tex-math\":\n self._add_equation(doc, parent, child)\n stop_walk = True\n elif child.tag == \"inline-formula\":\n # TODO: address inline formulas when supported by docling-core\n stop_walk = True\n\n # step into child\n if not stop_walk:\n new_text = self._walk_linear(doc, new_parent, child)\n if not (node.getparent().tag == \"p\" and node.tag in flush_tags):\n node_text += new_text\n if child.tag in (\"sec\", \"ack\") and text:\n self.hlevel -= 1\n\n # pick up the tail text\n node_text += child.tail.replace(\"\\n\", \" \") if child.tail else \"\"\n\n # create paragraph\n if node.tag == \"p\" and node_text.strip():\n doc.add_text(label=DocItemLabel.TEXT, text=node_text.strip(), parent=parent)\n return \"\"\n else:\n # backpropagate the text\n return node_text", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -1260,7 +1260,7 @@ { "text": "from typing_extensions import TypedDict, override\nclass Abstract(TypedDict):\n label: str\n content: str", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -1280,7 +1280,7 @@ { "text": "from io import BytesIO\nfrom typing_extensions import TypedDict, override\nclass Author(TypedDict):\n name: str\n affiliation_names: list[str]", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -1300,7 +1300,7 @@ { "text": "from io import BytesIO\nfrom typing_extensions import TypedDict, override\nclass Citation(TypedDict):\n author_names: str\n title: str\n source: str\n year: str\n volume: str\n page: str\n pub_id: str\n publisher_name: str\n publisher_loc: str", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -1320,7 +1320,7 @@ { "text": "from io import BytesIO\nfrom typing_extensions import TypedDict, override\nclass Table(TypedDict):\n label: str\n caption: str\n content: str", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -1340,7 +1340,7 @@ { "text": "from typing_extensions import TypedDict, override\nclass XMLComponents(TypedDict):\n title: str\n authors: list[Author]\n abstract: list[Abstract]", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", diff --git a/test/data/chunker_repo/TypeScript/repo_out_chunks.json b/test/data/chunker_repo/TypeScript/repo_out_chunks.json index bfa97bc2..c70a20dc 100644 --- a/test/data/chunker_repo/TypeScript/repo_out_chunks.json +++ b/test/data/chunker_repo/TypeScript/repo_out_chunks.json @@ -3,7 +3,7 @@ { "text": "import Model from \"./base/Model\";\nclass Comment extends Model\n public get isResolved(): boolean {\n return !!this.resolvedAt || !!this.parentComment?.isResolved;\n }", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -23,7 +23,7 @@ { "text": "import Model from \"./base/Model\";\nclass Comment extends Model\n public get isReply() {\n return !!this.parentCommentId;\n }", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -43,7 +43,7 @@ { "text": "import Model from \"./base/Model\";\nclass Comment extends Model\n public resolve() {\n return this.store.rootStore.comments.resolve(this.id);\n }", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -63,7 +63,7 @@ { "text": "import Model from \"./base/Model\";\nclass Comment extends Model\n public unresolve() {\n return this.store.rootStore.comments.unresolve(this.id);\n }", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -83,7 +83,7 @@ { "text": "import invariant from \"invariant\";\nimport uniq from \"lodash/uniq\";\nimport { action, computed, observable } from \"mobx\";\nimport { Pagination } from \"@shared/constants\";\nimport type { ProsemirrorData, ReactionSummary } from \"@shared/types\";\nimport User from \"~/models/User\";\nimport { client } from \"~/utils/ApiClient\";\nimport Document from \"./Document\";\n\nimport Field from \"./decorators/Field\";\nimport Relation from \"./decorators/Relation\";\n\nexport default Comment;", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -100,7 +100,7 @@ { "text": "import Group from \"./Group\";\nimport Model from \"./base/Model\";\nimport Relation from \"./decorators/Relation\";\nimport User from \"./User\";\n\nclass GroupUser extends Model {\n static modelName = \"GroupUser\";\n\n /** The ID of the user. */\n userId: string;\n\n /** The user that belongs to the group. */\n @Relation(() => User, { onDelete: \"cascade\" })\n user: User;\n\n /** The ID of the group. */\n groupId: string;\n\n /** The group that the user belongs to. */\n @Relation(() => Group, { onDelete: \"cascade\" })\n group: Group;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -120,7 +120,7 @@ { "text": "export default GroupUser;", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -137,7 +137,7 @@ { "text": "import {\n Action,\n ActionContext,\n ActionV2,\n ActionV2Group,\n ActionV2Separator as TActionV2Separator,\n ActionV2Variant,\n ActionV2WithChildren,\n CommandBarAction,\n ExternalLinkActionV2,\n InternalLinkActionV2,\n MenuExternalLink,\n MenuInternalLink,\n MenuItem,\n MenuItemButton,\n MenuItemWithChildren,\n} from \"~/types\";\nfunction resolve(value: any, context: ActionContext): T {\n return typeof value === \"function\" ? value(context) : value;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -157,7 +157,7 @@ { "text": "import {\n Action,\n ActionContext,\n ActionV2,\n ActionV2Group,\n ActionV2Separator as TActionV2Separator,\n ActionV2Variant,\n ActionV2WithChildren,\n CommandBarAction,\n ExternalLinkActionV2,\n InternalLinkActionV2,\n MenuExternalLink,\n MenuInternalLink,\n MenuItem,\n MenuItemButton,\n MenuItemWithChildren,\n} from \"~/types\";\nfunction hasVisibleItems(items: MenuItem[]) {\n const applicableTypes = [\"button\", \"link\", \"route\", \"group\", \"submenu\"];\n return items.some(\n (item) => applicableTypes.includes(item.type) && item.visible\n );\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", @@ -177,7 +177,7 @@ { "text": "import flattenDeep from \"lodash/flattenDeep\";\nimport { toast } from \"sonner\";\nimport { Optional } from \"utility-types\";\nimport { v4 as uuidv4 } from \"uuid\";\n\nimport Analytics from \"~/utils/Analytics\";\nimport history from \"~/utils/history\";\n\nexport function createAction(definition: Optional): Action {\n return {\n ...definition,\n perform: definition.perform\n ? (context) => {\n // We must use the specific analytics name here as the action name is\n // translated and potentially contains user strings.\n if (definition.analyticsName) {\n Analytics.track(\"perform_action\", definition.analyticsName, {\n context: context.isButton\n ? \"button\"\n : context.isCommandBar\n ? \"commandbar\"\n : \"contextmenu\",\n });\n }\n return definition.perform?.(context);\n }\n : undefined,\n id: definition.id ?? uuidv4(),\n };\n}\n\nexport function actionToMenuItem(\n action: Action,\n context: ActionContext\n): MenuItemButton | MenuExternalLink | MenuInternalLink | MenuItemWithChildren {\n const resolvedIcon = resolve>(action.icon, context);\n const resolvedChildren = resolve(action.children, context);\n const visible = action.visible ? action.visible(context) : true;\n const title = resolve(action.name, context);\n const icon =\n resolvedIcon && action.iconInContextMenu !== false\n ? resolvedIcon\n : undefined;\n\n if (resolvedChildren) {\n const items = resolvedChildren\n .map((a) => actionToMenuItem(a, context))\n .filter(Boolean)\n .filter((a) => a.visible);\n\n return {\n type: \"submenu\",\n title,\n icon,\n items,\n visible: visible && items.length > 0,\n };\n }\n\n if (action.to) {\n return typeof action.to === \"string\"\n ? {\n type: \"route\",\n title,\n icon,\n visible,\n to: action.to,\n selected: action.selected?.(context),\n }\n : {\n type: \"link\",\n title,\n icon,\n visible,\n href: action.to,\n selected: action.selected?.(context),\n };\n }\n\n return {\n type: \"button\",\n title,\n icon,\n visible,\n dangerous: action.dangerous,\n onClick: () => performAction(action, context),\n selected: action.selected?.(context),\n };\n}\n\nexport function actionToKBar(\n action: Action,\n context: ActionContext\n): CommandBarAction[] {\n if (typeof action.visible === \"function\" && !action.visible(context)) {\n return [];\n }\n\n const resolvedIcon = resolve(action.icon, context);\n const resolvedChildren = resolve(action.children, context);\n const resolvedSection = resolve(action.section, context);\n const resolvedName = resolve(action.name, context);\n const resolvedPlaceholder = resolve(action.placeholder, context);\n const children = resolvedChildren\n ? flattenDeep(resolvedChildren.map((a) => actionToKBar(a, context))).filter(\n (a) => !!a\n )\n : [];\n\n const sectionPriority =\n typeof action.section !== \"string\" && \"priority\" in action.section\n ? ((action.section.priority as number) ?? 0)\n : 0;\n\n return [\n {\n id: action.id,\n name: resolvedName,\n analyticsName: action.analyticsName,\n section: resolvedSection,\n placeholder: resolvedPlaceholder,\n keywords: action.keywords ?? \"\",\n shortcut: action.shortcut || [],\n icon: resolvedIcon,\n priority: (1 + (action.priority ?? 0)) * (1 + (sectionPriority ?? 0)),\n perform:\n action.perform || action.to\n ? () => performAction(action, context)\n : undefined,\n },\n ].concat(\n // @ts-expect-error ts-migrate(2769) FIXME: No overload matches this call.\n children.map((child) => ({ ...child, parent: child.parent ?? action.id }))\n );\n}\n\nexport async function performAction(action: Action, context: ActionContext) {\n const result = action.perform\n ? action.perform(context)\n : action.to\n ? typeof action.to === \"string\"\n ? history.push(action.to)\n : window.open(action.to.url, action.to.target)\n : undefined;\n\n if (result instanceof Promise) {\n return result.catch((err: Error) => {\n toast.error(err.message);\n });\n }\n\n return result;\n}\n\n/** Actions V2 */\n\nexport const ActionV2Separator: TActionV2Separator = {\n type: \"action_separator\",\n};\n\nexport function createActionV2(\n definition: Optional, \"id\">\n): ActionV2 {\n return {\n ...definition,\n type: \"action\",\n variant: \"action\",\n perform: definition.perform\n ? (context) => {\n // We must use the specific analytics name here as the action name is\n // translated and potentially contains user strings.\n if (definition.analyticsName) {\n Analytics.track(\"perform_action\", definition.analyticsName, {\n context: context.isButton\n ? \"button\"\n : context.isCommandBar\n ? \"commandbar\"\n : \"contextmenu\",\n });\n }\n return definition.perform(context);\n }\n : () => {},\n id: definition.id ?? uuidv4(),\n };\n}\n\nexport function createInternalLinkActionV2(\n definition: Optional, \"id\">\n): InternalLinkActionV2 {\n return {\n ...definition,\n type: \"action\",\n variant: \"internal_link\",\n id: definition.id ?? uuidv4(),\n };\n}\n\nexport function createExternalLinkActionV2(\n definition: Optional, \"id\">\n): ExternalLinkActionV2 {\n return {\n ...definition,\n type: \"action\",\n variant: \"external_link\",\n id: definition.id ?? uuidv4(),\n };\n}\n\nexport function createActionV2WithChildren(\n definition: Optional, \"id\">\n): ActionV2WithChildren {\n return {\n ...definition,\n type: \"action\",\n variant: \"action_with_children\",\n id: definition.id ?? uuidv4(),\n };\n}\n\nexport function createActionV2Group(\n definition: Omit\n): ActionV2Group {\n return {\n ...definition,\n type: \"action_group\",\n };\n}\n\nexport function createRootMenuAction(\n actions: (ActionV2Variant | ActionV2Group | TActionV2Separator)[]\n): ActionV2WithChildren {\n return {\n id: uuidv4(),\n type: \"action\",\n variant: \"action_with_children\",\n name: \"root_action\",\n section: \"Root\",\n children: actions,\n };\n}\n\nexport function actionV2ToMenuItem(\n action: ActionV2Variant | ActionV2Group | TActionV2Separator,\n context: ActionContext\n): MenuItem {\n switch (action.type) {\n case \"action\": {\n const title = resolve(action.name, context);\n const visible = resolve(action.visible, context);\n const icon =\n !!action.icon && action.iconInContextMenu !== false\n ? action.icon\n : undefined;\n\n switch (action.variant) {\n case \"action\":\n return {\n type: \"button\",\n title,\n icon,\n visible,\n dangerous: action.dangerous,\n onClick: () => performActionV2(action, context),\n };\n\n case \"internal_link\":\n return {\n type: \"route\",\n title,\n icon,\n visible,\n to: action.to,\n };\n\n case \"external_link\":\n return {\n type: \"link\",\n title,\n icon,\n visible,\n href: action.target\n ? { url: action.url, target: action.target }\n : action.url,\n };\n\n case \"action_with_children\": {\n const children = resolve<\n (ActionV2Variant | ActionV2Group | TActionV2Separator)[]\n >(action.children, context);\n const subMenuItems = children.map((a) =>\n actionV2ToMenuItem(a, context)\n );\n return {\n type: \"submenu\",\n title,\n icon,\n items: subMenuItems,\n visible: visible && hasVisibleItems(subMenuItems),\n };\n }\n\n default:\n throw Error(\"invalid action variant\");\n }\n }\n\n case \"action_group\": {\n const groupItems = action.actions.map((a) =>\n actionV2ToMenuItem(a, context)\n );\n return {\n type: \"group\",\n title: resolve(action.name, context),\n visible: hasVisibleItems(groupItems),\n items: groupItems,\n };\n }\n\n case \"action_separator\":\n return { type: \"separator\" };\n }\n}\n\nexport async function performActionV2(\n action: ActionV2,\n context: ActionContext\n) {\n const result = action.perform(context);\n\n if (result instanceof Promise) {\n return result.catch((err: Error) => {\n toast.error(err.message);\n });\n }\n\n return result;\n}", "meta": { - "schema_name": "docling_core.transforms.chunker.DocMeta", + "schema_name": "docling_core.transforms.chunker.CodeDocMeta", "version": "1.0.0", "origin": { "mimetype": "text/plain", diff --git a/test/data/doc/2408.09869v3_enriched.dt b/test/data/doc/2408.09869v3_enriched.dt index 044ba9dd..b87f9c32 100644 --- a/test/data/doc/2408.09869v3_enriched.dt +++ b/test/data/doc/2408.09869v3_enriched.dt @@ -27,8 +27,8 @@ . All required model assets 1 are downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the model assets in advance. Docling provides an easy code interface to convert PDF documents from file system, URLs or binary streams, and retrieve the output in either JSON or Markdown format. For convenience, separate methods are offered to convert single documents or batches of documents. A basic usage example is illustrated below. Further examples are available in the Doclign code repository. -<_unknown_>from docling.document_converter import DocumentConverter Large -<_unknown_>source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL converter = DocumentConverter() result = converter.convert_single(source) print(result.render_as_markdown()) # output: "## DocLayNet: A Human -Annotated Dataset for Document -Layout Analysis [...]" +<_Python_>from docling.document_converter import DocumentConverter Large +<_Python_>source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL converter = DocumentConverter() result = converter.convert_single(source) print(result.render_as_markdown()) # output: "## DocLayNet: A Human -Annotated Dataset for Document -Layout Analysis [...]" Optionally, you can configure custom pipeline features and runtime options, such as turning on or off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and defining the budget of CPU threads. Advanced usage examples and options are documented in the README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a container. 3 Processing pipeline Docling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support downstream operations. Then, the standard model pipeline applies a sequence of AI models independently on every page in the document to extract features and content, such as layout and table structures. Finally, the results from all pages are aggregated and passed through a post-processing stage, which augments metadata, detects the document language, infers reading-order and eventually assembles a typed document object which can be serialized to JSON or Markdown. diff --git a/test/data/doc/2408.09869v3_enriched.dt.json b/test/data/doc/2408.09869v3_enriched.dt.json index 0051e6b1..5b3c9eaf 100644 --- a/test/data/doc/2408.09869v3_enriched.dt.json +++ b/test/data/doc/2408.09869v3_enriched.dt.json @@ -1350,7 +1350,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/25", @@ -1381,7 +1381,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/26", diff --git a/test/data/doc/2408.09869v3_enriched.json b/test/data/doc/2408.09869v3_enriched.json index 054eeabe..a91e1910 100644 --- a/test/data/doc/2408.09869v3_enriched.json +++ b/test/data/doc/2408.09869v3_enriched.json @@ -1337,7 +1337,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/23", @@ -1368,7 +1368,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/24", diff --git a/test/data/doc/2408.09869v3_enriched.out.dt b/test/data/doc/2408.09869v3_enriched.out.dt index 7faeee2f..516f0c04 100644 --- a/test/data/doc/2408.09869v3_enriched.out.dt +++ b/test/data/doc/2408.09869v3_enriched.out.dt @@ -27,8 +27,8 @@ . All required model assets 1 are downloaded to a local huggingface datasets cache on first use, unless you choose to pre-install the model assets in advance. Docling provides an easy code interface to convert PDF documents from file system, URLs or binary streams, and retrieve the output in either JSON or Markdown format. For convenience, separate methods are offered to convert single documents or batches of documents. A basic usage example is illustrated below. Further examples are available in the Doclign code repository. -<_unknown_>from docling.document_converter import DocumentConverter Large -<_unknown_>source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL converter = DocumentConverter() result = converter.convert_single(source) print(result.render_as_markdown()) # output: "## DocLayNet: A Human -Annotated Dataset for Document -Layout Analysis [...]" +<_Python_>from docling.document_converter import DocumentConverter Large +<_Python_>source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL converter = DocumentConverter() result = converter.convert_single(source) print(result.render_as_markdown()) # output: "## DocLayNet: A Human -Annotated Dataset for Document -Layout Analysis [...]" Optionally, you can configure custom pipeline features and runtime options, such as turning on or off features (e.g. OCR, table structure recognition), enforcing limits on the input document size, and defining the budget of CPU threads. Advanced usage examples and options are documented in the README file. Docling also provides a Dockerfile to demonstrate how to install and run it inside a container. 3 Processing pipeline Docling implements a linear pipeline of operations, which execute sequentially on each given document (see Fig. 1). Each document is first parsed by a PDF backend, which retrieves the programmatic text tokens, consisting of string content and its coordinates on the page, and also renders a bitmap image of each page to support downstream operations. Then, the standard model pipeline applies a sequence of AI models independently on every page in the document to extract features and content, such as layout and table structures. Finally, the results from all pages are aggregated and passed through a post-processing stage, which augments metadata, detects the document language, infers reading-order and eventually assembles a typed document object which can be serialized to JSON or Markdown. diff --git a/test/data/doc/2408.09869v3_enriched.out.dt.json b/test/data/doc/2408.09869v3_enriched.out.dt.json index 7d3e159a..d9f25224 100644 --- a/test/data/doc/2408.09869v3_enriched.out.dt.json +++ b/test/data/doc/2408.09869v3_enriched.out.dt.json @@ -1350,7 +1350,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/25", @@ -1381,7 +1381,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/26", diff --git a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json index 7bbddf7b..b177187b 100644 --- a/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json +++ b/test/data/doc/2408.09869v3_enriched_p2_p3_p5.gt.json @@ -610,7 +610,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/13", @@ -641,7 +641,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/14", diff --git a/test/data/doc/concatenated.json b/test/data/doc/concatenated.json index 47fe4990..52fdbf6d 100644 --- a/test/data/doc/concatenated.json +++ b/test/data/doc/concatenated.json @@ -9352,7 +9352,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/314", @@ -9446,7 +9446,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/321", diff --git a/test/data/doc/constructed_doc.added_extracted_doc.json.gt b/test/data/doc/constructed_doc.added_extracted_doc.json.gt index 4013747b..720dea29 100644 --- a/test/data/doc/constructed_doc.added_extracted_doc.json.gt +++ b/test/data/doc/constructed_doc.added_extracted_doc.json.gt @@ -732,7 +732,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/16", @@ -826,7 +826,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/23", diff --git a/test/data/doc/constructed_doc.appended_child.json.gt b/test/data/doc/constructed_doc.appended_child.json.gt index 74b6fba7..57e55914 100644 --- a/test/data/doc/constructed_doc.appended_child.json.gt +++ b/test/data/doc/constructed_doc.appended_child.json.gt @@ -561,7 +561,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/16", @@ -655,7 +655,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/23", diff --git a/test/data/doc/constructed_doc.bulk_item_addition.json.gt b/test/data/doc/constructed_doc.bulk_item_addition.json.gt index 257c5b90..d6f7605c 100644 --- a/test/data/doc/constructed_doc.bulk_item_addition.json.gt +++ b/test/data/doc/constructed_doc.bulk_item_addition.json.gt @@ -725,7 +725,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/16", @@ -819,7 +819,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/23", diff --git a/test/data/doc/constructed_doc.bulk_item_insertion.json.gt b/test/data/doc/constructed_doc.bulk_item_insertion.json.gt index ce4f7c6d..7ef37443 100644 --- a/test/data/doc/constructed_doc.bulk_item_insertion.json.gt +++ b/test/data/doc/constructed_doc.bulk_item_insertion.json.gt @@ -731,7 +731,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/16", @@ -825,7 +825,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/23", diff --git a/test/data/doc/constructed_doc.deleted_group.json.gt b/test/data/doc/constructed_doc.deleted_group.json.gt index 549ae6a0..0eb91b7e 100644 --- a/test/data/doc/constructed_doc.deleted_group.json.gt +++ b/test/data/doc/constructed_doc.deleted_group.json.gt @@ -558,7 +558,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/16", @@ -652,7 +652,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/23", diff --git a/test/data/doc/constructed_doc.deleted_items_range.json.gt b/test/data/doc/constructed_doc.deleted_items_range.json.gt index 91b37357..7bee5eb3 100644 --- a/test/data/doc/constructed_doc.deleted_items_range.json.gt +++ b/test/data/doc/constructed_doc.deleted_items_range.json.gt @@ -561,7 +561,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/16", @@ -655,7 +655,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/23", diff --git a/test/data/doc/constructed_doc.deleted_picture.json.gt b/test/data/doc/constructed_doc.deleted_picture.json.gt index 85890f23..c6580208 100644 --- a/test/data/doc/constructed_doc.deleted_picture.json.gt +++ b/test/data/doc/constructed_doc.deleted_picture.json.gt @@ -555,7 +555,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/16", @@ -649,7 +649,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/23", diff --git a/test/data/doc/constructed_doc.deleted_table.json.gt b/test/data/doc/constructed_doc.deleted_table.json.gt index 55cb7002..4bdfe31a 100644 --- a/test/data/doc/constructed_doc.deleted_table.json.gt +++ b/test/data/doc/constructed_doc.deleted_table.json.gt @@ -517,7 +517,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/15", @@ -593,7 +593,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/21", diff --git a/test/data/doc/constructed_doc.deleted_text.json.gt b/test/data/doc/constructed_doc.deleted_text.json.gt index 45c03c2a..d074769a 100644 --- a/test/data/doc/constructed_doc.deleted_text.json.gt +++ b/test/data/doc/constructed_doc.deleted_text.json.gt @@ -772,7 +772,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/25", @@ -866,7 +866,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/32", diff --git a/test/data/doc/constructed_doc.dt b/test/data/doc/constructed_doc.dt index 35eb28dc..ef0fcde4 100644 --- a/test/data/doc/constructed_doc.dt +++ b/test/data/doc/constructed_doc.dt @@ -30,7 +30,7 @@ Affiliation 2 item 2 of neighboring list item 1 of sub list Here a code snippet: -<_unknown_>print("Hello world") +<_Python_>print("Hello world") (to be displayed inline) Here a formula: @@ -40,7 +40,7 @@ Affiliation 2 Here a code block: -<_unknown_>print("Hello world") +<_Python_>print("Hello world") Here a formula block: E=mc^2 number1 diff --git a/test/data/doc/constructed_doc.dt.gt b/test/data/doc/constructed_doc.dt.gt index 35eb28dc..ef0fcde4 100644 --- a/test/data/doc/constructed_doc.dt.gt +++ b/test/data/doc/constructed_doc.dt.gt @@ -30,7 +30,7 @@ Affiliation 2 item 2 of neighboring list item 1 of sub list Here a code snippet: -<_unknown_>print("Hello world") +<_Python_>print("Hello world") (to be displayed inline) Here a formula: @@ -40,7 +40,7 @@ Affiliation 2 Here a code block: -<_unknown_>print("Hello world") +<_Python_>print("Hello world") Here a formula block: E=mc^2 number1 diff --git a/test/data/doc/constructed_doc.embedded.json.gt b/test/data/doc/constructed_doc.embedded.json.gt index 4ac0e019..ef757fc9 100644 --- a/test/data/doc/constructed_doc.embedded.json.gt +++ b/test/data/doc/constructed_doc.embedded.json.gt @@ -783,7 +783,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/26", @@ -877,7 +877,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/33", diff --git a/test/data/doc/constructed_doc.embedded.yaml.gt b/test/data/doc/constructed_doc.embedded.yaml.gt index 15d93ce3..f44cd6b2 100644 --- a/test/data/doc/constructed_doc.embedded.yaml.gt +++ b/test/data/doc/constructed_doc.embedded.yaml.gt @@ -759,7 +759,7 @@ texts: text: 'Here a code snippet:' - captions: [] children: [] - code_language: unknown + code_language: Python content_layer: body footnotes: [] label: code @@ -829,7 +829,7 @@ texts: text: 'Here a code block:' - captions: [] children: [] - code_language: unknown + code_language: Python content_layer: body footnotes: [] label: code diff --git a/test/data/doc/constructed_doc.extracted_with_deletion.json.gt b/test/data/doc/constructed_doc.extracted_with_deletion.json.gt index fc7a3b94..c6075711 100644 --- a/test/data/doc/constructed_doc.extracted_with_deletion.json.gt +++ b/test/data/doc/constructed_doc.extracted_with_deletion.json.gt @@ -638,7 +638,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/16", @@ -732,7 +732,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/23", diff --git a/test/data/doc/constructed_doc.inserted_extracted_doc.json.gt b/test/data/doc/constructed_doc.inserted_extracted_doc.json.gt index a31af507..290a78e9 100644 --- a/test/data/doc/constructed_doc.inserted_extracted_doc.json.gt +++ b/test/data/doc/constructed_doc.inserted_extracted_doc.json.gt @@ -825,7 +825,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/16", @@ -919,7 +919,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/23", diff --git a/test/data/doc/constructed_doc.inserted_items_with_insert_*.json.gt b/test/data/doc/constructed_doc.inserted_items_with_insert_*.json.gt index 2722426c..734fc59f 100644 --- a/test/data/doc/constructed_doc.inserted_items_with_insert_*.json.gt +++ b/test/data/doc/constructed_doc.inserted_items_with_insert_*.json.gt @@ -678,7 +678,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/16", @@ -772,7 +772,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/23", diff --git a/test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json.gt b/test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json.gt index 42044db6..b44fe290 100644 --- a/test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json.gt +++ b/test/data/doc/constructed_doc.inserted_list_items_with_insert_*.json.gt @@ -718,7 +718,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/16", @@ -812,7 +812,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/23", diff --git a/test/data/doc/constructed_doc.inserted_text.json.gt b/test/data/doc/constructed_doc.inserted_text.json.gt index 6c4285f4..175922e9 100644 --- a/test/data/doc/constructed_doc.inserted_text.json.gt +++ b/test/data/doc/constructed_doc.inserted_text.json.gt @@ -789,7 +789,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/26", @@ -883,7 +883,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/33", diff --git a/test/data/doc/constructed_doc.manipulated_table.json.gt b/test/data/doc/constructed_doc.manipulated_table.json.gt index e65dd7d8..51ffeb81 100644 --- a/test/data/doc/constructed_doc.manipulated_table.json.gt +++ b/test/data/doc/constructed_doc.manipulated_table.json.gt @@ -731,7 +731,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/16", @@ -825,7 +825,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/23", diff --git a/test/data/doc/constructed_doc.referenced.json.gt b/test/data/doc/constructed_doc.referenced.json.gt index 8a11418f..21a7f022 100644 --- a/test/data/doc/constructed_doc.referenced.json.gt +++ b/test/data/doc/constructed_doc.referenced.json.gt @@ -783,7 +783,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/26", @@ -877,7 +877,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/33", diff --git a/test/data/doc/constructed_doc.referenced.yaml.gt b/test/data/doc/constructed_doc.referenced.yaml.gt index bb291c11..514fd1c1 100644 --- a/test/data/doc/constructed_doc.referenced.yaml.gt +++ b/test/data/doc/constructed_doc.referenced.yaml.gt @@ -759,7 +759,7 @@ texts: text: 'Here a code snippet:' - captions: [] children: [] - code_language: unknown + code_language: Python content_layer: body footnotes: [] label: code @@ -829,7 +829,7 @@ texts: text: 'Here a code block:' - captions: [] children: [] - code_language: unknown + code_language: Python content_layer: body footnotes: [] label: code diff --git a/test/data/doc/constructed_doc.replaced_item.json.gt b/test/data/doc/constructed_doc.replaced_item.json.gt index 91b37357..7bee5eb3 100644 --- a/test/data/doc/constructed_doc.replaced_item.json.gt +++ b/test/data/doc/constructed_doc.replaced_item.json.gt @@ -561,7 +561,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/16", @@ -655,7 +655,7 @@ "captions": [], "references": [], "footnotes": [], - "code_language": "unknown" + "code_language": "Python" }, { "self_ref": "#/texts/23", diff --git a/test/data/doc/constructed_document.yaml.dt b/test/data/doc/constructed_document.yaml.dt index 3d7de1c7..14df9817 100644 --- a/test/data/doc/constructed_document.yaml.dt +++ b/test/data/doc/constructed_document.yaml.dt @@ -30,7 +30,7 @@ Affiliation 2 item 2 of neighboring list item 1 of sub list Here a code snippet: -<_unknown_>print("Hello world") +<_Python_>print("Hello world") (to be displayed inline) Here a formula: @@ -40,7 +40,7 @@ Affiliation 2 Here a code block: -<_unknown_>print("Hello world") +<_Python_>print("Hello world") Here a formula block: E=mc^2 number1 From 377d5ce596e083dead9f367e2b9fc8d7a6fae7c2 Mon Sep 17 00:00:00 2001 From: Bridget McGinn Date: Thu, 23 Oct 2025 21:28:38 -0400 Subject: [PATCH 09/12] DCO Remediation Commit for Bridget McGinn I, Bridget McGinn , hereby add my Signed-off-by to this commit: 46bb88ae3a00ebbdd33336e51b51e030c748425b I, Bridget McGinn , hereby add my Signed-off-by to this commit: 10e9ed8a0f28bdd4ae6156a84f19435319f4a5ce I, Bridget McGinn , hereby add my Signed-off-by to this commit: d9827c749b72ecc902c66a24eb7fcb4db1b615fa I, Bridget McGinn , hereby add my Signed-off-by to this commit: 814dc6161134752b08f699eeef58535248893768 Signed-off-by: Bridget McGinn From d12fbc6c9161f320621610ac807835b1b622de2f Mon Sep 17 00:00:00 2001 From: Bridget McGinn Date: Fri, 24 Oct 2025 11:23:02 -0400 Subject: [PATCH 10/12] update uv.lock Signed-off-by: Bridget McGinn --- uv.lock | 641 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 469 insertions(+), 172 deletions(-) diff --git a/uv.lock b/uv.lock index 921be35a..1532f304 100644 --- a/uv.lock +++ b/uv.lock @@ -737,11 +737,15 @@ dependencies = [ { name = "pydantic" }, { name = "pyyaml" }, { name = "tabulate" }, - { name = "tree-sitter" }, - { name = "tree-sitter-c" }, + { name = "tree-sitter", version = "0.23.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "tree-sitter", version = "0.25.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "tree-sitter-c", version = "0.23.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "tree-sitter-c", version = "0.24.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "tree-sitter-java" }, - { name = "tree-sitter-javascript" }, - { name = "tree-sitter-python" }, + { name = "tree-sitter-javascript", version = "0.23.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "tree-sitter-javascript", version = "0.25.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "tree-sitter-python", version = "0.23.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "tree-sitter-python", version = "0.25.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "tree-sitter-typescript" }, { name = "typer" }, { name = "typing-extensions" }, @@ -794,13 +798,13 @@ requires-dist = [ { name = "tabulate", specifier = ">=0.9.0,<0.10.0" }, { name = "tiktoken", marker = "extra == 'chunking-openai'", specifier = ">=0.9.0,<0.13.0" }, { name = "transformers", marker = "extra == 'chunking'", specifier = ">=4.34.0,<5.0.0" }, - { name = "tree-sitter", specifier = "==0.23.2" }, - { name = "tree-sitter-c", specifier = "==0.23.4" }, - { name = "tree-sitter-java", specifier = "==0.23.5" }, - { name = "tree-sitter-javascript", specifier = "==0.23.1" }, - { name = "tree-sitter-python", specifier = "==0.23.6" }, - { name = "tree-sitter-typescript", specifier = "==0.23.2" }, - { name = "typer", specifier = ">=0.12.5,<0.17.0" }, + { name = "tree-sitter", specifier = ">=0.23.2,<1.0.0" }, + { name = "tree-sitter-c", specifier = ">=0.23.4,<1.0.0" }, + { name = "tree-sitter-java", specifier = ">=0.23.5,<1.0.0" }, + { name = "tree-sitter-javascript", specifier = ">=0.23.1,<1.0.0" }, + { name = "tree-sitter-python", specifier = ">=0.23.6,<1.0.0" }, + { name = "tree-sitter-typescript", specifier = ">=0.23.2,<1.0.0" }, + { name = "typer", specifier = ">=0.12.5,<0.20.0" }, { name = "typing-extensions", specifier = ">=4.12.2,<5.0.0" }, ] provides-extras = ["chunking", "chunking-openai"] @@ -969,7 +973,7 @@ wheels = [ [[package]] name = "huggingface-hub" -version = "0.35.3" +version = "0.36.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock", version = "3.19.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, @@ -982,9 +986,9 @@ dependencies = [ { name = "tqdm" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/10/7e/a0a97de7c73671863ca6b3f61fa12518caf35db37825e43d63a70956738c/huggingface_hub-0.35.3.tar.gz", hash = "sha256:350932eaa5cc6a4747efae85126ee220e4ef1b54e29d31c3b45c5612ddf0b32a", size = 461798, upload-time = "2025-09-29T14:29:58.625Z" } +sdist = { url = "https://files.pythonhosted.org/packages/98/63/4910c5fa9128fdadf6a9c5ac138e8b1b6cee4ca44bf7915bbfbce4e355ee/huggingface_hub-0.36.0.tar.gz", hash = "sha256:47b3f0e2539c39bf5cde015d63b72ec49baff67b6931c3d97f3f84532e2b8d25", size = 463358, upload-time = "2025-10-23T12:12:01.413Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/31/a0/651f93d154cb72323358bf2bbae3e642bdb5d2f1bfc874d096f7cb159fa0/huggingface_hub-0.35.3-py3-none-any.whl", hash = "sha256:0e3a01829c19d86d03793e4577816fe3bdfc1602ac62c7fb220d593d351224ba", size = 564262, upload-time = "2025-09-29T14:29:55.813Z" }, + { url = "https://files.pythonhosted.org/packages/cb/bd/1a875e0d592d447cbc02805fd3fe0f497714d6a2583f59d14fa9ebad96eb/huggingface_hub-0.36.0-py3-none-any.whl", hash = "sha256:7bcc9ad17d5b3f07b57c78e79d527102d08313caa278a641993acddcb894548d", size = 566094, upload-time = "2025-10-23T12:11:59.557Z" }, ] [[package]] @@ -1021,11 +1025,28 @@ wheels = [ name = "iniconfig" version = "2.1.0" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.10'", +] sdist = { url = "https://files.pythonhosted.org/packages/f2/97/ebf4da567aa6827c909642694d71c9fcf53e5b504f2d96afea02718862f3/iniconfig-2.1.0.tar.gz", hash = "sha256:3abbd2e30b36733fee78f9c7f7308f2d0050e88f0087fd25c2645f63c773e1c7", size = 4793, upload-time = "2025-03-19T20:09:59.721Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload-time = "2025-03-19T20:10:01.071Z" }, ] +[[package]] +name = "iniconfig" +version = "2.3.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/72/34/14ca021ce8e5dfedc35312d08ba8bf51fdd999c576889fc2c24cb97f4f10/iniconfig-2.3.0.tar.gz", hash = "sha256:c76315c77db068650d49c5b56314774a7804df16fee4402c1f19d6d15d8c4730", size = 20503, upload-time = "2025-10-18T21:55:43.219Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cb/b1/3846dd7f199d53cb17f49cba7e651e9ce294d8497c8c150530ed11865bb8/iniconfig-2.3.0-py3-none-any.whl", hash = "sha256:f631c04d2c48c52b84d0d0549c99ff3859c98df65b3101406327ecc7d53fbf12", size = 7484, upload-time = "2025-10-18T21:55:41.639Z" }, +] + [[package]] name = "invoke" version = "2.2.1" @@ -1037,7 +1058,7 @@ wheels = [ [[package]] name = "ipykernel" -version = "6.30.1" +version = "6.31.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "appnope", marker = "sys_platform == 'darwin'" }, @@ -1047,7 +1068,8 @@ dependencies = [ { name = "ipython", version = "8.37.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.10.*'" }, { name = "ipython", version = "9.6.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "jupyter-client" }, - { name = "jupyter-core" }, + { name = "jupyter-core", version = "5.8.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "jupyter-core", version = "5.9.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "matplotlib-inline" }, { name = "nest-asyncio" }, { name = "packaging" }, @@ -1056,9 +1078,9 @@ dependencies = [ { name = "tornado" }, { name = "traitlets" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/bb/76/11082e338e0daadc89c8ff866185de11daf67d181901038f9e139d109761/ipykernel-6.30.1.tar.gz", hash = "sha256:6abb270161896402e76b91394fcdce5d1be5d45f456671e5080572f8505be39b", size = 166260, upload-time = "2025-08-04T15:47:35.018Z" } +sdist = { url = "https://files.pythonhosted.org/packages/a5/1d/d5ba6edbfe6fae4c3105bca3a9c889563cc752c7f2de45e333164c7f4846/ipykernel-6.31.0.tar.gz", hash = "sha256:2372ce8bc1ff4f34e58cafed3a0feb2194b91fc7cad0fc72e79e47b45ee9e8f6", size = 167493, upload-time = "2025-10-20T11:42:39.948Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/fc/c7/b445faca8deb954fe536abebff4ece5b097b923de482b26e78448c89d1dd/ipykernel-6.30.1-py3-none-any.whl", hash = "sha256:aa6b9fb93dca949069d8b85b6c79b2518e32ac583ae9c7d37c51d119e18b3fb4", size = 117484, upload-time = "2025-08-04T15:47:32.622Z" }, + { url = "https://files.pythonhosted.org/packages/f6/d8/502954a4ec0efcf264f99b65b41c3c54e65a647d9f0d6f62cd02227d242c/ipykernel-6.31.0-py3-none-any.whl", hash = "sha256:abe5386f6ced727a70e0eb0cf1da801fa7c5fa6ff82147747d5a0406cd8c94af", size = 117003, upload-time = "2025-10-20T11:42:37.502Z" }, ] [[package]] @@ -1245,7 +1267,8 @@ dependencies = [ { name = "jsonschema-specifications" }, { name = "referencing", version = "0.36.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "referencing", version = "0.37.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "rpds-py" }, + { name = "rpds-py", version = "0.27.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "rpds-py", version = "0.28.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/74/69/f7185de793a29082a9f3c7728268ffb31cb5095131a9c139a74078e27336/jsonschema-4.25.1.tar.gz", hash = "sha256:e4a9655ce0da0c0b67a085847e00a3a51449e1157f4f75e9fb5aa545e122eb85", size = 357342, upload-time = "2025-08-18T17:03:50.038Z" } wheels = [ @@ -1271,7 +1294,8 @@ version = "8.6.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "importlib-metadata", marker = "python_full_version < '3.10'" }, - { name = "jupyter-core" }, + { name = "jupyter-core", version = "5.8.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "jupyter-core", version = "5.9.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "python-dateutil" }, { name = "pyzmq" }, { name = "tornado" }, @@ -1286,17 +1310,37 @@ wheels = [ name = "jupyter-core" version = "5.8.1" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.10'", +] dependencies = [ { name = "platformdirs", version = "4.4.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "platformdirs", version = "4.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "pywin32", marker = "platform_python_implementation != 'PyPy' and sys_platform == 'win32'" }, - { name = "traitlets" }, + { name = "pywin32", marker = "python_full_version < '3.10' and platform_python_implementation != 'PyPy' and sys_platform == 'win32'" }, + { name = "traitlets", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/99/1b/72906d554acfeb588332eaaa6f61577705e9ec752ddb486f302dafa292d9/jupyter_core-5.8.1.tar.gz", hash = "sha256:0a5f9706f70e64786b75acba995988915ebd4601c8a52e534a40b51c95f59941", size = 88923, upload-time = "2025-05-27T07:38:16.655Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/2f/57/6bffd4b20b88da3800c5d691e0337761576ee688eb01299eae865689d2df/jupyter_core-5.8.1-py3-none-any.whl", hash = "sha256:c28d268fc90fb53f1338ded2eb410704c5449a358406e8a948b75706e24863d0", size = 28880, upload-time = "2025-05-27T07:38:15.137Z" }, ] +[[package]] +name = "jupyter-core" +version = "5.9.1" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", +] +dependencies = [ + { name = "platformdirs", version = "4.5.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "traitlets", marker = "python_full_version >= '3.10'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/02/49/9d1284d0dc65e2c757b74c6687b6d319b02f822ad039e5c512df9194d9dd/jupyter_core-5.9.1.tar.gz", hash = "sha256:4d09aaff303b9566c3ce657f580bd089ff5c91f5f89cf7d8846c3cdf465b5508", size = 89814, upload-time = "2025-10-16T19:19:18.444Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/e7/80988e32bf6f73919a113473a604f5a8f09094de312b9d52b79c2df7612b/jupyter_core-5.9.1-py3-none-any.whl", hash = "sha256:ebf87fdc6073d142e114c72c9e29a9d7ca03fad818c5d300ce2adc1fb0743407", size = 29032, upload-time = "2025-10-16T19:19:16.783Z" }, +] + [[package]] name = "keyring" version = "25.6.0" @@ -1359,14 +1403,14 @@ wheels = [ [[package]] name = "matplotlib-inline" -version = "0.1.7" +version = "0.2.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "traitlets" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/99/5b/a36a337438a14116b16480db471ad061c36c3694df7c2084a0da7ba538b7/matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90", size = 8159, upload-time = "2024-04-15T13:44:44.803Z" } +sdist = { url = "https://files.pythonhosted.org/packages/c7/74/97e72a36efd4ae2bccb3463284300f8953f199b5ffbc04cbbb0ec78f74b1/matplotlib_inline-0.2.1.tar.gz", hash = "sha256:e1ee949c340d771fc39e241ea75683deb94762c8fa5f2927ec57c83c4dffa9fe", size = 8110, upload-time = "2025-10-23T09:00:22.126Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/8f/8e/9ad090d3553c280a8060fbf6e24dc1c0c29704ee7d1c372f0c174aa59285/matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca", size = 9899, upload-time = "2024-04-15T13:44:43.265Z" }, + { url = "https://files.pythonhosted.org/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl", hash = "sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76", size = 9516, upload-time = "2025-10-23T09:00:20.675Z" }, ] [[package]] @@ -2206,18 +2250,18 @@ wheels = [ [[package]] name = "psutil" -version = "7.1.0" +version = "7.1.1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b3/31/4723d756b59344b643542936e37a31d1d3204bcdc42a7daa8ee9eb06fb50/psutil-7.1.0.tar.gz", hash = "sha256:655708b3c069387c8b77b072fc429a57d0e214221d01c0a772df7dfedcb3bcd2", size = 497660, upload-time = "2025-09-17T20:14:52.902Z" } +sdist = { url = "https://files.pythonhosted.org/packages/89/fc/889242351a932d6183eec5df1fc6539b6f36b6a88444f1e63f18668253aa/psutil-7.1.1.tar.gz", hash = "sha256:092b6350145007389c1cfe5716050f02030a05219d90057ea867d18fe8d372fc", size = 487067, upload-time = "2025-10-19T15:43:59.373Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/46/62/ce4051019ee20ce0ed74432dd73a5bb087a6704284a470bb8adff69a0932/psutil-7.1.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:76168cef4397494250e9f4e73eb3752b146de1dd950040b29186d0cce1d5ca13", size = 245242, upload-time = "2025-09-17T20:14:56.126Z" }, - { url = "https://files.pythonhosted.org/packages/38/61/f76959fba841bf5b61123fbf4b650886dc4094c6858008b5bf73d9057216/psutil-7.1.0-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:5d007560c8c372efdff9e4579c2846d71de737e4605f611437255e81efcca2c5", size = 246682, upload-time = "2025-09-17T20:14:58.25Z" }, - { url = "https://files.pythonhosted.org/packages/88/7a/37c99d2e77ec30d63398ffa6a660450b8a62517cabe44b3e9bae97696e8d/psutil-7.1.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:22e4454970b32472ce7deaa45d045b34d3648ce478e26a04c7e858a0a6e75ff3", size = 287994, upload-time = "2025-09-17T20:14:59.901Z" }, - { url = "https://files.pythonhosted.org/packages/9d/de/04c8c61232f7244aa0a4b9a9fbd63a89d5aeaf94b2fc9d1d16e2faa5cbb0/psutil-7.1.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c70e113920d51e89f212dd7be06219a9b88014e63a4cec69b684c327bc474e3", size = 291163, upload-time = "2025-09-17T20:15:01.481Z" }, - { url = "https://files.pythonhosted.org/packages/f4/58/c4f976234bf6d4737bc8c02a81192f045c307b72cf39c9e5c5a2d78927f6/psutil-7.1.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7d4a113425c037300de3ac8b331637293da9be9713855c4fc9d2d97436d7259d", size = 293625, upload-time = "2025-09-17T20:15:04.492Z" }, - { url = "https://files.pythonhosted.org/packages/79/87/157c8e7959ec39ced1b11cc93c730c4fb7f9d408569a6c59dbd92ceb35db/psutil-7.1.0-cp37-abi3-win32.whl", hash = "sha256:09ad740870c8d219ed8daae0ad3b726d3bf9a028a198e7f3080f6a1888b99bca", size = 244812, upload-time = "2025-09-17T20:15:07.462Z" }, - { url = "https://files.pythonhosted.org/packages/bf/e9/b44c4f697276a7a95b8e94d0e320a7bf7f3318521b23de69035540b39838/psutil-7.1.0-cp37-abi3-win_amd64.whl", hash = "sha256:57f5e987c36d3146c0dd2528cd42151cf96cd359b9d67cfff836995cc5df9a3d", size = 247965, upload-time = "2025-09-17T20:15:09.673Z" }, - { url = "https://files.pythonhosted.org/packages/26/65/1070a6e3c036f39142c2820c4b52e9243246fcfc3f96239ac84472ba361e/psutil-7.1.0-cp37-abi3-win_arm64.whl", hash = "sha256:6937cb68133e7c97b6cc9649a570c9a18ba0efebed46d8c5dae4c07fa1b67a07", size = 244971, upload-time = "2025-09-17T20:15:12.262Z" }, + { url = "https://files.pythonhosted.org/packages/51/30/f97f8fb1f9ecfbeae4b5ca738dcae66ab28323b5cfbc96cb5565f3754056/psutil-7.1.1-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:8fa59d7b1f01f0337f12cd10dbd76e4312a4d3c730a4fedcbdd4e5447a8b8460", size = 244221, upload-time = "2025-10-19T15:44:03.145Z" }, + { url = "https://files.pythonhosted.org/packages/7b/98/b8d1f61ebf35f4dbdbaabadf9208282d8adc820562f0257e5e6e79e67bf2/psutil-7.1.1-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:2a95104eae85d088891716db676f780c1404fc15d47fde48a46a5d61e8f5ad2c", size = 245660, upload-time = "2025-10-19T15:44:05.657Z" }, + { url = "https://files.pythonhosted.org/packages/f0/4a/b8015d7357fefdfe34bc4a3db48a107bae4bad0b94fb6eb0613f09a08ada/psutil-7.1.1-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:98629cd8567acefcc45afe2f4ba1e9290f579eacf490a917967decce4b74ee9b", size = 286963, upload-time = "2025-10-19T15:44:08.877Z" }, + { url = "https://files.pythonhosted.org/packages/3d/3c/b56076bb35303d0733fc47b110a1c9cce081a05ae2e886575a3587c1ee76/psutil-7.1.1-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:92ebc58030fb054fa0f26c3206ef01c31c29d67aee1367e3483c16665c25c8d2", size = 290118, upload-time = "2025-10-19T15:44:11.897Z" }, + { url = "https://files.pythonhosted.org/packages/dc/af/c13d360c0adc6f6218bf9e2873480393d0f729c8dd0507d171f53061c0d3/psutil-7.1.1-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:146a704f224fb2ded2be3da5ac67fc32b9ea90c45b51676f9114a6ac45616967", size = 292587, upload-time = "2025-10-19T15:44:14.67Z" }, + { url = "https://files.pythonhosted.org/packages/90/2d/c933e7071ba60c7862813f2c7108ec4cf8304f1c79660efeefd0de982258/psutil-7.1.1-cp37-abi3-win32.whl", hash = "sha256:295c4025b5cd880f7445e4379e6826f7307e3d488947bf9834e865e7847dc5f7", size = 243772, upload-time = "2025-10-19T15:44:16.938Z" }, + { url = "https://files.pythonhosted.org/packages/be/f3/11fd213fff15427bc2853552138760c720fd65032d99edfb161910d04127/psutil-7.1.1-cp37-abi3-win_amd64.whl", hash = "sha256:9b4f17c5f65e44f69bd3a3406071a47b79df45cf2236d1f717970afcb526bcd3", size = 246936, upload-time = "2025-10-19T15:44:18.663Z" }, + { url = "https://files.pythonhosted.org/packages/0a/8d/8a9a45c8b655851f216c1d44f68e3533dc8d2c752ccd0f61f1aa73be4893/psutil-7.1.1-cp37-abi3-win_arm64.whl", hash = "sha256:5457cf741ca13da54624126cd5d333871b454ab133999a9a103fb097a7d7d21a", size = 243944, upload-time = "2025-10-19T15:44:20.666Z" }, ] [[package]] @@ -2258,7 +2302,7 @@ wheels = [ [[package]] name = "pydantic" -version = "2.12.2" +version = "2.12.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "annotated-types" }, @@ -2266,9 +2310,9 @@ dependencies = [ { name = "typing-extensions" }, { name = "typing-inspection" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/8d/35/d319ed522433215526689bad428a94058b6dd12190ce7ddd78618ac14b28/pydantic-2.12.2.tar.gz", hash = "sha256:7b8fa15b831a4bbde9d5b84028641ac3080a4ca2cbd4a621a661687e741624fd", size = 816358, upload-time = "2025-10-14T15:02:21.842Z" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/1e/4f0a3233767010308f2fd6bd0814597e3f63f1dc98304a9112b8759df4ff/pydantic-2.12.3.tar.gz", hash = "sha256:1da1c82b0fc140bb0103bc1441ffe062154c8d38491189751ee00fd8ca65ce74", size = 819383, upload-time = "2025-10-17T15:04:21.222Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/6c/98/468cb649f208a6f1279448e6e5247b37ae79cf5e4041186f1e2ef3d16345/pydantic-2.12.2-py3-none-any.whl", hash = "sha256:25ff718ee909acd82f1ff9b1a4acfd781bb23ab3739adaa7144f19a6a4e231ae", size = 460628, upload-time = "2025-10-14T15:02:19.623Z" }, + { url = "https://files.pythonhosted.org/packages/a1/6b/83661fa77dcefa195ad5f8cd9af3d1a7450fd57cc883ad04d65446ac2029/pydantic-2.12.3-py3-none-any.whl", hash = "sha256:6986454a854bc3bc6e5443e1369e06a3a456af9d339eda45510f517d9ea5c6bf", size = 462431, upload-time = "2025-10-17T15:04:19.346Z" }, ] [[package]] @@ -2435,7 +2479,8 @@ source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, { name = "exceptiongroup", marker = "python_full_version < '3.11'" }, - { name = "iniconfig" }, + { name = "iniconfig", version = "2.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, + { name = "iniconfig", version = "2.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "packaging" }, { name = "pluggy" }, { name = "pygments" }, @@ -2737,7 +2782,7 @@ resolution-markers = [ ] dependencies = [ { name = "attrs", marker = "python_full_version < '3.10'" }, - { name = "rpds-py", marker = "python_full_version < '3.10'" }, + { name = "rpds-py", version = "0.27.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, { name = "typing-extensions", marker = "python_full_version < '3.10'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/2f/db/98b5c277be99dd18bfd91dd04e1b759cad18d1a338188c936e92f921c7e2/referencing-0.36.2.tar.gz", hash = "sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa", size = 74744, upload-time = "2025-01-25T08:48:16.138Z" } @@ -2756,7 +2801,7 @@ resolution-markers = [ ] dependencies = [ { name = "attrs", marker = "python_full_version >= '3.10'" }, - { name = "rpds-py", marker = "python_full_version >= '3.10'" }, + { name = "rpds-py", version = "0.28.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, { name = "typing-extensions", marker = "python_full_version >= '3.10' and python_full_version < '3.13'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/22/f5/df4e9027acead3ecc63e50fe1e36aca1523e1719559c499951bb4b53188f/referencing-0.37.0.tar.gz", hash = "sha256:44aefc3142c5b842538163acb373e24cce6632bd54bdb01b21ad5863489f50d8", size = 78036, upload-time = "2025-10-13T15:30:48.871Z" } @@ -2766,124 +2811,124 @@ wheels = [ [[package]] name = "regex" -version = "2025.9.18" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/49/d3/eaa0d28aba6ad1827ad1e716d9a93e1ba963ada61887498297d3da715133/regex-2025.9.18.tar.gz", hash = "sha256:c5ba23274c61c6fef447ba6a39333297d0c247f53059dba0bca415cac511edc4", size = 400917, upload-time = "2025-09-19T00:38:35.79Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7e/d8/7e06171db8e55f917c5b8e89319cea2d86982e3fc46b677f40358223dece/regex-2025.9.18-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:12296202480c201c98a84aecc4d210592b2f55e200a1d193235c4db92b9f6788", size = 484829, upload-time = "2025-09-19T00:35:05.215Z" }, - { url = "https://files.pythonhosted.org/packages/8d/70/bf91bb39e5bedf75ce730ffbaa82ca585584d13335306d637458946b8b9f/regex-2025.9.18-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:220381f1464a581f2ea988f2220cf2a67927adcef107d47d6897ba5a2f6d51a4", size = 288993, upload-time = "2025-09-19T00:35:08.154Z" }, - { url = "https://files.pythonhosted.org/packages/fe/89/69f79b28365eda2c46e64c39d617d5f65a2aa451a4c94de7d9b34c2dc80f/regex-2025.9.18-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:87f681bfca84ebd265278b5daa1dcb57f4db315da3b5d044add7c30c10442e61", size = 286624, upload-time = "2025-09-19T00:35:09.717Z" }, - { url = "https://files.pythonhosted.org/packages/44/31/81e62955726c3a14fcc1049a80bc716765af6c055706869de5e880ddc783/regex-2025.9.18-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:34d674cbba70c9398074c8a1fcc1a79739d65d1105de2a3c695e2b05ea728251", size = 780473, upload-time = "2025-09-19T00:35:11.013Z" }, - { url = "https://files.pythonhosted.org/packages/fb/23/07072b7e191fbb6e213dc03b2f5b96f06d3c12d7deaded84679482926fc7/regex-2025.9.18-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:385c9b769655cb65ea40b6eea6ff763cbb6d69b3ffef0b0db8208e1833d4e746", size = 849290, upload-time = "2025-09-19T00:35:12.348Z" }, - { url = "https://files.pythonhosted.org/packages/b3/f0/aec7f6a01f2a112210424d77c6401b9015675fb887ced7e18926df4ae51e/regex-2025.9.18-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8900b3208e022570ae34328712bef6696de0804c122933414014bae791437ab2", size = 897335, upload-time = "2025-09-19T00:35:14.058Z" }, - { url = "https://files.pythonhosted.org/packages/cc/90/2e5f9da89d260de7d0417ead91a1bc897f19f0af05f4f9323313b76c47f2/regex-2025.9.18-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c204e93bf32cd7a77151d44b05eb36f469d0898e3fba141c026a26b79d9914a0", size = 789946, upload-time = "2025-09-19T00:35:15.403Z" }, - { url = "https://files.pythonhosted.org/packages/2b/d5/1c712c7362f2563d389be66bae131c8bab121a3fabfa04b0b5bfc9e73c51/regex-2025.9.18-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3acc471d1dd7e5ff82e6cacb3b286750decd949ecd4ae258696d04f019817ef8", size = 780787, upload-time = "2025-09-19T00:35:17.061Z" }, - { url = "https://files.pythonhosted.org/packages/4f/92/c54cdb4aa41009632e69817a5aa452673507f07e341076735a2f6c46a37c/regex-2025.9.18-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:6479d5555122433728760e5f29edb4c2b79655a8deb681a141beb5c8a025baea", size = 773632, upload-time = "2025-09-19T00:35:18.57Z" }, - { url = "https://files.pythonhosted.org/packages/db/99/75c996dc6a2231a8652d7ad0bfbeaf8a8c77612d335580f520f3ec40e30b/regex-2025.9.18-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:431bd2a8726b000eb6f12429c9b438a24062a535d06783a93d2bcbad3698f8a8", size = 844104, upload-time = "2025-09-19T00:35:20.259Z" }, - { url = "https://files.pythonhosted.org/packages/1c/f7/25aba34cc130cb6844047dbfe9716c9b8f9629fee8b8bec331aa9241b97b/regex-2025.9.18-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:0cc3521060162d02bd36927e20690129200e5ac9d2c6d32b70368870b122db25", size = 834794, upload-time = "2025-09-19T00:35:22.002Z" }, - { url = "https://files.pythonhosted.org/packages/51/eb/64e671beafa0ae29712268421597596d781704973551312b2425831d4037/regex-2025.9.18-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a021217b01be2d51632ce056d7a837d3fa37c543ede36e39d14063176a26ae29", size = 778535, upload-time = "2025-09-19T00:35:23.298Z" }, - { url = "https://files.pythonhosted.org/packages/26/33/c0ebc0b07bd0bf88f716cca240546b26235a07710ea58e271cfe390ae273/regex-2025.9.18-cp310-cp310-win32.whl", hash = "sha256:4a12a06c268a629cb67cc1d009b7bb0be43e289d00d5111f86a2efd3b1949444", size = 264115, upload-time = "2025-09-19T00:35:25.206Z" }, - { url = "https://files.pythonhosted.org/packages/59/39/aeb11a4ae68faaec2498512cadae09f2d8a91f1f65730fe62b9bffeea150/regex-2025.9.18-cp310-cp310-win_amd64.whl", hash = "sha256:47acd811589301298c49db2c56bde4f9308d6396da92daf99cba781fa74aa450", size = 276143, upload-time = "2025-09-19T00:35:26.785Z" }, - { url = "https://files.pythonhosted.org/packages/29/04/37f2d3fc334a1031fc2767c9d89cec13c2e72207c7e7f6feae8a47f4e149/regex-2025.9.18-cp310-cp310-win_arm64.whl", hash = "sha256:16bd2944e77522275e5ee36f867e19995bcaa533dcb516753a26726ac7285442", size = 268473, upload-time = "2025-09-19T00:35:28.39Z" }, - { url = "https://files.pythonhosted.org/packages/58/61/80eda662fc4eb32bfedc331f42390974c9e89c7eac1b79cd9eea4d7c458c/regex-2025.9.18-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:51076980cd08cd13c88eb7365427ae27f0d94e7cebe9ceb2bb9ffdae8fc4d82a", size = 484832, upload-time = "2025-09-19T00:35:30.011Z" }, - { url = "https://files.pythonhosted.org/packages/a6/d9/33833d9abddf3f07ad48504ddb53fe3b22f353214bbb878a72eee1e3ddbf/regex-2025.9.18-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:828446870bd7dee4e0cbeed767f07961aa07f0ea3129f38b3ccecebc9742e0b8", size = 288994, upload-time = "2025-09-19T00:35:31.733Z" }, - { url = "https://files.pythonhosted.org/packages/2a/b3/526ee96b0d70ea81980cbc20c3496fa582f775a52e001e2743cc33b2fa75/regex-2025.9.18-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c28821d5637866479ec4cc23b8c990f5bc6dd24e5e4384ba4a11d38a526e1414", size = 286619, upload-time = "2025-09-19T00:35:33.221Z" }, - { url = "https://files.pythonhosted.org/packages/65/4f/c2c096b02a351b33442aed5895cdd8bf87d372498d2100927c5a053d7ba3/regex-2025.9.18-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:726177ade8e481db669e76bf99de0b278783be8acd11cef71165327abd1f170a", size = 792454, upload-time = "2025-09-19T00:35:35.361Z" }, - { url = "https://files.pythonhosted.org/packages/24/15/b562c9d6e47c403c4b5deb744f8b4bf6e40684cf866c7b077960a925bdff/regex-2025.9.18-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f5cca697da89b9f8ea44115ce3130f6c54c22f541943ac8e9900461edc2b8bd4", size = 858723, upload-time = "2025-09-19T00:35:36.949Z" }, - { url = "https://files.pythonhosted.org/packages/f2/01/dba305409849e85b8a1a681eac4c03ed327d8de37895ddf9dc137f59c140/regex-2025.9.18-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:dfbde38f38004703c35666a1e1c088b778e35d55348da2b7b278914491698d6a", size = 905899, upload-time = "2025-09-19T00:35:38.723Z" }, - { url = "https://files.pythonhosted.org/packages/fe/d0/c51d1e6a80eab11ef96a4cbad17fc0310cf68994fb01a7283276b7e5bbd6/regex-2025.9.18-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f2f422214a03fab16bfa495cfec72bee4aaa5731843b771860a471282f1bf74f", size = 798981, upload-time = "2025-09-19T00:35:40.416Z" }, - { url = "https://files.pythonhosted.org/packages/c4/5e/72db90970887bbe02296612bd61b0fa31e6d88aa24f6a4853db3e96c575e/regex-2025.9.18-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a295916890f4df0902e4286bc7223ee7f9e925daa6dcdec4192364255b70561a", size = 781900, upload-time = "2025-09-19T00:35:42.077Z" }, - { url = "https://files.pythonhosted.org/packages/50/ff/596be45eea8e9bc31677fde243fa2904d00aad1b32c31bce26c3dbba0b9e/regex-2025.9.18-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:5db95ff632dbabc8c38c4e82bf545ab78d902e81160e6e455598014f0abe66b9", size = 852952, upload-time = "2025-09-19T00:35:43.751Z" }, - { url = "https://files.pythonhosted.org/packages/e5/1b/2dfa348fa551e900ed3f5f63f74185b6a08e8a76bc62bc9c106f4f92668b/regex-2025.9.18-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:fb967eb441b0f15ae610b7069bdb760b929f267efbf522e814bbbfffdf125ce2", size = 844355, upload-time = "2025-09-19T00:35:45.309Z" }, - { url = "https://files.pythonhosted.org/packages/f4/bf/aefb1def27fe33b8cbbb19c75c13aefccfbef1c6686f8e7f7095705969c7/regex-2025.9.18-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f04d2f20da4053d96c08f7fde6e1419b7ec9dbcee89c96e3d731fca77f411b95", size = 787254, upload-time = "2025-09-19T00:35:46.904Z" }, - { url = "https://files.pythonhosted.org/packages/e3/4e/8ef042e7cf0dbbb401e784e896acfc1b367b95dfbfc9ada94c2ed55a081f/regex-2025.9.18-cp311-cp311-win32.whl", hash = "sha256:895197241fccf18c0cea7550c80e75f185b8bd55b6924fcae269a1a92c614a07", size = 264129, upload-time = "2025-09-19T00:35:48.597Z" }, - { url = "https://files.pythonhosted.org/packages/b4/7d/c4fcabf80dcdd6821c0578ad9b451f8640b9110fb3dcb74793dd077069ff/regex-2025.9.18-cp311-cp311-win_amd64.whl", hash = "sha256:7e2b414deae99166e22c005e154a5513ac31493db178d8aec92b3269c9cce8c9", size = 276160, upload-time = "2025-09-19T00:36:00.45Z" }, - { url = "https://files.pythonhosted.org/packages/64/f8/0e13c8ae4d6df9d128afaba138342d532283d53a4c1e7a8c93d6756c8f4a/regex-2025.9.18-cp311-cp311-win_arm64.whl", hash = "sha256:fb137ec7c5c54f34a25ff9b31f6b7b0c2757be80176435bf367111e3f71d72df", size = 268471, upload-time = "2025-09-19T00:36:02.149Z" }, - { url = "https://files.pythonhosted.org/packages/b0/99/05859d87a66ae7098222d65748f11ef7f2dff51bfd7482a4e2256c90d72b/regex-2025.9.18-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:436e1b31d7efd4dcd52091d076482031c611dde58bf9c46ca6d0a26e33053a7e", size = 486335, upload-time = "2025-09-19T00:36:03.661Z" }, - { url = "https://files.pythonhosted.org/packages/97/7e/d43d4e8b978890932cf7b0957fce58c5b08c66f32698f695b0c2c24a48bf/regex-2025.9.18-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c190af81e5576b9c5fdc708f781a52ff20f8b96386c6e2e0557a78402b029f4a", size = 289720, upload-time = "2025-09-19T00:36:05.471Z" }, - { url = "https://files.pythonhosted.org/packages/bb/3b/ff80886089eb5dcf7e0d2040d9aaed539e25a94300403814bb24cc775058/regex-2025.9.18-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e4121f1ce2b2b5eec4b397cc1b277686e577e658d8f5870b7eb2d726bd2300ab", size = 287257, upload-time = "2025-09-19T00:36:07.072Z" }, - { url = "https://files.pythonhosted.org/packages/ee/66/243edf49dd8720cba8d5245dd4d6adcb03a1defab7238598c0c97cf549b8/regex-2025.9.18-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:300e25dbbf8299d87205e821a201057f2ef9aa3deb29caa01cd2cac669e508d5", size = 797463, upload-time = "2025-09-19T00:36:08.399Z" }, - { url = "https://files.pythonhosted.org/packages/df/71/c9d25a1142c70432e68bb03211d4a82299cd1c1fbc41db9409a394374ef5/regex-2025.9.18-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7b47fcf9f5316c0bdaf449e879407e1b9937a23c3b369135ca94ebc8d74b1742", size = 862670, upload-time = "2025-09-19T00:36:10.101Z" }, - { url = "https://files.pythonhosted.org/packages/f8/8f/329b1efc3a64375a294e3a92d43372bf1a351aa418e83c21f2f01cf6ec41/regex-2025.9.18-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:57a161bd3acaa4b513220b49949b07e252165e6b6dc910ee7617a37ff4f5b425", size = 910881, upload-time = "2025-09-19T00:36:12.223Z" }, - { url = "https://files.pythonhosted.org/packages/35/9e/a91b50332a9750519320ed30ec378b74c996f6befe282cfa6bb6cea7e9fd/regex-2025.9.18-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f130c3a7845ba42de42f380fff3c8aebe89a810747d91bcf56d40a069f15352", size = 802011, upload-time = "2025-09-19T00:36:13.901Z" }, - { url = "https://files.pythonhosted.org/packages/a4/1d/6be3b8d7856b6e0d7ee7f942f437d0a76e0d5622983abbb6d21e21ab9a17/regex-2025.9.18-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5f96fa342b6f54dcba928dd452e8d8cb9f0d63e711d1721cd765bb9f73bb048d", size = 786668, upload-time = "2025-09-19T00:36:15.391Z" }, - { url = "https://files.pythonhosted.org/packages/cb/ce/4a60e53df58bd157c5156a1736d3636f9910bdcc271d067b32b7fcd0c3a8/regex-2025.9.18-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:0f0d676522d68c207828dcd01fb6f214f63f238c283d9f01d85fc664c7c85b56", size = 856578, upload-time = "2025-09-19T00:36:16.845Z" }, - { url = "https://files.pythonhosted.org/packages/86/e8/162c91bfe7217253afccde112868afb239f94703de6580fb235058d506a6/regex-2025.9.18-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:40532bff8a1a0621e7903ae57fce88feb2e8a9a9116d341701302c9302aef06e", size = 849017, upload-time = "2025-09-19T00:36:18.597Z" }, - { url = "https://files.pythonhosted.org/packages/35/34/42b165bc45289646ea0959a1bc7531733e90b47c56a72067adfe6b3251f6/regex-2025.9.18-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:039f11b618ce8d71a1c364fdee37da1012f5a3e79b1b2819a9f389cd82fd6282", size = 788150, upload-time = "2025-09-19T00:36:20.464Z" }, - { url = "https://files.pythonhosted.org/packages/79/5d/cdd13b1f3c53afa7191593a7ad2ee24092a5a46417725ffff7f64be8342d/regex-2025.9.18-cp312-cp312-win32.whl", hash = "sha256:e1dd06f981eb226edf87c55d523131ade7285137fbde837c34dc9d1bf309f459", size = 264536, upload-time = "2025-09-19T00:36:21.922Z" }, - { url = "https://files.pythonhosted.org/packages/e0/f5/4a7770c9a522e7d2dc1fa3ffc83ab2ab33b0b22b447e62cffef186805302/regex-2025.9.18-cp312-cp312-win_amd64.whl", hash = "sha256:3d86b5247bf25fa3715e385aa9ff272c307e0636ce0c9595f64568b41f0a9c77", size = 275501, upload-time = "2025-09-19T00:36:23.4Z" }, - { url = "https://files.pythonhosted.org/packages/df/05/9ce3e110e70d225ecbed455b966003a3afda5e58e8aec2964042363a18f4/regex-2025.9.18-cp312-cp312-win_arm64.whl", hash = "sha256:032720248cbeeae6444c269b78cb15664458b7bb9ed02401d3da59fe4d68c3a5", size = 268601, upload-time = "2025-09-19T00:36:25.092Z" }, - { url = "https://files.pythonhosted.org/packages/d2/c7/5c48206a60ce33711cf7dcaeaed10dd737733a3569dc7e1dce324dd48f30/regex-2025.9.18-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2a40f929cd907c7e8ac7566ac76225a77701a6221bca937bdb70d56cb61f57b2", size = 485955, upload-time = "2025-09-19T00:36:26.822Z" }, - { url = "https://files.pythonhosted.org/packages/e9/be/74fc6bb19a3c491ec1ace943e622b5a8539068771e8705e469b2da2306a7/regex-2025.9.18-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:c90471671c2cdf914e58b6af62420ea9ecd06d1554d7474d50133ff26ae88feb", size = 289583, upload-time = "2025-09-19T00:36:28.577Z" }, - { url = "https://files.pythonhosted.org/packages/25/c4/9ceaa433cb5dc515765560f22a19578b95b92ff12526e5a259321c4fc1a0/regex-2025.9.18-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1a351aff9e07a2dabb5022ead6380cff17a4f10e4feb15f9100ee56c4d6d06af", size = 287000, upload-time = "2025-09-19T00:36:30.161Z" }, - { url = "https://files.pythonhosted.org/packages/7d/e6/68bc9393cb4dc68018456568c048ac035854b042bc7c33cb9b99b0680afa/regex-2025.9.18-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bc4b8e9d16e20ddfe16430c23468a8707ccad3365b06d4536142e71823f3ca29", size = 797535, upload-time = "2025-09-19T00:36:31.876Z" }, - { url = "https://files.pythonhosted.org/packages/6a/1c/ebae9032d34b78ecfe9bd4b5e6575b55351dc8513485bb92326613732b8c/regex-2025.9.18-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4b8cdbddf2db1c5e80338ba2daa3cfa3dec73a46fff2a7dda087c8efbf12d62f", size = 862603, upload-time = "2025-09-19T00:36:33.344Z" }, - { url = "https://files.pythonhosted.org/packages/3b/74/12332c54b3882557a4bcd2b99f8be581f5c6a43cf1660a85b460dd8ff468/regex-2025.9.18-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a276937d9d75085b2c91fb48244349c6954f05ee97bba0963ce24a9d915b8b68", size = 910829, upload-time = "2025-09-19T00:36:34.826Z" }, - { url = "https://files.pythonhosted.org/packages/86/70/ba42d5ed606ee275f2465bfc0e2208755b06cdabd0f4c7c4b614d51b57ab/regex-2025.9.18-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:92a8e375ccdc1256401c90e9dc02b8642894443d549ff5e25e36d7cf8a80c783", size = 802059, upload-time = "2025-09-19T00:36:36.664Z" }, - { url = "https://files.pythonhosted.org/packages/da/c5/fcb017e56396a7f2f8357412638d7e2963440b131a3ca549be25774b3641/regex-2025.9.18-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0dc6893b1f502d73037cf807a321cdc9be29ef3d6219f7970f842475873712ac", size = 786781, upload-time = "2025-09-19T00:36:38.168Z" }, - { url = "https://files.pythonhosted.org/packages/c6/ee/21c4278b973f630adfb3bcb23d09d83625f3ab1ca6e40ebdffe69901c7a1/regex-2025.9.18-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:a61e85bfc63d232ac14b015af1261f826260c8deb19401c0597dbb87a864361e", size = 856578, upload-time = "2025-09-19T00:36:40.129Z" }, - { url = "https://files.pythonhosted.org/packages/87/0b/de51550dc7274324435c8f1539373ac63019b0525ad720132866fff4a16a/regex-2025.9.18-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:1ef86a9ebc53f379d921fb9a7e42b92059ad3ee800fcd9e0fe6181090e9f6c23", size = 849119, upload-time = "2025-09-19T00:36:41.651Z" }, - { url = "https://files.pythonhosted.org/packages/60/52/383d3044fc5154d9ffe4321696ee5b2ee4833a28c29b137c22c33f41885b/regex-2025.9.18-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d3bc882119764ba3a119fbf2bd4f1b47bc56c1da5d42df4ed54ae1e8e66fdf8f", size = 788219, upload-time = "2025-09-19T00:36:43.575Z" }, - { url = "https://files.pythonhosted.org/packages/20/bd/2614fc302671b7359972ea212f0e3a92df4414aaeacab054a8ce80a86073/regex-2025.9.18-cp313-cp313-win32.whl", hash = "sha256:3810a65675845c3bdfa58c3c7d88624356dd6ee2fc186628295e0969005f928d", size = 264517, upload-time = "2025-09-19T00:36:45.503Z" }, - { url = "https://files.pythonhosted.org/packages/07/0f/ab5c1581e6563a7bffdc1974fb2d25f05689b88e2d416525271f232b1946/regex-2025.9.18-cp313-cp313-win_amd64.whl", hash = "sha256:16eaf74b3c4180ede88f620f299e474913ab6924d5c4b89b3833bc2345d83b3d", size = 275481, upload-time = "2025-09-19T00:36:46.965Z" }, - { url = "https://files.pythonhosted.org/packages/49/22/ee47672bc7958f8c5667a587c2600a4fba8b6bab6e86bd6d3e2b5f7cac42/regex-2025.9.18-cp313-cp313-win_arm64.whl", hash = "sha256:4dc98ba7dd66bd1261927a9f49bd5ee2bcb3660f7962f1ec02617280fc00f5eb", size = 268598, upload-time = "2025-09-19T00:36:48.314Z" }, - { url = "https://files.pythonhosted.org/packages/e8/83/6887e16a187c6226cb85d8301e47d3b73ecc4505a3a13d8da2096b44fd76/regex-2025.9.18-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:fe5d50572bc885a0a799410a717c42b1a6b50e2f45872e2b40f4f288f9bce8a2", size = 489765, upload-time = "2025-09-19T00:36:49.996Z" }, - { url = "https://files.pythonhosted.org/packages/51/c5/e2f7325301ea2916ff301c8d963ba66b1b2c1b06694191df80a9c4fea5d0/regex-2025.9.18-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:1b9d9a2d6cda6621551ca8cf7a06f103adf72831153f3c0d982386110870c4d3", size = 291228, upload-time = "2025-09-19T00:36:51.654Z" }, - { url = "https://files.pythonhosted.org/packages/91/60/7d229d2bc6961289e864a3a3cfebf7d0d250e2e65323a8952cbb7e22d824/regex-2025.9.18-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:13202e4c4ac0ef9a317fff817674b293c8f7e8c68d3190377d8d8b749f566e12", size = 289270, upload-time = "2025-09-19T00:36:53.118Z" }, - { url = "https://files.pythonhosted.org/packages/3c/d7/b4f06868ee2958ff6430df89857fbf3d43014bbf35538b6ec96c2704e15d/regex-2025.9.18-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:874ff523b0fecffb090f80ae53dc93538f8db954c8bb5505f05b7787ab3402a0", size = 806326, upload-time = "2025-09-19T00:36:54.631Z" }, - { url = "https://files.pythonhosted.org/packages/d6/e4/bca99034a8f1b9b62ccf337402a8e5b959dd5ba0e5e5b2ead70273df3277/regex-2025.9.18-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:d13ab0490128f2bb45d596f754148cd750411afc97e813e4b3a61cf278a23bb6", size = 871556, upload-time = "2025-09-19T00:36:56.208Z" }, - { url = "https://files.pythonhosted.org/packages/6d/df/e06ffaf078a162f6dd6b101a5ea9b44696dca860a48136b3ae4a9caf25e2/regex-2025.9.18-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:05440bc172bc4b4b37fb9667e796597419404dbba62e171e1f826d7d2a9ebcef", size = 913817, upload-time = "2025-09-19T00:36:57.807Z" }, - { url = "https://files.pythonhosted.org/packages/9e/05/25b05480b63292fd8e84800b1648e160ca778127b8d2367a0a258fa2e225/regex-2025.9.18-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5514b8e4031fdfaa3d27e92c75719cbe7f379e28cacd939807289bce76d0e35a", size = 811055, upload-time = "2025-09-19T00:36:59.762Z" }, - { url = "https://files.pythonhosted.org/packages/70/97/7bc7574655eb651ba3a916ed4b1be6798ae97af30104f655d8efd0cab24b/regex-2025.9.18-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:65d3c38c39efce73e0d9dc019697b39903ba25b1ad45ebbd730d2cf32741f40d", size = 794534, upload-time = "2025-09-19T00:37:01.405Z" }, - { url = "https://files.pythonhosted.org/packages/b4/c2/d5da49166a52dda879855ecdba0117f073583db2b39bb47ce9a3378a8e9e/regex-2025.9.18-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:ae77e447ebc144d5a26d50055c6ddba1d6ad4a865a560ec7200b8b06bc529368", size = 866684, upload-time = "2025-09-19T00:37:03.441Z" }, - { url = "https://files.pythonhosted.org/packages/bd/2d/0a5c4e6ec417de56b89ff4418ecc72f7e3feca806824c75ad0bbdae0516b/regex-2025.9.18-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:e3ef8cf53dc8df49d7e28a356cf824e3623764e9833348b655cfed4524ab8a90", size = 853282, upload-time = "2025-09-19T00:37:04.985Z" }, - { url = "https://files.pythonhosted.org/packages/f4/8e/d656af63e31a86572ec829665d6fa06eae7e144771e0330650a8bb865635/regex-2025.9.18-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:9feb29817df349c976da9a0debf775c5c33fc1c8ad7b9f025825da99374770b7", size = 797830, upload-time = "2025-09-19T00:37:06.697Z" }, - { url = "https://files.pythonhosted.org/packages/db/ce/06edc89df8f7b83ffd321b6071be4c54dc7332c0f77860edc40ce57d757b/regex-2025.9.18-cp313-cp313t-win32.whl", hash = "sha256:168be0d2f9b9d13076940b1ed774f98595b4e3c7fc54584bba81b3cc4181742e", size = 267281, upload-time = "2025-09-19T00:37:08.568Z" }, - { url = "https://files.pythonhosted.org/packages/83/9a/2b5d9c8b307a451fd17068719d971d3634ca29864b89ed5c18e499446d4a/regex-2025.9.18-cp313-cp313t-win_amd64.whl", hash = "sha256:d59ecf3bb549e491c8104fea7313f3563c7b048e01287db0a90485734a70a730", size = 278724, upload-time = "2025-09-19T00:37:10.023Z" }, - { url = "https://files.pythonhosted.org/packages/3d/70/177d31e8089a278a764f8ec9a3faac8d14a312d622a47385d4b43905806f/regex-2025.9.18-cp313-cp313t-win_arm64.whl", hash = "sha256:dbef80defe9fb21310948a2595420b36c6d641d9bea4c991175829b2cc4bc06a", size = 269771, upload-time = "2025-09-19T00:37:13.041Z" }, - { url = "https://files.pythonhosted.org/packages/44/b7/3b4663aa3b4af16819f2ab6a78c4111c7e9b066725d8107753c2257448a5/regex-2025.9.18-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:c6db75b51acf277997f3adcd0ad89045d856190d13359f15ab5dda21581d9129", size = 486130, upload-time = "2025-09-19T00:37:14.527Z" }, - { url = "https://files.pythonhosted.org/packages/80/5b/4533f5d7ac9c6a02a4725fe8883de2aebc713e67e842c04cf02626afb747/regex-2025.9.18-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:8f9698b6f6895d6db810e0bda5364f9ceb9e5b11328700a90cae573574f61eea", size = 289539, upload-time = "2025-09-19T00:37:16.356Z" }, - { url = "https://files.pythonhosted.org/packages/b8/8d/5ab6797c2750985f79e9995fad3254caa4520846580f266ae3b56d1cae58/regex-2025.9.18-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:29cd86aa7cb13a37d0f0d7c21d8d949fe402ffa0ea697e635afedd97ab4b69f1", size = 287233, upload-time = "2025-09-19T00:37:18.025Z" }, - { url = "https://files.pythonhosted.org/packages/cb/1e/95afcb02ba8d3a64e6ffeb801718ce73471ad6440c55d993f65a4a5e7a92/regex-2025.9.18-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7c9f285a071ee55cd9583ba24dde006e53e17780bb309baa8e4289cd472bcc47", size = 797876, upload-time = "2025-09-19T00:37:19.609Z" }, - { url = "https://files.pythonhosted.org/packages/c8/fb/720b1f49cec1f3b5a9fea5b34cd22b88b5ebccc8c1b5de9cc6f65eed165a/regex-2025.9.18-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:5adf266f730431e3be9021d3e5b8d5ee65e563fec2883ea8093944d21863b379", size = 863385, upload-time = "2025-09-19T00:37:21.65Z" }, - { url = "https://files.pythonhosted.org/packages/a9/ca/e0d07ecf701e1616f015a720dc13b84c582024cbfbb3fc5394ae204adbd7/regex-2025.9.18-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1137cabc0f38807de79e28d3f6e3e3f2cc8cfb26bead754d02e6d1de5f679203", size = 910220, upload-time = "2025-09-19T00:37:23.723Z" }, - { url = "https://files.pythonhosted.org/packages/b6/45/bba86413b910b708eca705a5af62163d5d396d5f647ed9485580c7025209/regex-2025.9.18-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7cc9e5525cada99699ca9223cce2d52e88c52a3d2a0e842bd53de5497c604164", size = 801827, upload-time = "2025-09-19T00:37:25.684Z" }, - { url = "https://files.pythonhosted.org/packages/b8/a6/740fbd9fcac31a1305a8eed30b44bf0f7f1e042342be0a4722c0365ecfca/regex-2025.9.18-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:bbb9246568f72dce29bcd433517c2be22c7791784b223a810225af3b50d1aafb", size = 786843, upload-time = "2025-09-19T00:37:27.62Z" }, - { url = "https://files.pythonhosted.org/packages/80/a7/0579e8560682645906da640c9055506465d809cb0f5415d9976f417209a6/regex-2025.9.18-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:6a52219a93dd3d92c675383efff6ae18c982e2d7651c792b1e6d121055808743", size = 857430, upload-time = "2025-09-19T00:37:29.362Z" }, - { url = "https://files.pythonhosted.org/packages/8d/9b/4dc96b6c17b38900cc9fee254fc9271d0dde044e82c78c0811b58754fde5/regex-2025.9.18-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:ae9b3840c5bd456780e3ddf2f737ab55a79b790f6409182012718a35c6d43282", size = 848612, upload-time = "2025-09-19T00:37:31.42Z" }, - { url = "https://files.pythonhosted.org/packages/b3/6a/6f659f99bebb1775e5ac81a3fb837b85897c1a4ef5acffd0ff8ffe7e67fb/regex-2025.9.18-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:d488c236ac497c46a5ac2005a952c1a0e22a07be9f10c3e735bc7d1209a34773", size = 787967, upload-time = "2025-09-19T00:37:34.019Z" }, - { url = "https://files.pythonhosted.org/packages/61/35/9e35665f097c07cf384a6b90a1ac11b0b1693084a0b7a675b06f760496c6/regex-2025.9.18-cp314-cp314-win32.whl", hash = "sha256:0c3506682ea19beefe627a38872d8da65cc01ffa25ed3f2e422dffa1474f0788", size = 269847, upload-time = "2025-09-19T00:37:35.759Z" }, - { url = "https://files.pythonhosted.org/packages/af/64/27594dbe0f1590b82de2821ebfe9a359b44dcb9b65524876cd12fabc447b/regex-2025.9.18-cp314-cp314-win_amd64.whl", hash = "sha256:57929d0f92bebb2d1a83af372cd0ffba2263f13f376e19b1e4fa32aec4efddc3", size = 278755, upload-time = "2025-09-19T00:37:37.367Z" }, - { url = "https://files.pythonhosted.org/packages/30/a3/0cd8d0d342886bd7d7f252d701b20ae1a3c72dc7f34ef4b2d17790280a09/regex-2025.9.18-cp314-cp314-win_arm64.whl", hash = "sha256:6a4b44df31d34fa51aa5c995d3aa3c999cec4d69b9bd414a8be51984d859f06d", size = 271873, upload-time = "2025-09-19T00:37:39.125Z" }, - { url = "https://files.pythonhosted.org/packages/99/cb/8a1ab05ecf404e18b54348e293d9b7a60ec2bd7aa59e637020c5eea852e8/regex-2025.9.18-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:b176326bcd544b5e9b17d6943f807697c0cb7351f6cfb45bf5637c95ff7e6306", size = 489773, upload-time = "2025-09-19T00:37:40.968Z" }, - { url = "https://files.pythonhosted.org/packages/93/3b/6543c9b7f7e734d2404fa2863d0d710c907bef99d4598760ed4563d634c3/regex-2025.9.18-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:0ffd9e230b826b15b369391bec167baed57c7ce39efc35835448618860995946", size = 291221, upload-time = "2025-09-19T00:37:42.901Z" }, - { url = "https://files.pythonhosted.org/packages/cd/91/e9fdee6ad6bf708d98c5d17fded423dcb0661795a49cba1b4ffb8358377a/regex-2025.9.18-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ec46332c41add73f2b57e2f5b642f991f6b15e50e9f86285e08ffe3a512ac39f", size = 289268, upload-time = "2025-09-19T00:37:44.823Z" }, - { url = "https://files.pythonhosted.org/packages/94/a6/bc3e8a918abe4741dadeaeb6c508e3a4ea847ff36030d820d89858f96a6c/regex-2025.9.18-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b80fa342ed1ea095168a3f116637bd1030d39c9ff38dc04e54ef7c521e01fc95", size = 806659, upload-time = "2025-09-19T00:37:46.684Z" }, - { url = "https://files.pythonhosted.org/packages/2b/71/ea62dbeb55d9e6905c7b5a49f75615ea1373afcad95830047e4e310db979/regex-2025.9.18-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f4d97071c0ba40f0cf2a93ed76e660654c399a0a04ab7d85472239460f3da84b", size = 871701, upload-time = "2025-09-19T00:37:48.882Z" }, - { url = "https://files.pythonhosted.org/packages/6a/90/fbe9dedb7dad24a3a4399c0bae64bfa932ec8922a0a9acf7bc88db30b161/regex-2025.9.18-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0ac936537ad87cef9e0e66c5144484206c1354224ee811ab1519a32373e411f3", size = 913742, upload-time = "2025-09-19T00:37:51.015Z" }, - { url = "https://files.pythonhosted.org/packages/f0/1c/47e4a8c0e73d41eb9eb9fdeba3b1b810110a5139a2526e82fd29c2d9f867/regex-2025.9.18-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dec57f96d4def58c422d212d414efe28218d58537b5445cf0c33afb1b4768571", size = 811117, upload-time = "2025-09-19T00:37:52.686Z" }, - { url = "https://files.pythonhosted.org/packages/2a/da/435f29fddfd015111523671e36d30af3342e8136a889159b05c1d9110480/regex-2025.9.18-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:48317233294648bf7cd068857f248e3a57222259a5304d32c7552e2284a1b2ad", size = 794647, upload-time = "2025-09-19T00:37:54.626Z" }, - { url = "https://files.pythonhosted.org/packages/23/66/df5e6dcca25c8bc57ce404eebc7342310a0d218db739d7882c9a2b5974a3/regex-2025.9.18-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:274687e62ea3cf54846a9b25fc48a04459de50af30a7bd0b61a9e38015983494", size = 866747, upload-time = "2025-09-19T00:37:56.367Z" }, - { url = "https://files.pythonhosted.org/packages/82/42/94392b39b531f2e469b2daa40acf454863733b674481fda17462a5ffadac/regex-2025.9.18-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:a78722c86a3e7e6aadf9579e3b0ad78d955f2d1f1a8ca4f67d7ca258e8719d4b", size = 853434, upload-time = "2025-09-19T00:37:58.39Z" }, - { url = "https://files.pythonhosted.org/packages/a8/f8/dcc64c7f7bbe58842a8f89622b50c58c3598fbbf4aad0a488d6df2c699f1/regex-2025.9.18-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:06104cd203cdef3ade989a1c45b6215bf42f8b9dd705ecc220c173233f7cba41", size = 798024, upload-time = "2025-09-19T00:38:00.397Z" }, - { url = "https://files.pythonhosted.org/packages/20/8d/edf1c5d5aa98f99a692313db813ec487732946784f8f93145e0153d910e5/regex-2025.9.18-cp314-cp314t-win32.whl", hash = "sha256:2e1eddc06eeaffd249c0adb6fafc19e2118e6308c60df9db27919e96b5656096", size = 273029, upload-time = "2025-09-19T00:38:02.383Z" }, - { url = "https://files.pythonhosted.org/packages/a7/24/02d4e4f88466f17b145f7ea2b2c11af3a942db6222429c2c146accf16054/regex-2025.9.18-cp314-cp314t-win_amd64.whl", hash = "sha256:8620d247fb8c0683ade51217b459cb4a1081c0405a3072235ba43a40d355c09a", size = 282680, upload-time = "2025-09-19T00:38:04.102Z" }, - { url = "https://files.pythonhosted.org/packages/1f/a3/c64894858aaaa454caa7cc47e2f225b04d3ed08ad649eacf58d45817fad2/regex-2025.9.18-cp314-cp314t-win_arm64.whl", hash = "sha256:b7531a8ef61de2c647cdf68b3229b071e46ec326b3138b2180acb4275f470b01", size = 273034, upload-time = "2025-09-19T00:38:05.807Z" }, - { url = "https://files.pythonhosted.org/packages/ed/d2/5b0ded10467d6e96f78de5e6f195b7f9b57251f411b1090004597cffe5d9/regex-2025.9.18-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:3dbcfcaa18e9480669030d07371713c10b4f1a41f791ffa5cb1a99f24e777f40", size = 484847, upload-time = "2025-09-19T00:38:07.367Z" }, - { url = "https://files.pythonhosted.org/packages/55/35/051da2c0ae6124e3f1aa1442ecc2bb4e2de930e95433bce1301a2e7ae255/regex-2025.9.18-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1e85f73ef7095f0380208269055ae20524bfde3f27c5384126ddccf20382a638", size = 288995, upload-time = "2025-09-19T00:38:09.253Z" }, - { url = "https://files.pythonhosted.org/packages/22/4b/4bfc51cad95263d25b6ed8c5253831b2536e8e279e6736d0a08c9f7ffe98/regex-2025.9.18-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9098e29b3ea4ffffeade423f6779665e2a4f8db64e699c0ed737ef0db6ba7b12", size = 286642, upload-time = "2025-09-19T00:38:11.012Z" }, - { url = "https://files.pythonhosted.org/packages/0e/67/d2f3e2483e09d1e9f7d93b4fe106b04933fba5e619bc901530d1c90d62da/regex-2025.9.18-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:90b6b7a2d0f45b7ecaaee1aec6b362184d6596ba2092dd583ffba1b78dd0231c", size = 779896, upload-time = "2025-09-19T00:38:12.732Z" }, - { url = "https://files.pythonhosted.org/packages/14/5e/49a4f07ce6f5563de02b0e321220b9534f3fd3bae275311b785dd618aea5/regex-2025.9.18-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c81b892af4a38286101502eae7aec69f7cd749a893d9987a92776954f3943408", size = 848954, upload-time = "2025-09-19T00:38:14.716Z" }, - { url = "https://files.pythonhosted.org/packages/00/8d/f5995ae51225c77ca9215d78ceb1dc30c52fa2b22c41dac977214e8b4bbd/regex-2025.9.18-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3b524d010973f2e1929aeb635418d468d869a5f77b52084d9f74c272189c251d", size = 896770, upload-time = "2025-09-19T00:38:16.381Z" }, - { url = "https://files.pythonhosted.org/packages/6b/15/2a3a744d73a557337c7561db2114bab10b4e9941c626c03169ea62f42c8f/regex-2025.9.18-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6b498437c026a3d5d0be0020023ff76d70ae4d77118e92f6f26c9d0423452446", size = 789484, upload-time = "2025-09-19T00:38:18.183Z" }, - { url = "https://files.pythonhosted.org/packages/d8/27/e425f3d17d32062a657b836d0c8a68f5e71a9e6295fa637159f265eaa609/regex-2025.9.18-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0716e4d6e58853d83f6563f3cf25c281ff46cf7107e5f11879e32cb0b59797d9", size = 780150, upload-time = "2025-09-19T00:38:19.879Z" }, - { url = "https://files.pythonhosted.org/packages/62/28/79dfae89b6fd7901b82611ac1a96ec25deceb7e918e9c5eb3f96cf5ad654/regex-2025.9.18-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:065b6956749379d41db2625f880b637d4acc14c0a4de0d25d609a62850e96d36", size = 773160, upload-time = "2025-09-19T00:38:21.641Z" }, - { url = "https://files.pythonhosted.org/packages/0b/67/df83d6ae608f487448e9be7ac26211af2afa2b6e34465fde3e07d1f11290/regex-2025.9.18-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:d4a691494439287c08ddb9b5793da605ee80299dd31e95fa3f323fac3c33d9d4", size = 843555, upload-time = "2025-09-19T00:38:23.696Z" }, - { url = "https://files.pythonhosted.org/packages/32/67/c65f56f3edd3f213d3aa41e9b9b07cc2247721a23d34bcfb2947dc0f4685/regex-2025.9.18-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ef8d10cc0989565bcbe45fb4439f044594d5c2b8919d3d229ea2c4238f1d55b0", size = 834169, upload-time = "2025-09-19T00:38:25.997Z" }, - { url = "https://files.pythonhosted.org/packages/95/90/7fca37435e3aa1a032c38fa1e171fdaf809c8dbf2717508e3f6a92c75446/regex-2025.9.18-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:4baeb1b16735ac969a7eeecc216f1f8b7caf60431f38a2671ae601f716a32d25", size = 778024, upload-time = "2025-09-19T00:38:28.043Z" }, - { url = "https://files.pythonhosted.org/packages/8b/05/c2ee512cdf34d6be5ac5cf938a58c1b79a9d96cbad404bc4d70404212edb/regex-2025.9.18-cp39-cp39-win32.whl", hash = "sha256:8e5f41ad24a1e0b5dfcf4c4e5d9f5bd54c895feb5708dd0c1d0d35693b24d478", size = 264151, upload-time = "2025-09-19T00:38:30.23Z" }, - { url = "https://files.pythonhosted.org/packages/f8/2f/8414fb46181b6108484f04d670ece196db6734cc4c683f41125043fd3280/regex-2025.9.18-cp39-cp39-win_amd64.whl", hash = "sha256:50e8290707f2fb8e314ab3831e594da71e062f1d623b05266f8cfe4db4949afd", size = 276232, upload-time = "2025-09-19T00:38:31.981Z" }, - { url = "https://files.pythonhosted.org/packages/61/63/f40931d477e1ed4b53105d506758a58cfec1b052c12972054930ec743ee5/regex-2025.9.18-cp39-cp39-win_arm64.whl", hash = "sha256:039a9d7195fd88c943d7c777d4941e8ef736731947becce773c31a1009cb3c35", size = 268505, upload-time = "2025-09-19T00:38:34.015Z" }, +version = "2025.10.23" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f8/c8/1d2160d36b11fbe0a61acb7c3c81ab032d9ec8ad888ac9e0a61b85ab99dd/regex-2025.10.23.tar.gz", hash = "sha256:8cbaf8ceb88f96ae2356d01b9adf5e6306fa42fa6f7eab6b97794e37c959ac26", size = 401266, upload-time = "2025-10-21T15:58:20.23Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/11/849d5d23633a77047465eaae4cc0cbf24ded7aa496c02e8b9710e28b1687/regex-2025.10.23-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:17bbcde374bef1c5fad9b131f0e28a6a24856dd90368d8c0201e2b5a69533daa", size = 487957, upload-time = "2025-10-21T15:54:26.151Z" }, + { url = "https://files.pythonhosted.org/packages/87/12/5985386e7e3200a0d6a6417026d2c758d783a932428a5efc0a42ca1ddf74/regex-2025.10.23-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:b4e10434279cc8567f99ca6e018e9025d14f2fded2a603380b6be2090f476426", size = 290419, upload-time = "2025-10-21T15:54:28.804Z" }, + { url = "https://files.pythonhosted.org/packages/67/cf/a8615923f962f8fdc41a3a6093a48726955e8b1993f4614b26a41d249f9b/regex-2025.10.23-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9c9bb421cbe7012c744a5a56cf4d6c80829c72edb1a2991677299c988d6339c8", size = 288285, upload-time = "2025-10-21T15:54:30.47Z" }, + { url = "https://files.pythonhosted.org/packages/4e/3d/6a3a1e12c86354cd0b3cbf8c3dd6acbe853609ee3b39d47ecd3ce95caf84/regex-2025.10.23-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:275cd1c2ed8c4a78ebfa489618d7aee762e8b4732da73573c3e38236ec5f65de", size = 781458, upload-time = "2025-10-21T15:54:31.978Z" }, + { url = "https://files.pythonhosted.org/packages/46/47/76a8da004489f2700361754859e373b87a53d043de8c47f4d1583fd39d78/regex-2025.10.23-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7b426ae7952f3dc1e73a86056d520bd4e5f021397484a6835902fc5648bcacce", size = 850605, upload-time = "2025-10-21T15:54:33.753Z" }, + { url = "https://files.pythonhosted.org/packages/67/05/fa886461f97d45a6f4b209699cb994dc6d6212d6e219d29444dac5005775/regex-2025.10.23-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c5cdaf5b6d37c7da1967dbe729d819461aab6a98a072feef65bbcff0a6e60649", size = 898563, upload-time = "2025-10-21T15:54:35.431Z" }, + { url = "https://files.pythonhosted.org/packages/2d/db/3ddd8d01455f23cabad7499f4199de0df92f5e96d39633203ff9d0b592dc/regex-2025.10.23-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3bfeff0b08f296ab28b4332a7e03ca31c437ee78b541ebc874bbf540e5932f8d", size = 791535, upload-time = "2025-10-21T15:54:37.269Z" }, + { url = "https://files.pythonhosted.org/packages/7c/ae/0fa5cbf41ca92b6ec3370222fcb6c68b240d68ab10e803d086c03a19fd9e/regex-2025.10.23-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5f97236a67307b775f30a74ef722b64b38b7ab7ba3bb4a2508518a5de545459c", size = 782461, upload-time = "2025-10-21T15:54:39.187Z" }, + { url = "https://files.pythonhosted.org/packages/d4/23/70af22a016df11af4def27870eb175c2c7235b72d411ecf75a4b4a422cb6/regex-2025.10.23-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:be19e7de499940cd72475fb8e46ab2ecb1cf5906bebdd18a89f9329afb1df82f", size = 774583, upload-time = "2025-10-21T15:54:41.018Z" }, + { url = "https://files.pythonhosted.org/packages/7a/ee/a54a6851f6905f33d3c4ed64e8737b1d85ed01b5724712530ddc0f9abdb1/regex-2025.10.23-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:883df76ee42d9ecb82b37ff8d01caea5895b3f49630a64d21111078bbf8ef64c", size = 845649, upload-time = "2025-10-21T15:54:42.615Z" }, + { url = "https://files.pythonhosted.org/packages/80/7d/c3ec1cae14e01fab00e38c41ed35f47a853359e95e9c023e9a4381bb122c/regex-2025.10.23-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:2e9117d1d35fc2addae6281019ecc70dc21c30014b0004f657558b91c6a8f1a7", size = 836037, upload-time = "2025-10-21T15:54:44.63Z" }, + { url = "https://files.pythonhosted.org/packages/15/ae/45771140dd43c4d67c87b54d3728078ed6a96599d9fc7ba6825086236782/regex-2025.10.23-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0ff1307f531a5d8cf5c20ea517254551ff0a8dc722193aab66c656c5a900ea68", size = 779705, upload-time = "2025-10-21T15:54:46.08Z" }, + { url = "https://files.pythonhosted.org/packages/b8/95/074e2581760eafce7c816a352b7d3a322536e5b68c346d1a8bacd895545c/regex-2025.10.23-cp310-cp310-win32.whl", hash = "sha256:7888475787cbfee4a7cd32998eeffe9a28129fa44ae0f691b96cb3939183ef41", size = 265663, upload-time = "2025-10-21T15:54:47.854Z" }, + { url = "https://files.pythonhosted.org/packages/f7/c7/a25f56a718847e34d3f1608c72eadeb67653bff1a0411da023dd8f4c647b/regex-2025.10.23-cp310-cp310-win_amd64.whl", hash = "sha256:ec41a905908496ce4906dab20fb103c814558db1d69afc12c2f384549c17936a", size = 277587, upload-time = "2025-10-21T15:54:49.571Z" }, + { url = "https://files.pythonhosted.org/packages/d3/e5/63eb17c6b5deaefd93c2bbb1feae7c0a8d2157da25883a6ca2569cf7a663/regex-2025.10.23-cp310-cp310-win_arm64.whl", hash = "sha256:b2b7f19a764d5e966d5a62bf2c28a8b4093cc864c6734510bdb4aeb840aec5e6", size = 269979, upload-time = "2025-10-21T15:54:51.375Z" }, + { url = "https://files.pythonhosted.org/packages/82/e5/74b7cd5cd76b4171f9793042045bb1726f7856dd56e582fc3e058a7a8a5e/regex-2025.10.23-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6c531155bf9179345e85032052a1e5fe1a696a6abf9cea54b97e8baefff970fd", size = 487960, upload-time = "2025-10-21T15:54:53.253Z" }, + { url = "https://files.pythonhosted.org/packages/b9/08/854fa4b3b20471d1df1c71e831b6a1aa480281e37791e52a2df9641ec5c6/regex-2025.10.23-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:912e9df4e89d383681268d38ad8f5780d7cccd94ba0e9aa09ca7ab7ab4f8e7eb", size = 290425, upload-time = "2025-10-21T15:54:55.21Z" }, + { url = "https://files.pythonhosted.org/packages/ab/d3/6272b1dd3ca1271661e168762b234ad3e00dbdf4ef0c7b9b72d2d159efa7/regex-2025.10.23-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4f375c61bfc3138b13e762fe0ae76e3bdca92497816936534a0177201666f44f", size = 288278, upload-time = "2025-10-21T15:54:56.862Z" }, + { url = "https://files.pythonhosted.org/packages/14/8f/c7b365dd9d9bc0a36e018cb96f2ffb60d2ba8deb589a712b437f67de2920/regex-2025.10.23-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e248cc9446081119128ed002a3801f8031e0c219b5d3c64d3cc627da29ac0a33", size = 793289, upload-time = "2025-10-21T15:54:58.352Z" }, + { url = "https://files.pythonhosted.org/packages/d4/fb/b8fbe9aa16cf0c21f45ec5a6c74b4cecbf1a1c0deb7089d4a6f83a9c1caa/regex-2025.10.23-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b52bf9282fdf401e4f4e721f0f61fc4b159b1307244517789702407dd74e38ca", size = 860321, upload-time = "2025-10-21T15:54:59.813Z" }, + { url = "https://files.pythonhosted.org/packages/b0/81/bf41405c772324926a9bd8a640dedaa42da0e929241834dfce0733070437/regex-2025.10.23-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5c084889ab2c59765a0d5ac602fd1c3c244f9b3fcc9a65fdc7ba6b74c5287490", size = 907011, upload-time = "2025-10-21T15:55:01.968Z" }, + { url = "https://files.pythonhosted.org/packages/a4/fb/5ad6a8b92d3f88f3797b51bb4ef47499acc2d0b53d2fbe4487a892f37a73/regex-2025.10.23-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d80e8eb79009bdb0936658c44ca06e2fbbca67792013e3818eea3f5f228971c2", size = 800312, upload-time = "2025-10-21T15:55:04.15Z" }, + { url = "https://files.pythonhosted.org/packages/42/48/b4efba0168a2b57f944205d823f8e8a3a1ae6211a34508f014ec2c712f4f/regex-2025.10.23-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b6f259118ba87b814a8ec475380aee5f5ae97a75852a3507cf31d055b01b5b40", size = 782839, upload-time = "2025-10-21T15:55:05.641Z" }, + { url = "https://files.pythonhosted.org/packages/13/2a/c9efb4c6c535b0559c1fa8e431e0574d229707c9ca718600366fcfef6801/regex-2025.10.23-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:9b8c72a242683dcc72d37595c4f1278dfd7642b769e46700a8df11eab19dfd82", size = 854270, upload-time = "2025-10-21T15:55:07.27Z" }, + { url = "https://files.pythonhosted.org/packages/34/2d/68eecc1bdaee020e8ba549502291c9450d90d8590d0552247c9b543ebf7b/regex-2025.10.23-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a8d7b7a0a3df9952f9965342159e0c1f05384c0f056a47ce8b61034f8cecbe83", size = 845771, upload-time = "2025-10-21T15:55:09.477Z" }, + { url = "https://files.pythonhosted.org/packages/a5/cd/a1ae499cf9b87afb47a67316bbf1037a7c681ffe447c510ed98c0aa2c01c/regex-2025.10.23-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:413bfea20a484c524858125e92b9ce6ffdd0a4b97d4ff96b5859aa119b0f1bdd", size = 788778, upload-time = "2025-10-21T15:55:11.396Z" }, + { url = "https://files.pythonhosted.org/packages/38/f9/70765e63f5ea7d43b2b6cd4ee9d3323f16267e530fb2a420d92d991cf0fc/regex-2025.10.23-cp311-cp311-win32.whl", hash = "sha256:f76deef1f1019a17dad98f408b8f7afc4bd007cbe835ae77b737e8c7f19ae575", size = 265666, upload-time = "2025-10-21T15:55:13.306Z" }, + { url = "https://files.pythonhosted.org/packages/9c/1a/18e9476ee1b63aaec3844d8e1cb21842dc19272c7e86d879bfc0dcc60db3/regex-2025.10.23-cp311-cp311-win_amd64.whl", hash = "sha256:59bba9f7125536f23fdab5deeea08da0c287a64c1d3acc1c7e99515809824de8", size = 277600, upload-time = "2025-10-21T15:55:15.087Z" }, + { url = "https://files.pythonhosted.org/packages/1d/1b/c019167b1f7a8ec77251457e3ff0339ed74ca8bce1ea13138dc98309c923/regex-2025.10.23-cp311-cp311-win_arm64.whl", hash = "sha256:b103a752b6f1632ca420225718d6ed83f6a6ced3016dd0a4ab9a6825312de566", size = 269974, upload-time = "2025-10-21T15:55:16.841Z" }, + { url = "https://files.pythonhosted.org/packages/f6/57/eeb274d83ab189d02d778851b1ac478477522a92b52edfa6e2ae9ff84679/regex-2025.10.23-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:7a44d9c00f7a0a02d3b777429281376370f3d13d2c75ae74eb94e11ebcf4a7fc", size = 489187, upload-time = "2025-10-21T15:55:18.322Z" }, + { url = "https://files.pythonhosted.org/packages/55/5c/7dad43a9b6ea88bf77e0b8b7729a4c36978e1043165034212fd2702880c6/regex-2025.10.23-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b83601f84fde939ae3478bb32a3aef36f61b58c3208d825c7e8ce1a735f143f2", size = 291122, upload-time = "2025-10-21T15:55:20.2Z" }, + { url = "https://files.pythonhosted.org/packages/66/21/38b71e6f2818f0f4b281c8fba8d9d57cfca7b032a648fa59696e0a54376a/regex-2025.10.23-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ec13647907bb9d15fd192bbfe89ff06612e098a5709e7d6ecabbdd8f7908fc45", size = 288797, upload-time = "2025-10-21T15:55:21.932Z" }, + { url = "https://files.pythonhosted.org/packages/be/95/888f069c89e7729732a6d7cca37f76b44bfb53a1e35dda8a2c7b65c1b992/regex-2025.10.23-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:78d76dd2957d62501084e7012ddafc5fcd406dd982b7a9ca1ea76e8eaaf73e7e", size = 798442, upload-time = "2025-10-21T15:55:23.747Z" }, + { url = "https://files.pythonhosted.org/packages/76/70/4f903c608faf786627a8ee17c06e0067b5acade473678b69c8094b248705/regex-2025.10.23-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8668e5f067e31a47699ebb354f43aeb9c0ef136f915bd864243098524482ac43", size = 864039, upload-time = "2025-10-21T15:55:25.656Z" }, + { url = "https://files.pythonhosted.org/packages/62/19/2df67b526bf25756c7f447dde554fc10a220fd839cc642f50857d01e4a7b/regex-2025.10.23-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a32433fe3deb4b2d8eda88790d2808fed0dc097e84f5e683b4cd4f42edef6cca", size = 912057, upload-time = "2025-10-21T15:55:27.309Z" }, + { url = "https://files.pythonhosted.org/packages/99/14/9a39b7c9e007968411bc3c843cc14cf15437510c0a9991f080cab654fd16/regex-2025.10.23-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d97d73818c642c938db14c0668167f8d39520ca9d983604575ade3fda193afcc", size = 803374, upload-time = "2025-10-21T15:55:28.9Z" }, + { url = "https://files.pythonhosted.org/packages/d4/f7/3495151dd3ca79949599b6d069b72a61a2c5e24fc441dccc79dcaf708fe6/regex-2025.10.23-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bca7feecc72ee33579e9f6ddf8babbe473045717a0e7dbc347099530f96e8b9a", size = 787714, upload-time = "2025-10-21T15:55:30.628Z" }, + { url = "https://files.pythonhosted.org/packages/28/65/ee882455e051131869957ee8597faea45188c9a98c0dad724cfb302d4580/regex-2025.10.23-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:7e24af51e907d7457cc4a72691ec458320b9ae67dc492f63209f01eecb09de32", size = 858392, upload-time = "2025-10-21T15:55:32.322Z" }, + { url = "https://files.pythonhosted.org/packages/53/25/9287fef5be97529ebd3ac79d256159cb709a07eb58d4be780d1ca3885da8/regex-2025.10.23-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:d10bcde58bbdf18146f3a69ec46dd03233b94a4a5632af97aa5378da3a47d288", size = 850484, upload-time = "2025-10-21T15:55:34.037Z" }, + { url = "https://files.pythonhosted.org/packages/f3/b4/b49b88b4fea2f14dc73e5b5842755e782fc2e52f74423d6f4adc130d5880/regex-2025.10.23-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:44383bc0c933388516c2692c9a7503e1f4a67e982f20b9a29d2fb70c6494f147", size = 789634, upload-time = "2025-10-21T15:55:35.958Z" }, + { url = "https://files.pythonhosted.org/packages/b6/3c/2f8d199d0e84e78bcd6bdc2be9b62410624f6b796e2893d1837ae738b160/regex-2025.10.23-cp312-cp312-win32.whl", hash = "sha256:6040a86f95438a0114bba16e51dfe27f1bc004fd29fe725f54a586f6d522b079", size = 266060, upload-time = "2025-10-21T15:55:37.902Z" }, + { url = "https://files.pythonhosted.org/packages/d7/67/c35e80969f6ded306ad70b0698863310bdf36aca57ad792f45ddc0e2271f/regex-2025.10.23-cp312-cp312-win_amd64.whl", hash = "sha256:436b4c4352fe0762e3bfa34a5567079baa2ef22aa9c37cf4d128979ccfcad842", size = 276931, upload-time = "2025-10-21T15:55:39.502Z" }, + { url = "https://files.pythonhosted.org/packages/f5/a1/4ed147de7d2b60174f758412c87fa51ada15cd3296a0ff047f4280aaa7ca/regex-2025.10.23-cp312-cp312-win_arm64.whl", hash = "sha256:f4b1b1991617055b46aff6f6db24888c1f05f4db9801349d23f09ed0714a9335", size = 270103, upload-time = "2025-10-21T15:55:41.24Z" }, + { url = "https://files.pythonhosted.org/packages/28/c6/195a6217a43719d5a6a12cc192a22d12c40290cecfa577f00f4fb822f07d/regex-2025.10.23-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:b7690f95404a1293923a296981fd943cca12c31a41af9c21ba3edd06398fc193", size = 488956, upload-time = "2025-10-21T15:55:42.887Z" }, + { url = "https://files.pythonhosted.org/packages/4c/93/181070cd1aa2fa541ff2d3afcf763ceecd4937b34c615fa92765020a6c90/regex-2025.10.23-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1a32d77aeaea58a13230100dd8797ac1a84c457f3af2fdf0d81ea689d5a9105b", size = 290997, upload-time = "2025-10-21T15:55:44.53Z" }, + { url = "https://files.pythonhosted.org/packages/b6/c5/9d37fbe3a40ed8dda78c23e1263002497540c0d1522ed75482ef6c2000f0/regex-2025.10.23-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:b24b29402f264f70a3c81f45974323b41764ff7159655360543b7cabb73e7d2f", size = 288686, upload-time = "2025-10-21T15:55:46.186Z" }, + { url = "https://files.pythonhosted.org/packages/5f/e7/db610ff9f10c2921f9b6ac0c8d8be4681b28ddd40fc0549429366967e61f/regex-2025.10.23-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:563824a08c7c03d96856d84b46fdb3bbb7cfbdf79da7ef68725cda2ce169c72a", size = 798466, upload-time = "2025-10-21T15:55:48.24Z" }, + { url = "https://files.pythonhosted.org/packages/90/10/aab883e1fa7fe2feb15ac663026e70ca0ae1411efa0c7a4a0342d9545015/regex-2025.10.23-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a0ec8bdd88d2e2659c3518087ee34b37e20bd169419ffead4240a7004e8ed03b", size = 863996, upload-time = "2025-10-21T15:55:50.478Z" }, + { url = "https://files.pythonhosted.org/packages/a2/b0/8f686dd97a51f3b37d0238cd00a6d0f9ccabe701f05b56de1918571d0d61/regex-2025.10.23-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b577601bfe1d33913fcd9276d7607bbac827c4798d9e14d04bf37d417a6c41cb", size = 912145, upload-time = "2025-10-21T15:55:52.215Z" }, + { url = "https://files.pythonhosted.org/packages/a3/ca/639f8cd5b08797bca38fc5e7e07f76641a428cf8c7fca05894caf045aa32/regex-2025.10.23-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7c9f2c68ac6cb3de94eea08a437a75eaa2bd33f9e97c84836ca0b610a5804368", size = 803370, upload-time = "2025-10-21T15:55:53.944Z" }, + { url = "https://files.pythonhosted.org/packages/0d/1e/a40725bb76959eddf8abc42a967bed6f4851b39f5ac4f20e9794d7832aa5/regex-2025.10.23-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:89f8b9ea3830c79468e26b0e21c3585f69f105157c2154a36f6b7839f8afb351", size = 787767, upload-time = "2025-10-21T15:55:56.004Z" }, + { url = "https://files.pythonhosted.org/packages/3d/d8/8ee9858062936b0f99656dce390aa667c6e7fb0c357b1b9bf76fb5e2e708/regex-2025.10.23-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:98fd84c4e4ea185b3bb5bf065261ab45867d8875032f358a435647285c722673", size = 858335, upload-time = "2025-10-21T15:55:58.185Z" }, + { url = "https://files.pythonhosted.org/packages/d8/0a/ed5faaa63fa8e3064ab670e08061fbf09e3a10235b19630cf0cbb9e48c0a/regex-2025.10.23-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:1e11d3e5887b8b096f96b4154dfb902f29c723a9556639586cd140e77e28b313", size = 850402, upload-time = "2025-10-21T15:56:00.023Z" }, + { url = "https://files.pythonhosted.org/packages/79/14/d05f617342f4b2b4a23561da500ca2beab062bfcc408d60680e77ecaf04d/regex-2025.10.23-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4f13450328a6634348d47a88367e06b64c9d84980ef6a748f717b13f8ce64e87", size = 789739, upload-time = "2025-10-21T15:56:01.967Z" }, + { url = "https://files.pythonhosted.org/packages/f9/7b/e8ce8eef42a15f2c3461f8b3e6e924bbc86e9605cb534a393aadc8d3aff8/regex-2025.10.23-cp313-cp313-win32.whl", hash = "sha256:37be9296598a30c6a20236248cb8b2c07ffd54d095b75d3a2a2ee5babdc51df1", size = 266054, upload-time = "2025-10-21T15:56:05.291Z" }, + { url = "https://files.pythonhosted.org/packages/71/2d/55184ed6be6473187868d2f2e6a0708195fc58270e62a22cbf26028f2570/regex-2025.10.23-cp313-cp313-win_amd64.whl", hash = "sha256:ea7a3c283ce0f06fe789365841e9174ba05f8db16e2fd6ae00a02df9572c04c0", size = 276917, upload-time = "2025-10-21T15:56:07.303Z" }, + { url = "https://files.pythonhosted.org/packages/9c/d4/927eced0e2bd45c45839e556f987f8c8f8683268dd3c00ad327deb3b0172/regex-2025.10.23-cp313-cp313-win_arm64.whl", hash = "sha256:d9a4953575f300a7bab71afa4cd4ac061c7697c89590a2902b536783eeb49a4f", size = 270105, upload-time = "2025-10-21T15:56:09.857Z" }, + { url = "https://files.pythonhosted.org/packages/3e/b3/95b310605285573341fc062d1d30b19a54f857530e86c805f942c4ff7941/regex-2025.10.23-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:7d6606524fa77b3912c9ef52a42ef63c6cfbfc1077e9dc6296cd5da0da286044", size = 491850, upload-time = "2025-10-21T15:56:11.685Z" }, + { url = "https://files.pythonhosted.org/packages/a4/8f/207c2cec01e34e56db1eff606eef46644a60cf1739ecd474627db90ad90b/regex-2025.10.23-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:c037aadf4d64bdc38af7db3dbd34877a057ce6524eefcb2914d6d41c56f968cc", size = 292537, upload-time = "2025-10-21T15:56:13.963Z" }, + { url = "https://files.pythonhosted.org/packages/98/3b/025240af4ada1dc0b5f10d73f3e5122d04ce7f8908ab8881e5d82b9d61b6/regex-2025.10.23-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:99018c331fb2529084a0c9b4c713dfa49fafb47c7712422e49467c13a636c656", size = 290904, upload-time = "2025-10-21T15:56:16.016Z" }, + { url = "https://files.pythonhosted.org/packages/81/8e/104ac14e2d3450c43db18ec03e1b96b445a94ae510b60138f00ce2cb7ca1/regex-2025.10.23-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fd8aba965604d70306eb90a35528f776e59112a7114a5162824d43b76fa27f58", size = 807311, upload-time = "2025-10-21T15:56:17.818Z" }, + { url = "https://files.pythonhosted.org/packages/19/63/78aef90141b7ce0be8a18e1782f764f6997ad09de0e05251f0d2503a914a/regex-2025.10.23-cp313-cp313t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:238e67264b4013e74136c49f883734f68656adf8257bfa13b515626b31b20f8e", size = 873241, upload-time = "2025-10-21T15:56:19.941Z" }, + { url = "https://files.pythonhosted.org/packages/b3/a8/80eb1201bb49ae4dba68a1b284b4211ed9daa8e74dc600018a10a90399fb/regex-2025.10.23-cp313-cp313t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b2eb48bd9848d66fd04826382f5e8491ae633de3233a3d64d58ceb4ecfa2113a", size = 914794, upload-time = "2025-10-21T15:56:22.488Z" }, + { url = "https://files.pythonhosted.org/packages/f0/d5/1984b6ee93281f360a119a5ca1af6a8ca7d8417861671388bf750becc29b/regex-2025.10.23-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d36591ce06d047d0c0fe2fc5f14bfbd5b4525d08a7b6a279379085e13f0e3d0e", size = 812581, upload-time = "2025-10-21T15:56:24.319Z" }, + { url = "https://files.pythonhosted.org/packages/c4/39/11ebdc6d9927172a64ae237d16763145db6bd45ebb4055c17b88edab72a7/regex-2025.10.23-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b5d4ece8628d6e364302006366cea3ee887db397faebacc5dacf8ef19e064cf8", size = 795346, upload-time = "2025-10-21T15:56:26.232Z" }, + { url = "https://files.pythonhosted.org/packages/3b/b4/89a591bcc08b5e436af43315284bd233ba77daf0cf20e098d7af12f006c1/regex-2025.10.23-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:39a7e8083959cb1c4ff74e483eecb5a65d3b3e1d821b256e54baf61782c906c6", size = 868214, upload-time = "2025-10-21T15:56:28.597Z" }, + { url = "https://files.pythonhosted.org/packages/3d/ff/58ba98409c1dbc8316cdb20dafbc63ed267380a07780cafecaf5012dabc9/regex-2025.10.23-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:842d449a8fefe546f311656cf8c0d6729b08c09a185f1cad94c756210286d6a8", size = 854540, upload-time = "2025-10-21T15:56:30.875Z" }, + { url = "https://files.pythonhosted.org/packages/9a/f2/4a9e9338d67626e2071b643f828a482712ad15889d7268e11e9a63d6f7e9/regex-2025.10.23-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:d614986dc68506be8f00474f4f6960e03e4ca9883f7df47744800e7d7c08a494", size = 799346, upload-time = "2025-10-21T15:56:32.725Z" }, + { url = "https://files.pythonhosted.org/packages/63/be/543d35c46bebf6f7bf2be538cca74d6585f25714700c36f37f01b92df551/regex-2025.10.23-cp313-cp313t-win32.whl", hash = "sha256:a5b7a26b51a9df473ec16a1934d117443a775ceb7b39b78670b2e21893c330c9", size = 268657, upload-time = "2025-10-21T15:56:34.577Z" }, + { url = "https://files.pythonhosted.org/packages/14/9f/4dd6b7b612037158bb2c9bcaa710e6fb3c40ad54af441b9c53b3a137a9f1/regex-2025.10.23-cp313-cp313t-win_amd64.whl", hash = "sha256:ce81c5544a5453f61cb6f548ed358cfb111e3b23f3cd42d250a4077a6be2a7b6", size = 280075, upload-time = "2025-10-21T15:56:36.767Z" }, + { url = "https://files.pythonhosted.org/packages/81/7a/5bd0672aa65d38c8da6747c17c8b441bdb53d816c569e3261013af8e83cf/regex-2025.10.23-cp313-cp313t-win_arm64.whl", hash = "sha256:e9bf7f6699f490e4e43c44757aa179dab24d1960999c84ab5c3d5377714ed473", size = 271219, upload-time = "2025-10-21T15:56:39.033Z" }, + { url = "https://files.pythonhosted.org/packages/73/f6/0caf29fec943f201fbc8822879c99d31e59c1d51a983d9843ee5cf398539/regex-2025.10.23-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:5b5cb5b6344c4c4c24b2dc87b0bfee78202b07ef7633385df70da7fcf6f7cec6", size = 488960, upload-time = "2025-10-21T15:56:40.849Z" }, + { url = "https://files.pythonhosted.org/packages/8e/7d/ebb7085b8fa31c24ce0355107cea2b92229d9050552a01c5d291c42aecea/regex-2025.10.23-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:a6ce7973384c37bdf0f371a843f95a6e6f4e1489e10e0cf57330198df72959c5", size = 290932, upload-time = "2025-10-21T15:56:42.875Z" }, + { url = "https://files.pythonhosted.org/packages/27/41/43906867287cbb5ca4cee671c3cc8081e15deef86a8189c3aad9ac9f6b4d/regex-2025.10.23-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2ee3663f2c334959016b56e3bd0dd187cbc73f948e3a3af14c3caaa0c3035d10", size = 288766, upload-time = "2025-10-21T15:56:44.894Z" }, + { url = "https://files.pythonhosted.org/packages/ab/9e/ea66132776700fc77a39b1056e7a5f1308032fead94507e208dc6716b7cd/regex-2025.10.23-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2003cc82a579107e70d013482acce8ba773293f2db534fb532738395c557ff34", size = 798884, upload-time = "2025-10-21T15:56:47.178Z" }, + { url = "https://files.pythonhosted.org/packages/d5/99/aed1453687ab63819a443930770db972c5c8064421f0d9f5da9ad029f26b/regex-2025.10.23-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:182c452279365a93a9f45874f7f191ec1c51e1f1eb41bf2b16563f1a40c1da3a", size = 864768, upload-time = "2025-10-21T15:56:49.793Z" }, + { url = "https://files.pythonhosted.org/packages/99/5d/732fe747a1304805eb3853ce6337eea16b169f7105a0d0dd9c6a5ffa9948/regex-2025.10.23-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b1249e9ff581c5b658c8f0437f883b01f1edcf424a16388591e7c05e5e9e8b0c", size = 911394, upload-time = "2025-10-21T15:56:52.186Z" }, + { url = "https://files.pythonhosted.org/packages/5e/48/58a1f6623466522352a6efa153b9a3714fc559d9f930e9bc947b4a88a2c3/regex-2025.10.23-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2b841698f93db3ccc36caa1900d2a3be281d9539b822dc012f08fc80b46a3224", size = 803145, upload-time = "2025-10-21T15:56:55.142Z" }, + { url = "https://files.pythonhosted.org/packages/ea/f6/7dea79be2681a5574ab3fc237aa53b2c1dfd6bd2b44d4640b6c76f33f4c1/regex-2025.10.23-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:956d89e0c92d471e8f7eee73f73fdff5ed345886378c45a43175a77538a1ffe4", size = 787831, upload-time = "2025-10-21T15:56:57.203Z" }, + { url = "https://files.pythonhosted.org/packages/3a/ad/07b76950fbbe65f88120ca2d8d845047c401450f607c99ed38862904671d/regex-2025.10.23-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:5c259cb363299a0d90d63b5c0d7568ee98419861618a95ee9d91a41cb9954462", size = 859162, upload-time = "2025-10-21T15:56:59.195Z" }, + { url = "https://files.pythonhosted.org/packages/41/87/374f3b2021b22aa6a4fc0b750d63f9721e53d1631a238f7a1c343c1cd288/regex-2025.10.23-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:185d2b18c062820b3a40d8fefa223a83f10b20a674bf6e8c4a432e8dfd844627", size = 849899, upload-time = "2025-10-21T15:57:01.747Z" }, + { url = "https://files.pythonhosted.org/packages/12/4a/7f7bb17c5a5a9747249807210e348450dab9212a46ae6d23ebce86ba6a2b/regex-2025.10.23-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:281d87fa790049c2b7c1b4253121edd80b392b19b5a3d28dc2a77579cb2a58ec", size = 789372, upload-time = "2025-10-21T15:57:04.018Z" }, + { url = "https://files.pythonhosted.org/packages/c9/dd/9c7728ff544fea09bbc8635e4c9e7c423b11c24f1a7a14e6ac4831466709/regex-2025.10.23-cp314-cp314-win32.whl", hash = "sha256:63b81eef3656072e4ca87c58084c7a9c2b81d41a300b157be635a8a675aacfb8", size = 271451, upload-time = "2025-10-21T15:57:06.266Z" }, + { url = "https://files.pythonhosted.org/packages/48/f8/ef7837ff858eb74079c4804c10b0403c0b740762e6eedba41062225f7117/regex-2025.10.23-cp314-cp314-win_amd64.whl", hash = "sha256:0967c5b86f274800a34a4ed862dfab56928144d03cb18821c5153f8777947796", size = 280173, upload-time = "2025-10-21T15:57:08.206Z" }, + { url = "https://files.pythonhosted.org/packages/8e/d0/d576e1dbd9885bfcd83d0e90762beea48d9373a6f7ed39170f44ed22e336/regex-2025.10.23-cp314-cp314-win_arm64.whl", hash = "sha256:c70dfe58b0a00b36aa04cdb0f798bf3e0adc31747641f69e191109fd8572c9a9", size = 273206, upload-time = "2025-10-21T15:57:10.367Z" }, + { url = "https://files.pythonhosted.org/packages/a6/d0/2025268315e8b2b7b660039824cb7765a41623e97d4cd421510925400487/regex-2025.10.23-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:1f5799ea1787aa6de6c150377d11afad39a38afd033f0c5247aecb997978c422", size = 491854, upload-time = "2025-10-21T15:57:12.526Z" }, + { url = "https://files.pythonhosted.org/packages/44/35/5681c2fec5e8b33454390af209c4353dfc44606bf06d714b0b8bd0454ffe/regex-2025.10.23-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:a9639ab7540cfea45ef57d16dcbea2e22de351998d614c3ad2f9778fa3bdd788", size = 292542, upload-time = "2025-10-21T15:57:15.158Z" }, + { url = "https://files.pythonhosted.org/packages/5d/17/184eed05543b724132e4a18149e900f5189001fcfe2d64edaae4fbaf36b4/regex-2025.10.23-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:08f52122c352eb44c3421dab78b9b73a8a77a282cc8314ae576fcaa92b780d10", size = 290903, upload-time = "2025-10-21T15:57:17.108Z" }, + { url = "https://files.pythonhosted.org/packages/25/d0/5e3347aa0db0de382dddfa133a7b0ae72f24b4344f3989398980b44a3924/regex-2025.10.23-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ebf1baebef1c4088ad5a5623decec6b52950f0e4d7a0ae4d48f0a99f8c9cb7d7", size = 807546, upload-time = "2025-10-21T15:57:19.179Z" }, + { url = "https://files.pythonhosted.org/packages/d2/bb/40c589bbdce1be0c55e9f8159789d58d47a22014f2f820cf2b517a5cd193/regex-2025.10.23-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:16b0f1c2e2d566c562d5c384c2b492646be0a19798532fdc1fdedacc66e3223f", size = 873322, upload-time = "2025-10-21T15:57:21.36Z" }, + { url = "https://files.pythonhosted.org/packages/fe/56/a7e40c01575ac93360e606278d359f91829781a9f7fb6e5aa435039edbda/regex-2025.10.23-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f7ada5d9dceafaab92646aa00c10a9efd9b09942dd9b0d7c5a4b73db92cc7e61", size = 914855, upload-time = "2025-10-21T15:57:24.044Z" }, + { url = "https://files.pythonhosted.org/packages/5c/4b/d55587b192763db3163c3f508b3b67b31bb6f5e7a0e08b83013d0a59500a/regex-2025.10.23-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3a36b4005770044bf08edecc798f0e41a75795b9e7c9c12fe29da8d792ef870c", size = 812724, upload-time = "2025-10-21T15:57:26.123Z" }, + { url = "https://files.pythonhosted.org/packages/33/20/18bac334955fbe99d17229f4f8e98d05e4a501ac03a442be8facbb37c304/regex-2025.10.23-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:af7b2661dcc032da1fae82069b5ebf2ac1dfcd5359ef8b35e1367bfc92181432", size = 795439, upload-time = "2025-10-21T15:57:28.497Z" }, + { url = "https://files.pythonhosted.org/packages/67/46/c57266be9df8549c7d85deb4cb82280cb0019e46fff677534c5fa1badfa4/regex-2025.10.23-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:1cb976810ac1416a67562c2e5ba0accf6f928932320fef302e08100ed681b38e", size = 868336, upload-time = "2025-10-21T15:57:30.867Z" }, + { url = "https://files.pythonhosted.org/packages/b8/f3/bd5879e41ef8187fec5e678e94b526a93f99e7bbe0437b0f2b47f9101694/regex-2025.10.23-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:1a56a54be3897d62f54290190fbcd754bff6932934529fbf5b29933da28fcd43", size = 854567, upload-time = "2025-10-21T15:57:33.062Z" }, + { url = "https://files.pythonhosted.org/packages/e6/57/2b6bbdbd2f24dfed5b028033aa17ad8f7d86bb28f1a892cac8b3bc89d059/regex-2025.10.23-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:8f3e6d202fb52c2153f532043bbcf618fd177df47b0b306741eb9b60ba96edc3", size = 799565, upload-time = "2025-10-21T15:57:35.153Z" }, + { url = "https://files.pythonhosted.org/packages/c7/ba/a6168f542ba73b151ed81237adf6b869c7b2f7f8d51618111296674e20ee/regex-2025.10.23-cp314-cp314t-win32.whl", hash = "sha256:1fa1186966b2621b1769fd467c7b22e317e6ba2d2cdcecc42ea3089ef04a8521", size = 274428, upload-time = "2025-10-21T15:57:37.996Z" }, + { url = "https://files.pythonhosted.org/packages/ef/a0/c84475e14a2829e9b0864ebf77c3f7da909df9d8acfe2bb540ff0072047c/regex-2025.10.23-cp314-cp314t-win_amd64.whl", hash = "sha256:08a15d40ce28362eac3e78e83d75475147869c1ff86bc93285f43b4f4431a741", size = 284140, upload-time = "2025-10-21T15:57:40.027Z" }, + { url = "https://files.pythonhosted.org/packages/51/33/6a08ade0eee5b8ba79386869fa6f77afeb835b60510f3525db987e2fffc4/regex-2025.10.23-cp314-cp314t-win_arm64.whl", hash = "sha256:a93e97338e1c8ea2649e130dcfbe8cd69bba5e1e163834752ab64dcb4de6d5ed", size = 274497, upload-time = "2025-10-21T15:57:42.389Z" }, + { url = "https://files.pythonhosted.org/packages/ed/16/ebf3f7dec606a5b0f23a01317c7989037f152f407170f17030ee977d4211/regex-2025.10.23-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:d8d286760ee5b77fd21cf6b33cc45e0bffd1deeda59ca65b9be996f590a9828a", size = 487962, upload-time = "2025-10-21T15:57:44.433Z" }, + { url = "https://files.pythonhosted.org/packages/ee/77/2893ad1c98a9eebe13a7a622c77ade288c93280d5581c83265d10e473935/regex-2025.10.23-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9e72e3b84b170fec02193d32620a0a7060a22e52c46e45957dcd14742e0d28fb", size = 290418, upload-time = "2025-10-21T15:57:46.764Z" }, + { url = "https://files.pythonhosted.org/packages/a6/5e/362fa14750a38efeb312f066f9ac941ae49960567331e48bf615ba11ad75/regex-2025.10.23-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ec506e8114fa12d21616deb44800f536d6bf2e1a69253dbf611f69af92395c99", size = 288298, upload-time = "2025-10-21T15:57:48.881Z" }, + { url = "https://files.pythonhosted.org/packages/e7/20/147df33bc304ec77e5c97f68a930ea97890f846a2d64b43402344002a00d/regex-2025.10.23-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d7e481f9710e8e24228ce2c77b41db7662a3f68853395da86a292b49dadca2aa", size = 780875, upload-time = "2025-10-21T15:57:51.248Z" }, + { url = "https://files.pythonhosted.org/packages/1d/62/ec306048d4da04fe4b620b26759df9fd4276f4d896de0560b4e49cec3f8a/regex-2025.10.23-cp39-cp39-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4663ff2fc367735ae7b90b4f0e05b25554446df4addafc76fdaacaaa0ba852b5", size = 850304, upload-time = "2025-10-21T15:57:53.507Z" }, + { url = "https://files.pythonhosted.org/packages/98/43/10f900eac7745475021d627d43d73d458c0b0503e42877c9040f11001ae7/regex-2025.10.23-cp39-cp39-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0879dd3251a42d2e9b938e1e03b1e9f60de90b4d153015193f5077a376a18439", size = 897914, upload-time = "2025-10-21T15:57:55.788Z" }, + { url = "https://files.pythonhosted.org/packages/44/4b/9b0eade50d8100f363b87c549e73039d3f639a2ab0b035f48551b89caa74/regex-2025.10.23-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:651c58aecbab7e97bdf8ec76298a28d2bf2b6238c099ec6bf32e6d41e2f9a9cb", size = 791074, upload-time = "2025-10-21T15:57:58.252Z" }, + { url = "https://files.pythonhosted.org/packages/25/87/1392f0cbc5b4592d37c947e051603caf5b6f006188c9959077231170e9b4/regex-2025.10.23-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ceabc62a0e879169cd1bf066063bd6991c3e41e437628936a2ce66e0e2071c32", size = 781781, upload-time = "2025-10-21T15:58:00.478Z" }, + { url = "https://files.pythonhosted.org/packages/ba/6a/d327cf755d3171855a8916f1770aefe7551248027688505a88490904dee1/regex-2025.10.23-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:bfdf4e9aa3e7b7d02fda97509b4ceeed34542361694ecc0a81db1688373ecfbd", size = 774142, upload-time = "2025-10-21T15:58:02.854Z" }, + { url = "https://files.pythonhosted.org/packages/22/5c/74f9caf0836707c3f4a4e19bbd9c6c93faa48cd658dfde54588d898e0cfb/regex-2025.10.23-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:92f565ff9beb9f51bc7cc8c578a7e92eb5c4576b69043a4c58cd05d73fda83c5", size = 845084, upload-time = "2025-10-21T15:58:05.36Z" }, + { url = "https://files.pythonhosted.org/packages/cf/7f/1dd095103748636616919f5f507aab6ed3a3df0ded7f4607d6418c84b75e/regex-2025.10.23-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:abbea548b1076eaf8635caf1071c9d86efdf0fa74abe71fca26c05a2d64cda80", size = 835448, upload-time = "2025-10-21T15:58:07.978Z" }, + { url = "https://files.pythonhosted.org/packages/e2/72/572c46603ae8cc3ad77bcaac45f395cdf4051d57406f9f6db2131c92f251/regex-2025.10.23-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:33535dcf34f47821381e341f7b715cbd027deda4223af4d3932adcd371d3192a", size = 779300, upload-time = "2025-10-21T15:58:10.569Z" }, + { url = "https://files.pythonhosted.org/packages/1b/d8/8f98716394bcfe9c243f08bda4df481020c53777d1a342ab0a180484c741/regex-2025.10.23-cp39-cp39-win32.whl", hash = "sha256:345c9df49a15bf6460534b104b336581bc5f35c286cac526416e7a63d389b09b", size = 265700, upload-time = "2025-10-21T15:58:12.895Z" }, + { url = "https://files.pythonhosted.org/packages/b8/cf/6d2a18663fadd8a2bc35829497b8e94e7a0b876dff22c8ac2d0c650de8f5/regex-2025.10.23-cp39-cp39-win_amd64.whl", hash = "sha256:f668fe1fd3358c5423355a289a4a003e58005ce829d217b828f80bd605a90145", size = 277666, upload-time = "2025-10-21T15:58:15.169Z" }, + { url = "https://files.pythonhosted.org/packages/86/e8/43773997a0de7cac2fdc76b7db7e5156326cd2f5eedf37447bee021d93b4/regex-2025.10.23-cp39-cp39-win_arm64.whl", hash = "sha256:07a3fd25d9074923e4d7258b551ae35ab6bdfe01904b8f0d5341c7d8b20eb18d", size = 270006, upload-time = "2025-10-21T15:58:18.112Z" }, ] [[package]] @@ -2940,6 +2985,9 @@ wheels = [ name = "rpds-py" version = "0.27.1" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.10'", +] sdist = { url = "https://files.pythonhosted.org/packages/e9/dd/2c0cbe774744272b0ae725f44032c77bdcab6e8bcf544bffa3b6e70c8dba/rpds_py-0.27.1.tar.gz", hash = "sha256:26a1c73171d10b7acccbded82bf6a586ab8203601e565badc74bbbf8bc5a10f8", size = 27479, upload-time = "2025-08-27T12:16:36.024Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/a5/ed/3aef893e2dd30e77e35d20d4ddb45ca459db59cead748cad9796ad479411/rpds_py-0.27.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:68afeec26d42ab3b47e541b272166a0b4400313946871cba3ed3a4fc0cab1cef", size = 371606, upload-time = "2025-08-27T12:12:25.189Z" }, @@ -3098,6 +3146,133 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/60/94/384cf54c430b9dac742bbd2ec26c23feb78ded0d43d6d78563a281aec017/rpds_py-0.27.1-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:4fc9b7fe29478824361ead6e14e4f5aed570d477e06088826537e202d25fe859", size = 228784, upload-time = "2025-08-27T12:16:34.428Z" }, ] +[[package]] +name = "rpds-py" +version = "0.28.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/48/dc/95f074d43452b3ef5d06276696ece4b3b5d696e7c9ad7173c54b1390cd70/rpds_py-0.28.0.tar.gz", hash = "sha256:abd4df20485a0983e2ca334a216249b6186d6e3c1627e106651943dbdb791aea", size = 27419, upload-time = "2025-10-22T22:24:29.327Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/82/f8/13bb772dc7cbf2c3c5b816febc34fa0cb2c64a08e0569869585684ce6631/rpds_py-0.28.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:7b6013db815417eeb56b2d9d7324e64fcd4fa289caeee6e7a78b2e11fc9b438a", size = 362820, upload-time = "2025-10-22T22:21:15.074Z" }, + { url = "https://files.pythonhosted.org/packages/84/91/6acce964aab32469c3dbe792cb041a752d64739c534e9c493c701ef0c032/rpds_py-0.28.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1a4c6b05c685c0c03f80dabaeb73e74218c49deea965ca63f76a752807397207", size = 348499, upload-time = "2025-10-22T22:21:17.658Z" }, + { url = "https://files.pythonhosted.org/packages/f1/93/c05bb1f4f5e0234db7c4917cb8dd5e2e0a9a7b26dc74b1b7bee3c9cfd477/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4794c6c3fbe8f9ac87699b131a1f26e7b4abcf6d828da46a3a52648c7930eba", size = 379356, upload-time = "2025-10-22T22:21:19.847Z" }, + { url = "https://files.pythonhosted.org/packages/5c/37/e292da436f0773e319753c567263427cdf6c645d30b44f09463ff8216cda/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2e8456b6ee5527112ff2354dd9087b030e3429e43a74f480d4a5ca79d269fd85", size = 390151, upload-time = "2025-10-22T22:21:21.569Z" }, + { url = "https://files.pythonhosted.org/packages/76/87/a4e3267131616e8faf10486dc00eaedf09bd61c87f01e5ef98e782ee06c9/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:beb880a9ca0a117415f241f66d56025c02037f7c4efc6fe59b5b8454f1eaa50d", size = 524831, upload-time = "2025-10-22T22:21:23.394Z" }, + { url = "https://files.pythonhosted.org/packages/e1/c8/4a4ca76f0befae9515da3fad11038f0fce44f6bb60b21fe9d9364dd51fb0/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6897bebb118c44b38c9cb62a178e09f1593c949391b9a1a6fe777ccab5934ee7", size = 404687, upload-time = "2025-10-22T22:21:25.201Z" }, + { url = "https://files.pythonhosted.org/packages/6a/65/118afe854424456beafbbebc6b34dcf6d72eae3a08b4632bc4220f8240d9/rpds_py-0.28.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1b553dd06e875249fd43efd727785efb57a53180e0fde321468222eabbeaafa", size = 382683, upload-time = "2025-10-22T22:21:26.536Z" }, + { url = "https://files.pythonhosted.org/packages/f7/bc/0625064041fb3a0c77ecc8878c0e8341b0ae27ad0f00cf8f2b57337a1e63/rpds_py-0.28.0-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:f0b2044fdddeea5b05df832e50d2a06fe61023acb44d76978e1b060206a8a476", size = 398927, upload-time = "2025-10-22T22:21:27.864Z" }, + { url = "https://files.pythonhosted.org/packages/5d/1a/fed7cf2f1ee8a5e4778f2054153f2cfcf517748875e2f5b21cf8907cd77d/rpds_py-0.28.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:05cf1e74900e8da73fa08cc76c74a03345e5a3e37691d07cfe2092d7d8e27b04", size = 411590, upload-time = "2025-10-22T22:21:29.474Z" }, + { url = "https://files.pythonhosted.org/packages/c1/64/a8e0f67fa374a6c472dbb0afdaf1ef744724f165abb6899f20e2f1563137/rpds_py-0.28.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:efd489fec7c311dae25e94fe7eeda4b3d06be71c68f2cf2e8ef990ffcd2cd7e8", size = 559843, upload-time = "2025-10-22T22:21:30.917Z" }, + { url = "https://files.pythonhosted.org/packages/a9/ea/e10353f6d7c105be09b8135b72787a65919971ae0330ad97d87e4e199880/rpds_py-0.28.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ada7754a10faacd4f26067e62de52d6af93b6d9542f0df73c57b9771eb3ba9c4", size = 584188, upload-time = "2025-10-22T22:21:32.827Z" }, + { url = "https://files.pythonhosted.org/packages/18/b0/a19743e0763caf0c89f6fc6ba6fbd9a353b24ffb4256a492420c5517da5a/rpds_py-0.28.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c2a34fd26588949e1e7977cfcbb17a9a42c948c100cab890c6d8d823f0586457", size = 550052, upload-time = "2025-10-22T22:21:34.702Z" }, + { url = "https://files.pythonhosted.org/packages/de/bc/ec2c004f6c7d6ab1e25dae875cdb1aee087c3ebed5b73712ed3000e3851a/rpds_py-0.28.0-cp310-cp310-win32.whl", hash = "sha256:f9174471d6920cbc5e82a7822de8dfd4dcea86eb828b04fc8c6519a77b0ee51e", size = 215110, upload-time = "2025-10-22T22:21:36.645Z" }, + { url = "https://files.pythonhosted.org/packages/6c/de/4ce8abf59674e17187023933547d2018363e8fc76ada4f1d4d22871ccb6e/rpds_py-0.28.0-cp310-cp310-win_amd64.whl", hash = "sha256:6e32dd207e2c4f8475257a3540ab8a93eff997abfa0a3fdb287cae0d6cd874b8", size = 223850, upload-time = "2025-10-22T22:21:38.006Z" }, + { url = "https://files.pythonhosted.org/packages/a6/34/058d0db5471c6be7bef82487ad5021ff8d1d1d27794be8730aad938649cf/rpds_py-0.28.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:03065002fd2e287725d95fbc69688e0c6daf6c6314ba38bdbaa3895418e09296", size = 362344, upload-time = "2025-10-22T22:21:39.713Z" }, + { url = "https://files.pythonhosted.org/packages/5d/67/9503f0ec8c055a0782880f300c50a2b8e5e72eb1f94dfc2053da527444dd/rpds_py-0.28.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:28ea02215f262b6d078daec0b45344c89e161eab9526b0d898221d96fdda5f27", size = 348440, upload-time = "2025-10-22T22:21:41.056Z" }, + { url = "https://files.pythonhosted.org/packages/68/2e/94223ee9b32332a41d75b6f94b37b4ce3e93878a556fc5f152cbd856a81f/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25dbade8fbf30bcc551cb352376c0ad64b067e4fc56f90e22ba70c3ce205988c", size = 379068, upload-time = "2025-10-22T22:21:42.593Z" }, + { url = "https://files.pythonhosted.org/packages/b4/25/54fd48f9f680cfc44e6a7f39a5fadf1d4a4a1fd0848076af4a43e79f998c/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3c03002f54cc855860bfdc3442928ffdca9081e73b5b382ed0b9e8efe6e5e205", size = 390518, upload-time = "2025-10-22T22:21:43.998Z" }, + { url = "https://files.pythonhosted.org/packages/1b/85/ac258c9c27f2ccb1bd5d0697e53a82ebcf8088e3186d5d2bf8498ee7ed44/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b9699fa7990368b22032baf2b2dce1f634388e4ffc03dfefaaac79f4695edc95", size = 525319, upload-time = "2025-10-22T22:21:45.645Z" }, + { url = "https://files.pythonhosted.org/packages/40/cb/c6734774789566d46775f193964b76627cd5f42ecf246d257ce84d1912ed/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b9b06fe1a75e05e0713f06ea0c89ecb6452210fd60e2f1b6ddc1067b990e08d9", size = 404896, upload-time = "2025-10-22T22:21:47.544Z" }, + { url = "https://files.pythonhosted.org/packages/1f/53/14e37ce83202c632c89b0691185dca9532288ff9d390eacae3d2ff771bae/rpds_py-0.28.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac9f83e7b326a3f9ec3ef84cda98fb0a74c7159f33e692032233046e7fd15da2", size = 382862, upload-time = "2025-10-22T22:21:49.176Z" }, + { url = "https://files.pythonhosted.org/packages/6a/83/f3642483ca971a54d60caa4449f9d6d4dbb56a53e0072d0deff51b38af74/rpds_py-0.28.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:0d3259ea9ad8743a75a43eb7819324cdab393263c91be86e2d1901ee65c314e0", size = 398848, upload-time = "2025-10-22T22:21:51.024Z" }, + { url = "https://files.pythonhosted.org/packages/44/09/2d9c8b2f88e399b4cfe86efdf2935feaf0394e4f14ab30c6c5945d60af7d/rpds_py-0.28.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9a7548b345f66f6695943b4ef6afe33ccd3f1b638bd9afd0f730dd255c249c9e", size = 412030, upload-time = "2025-10-22T22:21:52.665Z" }, + { url = "https://files.pythonhosted.org/packages/dd/f5/e1cec473d4bde6df1fd3738be8e82d64dd0600868e76e92dfeaebbc2d18f/rpds_py-0.28.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c9a40040aa388b037eb39416710fbcce9443498d2eaab0b9b45ae988b53f5c67", size = 559700, upload-time = "2025-10-22T22:21:54.123Z" }, + { url = "https://files.pythonhosted.org/packages/8d/be/73bb241c1649edbf14e98e9e78899c2c5e52bbe47cb64811f44d2cc11808/rpds_py-0.28.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8f60c7ea34e78c199acd0d3cda37a99be2c861dd2b8cf67399784f70c9f8e57d", size = 584581, upload-time = "2025-10-22T22:21:56.102Z" }, + { url = "https://files.pythonhosted.org/packages/9c/9c/ffc6e9218cd1eb5c2c7dbd276c87cd10e8c2232c456b554169eb363381df/rpds_py-0.28.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1571ae4292649100d743b26d5f9c63503bb1fedf538a8f29a98dce2d5ba6b4e6", size = 549981, upload-time = "2025-10-22T22:21:58.253Z" }, + { url = "https://files.pythonhosted.org/packages/5f/50/da8b6d33803a94df0149345ee33e5d91ed4d25fc6517de6a25587eae4133/rpds_py-0.28.0-cp311-cp311-win32.whl", hash = "sha256:5cfa9af45e7c1140af7321fa0bef25b386ee9faa8928c80dc3a5360971a29e8c", size = 214729, upload-time = "2025-10-22T22:21:59.625Z" }, + { url = "https://files.pythonhosted.org/packages/12/fd/b0f48c4c320ee24c8c20df8b44acffb7353991ddf688af01eef5f93d7018/rpds_py-0.28.0-cp311-cp311-win_amd64.whl", hash = "sha256:dd8d86b5d29d1b74100982424ba53e56033dc47720a6de9ba0259cf81d7cecaa", size = 223977, upload-time = "2025-10-22T22:22:01.092Z" }, + { url = "https://files.pythonhosted.org/packages/b4/21/c8e77a2ac66e2ec4e21f18a04b4e9a0417ecf8e61b5eaeaa9360a91713b4/rpds_py-0.28.0-cp311-cp311-win_arm64.whl", hash = "sha256:4e27d3a5709cc2b3e013bf93679a849213c79ae0573f9b894b284b55e729e120", size = 217326, upload-time = "2025-10-22T22:22:02.944Z" }, + { url = "https://files.pythonhosted.org/packages/b8/5c/6c3936495003875fe7b14f90ea812841a08fca50ab26bd840e924097d9c8/rpds_py-0.28.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:6b4f28583a4f247ff60cd7bdda83db8c3f5b05a7a82ff20dd4b078571747708f", size = 366439, upload-time = "2025-10-22T22:22:04.525Z" }, + { url = "https://files.pythonhosted.org/packages/56/f9/a0f1ca194c50aa29895b442771f036a25b6c41a35e4f35b1a0ea713bedae/rpds_py-0.28.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d678e91b610c29c4b3d52a2c148b641df2b4676ffe47c59f6388d58b99cdc424", size = 348170, upload-time = "2025-10-22T22:22:06.397Z" }, + { url = "https://files.pythonhosted.org/packages/18/ea/42d243d3a586beb72c77fa5def0487daf827210069a95f36328e869599ea/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e819e0e37a44a78e1383bf1970076e2ccc4dc8c2bbaa2f9bd1dc987e9afff628", size = 378838, upload-time = "2025-10-22T22:22:07.932Z" }, + { url = "https://files.pythonhosted.org/packages/e7/78/3de32e18a94791af8f33601402d9d4f39613136398658412a4e0b3047327/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5ee514e0f0523db5d3fb171f397c54875dbbd69760a414dccf9d4d7ad628b5bd", size = 393299, upload-time = "2025-10-22T22:22:09.435Z" }, + { url = "https://files.pythonhosted.org/packages/13/7e/4bdb435afb18acea2eb8a25ad56b956f28de7c59f8a1d32827effa0d4514/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5f3fa06d27fdcee47f07a39e02862da0100cb4982508f5ead53ec533cd5fe55e", size = 518000, upload-time = "2025-10-22T22:22:11.326Z" }, + { url = "https://files.pythonhosted.org/packages/31/d0/5f52a656875cdc60498ab035a7a0ac8f399890cc1ee73ebd567bac4e39ae/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:46959ef2e64f9e4a41fc89aa20dbca2b85531f9a72c21099a3360f35d10b0d5a", size = 408746, upload-time = "2025-10-22T22:22:13.143Z" }, + { url = "https://files.pythonhosted.org/packages/3e/cd/49ce51767b879cde77e7ad9fae164ea15dce3616fe591d9ea1df51152706/rpds_py-0.28.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8455933b4bcd6e83fde3fefc987a023389c4b13f9a58c8d23e4b3f6d13f78c84", size = 386379, upload-time = "2025-10-22T22:22:14.602Z" }, + { url = "https://files.pythonhosted.org/packages/6a/99/e4e1e1ee93a98f72fc450e36c0e4d99c35370220e815288e3ecd2ec36a2a/rpds_py-0.28.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:ad50614a02c8c2962feebe6012b52f9802deec4263946cddea37aaf28dd25a66", size = 401280, upload-time = "2025-10-22T22:22:16.063Z" }, + { url = "https://files.pythonhosted.org/packages/61/35/e0c6a57488392a8b319d2200d03dad2b29c0db9996f5662c3b02d0b86c02/rpds_py-0.28.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e5deca01b271492553fdb6c7fd974659dce736a15bae5dad7ab8b93555bceb28", size = 412365, upload-time = "2025-10-22T22:22:17.504Z" }, + { url = "https://files.pythonhosted.org/packages/ff/6a/841337980ea253ec797eb084665436007a1aad0faac1ba097fb906c5f69c/rpds_py-0.28.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:735f8495a13159ce6a0d533f01e8674cec0c57038c920495f87dcb20b3ddb48a", size = 559573, upload-time = "2025-10-22T22:22:19.108Z" }, + { url = "https://files.pythonhosted.org/packages/e7/5e/64826ec58afd4c489731f8b00729c5f6afdb86f1df1df60bfede55d650bb/rpds_py-0.28.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:961ca621ff10d198bbe6ba4957decca61aa2a0c56695384c1d6b79bf61436df5", size = 583973, upload-time = "2025-10-22T22:22:20.768Z" }, + { url = "https://files.pythonhosted.org/packages/b6/ee/44d024b4843f8386a4eeaa4c171b3d31d55f7177c415545fd1a24c249b5d/rpds_py-0.28.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2374e16cc9131022e7d9a8f8d65d261d9ba55048c78f3b6e017971a4f5e6353c", size = 553800, upload-time = "2025-10-22T22:22:22.25Z" }, + { url = "https://files.pythonhosted.org/packages/7d/89/33e675dccff11a06d4d85dbb4d1865f878d5020cbb69b2c1e7b2d3f82562/rpds_py-0.28.0-cp312-cp312-win32.whl", hash = "sha256:d15431e334fba488b081d47f30f091e5d03c18527c325386091f31718952fe08", size = 216954, upload-time = "2025-10-22T22:22:24.105Z" }, + { url = "https://files.pythonhosted.org/packages/af/36/45f6ebb3210887e8ee6dbf1bc710ae8400bb417ce165aaf3024b8360d999/rpds_py-0.28.0-cp312-cp312-win_amd64.whl", hash = "sha256:a410542d61fc54710f750d3764380b53bf09e8c4edbf2f9141a82aa774a04f7c", size = 227844, upload-time = "2025-10-22T22:22:25.551Z" }, + { url = "https://files.pythonhosted.org/packages/57/91/f3fb250d7e73de71080f9a221d19bd6a1c1eb0d12a1ea26513f6c1052ad6/rpds_py-0.28.0-cp312-cp312-win_arm64.whl", hash = "sha256:1f0cfd1c69e2d14f8c892b893997fa9a60d890a0c8a603e88dca4955f26d1edd", size = 217624, upload-time = "2025-10-22T22:22:26.914Z" }, + { url = "https://files.pythonhosted.org/packages/d3/03/ce566d92611dfac0085c2f4b048cd53ed7c274a5c05974b882a908d540a2/rpds_py-0.28.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:e9e184408a0297086f880556b6168fa927d677716f83d3472ea333b42171ee3b", size = 366235, upload-time = "2025-10-22T22:22:28.397Z" }, + { url = "https://files.pythonhosted.org/packages/00/34/1c61da1b25592b86fd285bd7bd8422f4c9d748a7373b46126f9ae792a004/rpds_py-0.28.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:edd267266a9b0448f33dc465a97cfc5d467594b600fe28e7fa2f36450e03053a", size = 348241, upload-time = "2025-10-22T22:22:30.171Z" }, + { url = "https://files.pythonhosted.org/packages/fc/00/ed1e28616848c61c493a067779633ebf4b569eccaacf9ccbdc0e7cba2b9d/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:85beb8b3f45e4e32f6802fb6cd6b17f615ef6c6a52f265371fb916fae02814aa", size = 378079, upload-time = "2025-10-22T22:22:31.644Z" }, + { url = "https://files.pythonhosted.org/packages/11/b2/ccb30333a16a470091b6e50289adb4d3ec656fd9951ba8c5e3aaa0746a67/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d2412be8d00a1b895f8ad827cc2116455196e20ed994bb704bf138fe91a42724", size = 393151, upload-time = "2025-10-22T22:22:33.453Z" }, + { url = "https://files.pythonhosted.org/packages/8c/d0/73e2217c3ee486d555cb84920597480627d8c0240ff3062005c6cc47773e/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cf128350d384b777da0e68796afdcebc2e9f63f0e9f242217754e647f6d32491", size = 517520, upload-time = "2025-10-22T22:22:34.949Z" }, + { url = "https://files.pythonhosted.org/packages/c4/91/23efe81c700427d0841a4ae7ea23e305654381831e6029499fe80be8a071/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a2036d09b363aa36695d1cc1a97b36865597f4478470b0697b5ee9403f4fe399", size = 408699, upload-time = "2025-10-22T22:22:36.584Z" }, + { url = "https://files.pythonhosted.org/packages/ca/ee/a324d3198da151820a326c1f988caaa4f37fc27955148a76fff7a2d787a9/rpds_py-0.28.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8e1e9be4fa6305a16be628959188e4fd5cd6f1b0e724d63c6d8b2a8adf74ea6", size = 385720, upload-time = "2025-10-22T22:22:38.014Z" }, + { url = "https://files.pythonhosted.org/packages/19/ad/e68120dc05af8b7cab4a789fccd8cdcf0fe7e6581461038cc5c164cd97d2/rpds_py-0.28.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:0a403460c9dd91a7f23fc3188de6d8977f1d9603a351d5db6cf20aaea95b538d", size = 401096, upload-time = "2025-10-22T22:22:39.869Z" }, + { url = "https://files.pythonhosted.org/packages/99/90/c1e070620042459d60df6356b666bb1f62198a89d68881816a7ed121595a/rpds_py-0.28.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d7366b6553cdc805abcc512b849a519167db8f5e5c3472010cd1228b224265cb", size = 411465, upload-time = "2025-10-22T22:22:41.395Z" }, + { url = "https://files.pythonhosted.org/packages/68/61/7c195b30d57f1b8d5970f600efee72a4fad79ec829057972e13a0370fd24/rpds_py-0.28.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:5b43c6a3726efd50f18d8120ec0551241c38785b68952d240c45ea553912ac41", size = 558832, upload-time = "2025-10-22T22:22:42.871Z" }, + { url = "https://files.pythonhosted.org/packages/b0/3d/06f3a718864773f69941d4deccdf18e5e47dd298b4628062f004c10f3b34/rpds_py-0.28.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:0cb7203c7bc69d7c1585ebb33a2e6074492d2fc21ad28a7b9d40457ac2a51ab7", size = 583230, upload-time = "2025-10-22T22:22:44.877Z" }, + { url = "https://files.pythonhosted.org/packages/66/df/62fc783781a121e77fee9a21ead0a926f1b652280a33f5956a5e7833ed30/rpds_py-0.28.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:7a52a5169c664dfb495882adc75c304ae1d50df552fbd68e100fdc719dee4ff9", size = 553268, upload-time = "2025-10-22T22:22:46.441Z" }, + { url = "https://files.pythonhosted.org/packages/84/85/d34366e335140a4837902d3dea89b51f087bd6a63c993ebdff59e93ee61d/rpds_py-0.28.0-cp313-cp313-win32.whl", hash = "sha256:2e42456917b6687215b3e606ab46aa6bca040c77af7df9a08a6dcfe8a4d10ca5", size = 217100, upload-time = "2025-10-22T22:22:48.342Z" }, + { url = "https://files.pythonhosted.org/packages/3c/1c/f25a3f3752ad7601476e3eff395fe075e0f7813fbb9862bd67c82440e880/rpds_py-0.28.0-cp313-cp313-win_amd64.whl", hash = "sha256:e0a0311caedc8069d68fc2bf4c9019b58a2d5ce3cd7cb656c845f1615b577e1e", size = 227759, upload-time = "2025-10-22T22:22:50.219Z" }, + { url = "https://files.pythonhosted.org/packages/e0/d6/5f39b42b99615b5bc2f36ab90423ea404830bdfee1c706820943e9a645eb/rpds_py-0.28.0-cp313-cp313-win_arm64.whl", hash = "sha256:04c1b207ab8b581108801528d59ad80aa83bb170b35b0ddffb29c20e411acdc1", size = 217326, upload-time = "2025-10-22T22:22:51.647Z" }, + { url = "https://files.pythonhosted.org/packages/5c/8b/0c69b72d1cee20a63db534be0df271effe715ef6c744fdf1ff23bb2b0b1c/rpds_py-0.28.0-cp313-cp313t-macosx_10_12_x86_64.whl", hash = "sha256:f296ea3054e11fc58ad42e850e8b75c62d9a93a9f981ad04b2e5ae7d2186ff9c", size = 355736, upload-time = "2025-10-22T22:22:53.211Z" }, + { url = "https://files.pythonhosted.org/packages/f7/6d/0c2ee773cfb55c31a8514d2cece856dd299170a49babd50dcffb15ddc749/rpds_py-0.28.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:5a7306c19b19005ad98468fcefeb7100b19c79fc23a5f24a12e06d91181193fa", size = 342677, upload-time = "2025-10-22T22:22:54.723Z" }, + { url = "https://files.pythonhosted.org/packages/e2/1c/22513ab25a27ea205144414724743e305e8153e6abe81833b5e678650f5a/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5d9b86aa501fed9862a443c5c3116f6ead8bc9296185f369277c42542bd646b", size = 371847, upload-time = "2025-10-22T22:22:56.295Z" }, + { url = "https://files.pythonhosted.org/packages/60/07/68e6ccdb4b05115ffe61d31afc94adef1833d3a72f76c9632d4d90d67954/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e5bbc701eff140ba0e872691d573b3d5d30059ea26e5785acba9132d10c8c31d", size = 381800, upload-time = "2025-10-22T22:22:57.808Z" }, + { url = "https://files.pythonhosted.org/packages/73/bf/6d6d15df80781d7f9f368e7c1a00caf764436518c4877fb28b029c4624af/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9a5690671cd672a45aa8616d7374fdf334a1b9c04a0cac3c854b1136e92374fe", size = 518827, upload-time = "2025-10-22T22:22:59.826Z" }, + { url = "https://files.pythonhosted.org/packages/7b/d3/2decbb2976cc452cbf12a2b0aaac5f1b9dc5dd9d1f7e2509a3ee00421249/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9f1d92ecea4fa12f978a367c32a5375a1982834649cdb96539dcdc12e609ab1a", size = 399471, upload-time = "2025-10-22T22:23:01.968Z" }, + { url = "https://files.pythonhosted.org/packages/b1/2c/f30892f9e54bd02e5faca3f6a26d6933c51055e67d54818af90abed9748e/rpds_py-0.28.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8d252db6b1a78d0a3928b6190156042d54c93660ce4d98290d7b16b5296fb7cc", size = 377578, upload-time = "2025-10-22T22:23:03.52Z" }, + { url = "https://files.pythonhosted.org/packages/f0/5d/3bce97e5534157318f29ac06bf2d279dae2674ec12f7cb9c12739cee64d8/rpds_py-0.28.0-cp313-cp313t-manylinux_2_31_riscv64.whl", hash = "sha256:d61b355c3275acb825f8777d6c4505f42b5007e357af500939d4a35b19177259", size = 390482, upload-time = "2025-10-22T22:23:05.391Z" }, + { url = "https://files.pythonhosted.org/packages/e3/f0/886bd515ed457b5bd93b166175edb80a0b21a210c10e993392127f1e3931/rpds_py-0.28.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:acbe5e8b1026c0c580d0321c8aae4b0a1e1676861d48d6e8c6586625055b606a", size = 402447, upload-time = "2025-10-22T22:23:06.93Z" }, + { url = "https://files.pythonhosted.org/packages/42/b5/71e8777ac55e6af1f4f1c05b47542a1eaa6c33c1cf0d300dca6a1c6e159a/rpds_py-0.28.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:8aa23b6f0fc59b85b4c7d89ba2965af274346f738e8d9fc2455763602e62fd5f", size = 552385, upload-time = "2025-10-22T22:23:08.557Z" }, + { url = "https://files.pythonhosted.org/packages/5d/cb/6ca2d70cbda5a8e36605e7788c4aa3bea7c17d71d213465a5a675079b98d/rpds_py-0.28.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:7b14b0c680286958817c22d76fcbca4800ddacef6f678f3a7c79a1fe7067fe37", size = 575642, upload-time = "2025-10-22T22:23:10.348Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d4/407ad9960ca7856d7b25c96dcbe019270b5ffdd83a561787bc682c797086/rpds_py-0.28.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:bcf1d210dfee61a6c86551d67ee1031899c0fdbae88b2d44a569995d43797712", size = 544507, upload-time = "2025-10-22T22:23:12.434Z" }, + { url = "https://files.pythonhosted.org/packages/51/31/2f46fe0efcac23fbf5797c6b6b7e1c76f7d60773e525cb65fcbc582ee0f2/rpds_py-0.28.0-cp313-cp313t-win32.whl", hash = "sha256:3aa4dc0fdab4a7029ac63959a3ccf4ed605fee048ba67ce89ca3168da34a1342", size = 205376, upload-time = "2025-10-22T22:23:13.979Z" }, + { url = "https://files.pythonhosted.org/packages/92/e4/15947bda33cbedfc134490a41841ab8870a72a867a03d4969d886f6594a2/rpds_py-0.28.0-cp313-cp313t-win_amd64.whl", hash = "sha256:7b7d9d83c942855e4fdcfa75d4f96f6b9e272d42fffcb72cd4bb2577db2e2907", size = 215907, upload-time = "2025-10-22T22:23:15.5Z" }, + { url = "https://files.pythonhosted.org/packages/08/47/ffe8cd7a6a02833b10623bf765fbb57ce977e9a4318ca0e8cf97e9c3d2b3/rpds_py-0.28.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:dcdcb890b3ada98a03f9f2bb108489cdc7580176cb73b4f2d789e9a1dac1d472", size = 353830, upload-time = "2025-10-22T22:23:17.03Z" }, + { url = "https://files.pythonhosted.org/packages/f9/9f/890f36cbd83a58491d0d91ae0db1702639edb33fb48eeb356f80ecc6b000/rpds_py-0.28.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:f274f56a926ba2dc02976ca5b11c32855cbd5925534e57cfe1fda64e04d1add2", size = 341819, upload-time = "2025-10-22T22:23:18.57Z" }, + { url = "https://files.pythonhosted.org/packages/09/e3/921eb109f682aa24fb76207698fbbcf9418738f35a40c21652c29053f23d/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fe0438ac4a29a520ea94c8c7f1754cdd8feb1bc490dfda1bfd990072363d527", size = 373127, upload-time = "2025-10-22T22:23:20.216Z" }, + { url = "https://files.pythonhosted.org/packages/23/13/bce4384d9f8f4989f1a9599c71b7a2d877462e5fd7175e1f69b398f729f4/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8a358a32dd3ae50e933347889b6af9a1bdf207ba5d1a3f34e1a38cd3540e6733", size = 382767, upload-time = "2025-10-22T22:23:21.787Z" }, + { url = "https://files.pythonhosted.org/packages/23/e1/579512b2d89a77c64ccef5a0bc46a6ef7f72ae0cf03d4b26dcd52e57ee0a/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e80848a71c78aa328fefaba9c244d588a342c8e03bda518447b624ea64d1ff56", size = 517585, upload-time = "2025-10-22T22:23:23.699Z" }, + { url = "https://files.pythonhosted.org/packages/62/3c/ca704b8d324a2591b0b0adcfcaadf9c862375b11f2f667ac03c61b4fd0a6/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f586db2e209d54fe177e58e0bc4946bea5fb0102f150b1b2f13de03e1f0976f8", size = 399828, upload-time = "2025-10-22T22:23:25.713Z" }, + { url = "https://files.pythonhosted.org/packages/da/37/e84283b9e897e3adc46b4c88bb3f6ec92a43bd4d2f7ef5b13459963b2e9c/rpds_py-0.28.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ae8ee156d6b586e4292491e885d41483136ab994e719a13458055bec14cf370", size = 375509, upload-time = "2025-10-22T22:23:27.32Z" }, + { url = "https://files.pythonhosted.org/packages/1a/c2/a980beab869d86258bf76ec42dec778ba98151f253a952b02fe36d72b29c/rpds_py-0.28.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:a805e9b3973f7e27f7cab63a6b4f61d90f2e5557cff73b6e97cd5b8540276d3d", size = 392014, upload-time = "2025-10-22T22:23:29.332Z" }, + { url = "https://files.pythonhosted.org/packages/da/b5/b1d3c5f9d3fa5aeef74265f9c64de3c34a0d6d5cd3c81c8b17d5c8f10ed4/rpds_py-0.28.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5d3fd16b6dc89c73a4da0b4ac8b12a7ecc75b2864b95c9e5afed8003cb50a728", size = 402410, upload-time = "2025-10-22T22:23:31.14Z" }, + { url = "https://files.pythonhosted.org/packages/74/ae/cab05ff08dfcc052afc73dcb38cbc765ffc86f94e966f3924cd17492293c/rpds_py-0.28.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:6796079e5d24fdaba6d49bda28e2c47347e89834678f2bc2c1b4fc1489c0fb01", size = 553593, upload-time = "2025-10-22T22:23:32.834Z" }, + { url = "https://files.pythonhosted.org/packages/70/80/50d5706ea2a9bfc9e9c5f401d91879e7c790c619969369800cde202da214/rpds_py-0.28.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:76500820c2af232435cbe215e3324c75b950a027134e044423f59f5b9a1ba515", size = 576925, upload-time = "2025-10-22T22:23:34.47Z" }, + { url = "https://files.pythonhosted.org/packages/ab/12/85a57d7a5855a3b188d024b099fd09c90db55d32a03626d0ed16352413ff/rpds_py-0.28.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:bbdc5640900a7dbf9dd707fe6388972f5bbd883633eb68b76591044cfe346f7e", size = 542444, upload-time = "2025-10-22T22:23:36.093Z" }, + { url = "https://files.pythonhosted.org/packages/6c/65/10643fb50179509150eb94d558e8837c57ca8b9adc04bd07b98e57b48f8c/rpds_py-0.28.0-cp314-cp314-win32.whl", hash = "sha256:adc8aa88486857d2b35d75f0640b949759f79dc105f50aa2c27816b2e0dd749f", size = 207968, upload-time = "2025-10-22T22:23:37.638Z" }, + { url = "https://files.pythonhosted.org/packages/b4/84/0c11fe4d9aaea784ff4652499e365963222481ac647bcd0251c88af646eb/rpds_py-0.28.0-cp314-cp314-win_amd64.whl", hash = "sha256:66e6fa8e075b58946e76a78e69e1a124a21d9a48a5b4766d15ba5b06869d1fa1", size = 218876, upload-time = "2025-10-22T22:23:39.179Z" }, + { url = "https://files.pythonhosted.org/packages/0f/e0/3ab3b86ded7bb18478392dc3e835f7b754cd446f62f3fc96f4fe2aca78f6/rpds_py-0.28.0-cp314-cp314-win_arm64.whl", hash = "sha256:a6fe887c2c5c59413353b7c0caff25d0e566623501ccfff88957fa438a69377d", size = 212506, upload-time = "2025-10-22T22:23:40.755Z" }, + { url = "https://files.pythonhosted.org/packages/51/ec/d5681bb425226c3501eab50fc30e9d275de20c131869322c8a1729c7b61c/rpds_py-0.28.0-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:7a69df082db13c7070f7b8b1f155fa9e687f1d6aefb7b0e3f7231653b79a067b", size = 355433, upload-time = "2025-10-22T22:23:42.259Z" }, + { url = "https://files.pythonhosted.org/packages/be/ec/568c5e689e1cfb1ea8b875cffea3649260955f677fdd7ddc6176902d04cd/rpds_py-0.28.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b1cde22f2c30ebb049a9e74c5374994157b9b70a16147d332f89c99c5960737a", size = 342601, upload-time = "2025-10-22T22:23:44.372Z" }, + { url = "https://files.pythonhosted.org/packages/32/fe/51ada84d1d2a1d9d8f2c902cfddd0133b4a5eb543196ab5161d1c07ed2ad/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5338742f6ba7a51012ea470bd4dc600a8c713c0c72adaa0977a1b1f4327d6592", size = 372039, upload-time = "2025-10-22T22:23:46.025Z" }, + { url = "https://files.pythonhosted.org/packages/07/c1/60144a2f2620abade1a78e0d91b298ac2d9b91bc08864493fa00451ef06e/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e1460ebde1bcf6d496d80b191d854adedcc619f84ff17dc1c6d550f58c9efbba", size = 382407, upload-time = "2025-10-22T22:23:48.098Z" }, + { url = "https://files.pythonhosted.org/packages/45/ed/091a7bbdcf4038a60a461df50bc4c82a7ed6d5d5e27649aab61771c17585/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e3eb248f2feba84c692579257a043a7699e28a77d86c77b032c1d9fbb3f0219c", size = 518172, upload-time = "2025-10-22T22:23:50.16Z" }, + { url = "https://files.pythonhosted.org/packages/54/dd/02cc90c2fd9c2ef8016fd7813bfacd1c3a1325633ec8f244c47b449fc868/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3bbba5def70b16cd1c1d7255666aad3b290fbf8d0fe7f9f91abafb73611a91", size = 399020, upload-time = "2025-10-22T22:23:51.81Z" }, + { url = "https://files.pythonhosted.org/packages/ab/81/5d98cc0329bbb911ccecd0b9e19fbf7f3a5de8094b4cda5e71013b2dd77e/rpds_py-0.28.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3114f4db69ac5a1f32e7e4d1cbbe7c8f9cf8217f78e6e002cedf2d54c2a548ed", size = 377451, upload-time = "2025-10-22T22:23:53.711Z" }, + { url = "https://files.pythonhosted.org/packages/b4/07/4d5bcd49e3dfed2d38e2dcb49ab6615f2ceb9f89f5a372c46dbdebb4e028/rpds_py-0.28.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:4b0cb8a906b1a0196b863d460c0222fb8ad0f34041568da5620f9799b83ccf0b", size = 390355, upload-time = "2025-10-22T22:23:55.299Z" }, + { url = "https://files.pythonhosted.org/packages/3f/79/9f14ba9010fee74e4f40bf578735cfcbb91d2e642ffd1abe429bb0b96364/rpds_py-0.28.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cf681ac76a60b667106141e11a92a3330890257e6f559ca995fbb5265160b56e", size = 403146, upload-time = "2025-10-22T22:23:56.929Z" }, + { url = "https://files.pythonhosted.org/packages/39/4c/f08283a82ac141331a83a40652830edd3a4a92c34e07e2bbe00baaea2f5f/rpds_py-0.28.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:1e8ee6413cfc677ce8898d9cde18cc3a60fc2ba756b0dec5b71eb6eb21c49fa1", size = 552656, upload-time = "2025-10-22T22:23:58.62Z" }, + { url = "https://files.pythonhosted.org/packages/61/47/d922fc0666f0dd8e40c33990d055f4cc6ecff6f502c2d01569dbed830f9b/rpds_py-0.28.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:b3072b16904d0b5572a15eb9d31c1954e0d3227a585fc1351aa9878729099d6c", size = 576782, upload-time = "2025-10-22T22:24:00.312Z" }, + { url = "https://files.pythonhosted.org/packages/d3/0c/5bafdd8ccf6aa9d3bfc630cfece457ff5b581af24f46a9f3590f790e3df2/rpds_py-0.28.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:b670c30fd87a6aec281c3c9896d3bae4b205fd75d79d06dc87c2503717e46092", size = 544671, upload-time = "2025-10-22T22:24:02.297Z" }, + { url = "https://files.pythonhosted.org/packages/2c/37/dcc5d8397caa924988693519069d0beea077a866128719351a4ad95e82fc/rpds_py-0.28.0-cp314-cp314t-win32.whl", hash = "sha256:8014045a15b4d2b3476f0a287fcc93d4f823472d7d1308d47884ecac9e612be3", size = 205749, upload-time = "2025-10-22T22:24:03.848Z" }, + { url = "https://files.pythonhosted.org/packages/d7/69/64d43b21a10d72b45939a28961216baeb721cc2a430f5f7c3bfa21659a53/rpds_py-0.28.0-cp314-cp314t-win_amd64.whl", hash = "sha256:7a4e59c90d9c27c561eb3160323634a9ff50b04e4f7820600a2beb0ac90db578", size = 216233, upload-time = "2025-10-22T22:24:05.471Z" }, + { url = "https://files.pythonhosted.org/packages/ae/bc/b43f2ea505f28119bd551ae75f70be0c803d2dbcd37c1b3734909e40620b/rpds_py-0.28.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f5e7101145427087e493b9c9b959da68d357c28c562792300dd21a095118ed16", size = 363913, upload-time = "2025-10-22T22:24:07.129Z" }, + { url = "https://files.pythonhosted.org/packages/28/f2/db318195d324c89a2c57dc5195058cbadd71b20d220685c5bd1da79ee7fe/rpds_py-0.28.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:31eb671150b9c62409a888850aaa8e6533635704fe2b78335f9aaf7ff81eec4d", size = 350452, upload-time = "2025-10-22T22:24:08.754Z" }, + { url = "https://files.pythonhosted.org/packages/ae/f2/1391c819b8573a4898cedd6b6c5ec5bc370ce59e5d6bdcebe3c9c1db4588/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48b55c1f64482f7d8bd39942f376bfdf2f6aec637ee8c805b5041e14eeb771db", size = 380957, upload-time = "2025-10-22T22:24:10.826Z" }, + { url = "https://files.pythonhosted.org/packages/5a/5c/e5de68ee7eb7248fce93269833d1b329a196d736aefb1a7481d1e99d1222/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:24743a7b372e9a76171f6b69c01aedf927e8ac3e16c474d9fe20d552a8cb45c7", size = 391919, upload-time = "2025-10-22T22:24:12.559Z" }, + { url = "https://files.pythonhosted.org/packages/fb/4f/2376336112cbfeb122fd435d608ad8d5041b3aed176f85a3cb32c262eb80/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:389c29045ee8bbb1627ea190b4976a310a295559eaf9f1464a1a6f2bf84dde78", size = 528541, upload-time = "2025-10-22T22:24:14.197Z" }, + { url = "https://files.pythonhosted.org/packages/68/53/5ae232e795853dd20da7225c5dd13a09c0a905b1a655e92bdf8d78a99fd9/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23690b5827e643150cf7b49569679ec13fe9a610a15949ed48b85eb7f98f34ec", size = 405629, upload-time = "2025-10-22T22:24:16.001Z" }, + { url = "https://files.pythonhosted.org/packages/b9/2d/351a3b852b683ca9b6b8b38ed9efb2347596973849ba6c3a0e99877c10aa/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6f0c9266c26580e7243ad0d72fc3e01d6b33866cfab5084a6da7576bcf1c4f72", size = 384123, upload-time = "2025-10-22T22:24:17.585Z" }, + { url = "https://files.pythonhosted.org/packages/e0/15/870804daa00202728cc91cb8e2385fa9f1f4eb49857c49cfce89e304eae6/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:4c6c4db5d73d179746951486df97fd25e92396be07fc29ee8ff9a8f5afbdfb27", size = 400923, upload-time = "2025-10-22T22:24:19.512Z" }, + { url = "https://files.pythonhosted.org/packages/53/25/3706b83c125fa2a0bccceac951de3f76631f6bd0ee4d02a0ed780712ef1b/rpds_py-0.28.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:a3b695a8fa799dd2cfdb4804b37096c5f6dba1ac7f48a7fbf6d0485bcd060316", size = 413767, upload-time = "2025-10-22T22:24:21.316Z" }, + { url = "https://files.pythonhosted.org/packages/ef/f9/ce43dbe62767432273ed2584cef71fef8411bddfb64125d4c19128015018/rpds_py-0.28.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:6aa1bfce3f83baf00d9c5fcdbba93a3ab79958b4c7d7d1f55e7fe68c20e63912", size = 561530, upload-time = "2025-10-22T22:24:22.958Z" }, + { url = "https://files.pythonhosted.org/packages/46/c9/ffe77999ed8f81e30713dd38fd9ecaa161f28ec48bb80fa1cd9118399c27/rpds_py-0.28.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:7b0f9dceb221792b3ee6acb5438eb1f02b0cb2c247796a72b016dcc92c6de829", size = 585453, upload-time = "2025-10-22T22:24:24.779Z" }, + { url = "https://files.pythonhosted.org/packages/ed/d2/4a73b18821fd4669762c855fd1f4e80ceb66fb72d71162d14da58444a763/rpds_py-0.28.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:5d0145edba8abd3db0ab22b5300c99dc152f5c9021fab861be0f0544dc3cbc5f", size = 552199, upload-time = "2025-10-22T22:24:26.54Z" }, +] + [[package]] name = "safetensors" version = "0.6.2" @@ -3454,6 +3629,9 @@ wheels = [ name = "tree-sitter" version = "0.23.2" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.10'", +] sdist = { url = "https://files.pythonhosted.org/packages/0f/50/fd5fafa42b884f741b28d9e6fd366c3f34e15d2ed3aa9633b34e388379e2/tree-sitter-0.23.2.tar.gz", hash = "sha256:66bae8dd47f1fed7bdef816115146d3a41c39b5c482d7bad36d9ba1def088450", size = 166800, upload-time = "2024-10-24T15:31:02.238Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/91/04/2068a7b725265ecfcbf63ecdae038f1d4124ebccd55b8a7ce145b70e2b6a/tree_sitter-0.23.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3a937f5d8727bc1c74c4bf2a9d1c25ace049e8628273016ad0d45914ae904e10", size = 139289, upload-time = "2024-10-24T15:29:59.27Z" }, @@ -3498,19 +3676,90 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/10/b5/9eaf794fc71490573ab14a366affca415bc1ddbf86a14d78e54583db4254/tree_sitter-0.23.2-cp39-cp39-win_arm64.whl", hash = "sha256:b848e0fdd522fbb8888cdb4f4d93f8fad97ae10d70c122fb922e51363c7febcd", size = 102787, upload-time = "2024-10-24T15:31:01.084Z" }, ] +[[package]] +name = "tree-sitter" +version = "0.25.2" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/66/7c/0350cfc47faadc0d3cf7d8237a4e34032b3014ddf4a12ded9933e1648b55/tree-sitter-0.25.2.tar.gz", hash = "sha256:fe43c158555da46723b28b52e058ad444195afd1db3ca7720c59a254544e9c20", size = 177961, upload-time = "2025-09-25T17:37:59.751Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e2/d4/f7ffb855cb039b7568aba4911fbe42e4c39c0e4398387c8e0d8251489992/tree_sitter-0.25.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72a510931c3c25f134aac2daf4eb4feca99ffe37a35896d7150e50ac3eee06c7", size = 146749, upload-time = "2025-09-25T17:37:16.475Z" }, + { url = "https://files.pythonhosted.org/packages/9a/58/f8a107f9f89700c0ab2930f1315e63bdedccbb5fd1b10fcbc5ebadd54ac8/tree_sitter-0.25.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:44488e0e78146f87baaa009736886516779253d6d6bac3ef636ede72bc6a8234", size = 137766, upload-time = "2025-09-25T17:37:18.138Z" }, + { url = "https://files.pythonhosted.org/packages/19/fb/357158d39f01699faea466e8fd5a849f5a30252c68414bddc20357a9ac79/tree_sitter-0.25.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c2f8e7d6b2f8489d4a9885e3adcaef4bc5ff0a275acd990f120e29c4ab3395c5", size = 599809, upload-time = "2025-09-25T17:37:19.169Z" }, + { url = "https://files.pythonhosted.org/packages/c5/a4/68ae301626f2393a62119481cb660eb93504a524fc741a6f1528a4568cf6/tree_sitter-0.25.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:20b570690f87f1da424cd690e51cc56728d21d63f4abd4b326d382a30353acc7", size = 627676, upload-time = "2025-09-25T17:37:20.715Z" }, + { url = "https://files.pythonhosted.org/packages/69/fe/4c1bef37db5ca8b17ca0b3070f2dff509468a50b3af18f17665adcab42b9/tree_sitter-0.25.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a0ec41b895da717bc218a42a3a7a0bfcfe9a213d7afaa4255353901e0e21f696", size = 624281, upload-time = "2025-09-25T17:37:21.823Z" }, + { url = "https://files.pythonhosted.org/packages/d4/30/3283cb7fa251cae2a0bf8661658021a789810db3ab1b0569482d4a3671fd/tree_sitter-0.25.2-cp310-cp310-win_amd64.whl", hash = "sha256:7712335855b2307a21ae86efe949c76be36c6068d76df34faa27ce9ee40ff444", size = 127295, upload-time = "2025-09-25T17:37:22.977Z" }, + { url = "https://files.pythonhosted.org/packages/88/90/ceb05e6de281aebe82b68662890619580d4ffe09283ebd2ceabcf5df7b4a/tree_sitter-0.25.2-cp310-cp310-win_arm64.whl", hash = "sha256:a925364eb7fbb9cdce55a9868f7525a1905af512a559303bd54ef468fd88cb37", size = 113991, upload-time = "2025-09-25T17:37:23.854Z" }, + { url = "https://files.pythonhosted.org/packages/7c/22/88a1e00b906d26fa8a075dd19c6c3116997cb884bf1b3c023deb065a344d/tree_sitter-0.25.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b8ca72d841215b6573ed0655b3a5cd1133f9b69a6fa561aecad40dca9029d75b", size = 146752, upload-time = "2025-09-25T17:37:24.775Z" }, + { url = "https://files.pythonhosted.org/packages/57/1c/22cc14f3910017b7a76d7358df5cd315a84fe0c7f6f7b443b49db2e2790d/tree_sitter-0.25.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc0351cfe5022cec5a77645f647f92a936b38850346ed3f6d6babfbeeeca4d26", size = 137765, upload-time = "2025-09-25T17:37:26.103Z" }, + { url = "https://files.pythonhosted.org/packages/1c/0c/d0de46ded7d5b34631e0f630d9866dab22d3183195bf0f3b81de406d6622/tree_sitter-0.25.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1799609636c0193e16c38f366bda5af15b1ce476df79ddaae7dd274df9e44266", size = 604643, upload-time = "2025-09-25T17:37:27.398Z" }, + { url = "https://files.pythonhosted.org/packages/34/38/b735a58c1c2f60a168a678ca27b4c1a9df725d0bf2d1a8a1c571c033111e/tree_sitter-0.25.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3e65ae456ad0d210ee71a89ee112ac7e72e6c2e5aac1b95846ecc7afa68a194c", size = 632229, upload-time = "2025-09-25T17:37:28.463Z" }, + { url = "https://files.pythonhosted.org/packages/32/f6/cda1e1e6cbff5e28d8433578e2556d7ba0b0209d95a796128155b97e7693/tree_sitter-0.25.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:49ee3c348caa459244ec437ccc7ff3831f35977d143f65311572b8ba0a5f265f", size = 629861, upload-time = "2025-09-25T17:37:29.593Z" }, + { url = "https://files.pythonhosted.org/packages/f9/19/427e5943b276a0dd74c2a1f1d7a7393443f13d1ee47dedb3f8127903c080/tree_sitter-0.25.2-cp311-cp311-win_amd64.whl", hash = "sha256:56ac6602c7d09c2c507c55e58dc7026b8988e0475bd0002f8a386cce5e8e8adc", size = 127304, upload-time = "2025-09-25T17:37:30.549Z" }, + { url = "https://files.pythonhosted.org/packages/eb/d9/eef856dc15f784d85d1397a17f3ee0f82df7778efce9e1961203abfe376a/tree_sitter-0.25.2-cp311-cp311-win_arm64.whl", hash = "sha256:b3d11a3a3ac89bb8a2543d75597f905a9926f9c806f40fcca8242922d1cc6ad5", size = 113990, upload-time = "2025-09-25T17:37:31.852Z" }, + { url = "https://files.pythonhosted.org/packages/3c/9e/20c2a00a862f1c2897a436b17edb774e831b22218083b459d0d081c9db33/tree_sitter-0.25.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ddabfff809ffc983fc9963455ba1cecc90295803e06e140a4c83e94c1fa3d960", size = 146941, upload-time = "2025-09-25T17:37:34.813Z" }, + { url = "https://files.pythonhosted.org/packages/ef/04/8512e2062e652a1016e840ce36ba1cc33258b0dcc4e500d8089b4054afec/tree_sitter-0.25.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c0c0ab5f94938a23fe81928a21cc0fac44143133ccc4eb7eeb1b92f84748331c", size = 137699, upload-time = "2025-09-25T17:37:36.349Z" }, + { url = "https://files.pythonhosted.org/packages/47/8a/d48c0414db19307b0fb3bb10d76a3a0cbe275bb293f145ee7fba2abd668e/tree_sitter-0.25.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd12d80d91d4114ca097626eb82714618dcdfacd6a5e0955216c6485c350ef99", size = 607125, upload-time = "2025-09-25T17:37:37.725Z" }, + { url = "https://files.pythonhosted.org/packages/39/d1/b95f545e9fc5001b8a78636ef942a4e4e536580caa6a99e73dd0a02e87aa/tree_sitter-0.25.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b43a9e4c89d4d0839de27cd4d6902d33396de700e9ff4c5ab7631f277a85ead9", size = 635418, upload-time = "2025-09-25T17:37:38.922Z" }, + { url = "https://files.pythonhosted.org/packages/de/4d/b734bde3fb6f3513a010fa91f1f2875442cdc0382d6a949005cd84563d8f/tree_sitter-0.25.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fbb1706407c0e451c4f8cc016fec27d72d4b211fdd3173320b1ada7a6c74c3ac", size = 631250, upload-time = "2025-09-25T17:37:40.039Z" }, + { url = "https://files.pythonhosted.org/packages/46/f2/5f654994f36d10c64d50a192239599fcae46677491c8dd53e7579c35a3e3/tree_sitter-0.25.2-cp312-cp312-win_amd64.whl", hash = "sha256:6d0302550bbe4620a5dc7649517c4409d74ef18558276ce758419cf09e578897", size = 127156, upload-time = "2025-09-25T17:37:41.132Z" }, + { url = "https://files.pythonhosted.org/packages/67/23/148c468d410efcf0a9535272d81c258d840c27b34781d625f1f627e2e27d/tree_sitter-0.25.2-cp312-cp312-win_arm64.whl", hash = "sha256:0c8b6682cac77e37cfe5cf7ec388844957f48b7bd8d6321d0ca2d852994e10d5", size = 113984, upload-time = "2025-09-25T17:37:42.074Z" }, + { url = "https://files.pythonhosted.org/packages/8c/67/67492014ce32729b63d7ef318a19f9cfedd855d677de5773476caf771e96/tree_sitter-0.25.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0628671f0de69bb279558ef6b640bcfc97864fe0026d840f872728a86cd6b6cd", size = 146926, upload-time = "2025-09-25T17:37:43.041Z" }, + { url = "https://files.pythonhosted.org/packages/4e/9c/a278b15e6b263e86c5e301c82a60923fa7c59d44f78d7a110a89a413e640/tree_sitter-0.25.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f5ddcd3e291a749b62521f71fc953f66f5fd9743973fd6dd962b092773569601", size = 137712, upload-time = "2025-09-25T17:37:44.039Z" }, + { url = "https://files.pythonhosted.org/packages/54/9a/423bba15d2bf6473ba67846ba5244b988cd97a4b1ea2b146822162256794/tree_sitter-0.25.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bd88fbb0f6c3a0f28f0a68d72df88e9755cf5215bae146f5a1bdc8362b772053", size = 607873, upload-time = "2025-09-25T17:37:45.477Z" }, + { url = "https://files.pythonhosted.org/packages/ed/4c/b430d2cb43f8badfb3a3fa9d6cd7c8247698187b5674008c9d67b2a90c8e/tree_sitter-0.25.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b878e296e63661c8e124177cc3084b041ba3f5936b43076d57c487822426f614", size = 636313, upload-time = "2025-09-25T17:37:46.68Z" }, + { url = "https://files.pythonhosted.org/packages/9d/27/5f97098dbba807331d666a0997662e82d066e84b17d92efab575d283822f/tree_sitter-0.25.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d77605e0d353ba3fe5627e5490f0fbfe44141bafa4478d88ef7954a61a848dae", size = 631370, upload-time = "2025-09-25T17:37:47.993Z" }, + { url = "https://files.pythonhosted.org/packages/d4/3c/87caaed663fabc35e18dc704cd0e9800a0ee2f22bd18b9cbe7c10799895d/tree_sitter-0.25.2-cp313-cp313-win_amd64.whl", hash = "sha256:463c032bd02052d934daa5f45d183e0521ceb783c2548501cf034b0beba92c9b", size = 127157, upload-time = "2025-09-25T17:37:48.967Z" }, + { url = "https://files.pythonhosted.org/packages/d5/23/f8467b408b7988aff4ea40946a4bd1a2c1a73d17156a9d039bbaff1e2ceb/tree_sitter-0.25.2-cp313-cp313-win_arm64.whl", hash = "sha256:b3f63a1796886249bd22c559a5944d64d05d43f2be72961624278eff0dcc5cb8", size = 113975, upload-time = "2025-09-25T17:37:49.922Z" }, + { url = "https://files.pythonhosted.org/packages/07/e3/d9526ba71dfbbe4eba5e51d89432b4b333a49a1e70712aa5590cd22fc74f/tree_sitter-0.25.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:65d3c931013ea798b502782acab986bbf47ba2c452610ab0776cf4a8ef150fc0", size = 146776, upload-time = "2025-09-25T17:37:50.898Z" }, + { url = "https://files.pythonhosted.org/packages/42/97/4bd4ad97f85a23011dd8a535534bb1035c4e0bac1234d58f438e15cff51f/tree_sitter-0.25.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:bda059af9d621918efb813b22fb06b3fe00c3e94079c6143fcb2c565eb44cb87", size = 137732, upload-time = "2025-09-25T17:37:51.877Z" }, + { url = "https://files.pythonhosted.org/packages/b6/19/1e968aa0b1b567988ed522f836498a6a9529a74aab15f09dd9ac1e41f505/tree_sitter-0.25.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eac4e8e4c7060c75f395feec46421eb61212cb73998dbe004b7384724f3682ab", size = 609456, upload-time = "2025-09-25T17:37:52.925Z" }, + { url = "https://files.pythonhosted.org/packages/48/b6/cf08f4f20f4c9094006ef8828555484e842fc468827ad6e56011ab668dbd/tree_sitter-0.25.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:260586381b23be33b6191a07cea3d44ecbd6c01aa4c6b027a0439145fcbc3358", size = 636772, upload-time = "2025-09-25T17:37:54.647Z" }, + { url = "https://files.pythonhosted.org/packages/57/e2/d42d55bf56360987c32bc7b16adb06744e425670b823fb8a5786a1cea991/tree_sitter-0.25.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7d2ee1acbacebe50ba0f85fff1bc05e65d877958f00880f49f9b2af38dce1af0", size = 631522, upload-time = "2025-09-25T17:37:55.833Z" }, + { url = "https://files.pythonhosted.org/packages/03/87/af9604ebe275a9345d88c3ace0cf2a1341aa3f8ef49dd9fc11662132df8a/tree_sitter-0.25.2-cp314-cp314-win_amd64.whl", hash = "sha256:4973b718fcadfb04e59e746abfbb0288694159c6aeecd2add59320c03368c721", size = 130864, upload-time = "2025-09-25T17:37:57.453Z" }, + { url = "https://files.pythonhosted.org/packages/a6/6e/e64621037357acb83d912276ffd30a859ef117f9c680f2e3cb955f47c680/tree_sitter-0.25.2-cp314-cp314-win_arm64.whl", hash = "sha256:b8d4429954a3beb3e844e2872610d2a4800ba4eb42bb1990c6a4b1949b18459f", size = 117470, upload-time = "2025-09-25T17:37:58.431Z" }, +] + +[[package]] +name = "tree-sitter-c" +version = "0.23.6" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.10'", +] +sdist = { url = "https://files.pythonhosted.org/packages/27/27/5218b7aadabcf8020d06a3b13f8f87dd0e6e958f43d9839847e3f12b02c7/tree_sitter_c-0.23.6.tar.gz", hash = "sha256:1d3b4a6ca8ebc7b0727857cc63a874118e0c04d353a4909b5c104e913fd69864", size = 221969, upload-time = "2025-05-24T16:05:16.753Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a5/2e/ba7d982c1b3c8a01e4b106cd9c8c292445366c77cb0fd9da598558d6b2a3/tree_sitter_c-0.23.6-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:0b46335c2603b86c75e7fc587e29c9299cf06e9634ce1a69ac1e928dfe568af2", size = 80847, upload-time = "2025-05-24T16:05:09.665Z" }, + { url = "https://files.pythonhosted.org/packages/57/ac/08081eb00119e528127a5a67008383e4730d099560f0f6e66f6e539710e2/tree_sitter_c-0.23.6-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:ffc36baf02f46744df354e4a00eab78d1034db480e649554c625ba79ee4b6b9c", size = 86208, upload-time = "2025-05-24T16:05:10.943Z" }, + { url = "https://files.pythonhosted.org/packages/2c/cb/98f0165f4cbdc6df35625358a9958176221bb098d38f58c25f5c6a04f9e5/tree_sitter_c-0.23.6-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e96ef21fffd7135839010b37066d6653101ff74fa8961468ffbb0bcf3ae22d61", size = 109935, upload-time = "2025-05-24T16:05:12.126Z" }, + { url = "https://files.pythonhosted.org/packages/b6/eb/1bfae083aa5e6b04e36de75f55491eaa495e84a0d06a87257cbb7c404a08/tree_sitter_c-0.23.6-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cfa9044039460632ef333afd6e907fdc67a657890afe49c8592bd223de059712", size = 98063, upload-time = "2025-05-24T16:05:12.975Z" }, + { url = "https://files.pythonhosted.org/packages/be/1f/85d34bbedb09bacb21c3861bbb26129420f26af289972906b75277150dea/tree_sitter_c-0.23.6-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a25cc5f275109f59dd6d5e636355ff038e46fc1048404519b591935a2b5c96d3", size = 94072, upload-time = "2025-05-24T16:05:13.814Z" }, + { url = "https://files.pythonhosted.org/packages/e6/35/c78cbe4ac9426f2208bacf20a6de9c262af8b9e8d379a6249c6876916978/tree_sitter_c-0.23.6-cp39-abi3-win_amd64.whl", hash = "sha256:1fccc265a0fe1b09874321c20046b297b1513e2cef1af7e17ac53b9b5cf6878e", size = 84626, upload-time = "2025-05-24T16:05:14.65Z" }, + { url = "https://files.pythonhosted.org/packages/be/53/d0f910b86e9d270e76d45f7accabd9efe96448e89c9f713ca2501a8876bf/tree_sitter_c-0.23.6-cp39-abi3-win_arm64.whl", hash = "sha256:ac92b69880d9844c89253a352937dada56e3647fbb8d5acb33f820eeb7763fd7", size = 82655, upload-time = "2025-05-24T16:05:15.894Z" }, +] + [[package]] name = "tree-sitter-c" -version = "0.23.4" +version = "0.24.1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/27/27/254ebffa4066b3073dddee00c1915893794f5cbf938335c1cc926cd32385/tree_sitter_c-0.23.4.tar.gz", hash = "sha256:9215c7888dd019038f162ea5646178f6e129cd2b49fc506d14becf5e426121d7", size = 223089, upload-time = "2024-12-15T22:24:42.833Z" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/f1/f5/ba8cd08d717277551ade8537d3aa2a94b907c6c6e0fbcf4e4d8b1c747fa3/tree_sitter_c-0.24.1.tar.gz", hash = "sha256:7d2d0cda0b8dda428c81440c1e94367f9f13548eedca3f49768bde66b1422ad6", size = 228014, upload-time = "2025-05-24T17:32:58.384Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/84/a9/41e5177fd9309bf142d6772f6885e6a93baa0ad40f17c7a4144ba1275c9c/tree_sitter_c-0.23.4-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2c92c0571b36b6da06f8882f34151dc11e67a493e9101cc0026a16da27709c05", size = 80812, upload-time = "2024-12-15T22:24:26.318Z" }, - { url = "https://files.pythonhosted.org/packages/90/99/cf0a3a8a661fffc7f6843cafbbc1887c47e1a79f751cf9c88002008c8eae/tree_sitter_c-0.23.4-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:98c285a23bf4fb6fb34140d6ea0f0d25d0a93e0d93692f9dffe3db6d1fe08534", size = 85813, upload-time = "2024-12-15T22:24:28.438Z" }, - { url = "https://files.pythonhosted.org/packages/01/c1/d346a08e05223bff3cea08a8f96d685d19bc2c022fde719bfd3e9f6aaaac/tree_sitter_c-0.23.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e42a3519825ca59c91b2b7aec08dd3c89e02690c7b315d54a1e1743f9be3f15", size = 110085, upload-time = "2024-12-15T22:24:30.823Z" }, - { url = "https://files.pythonhosted.org/packages/a8/88/b7d395038b109d42a4682b9f3d72f8e02de8f7c7caf9ad2b289991f1ac19/tree_sitter_c-0.23.4-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c15c7588c3d95872328019073a8d5eaf7c2691b4d4ef0393a0168399b2ad2356", size = 98075, upload-time = "2024-12-15T22:24:32.946Z" }, - { url = "https://files.pythonhosted.org/packages/e8/12/754a8166d3860cdd614bf7d117c94a740ce1ab1ab2ba766321249909e7b1/tree_sitter_c-0.23.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:013403e74765d74e523f380f9df8f3d99e9fe94132a3fc0c8b29cba538a7b2bf", size = 94071, upload-time = "2024-12-15T22:24:34.974Z" }, - { url = "https://files.pythonhosted.org/packages/14/da/2f97b96f081d6ac9b37c87c9d8e5c0ff5948802562ae28b1a58afd8dec1d/tree_sitter_c-0.23.4-cp39-abi3-win_amd64.whl", hash = "sha256:a4d7bdeaca8f1da72352a945853f56aa5d34e7bc22569ec5bda5d7c1a04e5b0f", size = 84483, upload-time = "2024-12-15T22:24:37.052Z" }, - { url = "https://files.pythonhosted.org/packages/d9/33/0d3b72634e2f34e64b07aaf100207cf3d01e32d814e72e144af0a0e785ad/tree_sitter_c-0.23.4-cp39-abi3-win_arm64.whl", hash = "sha256:edd36e12cc79b8b5bbc81fc336ff7d2577d0fe16afd18163c9aff7ae3ff69e15", size = 82482, upload-time = "2024-12-15T22:24:40.758Z" }, + { url = "https://files.pythonhosted.org/packages/15/c7/c817be36306e457c2d36cc324789046390d9d8c555c38772429ffdb7d361/tree_sitter_c-0.24.1-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9c06ac26a1efdcc8b26a8a6970fbc6997c4071857359e5837d4c42892d45fe1e", size = 80940, upload-time = "2025-05-24T17:32:49.967Z" }, + { url = "https://files.pythonhosted.org/packages/7a/42/283909467290b24fdbc29bb32ee20e409a19a55002b43175d66d091ca1a4/tree_sitter_c-0.24.1-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:942bcd7cbecd810dcf7ca6f8f834391ebf0771a89479646d891ba4ca2fdfdc88", size = 86304, upload-time = "2025-05-24T17:32:51.271Z" }, + { url = "https://files.pythonhosted.org/packages/94/53/fb4f61d4e5f15ec3da85774a4df8e58d3b5b73036cf167f0203b4dd9d158/tree_sitter_c-0.24.1-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a74cfd7a11ca5a961fafd4d751892ee65acae667d2818968a6f079397d8d28c", size = 109996, upload-time = "2025-05-24T17:32:52.119Z" }, + { url = "https://files.pythonhosted.org/packages/5e/e8/fc541d34ee81c386c5453c2596c1763e8e9cd7cb0725f39d7dfa2276afa4/tree_sitter_c-0.24.1-cp310-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6a807705a3978911dc7ee26a7ad36dcfacb6adfc13c190d496660ec9bd66707", size = 98137, upload-time = "2025-05-24T17:32:53.361Z" }, + { url = "https://files.pythonhosted.org/packages/32/c6/d0563319cae0d5b5780a92e2806074b24afea2a07aa4c10599b899bda3ec/tree_sitter_c-0.24.1-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:789781afcb710df34144f7e2a20cd80e325114b9119e3956c6bd1dd2d365df98", size = 94148, upload-time = "2025-05-24T17:32:54.855Z" }, + { url = "https://files.pythonhosted.org/packages/50/5a/6361df7f3fa2310c53a0d26b4702a261c332da16fa9d801e381e3a86e25f/tree_sitter_c-0.24.1-cp310-abi3-win_amd64.whl", hash = "sha256:290bff0f9c79c966496ebae45042f77543e6e4aea725f40587a8611d566231a8", size = 84703, upload-time = "2025-05-24T17:32:56.084Z" }, + { url = "https://files.pythonhosted.org/packages/22/6a/210a302e8025ac492cbaea58d3720d66b7d8034c5d747ac5e4d2d235aa25/tree_sitter_c-0.24.1-cp310-abi3-win_arm64.whl", hash = "sha256:d46bbda06f838c2dcb91daf767813671fd366b49ad84ff37db702129267b46e1", size = 82715, upload-time = "2025-05-24T17:32:57.248Z" }, ] [[package]] @@ -3532,6 +3781,9 @@ wheels = [ name = "tree-sitter-javascript" version = "0.23.1" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.10'", +] sdist = { url = "https://files.pythonhosted.org/packages/cd/dc/1c55c33cc6bbe754359b330534cf9f261c1b9b2c26ddf23aef3c5fa67759/tree_sitter_javascript-0.23.1.tar.gz", hash = "sha256:b2059ce8b150162cda05a457ca3920450adbf915119c04b8c67b5241cd7fcfed", size = 110058, upload-time = "2024-11-10T05:40:42.357Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/20/d3/c67d7d49967344b51208ad19f105233be1afdf07d3dcb35b471900265227/tree_sitter_javascript-0.23.1-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:6ca583dad4bd79d3053c310b9f7208cd597fd85f9947e4ab2294658bb5c11e35", size = 59333, upload-time = "2024-11-10T05:40:31.988Z" }, @@ -3543,10 +3795,34 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dc/79/ceb21988e6de615355a63eebcf806cd2a0fe875bec27b429d58b63e7fb5f/tree_sitter_javascript-0.23.1-cp39-abi3-win_arm64.whl", hash = "sha256:eb28130cd2fb30d702d614cbf61ef44d1c7f6869e7d864a9cc17111e370be8f7", size = 57027, upload-time = "2024-11-10T05:40:40.841Z" }, ] +[[package]] +name = "tree-sitter-javascript" +version = "0.25.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/59/e0/e63103c72a9d3dfd89a31e02e660263ad84b7438e5f44ee82e443e65bbde/tree_sitter_javascript-0.25.0.tar.gz", hash = "sha256:329b5414874f0588a98f1c291f1b28138286617aa907746ffe55adfdcf963f38", size = 132338, upload-time = "2025-09-01T07:13:44.792Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/2c/df/5106ac250cd03661ebc3cc75da6b3d9f6800a3606393a0122eca58038104/tree_sitter_javascript-0.25.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:b70f887fb269d6e58c349d683f59fa647140c410cfe2bee44a883b20ec92e3dc", size = 64052, upload-time = "2025-09-01T07:13:36.865Z" }, + { url = "https://files.pythonhosted.org/packages/b1/8f/6b4b2bc90d8ab3955856ce852cc9d1e82c81d7ab9646385f0e75ffd5b5d3/tree_sitter_javascript-0.25.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:8264a996b8845cfce06965152a013b5d9cbb7d199bc3503e12b5682e62bb1de1", size = 66440, upload-time = "2025-09-01T07:13:37.962Z" }, + { url = "https://files.pythonhosted.org/packages/5f/c4/7da74ecdcd8a398f88bd003a87c65403b5fe0e958cdd43fbd5fd4a398fcf/tree_sitter_javascript-0.25.0-cp310-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9dc04ba91fc8583344e57c1f1ed5b2c97ecaaf47480011b92fbeab8dda96db75", size = 99728, upload-time = "2025-09-01T07:13:38.755Z" }, + { url = "https://files.pythonhosted.org/packages/96/c8/97da3af4796495e46421e9344738addb3602fa6426ea695be3fcbadbee37/tree_sitter_javascript-0.25.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:199d09985190852e0912da2b8d26c932159be314bc04952cf917ed0e4c633e6b", size = 106072, upload-time = "2025-09-01T07:13:39.798Z" }, + { url = "https://files.pythonhosted.org/packages/13/be/c964e8130be08cc9bd6627d845f0e4460945b158429d39510953bbcb8fcc/tree_sitter_javascript-0.25.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:dfcf789064c58dc13c0a4edb550acacfc6f0f280577f1e7a00de3e89fc7f8ddc", size = 104388, upload-time = "2025-09-01T07:13:40.866Z" }, + { url = "https://files.pythonhosted.org/packages/ee/89/9b773dee0f8961d1bb8d7baf0a204ab587618df19897c1ef260916f318ec/tree_sitter_javascript-0.25.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1b852d3aee8a36186dbcc32c798b11b4869f9b5041743b63b65c2ef793db7a54", size = 98377, upload-time = "2025-09-01T07:13:41.838Z" }, + { url = "https://files.pythonhosted.org/packages/3b/dc/d90cb1790f8cec9b4878d278ad9faf7c8f893189ce0f855304fd704fc274/tree_sitter_javascript-0.25.0-cp310-abi3-win_amd64.whl", hash = "sha256:e5ed840f5bd4a3f0272e441d19429b26eedc257abe5574c8546da6b556865e3c", size = 62975, upload-time = "2025-09-01T07:13:42.828Z" }, + { url = "https://files.pythonhosted.org/packages/2e/1f/f9eba1038b7d4394410f3c0a6ec2122b590cd7acb03f196e52fa57ebbe72/tree_sitter_javascript-0.25.0-cp310-abi3-win_arm64.whl", hash = "sha256:622a69d677aa7f6ee2931d8c77c981a33f0ebb6d275aa9d43d3397c879a9bb0b", size = 61668, upload-time = "2025-09-01T07:13:43.803Z" }, +] + [[package]] name = "tree-sitter-python" version = "0.23.6" source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.10'", +] sdist = { url = "https://files.pythonhosted.org/packages/1c/30/6766433b31be476fda6569a3a374c2220e45ffee0bff75460038a57bf23b/tree_sitter_python-0.23.6.tar.gz", hash = "sha256:354bfa0a2f9217431764a631516f85173e9711af2c13dbd796a8815acfe505d9", size = 155868, upload-time = "2024-12-22T23:09:55.918Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/ab/67/577a02acae5f776007c924ca86ef14c19c12e71de0aa9d2a036f3c248e7b/tree_sitter_python-0.23.6-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:28fbec8f74eeb2b30292d97715e60fac9ccf8a8091ce19b9d93e9b580ed280fb", size = 74361, upload-time = "2024-12-22T23:09:42.37Z" }, @@ -3558,6 +3834,27 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/87/cb/ce35a65f83a47b510d8a2f1eddf3bdbb0d57aabc87351c8788caf3309f76/tree_sitter_python-0.23.6-cp39-abi3-win_arm64.whl", hash = "sha256:71334371bd73d5fe080aed39fbff49ed8efb9506edebe16795b0c7567ed6a272", size = 73649, upload-time = "2024-12-22T23:09:53.71Z" }, ] +[[package]] +name = "tree-sitter-python" +version = "0.25.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12'", + "python_full_version == '3.11.*'", + "python_full_version == '3.10.*'", +] +sdist = { url = "https://files.pythonhosted.org/packages/b8/8b/c992ff0e768cb6768d5c96234579bf8842b3a633db641455d86dd30d5dac/tree_sitter_python-0.25.0.tar.gz", hash = "sha256:b13e090f725f5b9c86aa455a268553c65cadf325471ad5b65cd29cac8a1a68ac", size = 159845, upload-time = "2025-09-11T06:47:58.159Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/cf/64/a4e503c78a4eb3ac46d8e72a29c1b1237fa85238d8e972b063e0751f5a94/tree_sitter_python-0.25.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:14a79a47ddef72f987d5a2c122d148a812169d7484ff5c75a3db9609d419f361", size = 73790, upload-time = "2025-09-11T06:47:47.652Z" }, + { url = "https://files.pythonhosted.org/packages/e6/1d/60d8c2a0cc63d6ec4ba4e99ce61b802d2e39ef9db799bdf2a8f932a6cd4b/tree_sitter_python-0.25.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:480c21dbd995b7fe44813e741d71fed10ba695e7caab627fb034e3828469d762", size = 76691, upload-time = "2025-09-11T06:47:49.038Z" }, + { url = "https://files.pythonhosted.org/packages/aa/cb/d9b0b67d037922d60cbe0359e0c86457c2da721bc714381a63e2c8e35eba/tree_sitter_python-0.25.0-cp310-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:86f118e5eecad616ecdb81d171a36dde9bef5a0b21ed71ea9c3e390813c3baf5", size = 108133, upload-time = "2025-09-11T06:47:50.499Z" }, + { url = "https://files.pythonhosted.org/packages/40/bd/bf4787f57e6b2860f3f1c8c62f045b39fb32d6bac4b53d7a9e66de968440/tree_sitter_python-0.25.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:be71650ca2b93b6e9649e5d65c6811aad87a7614c8c1003246b303f6b150f61b", size = 110603, upload-time = "2025-09-11T06:47:51.985Z" }, + { url = "https://files.pythonhosted.org/packages/5d/25/feff09f5c2f32484fbce15db8b49455c7572346ce61a699a41972dea7318/tree_sitter_python-0.25.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:e6d5b5799628cc0f24691ab2a172a8e676f668fe90dc60468bee14084a35c16d", size = 108998, upload-time = "2025-09-11T06:47:53.046Z" }, + { url = "https://files.pythonhosted.org/packages/75/69/4946da3d6c0df316ccb938316ce007fb565d08f89d02d854f2d308f0309f/tree_sitter_python-0.25.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:71959832fc5d9642e52c11f2f7d79ae520b461e63334927e93ca46cd61cd9683", size = 107268, upload-time = "2025-09-11T06:47:54.388Z" }, + { url = "https://files.pythonhosted.org/packages/ed/a2/996fc2dfa1076dc460d3e2f3c75974ea4b8f02f6bc925383aaae519920e8/tree_sitter_python-0.25.0-cp310-abi3-win_amd64.whl", hash = "sha256:9bcde33f18792de54ee579b00e1b4fe186b7926825444766f849bf7181793a76", size = 76073, upload-time = "2025-09-11T06:47:55.773Z" }, + { url = "https://files.pythonhosted.org/packages/07/19/4b5569d9b1ebebb5907d11554a96ef3fa09364a30fcfabeff587495b512f/tree_sitter_python-0.25.0-cp310-abi3-win_arm64.whl", hash = "sha256:0fbf6a3774ad7e89ee891851204c2e2c47e12b63a5edbe2e9156997731c128bb", size = 74169, upload-time = "2025-09-11T06:47:56.747Z" }, +] + [[package]] name = "tree-sitter-typescript" version = "0.23.2" @@ -3596,7 +3893,7 @@ wheels = [ [[package]] name = "typer" -version = "0.16.1" +version = "0.19.2" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, @@ -3605,9 +3902,9 @@ dependencies = [ { name = "shellingham" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/43/78/d90f616bf5f88f8710ad067c1f8705bf7618059836ca084e5bb2a0855d75/typer-0.16.1.tar.gz", hash = "sha256:d358c65a464a7a90f338e3bb7ff0c74ac081449e53884b12ba658cbd72990614", size = 102836, upload-time = "2025-08-18T19:18:22.898Z" } +sdist = { url = "https://files.pythonhosted.org/packages/21/ca/950278884e2ca20547ff3eb109478c6baf6b8cf219318e6bc4f666fad8e8/typer-0.19.2.tar.gz", hash = "sha256:9ad824308ded0ad06cc716434705f691d4ee0bfd0fb081839d2e426860e7fdca", size = 104755, upload-time = "2025-09-23T09:47:48.256Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/2d/76/06dbe78f39b2203d2a47d5facc5df5102d0561e2807396471b5f7c5a30a1/typer-0.16.1-py3-none-any.whl", hash = "sha256:90ee01cb02d9b8395ae21ee3368421faf21fa138cb2a541ed369c08cec5237c9", size = 46397, upload-time = "2025-08-18T19:18:21.663Z" }, + { url = "https://files.pythonhosted.org/packages/00/22/35617eee79080a5d071d0f14ad698d325ee6b3bf824fc0467c03b30e7fa8/typer-0.19.2-py3-none-any.whl", hash = "sha256:755e7e19670ffad8283db353267cb81ef252f595aa6834a0d1ca9312d9326cb9", size = 46748, upload-time = "2025-09-23T09:47:46.777Z" }, ] [[package]] From b417cae8f5ece6cd5efce9fae207313092533435 Mon Sep 17 00:00:00 2001 From: Bridget McGinn Date: Fri, 24 Oct 2025 14:01:37 -0400 Subject: [PATCH 11/12] revert to stricter treesitter versioning due to compatibility Signed-off-by: Bridget McGinn --- pyproject.toml | 12 ++-- uv.lock | 164 ++++++------------------------------------------- 2 files changed, 25 insertions(+), 151 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d8561f8e..2be66c63 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,12 +50,12 @@ dependencies = [ 'typing-extensions (>=4.12.2,<5.0.0)', 'typer (>=0.12.5,<0.20.0)', 'latex2mathml (>=3.77.0,<4.0.0)', - "tree-sitter (>=0.23.2,<1.0.0)", - "tree-sitter-python (>=0.23.6,<1.0.0)", - "tree-sitter-c (>=0.23.4,<1.0.0)", - "tree-sitter-java (>=0.23.5,<1.0.0)", - "tree-sitter-javascript (>=0.23.1,<1.0.0)", - "tree-sitter-typescript (>=0.23.2,<1.0.0)", + "tree-sitter (>=0.23.2,<0.24)", + "tree-sitter-python (>=0.23.6,<0.24)", + "tree-sitter-c (==0.23.4)", + "tree-sitter-java (>=0.23.5,<0.24)", + "tree-sitter-javascript (>=0.23.1,<0.24)", + "tree-sitter-typescript (>=0.23.2,<0.24)", ] [project.urls] diff --git a/uv.lock b/uv.lock index 1532f304..f3566990 100644 --- a/uv.lock +++ b/uv.lock @@ -737,15 +737,11 @@ dependencies = [ { name = "pydantic" }, { name = "pyyaml" }, { name = "tabulate" }, - { name = "tree-sitter", version = "0.23.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "tree-sitter", version = "0.25.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "tree-sitter-c", version = "0.23.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "tree-sitter-c", version = "0.24.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "tree-sitter" }, + { name = "tree-sitter-c" }, { name = "tree-sitter-java" }, - { name = "tree-sitter-javascript", version = "0.23.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "tree-sitter-javascript", version = "0.25.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "tree-sitter-python", version = "0.23.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "tree-sitter-python", version = "0.25.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "tree-sitter-javascript" }, + { name = "tree-sitter-python" }, { name = "tree-sitter-typescript" }, { name = "typer" }, { name = "typing-extensions" }, @@ -798,12 +794,12 @@ requires-dist = [ { name = "tabulate", specifier = ">=0.9.0,<0.10.0" }, { name = "tiktoken", marker = "extra == 'chunking-openai'", specifier = ">=0.9.0,<0.13.0" }, { name = "transformers", marker = "extra == 'chunking'", specifier = ">=4.34.0,<5.0.0" }, - { name = "tree-sitter", specifier = ">=0.23.2,<1.0.0" }, - { name = "tree-sitter-c", specifier = ">=0.23.4,<1.0.0" }, - { name = "tree-sitter-java", specifier = ">=0.23.5,<1.0.0" }, - { name = "tree-sitter-javascript", specifier = ">=0.23.1,<1.0.0" }, - { name = "tree-sitter-python", specifier = ">=0.23.6,<1.0.0" }, - { name = "tree-sitter-typescript", specifier = ">=0.23.2,<1.0.0" }, + { name = "tree-sitter", specifier = ">=0.23.2,<0.24" }, + { name = "tree-sitter-c", specifier = "==0.23.4" }, + { name = "tree-sitter-java", specifier = ">=0.23.5,<0.24" }, + { name = "tree-sitter-javascript", specifier = ">=0.23.1,<0.24" }, + { name = "tree-sitter-python", specifier = ">=0.23.6,<0.24" }, + { name = "tree-sitter-typescript", specifier = ">=0.23.2,<0.24" }, { name = "typer", specifier = ">=0.12.5,<0.20.0" }, { name = "typing-extensions", specifier = ">=4.12.2,<5.0.0" }, ] @@ -3629,9 +3625,6 @@ wheels = [ name = "tree-sitter" version = "0.23.2" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] sdist = { url = "https://files.pythonhosted.org/packages/0f/50/fd5fafa42b884f741b28d9e6fd366c3f34e15d2ed3aa9633b34e388379e2/tree-sitter-0.23.2.tar.gz", hash = "sha256:66bae8dd47f1fed7bdef816115146d3a41c39b5c482d7bad36d9ba1def088450", size = 166800, upload-time = "2024-10-24T15:31:02.238Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/91/04/2068a7b725265ecfcbf63ecdae038f1d4124ebccd55b8a7ce145b70e2b6a/tree_sitter-0.23.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:3a937f5d8727bc1c74c4bf2a9d1c25ace049e8628273016ad0d45914ae904e10", size = 139289, upload-time = "2024-10-24T15:29:59.27Z" }, @@ -3676,90 +3669,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/10/b5/9eaf794fc71490573ab14a366affca415bc1ddbf86a14d78e54583db4254/tree_sitter-0.23.2-cp39-cp39-win_arm64.whl", hash = "sha256:b848e0fdd522fbb8888cdb4f4d93f8fad97ae10d70c122fb922e51363c7febcd", size = 102787, upload-time = "2024-10-24T15:31:01.084Z" }, ] -[[package]] -name = "tree-sitter" -version = "0.25.2" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] -sdist = { url = "https://files.pythonhosted.org/packages/66/7c/0350cfc47faadc0d3cf7d8237a4e34032b3014ddf4a12ded9933e1648b55/tree-sitter-0.25.2.tar.gz", hash = "sha256:fe43c158555da46723b28b52e058ad444195afd1db3ca7720c59a254544e9c20", size = 177961, upload-time = "2025-09-25T17:37:59.751Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e2/d4/f7ffb855cb039b7568aba4911fbe42e4c39c0e4398387c8e0d8251489992/tree_sitter-0.25.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:72a510931c3c25f134aac2daf4eb4feca99ffe37a35896d7150e50ac3eee06c7", size = 146749, upload-time = "2025-09-25T17:37:16.475Z" }, - { url = "https://files.pythonhosted.org/packages/9a/58/f8a107f9f89700c0ab2930f1315e63bdedccbb5fd1b10fcbc5ebadd54ac8/tree_sitter-0.25.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:44488e0e78146f87baaa009736886516779253d6d6bac3ef636ede72bc6a8234", size = 137766, upload-time = "2025-09-25T17:37:18.138Z" }, - { url = "https://files.pythonhosted.org/packages/19/fb/357158d39f01699faea466e8fd5a849f5a30252c68414bddc20357a9ac79/tree_sitter-0.25.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c2f8e7d6b2f8489d4a9885e3adcaef4bc5ff0a275acd990f120e29c4ab3395c5", size = 599809, upload-time = "2025-09-25T17:37:19.169Z" }, - { url = "https://files.pythonhosted.org/packages/c5/a4/68ae301626f2393a62119481cb660eb93504a524fc741a6f1528a4568cf6/tree_sitter-0.25.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:20b570690f87f1da424cd690e51cc56728d21d63f4abd4b326d382a30353acc7", size = 627676, upload-time = "2025-09-25T17:37:20.715Z" }, - { url = "https://files.pythonhosted.org/packages/69/fe/4c1bef37db5ca8b17ca0b3070f2dff509468a50b3af18f17665adcab42b9/tree_sitter-0.25.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:a0ec41b895da717bc218a42a3a7a0bfcfe9a213d7afaa4255353901e0e21f696", size = 624281, upload-time = "2025-09-25T17:37:21.823Z" }, - { url = "https://files.pythonhosted.org/packages/d4/30/3283cb7fa251cae2a0bf8661658021a789810db3ab1b0569482d4a3671fd/tree_sitter-0.25.2-cp310-cp310-win_amd64.whl", hash = "sha256:7712335855b2307a21ae86efe949c76be36c6068d76df34faa27ce9ee40ff444", size = 127295, upload-time = "2025-09-25T17:37:22.977Z" }, - { url = "https://files.pythonhosted.org/packages/88/90/ceb05e6de281aebe82b68662890619580d4ffe09283ebd2ceabcf5df7b4a/tree_sitter-0.25.2-cp310-cp310-win_arm64.whl", hash = "sha256:a925364eb7fbb9cdce55a9868f7525a1905af512a559303bd54ef468fd88cb37", size = 113991, upload-time = "2025-09-25T17:37:23.854Z" }, - { url = "https://files.pythonhosted.org/packages/7c/22/88a1e00b906d26fa8a075dd19c6c3116997cb884bf1b3c023deb065a344d/tree_sitter-0.25.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b8ca72d841215b6573ed0655b3a5cd1133f9b69a6fa561aecad40dca9029d75b", size = 146752, upload-time = "2025-09-25T17:37:24.775Z" }, - { url = "https://files.pythonhosted.org/packages/57/1c/22cc14f3910017b7a76d7358df5cd315a84fe0c7f6f7b443b49db2e2790d/tree_sitter-0.25.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:cc0351cfe5022cec5a77645f647f92a936b38850346ed3f6d6babfbeeeca4d26", size = 137765, upload-time = "2025-09-25T17:37:26.103Z" }, - { url = "https://files.pythonhosted.org/packages/1c/0c/d0de46ded7d5b34631e0f630d9866dab22d3183195bf0f3b81de406d6622/tree_sitter-0.25.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1799609636c0193e16c38f366bda5af15b1ce476df79ddaae7dd274df9e44266", size = 604643, upload-time = "2025-09-25T17:37:27.398Z" }, - { url = "https://files.pythonhosted.org/packages/34/38/b735a58c1c2f60a168a678ca27b4c1a9df725d0bf2d1a8a1c571c033111e/tree_sitter-0.25.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3e65ae456ad0d210ee71a89ee112ac7e72e6c2e5aac1b95846ecc7afa68a194c", size = 632229, upload-time = "2025-09-25T17:37:28.463Z" }, - { url = "https://files.pythonhosted.org/packages/32/f6/cda1e1e6cbff5e28d8433578e2556d7ba0b0209d95a796128155b97e7693/tree_sitter-0.25.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:49ee3c348caa459244ec437ccc7ff3831f35977d143f65311572b8ba0a5f265f", size = 629861, upload-time = "2025-09-25T17:37:29.593Z" }, - { url = "https://files.pythonhosted.org/packages/f9/19/427e5943b276a0dd74c2a1f1d7a7393443f13d1ee47dedb3f8127903c080/tree_sitter-0.25.2-cp311-cp311-win_amd64.whl", hash = "sha256:56ac6602c7d09c2c507c55e58dc7026b8988e0475bd0002f8a386cce5e8e8adc", size = 127304, upload-time = "2025-09-25T17:37:30.549Z" }, - { url = "https://files.pythonhosted.org/packages/eb/d9/eef856dc15f784d85d1397a17f3ee0f82df7778efce9e1961203abfe376a/tree_sitter-0.25.2-cp311-cp311-win_arm64.whl", hash = "sha256:b3d11a3a3ac89bb8a2543d75597f905a9926f9c806f40fcca8242922d1cc6ad5", size = 113990, upload-time = "2025-09-25T17:37:31.852Z" }, - { url = "https://files.pythonhosted.org/packages/3c/9e/20c2a00a862f1c2897a436b17edb774e831b22218083b459d0d081c9db33/tree_sitter-0.25.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ddabfff809ffc983fc9963455ba1cecc90295803e06e140a4c83e94c1fa3d960", size = 146941, upload-time = "2025-09-25T17:37:34.813Z" }, - { url = "https://files.pythonhosted.org/packages/ef/04/8512e2062e652a1016e840ce36ba1cc33258b0dcc4e500d8089b4054afec/tree_sitter-0.25.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c0c0ab5f94938a23fe81928a21cc0fac44143133ccc4eb7eeb1b92f84748331c", size = 137699, upload-time = "2025-09-25T17:37:36.349Z" }, - { url = "https://files.pythonhosted.org/packages/47/8a/d48c0414db19307b0fb3bb10d76a3a0cbe275bb293f145ee7fba2abd668e/tree_sitter-0.25.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dd12d80d91d4114ca097626eb82714618dcdfacd6a5e0955216c6485c350ef99", size = 607125, upload-time = "2025-09-25T17:37:37.725Z" }, - { url = "https://files.pythonhosted.org/packages/39/d1/b95f545e9fc5001b8a78636ef942a4e4e536580caa6a99e73dd0a02e87aa/tree_sitter-0.25.2-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b43a9e4c89d4d0839de27cd4d6902d33396de700e9ff4c5ab7631f277a85ead9", size = 635418, upload-time = "2025-09-25T17:37:38.922Z" }, - { url = "https://files.pythonhosted.org/packages/de/4d/b734bde3fb6f3513a010fa91f1f2875442cdc0382d6a949005cd84563d8f/tree_sitter-0.25.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fbb1706407c0e451c4f8cc016fec27d72d4b211fdd3173320b1ada7a6c74c3ac", size = 631250, upload-time = "2025-09-25T17:37:40.039Z" }, - { url = "https://files.pythonhosted.org/packages/46/f2/5f654994f36d10c64d50a192239599fcae46677491c8dd53e7579c35a3e3/tree_sitter-0.25.2-cp312-cp312-win_amd64.whl", hash = "sha256:6d0302550bbe4620a5dc7649517c4409d74ef18558276ce758419cf09e578897", size = 127156, upload-time = "2025-09-25T17:37:41.132Z" }, - { url = "https://files.pythonhosted.org/packages/67/23/148c468d410efcf0a9535272d81c258d840c27b34781d625f1f627e2e27d/tree_sitter-0.25.2-cp312-cp312-win_arm64.whl", hash = "sha256:0c8b6682cac77e37cfe5cf7ec388844957f48b7bd8d6321d0ca2d852994e10d5", size = 113984, upload-time = "2025-09-25T17:37:42.074Z" }, - { url = "https://files.pythonhosted.org/packages/8c/67/67492014ce32729b63d7ef318a19f9cfedd855d677de5773476caf771e96/tree_sitter-0.25.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:0628671f0de69bb279558ef6b640bcfc97864fe0026d840f872728a86cd6b6cd", size = 146926, upload-time = "2025-09-25T17:37:43.041Z" }, - { url = "https://files.pythonhosted.org/packages/4e/9c/a278b15e6b263e86c5e301c82a60923fa7c59d44f78d7a110a89a413e640/tree_sitter-0.25.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f5ddcd3e291a749b62521f71fc953f66f5fd9743973fd6dd962b092773569601", size = 137712, upload-time = "2025-09-25T17:37:44.039Z" }, - { url = "https://files.pythonhosted.org/packages/54/9a/423bba15d2bf6473ba67846ba5244b988cd97a4b1ea2b146822162256794/tree_sitter-0.25.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bd88fbb0f6c3a0f28f0a68d72df88e9755cf5215bae146f5a1bdc8362b772053", size = 607873, upload-time = "2025-09-25T17:37:45.477Z" }, - { url = "https://files.pythonhosted.org/packages/ed/4c/b430d2cb43f8badfb3a3fa9d6cd7c8247698187b5674008c9d67b2a90c8e/tree_sitter-0.25.2-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b878e296e63661c8e124177cc3084b041ba3f5936b43076d57c487822426f614", size = 636313, upload-time = "2025-09-25T17:37:46.68Z" }, - { url = "https://files.pythonhosted.org/packages/9d/27/5f97098dbba807331d666a0997662e82d066e84b17d92efab575d283822f/tree_sitter-0.25.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d77605e0d353ba3fe5627e5490f0fbfe44141bafa4478d88ef7954a61a848dae", size = 631370, upload-time = "2025-09-25T17:37:47.993Z" }, - { url = "https://files.pythonhosted.org/packages/d4/3c/87caaed663fabc35e18dc704cd0e9800a0ee2f22bd18b9cbe7c10799895d/tree_sitter-0.25.2-cp313-cp313-win_amd64.whl", hash = "sha256:463c032bd02052d934daa5f45d183e0521ceb783c2548501cf034b0beba92c9b", size = 127157, upload-time = "2025-09-25T17:37:48.967Z" }, - { url = "https://files.pythonhosted.org/packages/d5/23/f8467b408b7988aff4ea40946a4bd1a2c1a73d17156a9d039bbaff1e2ceb/tree_sitter-0.25.2-cp313-cp313-win_arm64.whl", hash = "sha256:b3f63a1796886249bd22c559a5944d64d05d43f2be72961624278eff0dcc5cb8", size = 113975, upload-time = "2025-09-25T17:37:49.922Z" }, - { url = "https://files.pythonhosted.org/packages/07/e3/d9526ba71dfbbe4eba5e51d89432b4b333a49a1e70712aa5590cd22fc74f/tree_sitter-0.25.2-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:65d3c931013ea798b502782acab986bbf47ba2c452610ab0776cf4a8ef150fc0", size = 146776, upload-time = "2025-09-25T17:37:50.898Z" }, - { url = "https://files.pythonhosted.org/packages/42/97/4bd4ad97f85a23011dd8a535534bb1035c4e0bac1234d58f438e15cff51f/tree_sitter-0.25.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:bda059af9d621918efb813b22fb06b3fe00c3e94079c6143fcb2c565eb44cb87", size = 137732, upload-time = "2025-09-25T17:37:51.877Z" }, - { url = "https://files.pythonhosted.org/packages/b6/19/1e968aa0b1b567988ed522f836498a6a9529a74aab15f09dd9ac1e41f505/tree_sitter-0.25.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:eac4e8e4c7060c75f395feec46421eb61212cb73998dbe004b7384724f3682ab", size = 609456, upload-time = "2025-09-25T17:37:52.925Z" }, - { url = "https://files.pythonhosted.org/packages/48/b6/cf08f4f20f4c9094006ef8828555484e842fc468827ad6e56011ab668dbd/tree_sitter-0.25.2-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:260586381b23be33b6191a07cea3d44ecbd6c01aa4c6b027a0439145fcbc3358", size = 636772, upload-time = "2025-09-25T17:37:54.647Z" }, - { url = "https://files.pythonhosted.org/packages/57/e2/d42d55bf56360987c32bc7b16adb06744e425670b823fb8a5786a1cea991/tree_sitter-0.25.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:7d2ee1acbacebe50ba0f85fff1bc05e65d877958f00880f49f9b2af38dce1af0", size = 631522, upload-time = "2025-09-25T17:37:55.833Z" }, - { url = "https://files.pythonhosted.org/packages/03/87/af9604ebe275a9345d88c3ace0cf2a1341aa3f8ef49dd9fc11662132df8a/tree_sitter-0.25.2-cp314-cp314-win_amd64.whl", hash = "sha256:4973b718fcadfb04e59e746abfbb0288694159c6aeecd2add59320c03368c721", size = 130864, upload-time = "2025-09-25T17:37:57.453Z" }, - { url = "https://files.pythonhosted.org/packages/a6/6e/e64621037357acb83d912276ffd30a859ef117f9c680f2e3cb955f47c680/tree_sitter-0.25.2-cp314-cp314-win_arm64.whl", hash = "sha256:b8d4429954a3beb3e844e2872610d2a4800ba4eb42bb1990c6a4b1949b18459f", size = 117470, upload-time = "2025-09-25T17:37:58.431Z" }, -] - -[[package]] -name = "tree-sitter-c" -version = "0.23.6" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] -sdist = { url = "https://files.pythonhosted.org/packages/27/27/5218b7aadabcf8020d06a3b13f8f87dd0e6e958f43d9839847e3f12b02c7/tree_sitter_c-0.23.6.tar.gz", hash = "sha256:1d3b4a6ca8ebc7b0727857cc63a874118e0c04d353a4909b5c104e913fd69864", size = 221969, upload-time = "2025-05-24T16:05:16.753Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a5/2e/ba7d982c1b3c8a01e4b106cd9c8c292445366c77cb0fd9da598558d6b2a3/tree_sitter_c-0.23.6-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:0b46335c2603b86c75e7fc587e29c9299cf06e9634ce1a69ac1e928dfe568af2", size = 80847, upload-time = "2025-05-24T16:05:09.665Z" }, - { url = "https://files.pythonhosted.org/packages/57/ac/08081eb00119e528127a5a67008383e4730d099560f0f6e66f6e539710e2/tree_sitter_c-0.23.6-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:ffc36baf02f46744df354e4a00eab78d1034db480e649554c625ba79ee4b6b9c", size = 86208, upload-time = "2025-05-24T16:05:10.943Z" }, - { url = "https://files.pythonhosted.org/packages/2c/cb/98f0165f4cbdc6df35625358a9958176221bb098d38f58c25f5c6a04f9e5/tree_sitter_c-0.23.6-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e96ef21fffd7135839010b37066d6653101ff74fa8961468ffbb0bcf3ae22d61", size = 109935, upload-time = "2025-05-24T16:05:12.126Z" }, - { url = "https://files.pythonhosted.org/packages/b6/eb/1bfae083aa5e6b04e36de75f55491eaa495e84a0d06a87257cbb7c404a08/tree_sitter_c-0.23.6-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cfa9044039460632ef333afd6e907fdc67a657890afe49c8592bd223de059712", size = 98063, upload-time = "2025-05-24T16:05:12.975Z" }, - { url = "https://files.pythonhosted.org/packages/be/1f/85d34bbedb09bacb21c3861bbb26129420f26af289972906b75277150dea/tree_sitter_c-0.23.6-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a25cc5f275109f59dd6d5e636355ff038e46fc1048404519b591935a2b5c96d3", size = 94072, upload-time = "2025-05-24T16:05:13.814Z" }, - { url = "https://files.pythonhosted.org/packages/e6/35/c78cbe4ac9426f2208bacf20a6de9c262af8b9e8d379a6249c6876916978/tree_sitter_c-0.23.6-cp39-abi3-win_amd64.whl", hash = "sha256:1fccc265a0fe1b09874321c20046b297b1513e2cef1af7e17ac53b9b5cf6878e", size = 84626, upload-time = "2025-05-24T16:05:14.65Z" }, - { url = "https://files.pythonhosted.org/packages/be/53/d0f910b86e9d270e76d45f7accabd9efe96448e89c9f713ca2501a8876bf/tree_sitter_c-0.23.6-cp39-abi3-win_arm64.whl", hash = "sha256:ac92b69880d9844c89253a352937dada56e3647fbb8d5acb33f820eeb7763fd7", size = 82655, upload-time = "2025-05-24T16:05:15.894Z" }, -] - [[package]] name = "tree-sitter-c" -version = "0.24.1" +version = "0.23.4" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] -sdist = { url = "https://files.pythonhosted.org/packages/f1/f5/ba8cd08d717277551ade8537d3aa2a94b907c6c6e0fbcf4e4d8b1c747fa3/tree_sitter_c-0.24.1.tar.gz", hash = "sha256:7d2d0cda0b8dda428c81440c1e94367f9f13548eedca3f49768bde66b1422ad6", size = 228014, upload-time = "2025-05-24T17:32:58.384Z" } +sdist = { url = "https://files.pythonhosted.org/packages/27/27/254ebffa4066b3073dddee00c1915893794f5cbf938335c1cc926cd32385/tree_sitter_c-0.23.4.tar.gz", hash = "sha256:9215c7888dd019038f162ea5646178f6e129cd2b49fc506d14becf5e426121d7", size = 223089, upload-time = "2024-12-15T22:24:42.833Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/15/c7/c817be36306e457c2d36cc324789046390d9d8c555c38772429ffdb7d361/tree_sitter_c-0.24.1-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:9c06ac26a1efdcc8b26a8a6970fbc6997c4071857359e5837d4c42892d45fe1e", size = 80940, upload-time = "2025-05-24T17:32:49.967Z" }, - { url = "https://files.pythonhosted.org/packages/7a/42/283909467290b24fdbc29bb32ee20e409a19a55002b43175d66d091ca1a4/tree_sitter_c-0.24.1-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:942bcd7cbecd810dcf7ca6f8f834391ebf0771a89479646d891ba4ca2fdfdc88", size = 86304, upload-time = "2025-05-24T17:32:51.271Z" }, - { url = "https://files.pythonhosted.org/packages/94/53/fb4f61d4e5f15ec3da85774a4df8e58d3b5b73036cf167f0203b4dd9d158/tree_sitter_c-0.24.1-cp310-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9a74cfd7a11ca5a961fafd4d751892ee65acae667d2818968a6f079397d8d28c", size = 109996, upload-time = "2025-05-24T17:32:52.119Z" }, - { url = "https://files.pythonhosted.org/packages/5e/e8/fc541d34ee81c386c5453c2596c1763e8e9cd7cb0725f39d7dfa2276afa4/tree_sitter_c-0.24.1-cp310-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6a807705a3978911dc7ee26a7ad36dcfacb6adfc13c190d496660ec9bd66707", size = 98137, upload-time = "2025-05-24T17:32:53.361Z" }, - { url = "https://files.pythonhosted.org/packages/32/c6/d0563319cae0d5b5780a92e2806074b24afea2a07aa4c10599b899bda3ec/tree_sitter_c-0.24.1-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:789781afcb710df34144f7e2a20cd80e325114b9119e3956c6bd1dd2d365df98", size = 94148, upload-time = "2025-05-24T17:32:54.855Z" }, - { url = "https://files.pythonhosted.org/packages/50/5a/6361df7f3fa2310c53a0d26b4702a261c332da16fa9d801e381e3a86e25f/tree_sitter_c-0.24.1-cp310-abi3-win_amd64.whl", hash = "sha256:290bff0f9c79c966496ebae45042f77543e6e4aea725f40587a8611d566231a8", size = 84703, upload-time = "2025-05-24T17:32:56.084Z" }, - { url = "https://files.pythonhosted.org/packages/22/6a/210a302e8025ac492cbaea58d3720d66b7d8034c5d747ac5e4d2d235aa25/tree_sitter_c-0.24.1-cp310-abi3-win_arm64.whl", hash = "sha256:d46bbda06f838c2dcb91daf767813671fd366b49ad84ff37db702129267b46e1", size = 82715, upload-time = "2025-05-24T17:32:57.248Z" }, + { url = "https://files.pythonhosted.org/packages/84/a9/41e5177fd9309bf142d6772f6885e6a93baa0ad40f17c7a4144ba1275c9c/tree_sitter_c-0.23.4-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2c92c0571b36b6da06f8882f34151dc11e67a493e9101cc0026a16da27709c05", size = 80812, upload-time = "2024-12-15T22:24:26.318Z" }, + { url = "https://files.pythonhosted.org/packages/90/99/cf0a3a8a661fffc7f6843cafbbc1887c47e1a79f751cf9c88002008c8eae/tree_sitter_c-0.23.4-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:98c285a23bf4fb6fb34140d6ea0f0d25d0a93e0d93692f9dffe3db6d1fe08534", size = 85813, upload-time = "2024-12-15T22:24:28.438Z" }, + { url = "https://files.pythonhosted.org/packages/01/c1/d346a08e05223bff3cea08a8f96d685d19bc2c022fde719bfd3e9f6aaaac/tree_sitter_c-0.23.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5e42a3519825ca59c91b2b7aec08dd3c89e02690c7b315d54a1e1743f9be3f15", size = 110085, upload-time = "2024-12-15T22:24:30.823Z" }, + { url = "https://files.pythonhosted.org/packages/a8/88/b7d395038b109d42a4682b9f3d72f8e02de8f7c7caf9ad2b289991f1ac19/tree_sitter_c-0.23.4-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c15c7588c3d95872328019073a8d5eaf7c2691b4d4ef0393a0168399b2ad2356", size = 98075, upload-time = "2024-12-15T22:24:32.946Z" }, + { url = "https://files.pythonhosted.org/packages/e8/12/754a8166d3860cdd614bf7d117c94a740ce1ab1ab2ba766321249909e7b1/tree_sitter_c-0.23.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:013403e74765d74e523f380f9df8f3d99e9fe94132a3fc0c8b29cba538a7b2bf", size = 94071, upload-time = "2024-12-15T22:24:34.974Z" }, + { url = "https://files.pythonhosted.org/packages/14/da/2f97b96f081d6ac9b37c87c9d8e5c0ff5948802562ae28b1a58afd8dec1d/tree_sitter_c-0.23.4-cp39-abi3-win_amd64.whl", hash = "sha256:a4d7bdeaca8f1da72352a945853f56aa5d34e7bc22569ec5bda5d7c1a04e5b0f", size = 84483, upload-time = "2024-12-15T22:24:37.052Z" }, + { url = "https://files.pythonhosted.org/packages/d9/33/0d3b72634e2f34e64b07aaf100207cf3d01e32d814e72e144af0a0e785ad/tree_sitter_c-0.23.4-cp39-abi3-win_arm64.whl", hash = "sha256:edd36e12cc79b8b5bbc81fc336ff7d2577d0fe16afd18163c9aff7ae3ff69e15", size = 82482, upload-time = "2024-12-15T22:24:40.758Z" }, ] [[package]] @@ -3781,9 +3703,6 @@ wheels = [ name = "tree-sitter-javascript" version = "0.23.1" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] sdist = { url = "https://files.pythonhosted.org/packages/cd/dc/1c55c33cc6bbe754359b330534cf9f261c1b9b2c26ddf23aef3c5fa67759/tree_sitter_javascript-0.23.1.tar.gz", hash = "sha256:b2059ce8b150162cda05a457ca3920450adbf915119c04b8c67b5241cd7fcfed", size = 110058, upload-time = "2024-11-10T05:40:42.357Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/20/d3/c67d7d49967344b51208ad19f105233be1afdf07d3dcb35b471900265227/tree_sitter_javascript-0.23.1-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:6ca583dad4bd79d3053c310b9f7208cd597fd85f9947e4ab2294658bb5c11e35", size = 59333, upload-time = "2024-11-10T05:40:31.988Z" }, @@ -3795,34 +3714,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dc/79/ceb21988e6de615355a63eebcf806cd2a0fe875bec27b429d58b63e7fb5f/tree_sitter_javascript-0.23.1-cp39-abi3-win_arm64.whl", hash = "sha256:eb28130cd2fb30d702d614cbf61ef44d1c7f6869e7d864a9cc17111e370be8f7", size = 57027, upload-time = "2024-11-10T05:40:40.841Z" }, ] -[[package]] -name = "tree-sitter-javascript" -version = "0.25.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] -sdist = { url = "https://files.pythonhosted.org/packages/59/e0/e63103c72a9d3dfd89a31e02e660263ad84b7438e5f44ee82e443e65bbde/tree_sitter_javascript-0.25.0.tar.gz", hash = "sha256:329b5414874f0588a98f1c291f1b28138286617aa907746ffe55adfdcf963f38", size = 132338, upload-time = "2025-09-01T07:13:44.792Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2c/df/5106ac250cd03661ebc3cc75da6b3d9f6800a3606393a0122eca58038104/tree_sitter_javascript-0.25.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:b70f887fb269d6e58c349d683f59fa647140c410cfe2bee44a883b20ec92e3dc", size = 64052, upload-time = "2025-09-01T07:13:36.865Z" }, - { url = "https://files.pythonhosted.org/packages/b1/8f/6b4b2bc90d8ab3955856ce852cc9d1e82c81d7ab9646385f0e75ffd5b5d3/tree_sitter_javascript-0.25.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:8264a996b8845cfce06965152a013b5d9cbb7d199bc3503e12b5682e62bb1de1", size = 66440, upload-time = "2025-09-01T07:13:37.962Z" }, - { url = "https://files.pythonhosted.org/packages/5f/c4/7da74ecdcd8a398f88bd003a87c65403b5fe0e958cdd43fbd5fd4a398fcf/tree_sitter_javascript-0.25.0-cp310-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:9dc04ba91fc8583344e57c1f1ed5b2c97ecaaf47480011b92fbeab8dda96db75", size = 99728, upload-time = "2025-09-01T07:13:38.755Z" }, - { url = "https://files.pythonhosted.org/packages/96/c8/97da3af4796495e46421e9344738addb3602fa6426ea695be3fcbadbee37/tree_sitter_javascript-0.25.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:199d09985190852e0912da2b8d26c932159be314bc04952cf917ed0e4c633e6b", size = 106072, upload-time = "2025-09-01T07:13:39.798Z" }, - { url = "https://files.pythonhosted.org/packages/13/be/c964e8130be08cc9bd6627d845f0e4460945b158429d39510953bbcb8fcc/tree_sitter_javascript-0.25.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:dfcf789064c58dc13c0a4edb550acacfc6f0f280577f1e7a00de3e89fc7f8ddc", size = 104388, upload-time = "2025-09-01T07:13:40.866Z" }, - { url = "https://files.pythonhosted.org/packages/ee/89/9b773dee0f8961d1bb8d7baf0a204ab587618df19897c1ef260916f318ec/tree_sitter_javascript-0.25.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:1b852d3aee8a36186dbcc32c798b11b4869f9b5041743b63b65c2ef793db7a54", size = 98377, upload-time = "2025-09-01T07:13:41.838Z" }, - { url = "https://files.pythonhosted.org/packages/3b/dc/d90cb1790f8cec9b4878d278ad9faf7c8f893189ce0f855304fd704fc274/tree_sitter_javascript-0.25.0-cp310-abi3-win_amd64.whl", hash = "sha256:e5ed840f5bd4a3f0272e441d19429b26eedc257abe5574c8546da6b556865e3c", size = 62975, upload-time = "2025-09-01T07:13:42.828Z" }, - { url = "https://files.pythonhosted.org/packages/2e/1f/f9eba1038b7d4394410f3c0a6ec2122b590cd7acb03f196e52fa57ebbe72/tree_sitter_javascript-0.25.0-cp310-abi3-win_arm64.whl", hash = "sha256:622a69d677aa7f6ee2931d8c77c981a33f0ebb6d275aa9d43d3397c879a9bb0b", size = 61668, upload-time = "2025-09-01T07:13:43.803Z" }, -] - [[package]] name = "tree-sitter-python" version = "0.23.6" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version < '3.10'", -] sdist = { url = "https://files.pythonhosted.org/packages/1c/30/6766433b31be476fda6569a3a374c2220e45ffee0bff75460038a57bf23b/tree_sitter_python-0.23.6.tar.gz", hash = "sha256:354bfa0a2f9217431764a631516f85173e9711af2c13dbd796a8815acfe505d9", size = 155868, upload-time = "2024-12-22T23:09:55.918Z" } wheels = [ { url = "https://files.pythonhosted.org/packages/ab/67/577a02acae5f776007c924ca86ef14c19c12e71de0aa9d2a036f3c248e7b/tree_sitter_python-0.23.6-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:28fbec8f74eeb2b30292d97715e60fac9ccf8a8091ce19b9d93e9b580ed280fb", size = 74361, upload-time = "2024-12-22T23:09:42.37Z" }, @@ -3834,27 +3729,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/87/cb/ce35a65f83a47b510d8a2f1eddf3bdbb0d57aabc87351c8788caf3309f76/tree_sitter_python-0.23.6-cp39-abi3-win_arm64.whl", hash = "sha256:71334371bd73d5fe080aed39fbff49ed8efb9506edebe16795b0c7567ed6a272", size = 73649, upload-time = "2024-12-22T23:09:53.71Z" }, ] -[[package]] -name = "tree-sitter-python" -version = "0.25.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12'", - "python_full_version == '3.11.*'", - "python_full_version == '3.10.*'", -] -sdist = { url = "https://files.pythonhosted.org/packages/b8/8b/c992ff0e768cb6768d5c96234579bf8842b3a633db641455d86dd30d5dac/tree_sitter_python-0.25.0.tar.gz", hash = "sha256:b13e090f725f5b9c86aa455a268553c65cadf325471ad5b65cd29cac8a1a68ac", size = 159845, upload-time = "2025-09-11T06:47:58.159Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/cf/64/a4e503c78a4eb3ac46d8e72a29c1b1237fa85238d8e972b063e0751f5a94/tree_sitter_python-0.25.0-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:14a79a47ddef72f987d5a2c122d148a812169d7484ff5c75a3db9609d419f361", size = 73790, upload-time = "2025-09-11T06:47:47.652Z" }, - { url = "https://files.pythonhosted.org/packages/e6/1d/60d8c2a0cc63d6ec4ba4e99ce61b802d2e39ef9db799bdf2a8f932a6cd4b/tree_sitter_python-0.25.0-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:480c21dbd995b7fe44813e741d71fed10ba695e7caab627fb034e3828469d762", size = 76691, upload-time = "2025-09-11T06:47:49.038Z" }, - { url = "https://files.pythonhosted.org/packages/aa/cb/d9b0b67d037922d60cbe0359e0c86457c2da721bc714381a63e2c8e35eba/tree_sitter_python-0.25.0-cp310-abi3-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:86f118e5eecad616ecdb81d171a36dde9bef5a0b21ed71ea9c3e390813c3baf5", size = 108133, upload-time = "2025-09-11T06:47:50.499Z" }, - { url = "https://files.pythonhosted.org/packages/40/bd/bf4787f57e6b2860f3f1c8c62f045b39fb32d6bac4b53d7a9e66de968440/tree_sitter_python-0.25.0-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:be71650ca2b93b6e9649e5d65c6811aad87a7614c8c1003246b303f6b150f61b", size = 110603, upload-time = "2025-09-11T06:47:51.985Z" }, - { url = "https://files.pythonhosted.org/packages/5d/25/feff09f5c2f32484fbce15db8b49455c7572346ce61a699a41972dea7318/tree_sitter_python-0.25.0-cp310-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:e6d5b5799628cc0f24691ab2a172a8e676f668fe90dc60468bee14084a35c16d", size = 108998, upload-time = "2025-09-11T06:47:53.046Z" }, - { url = "https://files.pythonhosted.org/packages/75/69/4946da3d6c0df316ccb938316ce007fb565d08f89d02d854f2d308f0309f/tree_sitter_python-0.25.0-cp310-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:71959832fc5d9642e52c11f2f7d79ae520b461e63334927e93ca46cd61cd9683", size = 107268, upload-time = "2025-09-11T06:47:54.388Z" }, - { url = "https://files.pythonhosted.org/packages/ed/a2/996fc2dfa1076dc460d3e2f3c75974ea4b8f02f6bc925383aaae519920e8/tree_sitter_python-0.25.0-cp310-abi3-win_amd64.whl", hash = "sha256:9bcde33f18792de54ee579b00e1b4fe186b7926825444766f849bf7181793a76", size = 76073, upload-time = "2025-09-11T06:47:55.773Z" }, - { url = "https://files.pythonhosted.org/packages/07/19/4b5569d9b1ebebb5907d11554a96ef3fa09364a30fcfabeff587495b512f/tree_sitter_python-0.25.0-cp310-abi3-win_arm64.whl", hash = "sha256:0fbf6a3774ad7e89ee891851204c2e2c47e12b63a5edbe2e9156997731c128bb", size = 74169, upload-time = "2025-09-11T06:47:56.747Z" }, -] - [[package]] name = "tree-sitter-typescript" version = "0.23.2" From 607347f65a93e855e8a4e8c47753bd5b5ba9d559 Mon Sep 17 00:00:00 2001 From: Bridget McGinn Date: Fri, 24 Oct 2025 14:06:29 -0400 Subject: [PATCH 12/12] DCO Remediation Commit for Bridget McGinn I, Bridget McGinn , hereby add my Signed-off-by to this commit: a4a21e90d63da3aa46fd6f0681b065d6bf85305b I, Bridget McGinn , hereby add my Signed-off-by to this commit: 0266c6387066cd8e4d45ea02fcb417ade5eedd18 I, Bridget McGinn , hereby add my Signed-off-by to this commit: 336dd6a2078272dd30ade58b4fc6392e7a2edf86 I, Bridget McGinn , hereby add my Signed-off-by to this commit: 68890e9f48e6f310e27eb20eeb2d98d45c90bd36 I, Bridget McGinn , hereby add my Signed-off-by to this commit: 3c65eef59e817e9bd8cd3c55e98e843e326dd7a2 Signed-off-by: Bridget McGinn