diff --git a/docling_core/experimental/idoctags.py b/docling_core/experimental/idoctags.py index 9f497f14..7990f4cd 100644 --- a/docling_core/experimental/idoctags.py +++ b/docling_core/experimental/idoctags.py @@ -1,7 +1,7 @@ """Define classes for DocTags serialization.""" from enum import Enum -from typing import Any, Final, Optional +from typing import Any, Final, Optional, Tuple from xml.dom.minidom import parseString from pydantic import BaseModel @@ -9,6 +9,7 @@ from docling_core.transforms.serializer.base import ( BaseDocSerializer, + BaseListSerializer, BaseMetaSerializer, BasePictureSerializer, BaseTableSerializer, @@ -28,6 +29,8 @@ DescriptionMetaField, DocItem, DoclingDocument, + ListGroup, + ListItem, MetaFieldName, MoleculeMetaField, NodeItem, @@ -38,7 +41,10 @@ TabularChartMetaField, ) from docling_core.types.doc.labels import DocItemLabel -from docling_core.types.doc.tokens import DocumentToken +from docling_core.types.doc.tokens import ( + _CodeLanguageToken, + _PictureClassificationToken, +) DOCTAGS_VERSION: Final = "1.0.0" @@ -61,6 +67,127 @@ class IDocTagsTableToken(str, Enum): OTSL_RHED = "" # - row header cell, OTSL_SROW = "" # - section row cell + @classmethod + def get_special_tokens( + cls, + ): + """Return all table-related special tokens. + + Includes the opening/closing OTSL tags and each enum token value. + """ + special_tokens: list[str] = ["", ""] + for token in cls: + special_tokens.append(f"{token.value}") + + return special_tokens + + +class IDocTagsToken(str, Enum): + """IDocTagsToken.""" + + _LOC_PREFIX = "loc_" + _SECTION_HEADER_PREFIX = "section_header_level_" + + DOCUMENT = "doctag" + VERSION = "version" + + OTSL = "otsl" + ORDERED_LIST = "ordered_list" + UNORDERED_LIST = "unordered_list" + + PAGE_BREAK = "page_break" + + CAPTION = "caption" + FOOTNOTE = "footnote" + FORMULA = "formula" + LIST_ITEM = "list_item" + PAGE_FOOTER = "page_footer" + PAGE_HEADER = "page_header" + PICTURE = "picture" + SECTION_HEADER = "section_header" + TABLE = "table" + TEXT = "text" + TITLE = "title" + DOCUMENT_INDEX = "document_index" + CODE = "code" + CHECKBOX_SELECTED = "checkbox_selected" + CHECKBOX_UNSELECTED = "checkbox_unselected" + FORM = "form" + EMPTY_VALUE = "empty_value" # used for empty value fields in fillable forms + + @classmethod + def get_special_tokens( + cls, + *, + page_dimension: Tuple[int, int] = (500, 500), + include_location_tokens: bool = True, + include_code_class: bool = False, + include_picture_class: bool = False, + ): + """Function to get all special document tokens.""" + special_tokens: list[str] = [] + for token in cls: + if not token.value.endswith("_"): + special_tokens.append(f"<{token.value}>") + special_tokens.append(f"") + + for i in range(6): + special_tokens += [ + f"<{IDocTagsToken._SECTION_HEADER_PREFIX.value}{i}>", + f"", + ] + + special_tokens.extend(IDocTagsTableToken.get_special_tokens()) + + if include_picture_class: + special_tokens.extend([t.value for t in _PictureClassificationToken]) + + if include_code_class: + special_tokens.extend([t.value for t in _CodeLanguageToken]) + + if include_location_tokens: + # Adding dynamically generated location-tokens + for i in range(0, max(page_dimension[0], page_dimension[1])): + special_tokens.append(f"<{IDocTagsToken._LOC_PREFIX.value}{i}/>") + + return special_tokens + + @classmethod + def create_token_name_from_doc_item_label(cls, label: str, level: int = 1) -> str: + """Get token corresponding to passed doc item label.""" + doc_token_by_item_label = { + DocItemLabel.CAPTION: IDocTagsToken.CAPTION, + DocItemLabel.FOOTNOTE: IDocTagsToken.FOOTNOTE, + DocItemLabel.FORMULA: IDocTagsToken.FORMULA, + DocItemLabel.LIST_ITEM: IDocTagsToken.LIST_ITEM, + DocItemLabel.PAGE_FOOTER: IDocTagsToken.PAGE_FOOTER, + DocItemLabel.PAGE_HEADER: IDocTagsToken.PAGE_HEADER, + DocItemLabel.PICTURE: IDocTagsToken.PICTURE, + DocItemLabel.TABLE: IDocTagsToken.TABLE, + DocItemLabel.TEXT: IDocTagsToken.TEXT, + DocItemLabel.TITLE: IDocTagsToken.TITLE, + DocItemLabel.DOCUMENT_INDEX: IDocTagsToken.DOCUMENT_INDEX, + DocItemLabel.CODE: IDocTagsToken.CODE, + DocItemLabel.CHECKBOX_SELECTED: IDocTagsToken.CHECKBOX_SELECTED, + DocItemLabel.CHECKBOX_UNSELECTED: IDocTagsToken.CHECKBOX_UNSELECTED, + DocItemLabel.FORM: IDocTagsToken.FORM, + # Fallback mappings for labels without dedicated tokens in IDocTagsToken + DocItemLabel.KEY_VALUE_REGION: IDocTagsToken.TEXT, + DocItemLabel.PARAGRAPH: IDocTagsToken.TEXT, + DocItemLabel.REFERENCE: IDocTagsToken.TEXT, + DocItemLabel.CHART: IDocTagsToken.PICTURE, + } + + res: str + if label == DocItemLabel.SECTION_HEADER: + res = f"{IDocTagsToken._SECTION_HEADER_PREFIX}{level}" + else: + try: + res = doc_token_by_item_label[DocItemLabel(label)].value + except KeyError as e: + raise RuntimeError(f"Unexpected DocItemLabel: {label}") from e + return res + class IDocTagsParams(DocTagsParams): """DocTags-specific serialization parameters.""" @@ -69,6 +196,136 @@ class IDocTagsParams(DocTagsParams): pretty_indentation: Optional[str] = 2 * " " +class IDocTagsListSerializer(BaseModel, BaseListSerializer): + """DocTags-specific list serializer.""" + + indent: int = 4 + + @override + def serialize( + self, + *, + item: ListGroup, + doc_serializer: "BaseDocSerializer", + doc: DoclingDocument, + list_level: int = 0, + is_inline_scope: bool = False, + visited: Optional[set[str]] = None, # refs of visited items + **kwargs: Any, + ) -> SerializationResult: + """Serialize a ``ListGroup`` into IDocTags markup. + + This emits list containers (````/````) and + serializes children explicitly. Nested ``ListGroup`` items are emitted as + siblings without an enclosing ```` wrapper, while structural + wrappers are still preserved even when content is suppressed. + + Args: + item: The list group to serialize. + doc_serializer: The document-level serializer to delegate nested items. + doc: The document that provides item resolution. + list_level: Current nesting depth (0-based). + is_inline_scope: Whether serialization happens in an inline context. + visited: Set of already visited item refs to avoid cycles. + **kwargs: Additional serializer parameters forwarded to ``IDocTagsParams``. + + Returns: + A ``SerializationResult`` containing serialized text and metadata. + """ + my_visited = visited if visited is not None else set() + params = IDocTagsParams(**kwargs) + + # Build list children explicitly. Requirements: + # 1) / can be children of lists. + # 2) Do NOT wrap nested lists into , even if they are + # children of a ListItem in the logical structure. + # 3) Still ensure structural wrappers are preserved even when + # content is suppressed (e.g., add_content=False). + item_results: list[SerializationResult] = [] + child_results_wrapped: list[str] = [] + + excluded = doc_serializer.get_excluded_refs(**kwargs) + for child_ref in item.children: + child = child_ref.resolve(doc) + + # If a nested list group is present directly under this list group, + # emit it as a sibling (no wrapper). + if isinstance(child, ListGroup): + if child.self_ref in my_visited or child.self_ref in excluded: + continue + my_visited.add(child.self_ref) + sub_res = doc_serializer.serialize( + item=child, + list_level=list_level + 1, + is_inline_scope=is_inline_scope, + visited=my_visited, + **kwargs, + ) + if sub_res.text: + child_results_wrapped.append(sub_res.text) + item_results.append(sub_res) + continue + + # Normal case: ListItem under ListGroup + if not isinstance(child, ListItem): + continue + if child.self_ref in my_visited or child.self_ref in excluded: + continue + + my_visited.add(child.self_ref) + + # Serialize the list item content (DocTagsTextSerializer will not wrap it) + child_res = doc_serializer.serialize( + item=child, + list_level=list_level + 1, + is_inline_scope=is_inline_scope, + visited=my_visited, + **kwargs, + ) + item_results.append(child_res) + # Wrap the content into , without any nested list content. + child_text_wrapped = _wrap( + text=f"{child_res.text}", + wrap_tag=IDocTagsToken.LIST_ITEM.value, + ) + child_results_wrapped.append(child_text_wrapped) + + # After the , append any nested lists (children of this ListItem) + # as siblings at the same level (not wrapped in ). + for subref in child.children: + sub = subref.resolve(doc) + if ( + isinstance(sub, ListGroup) + and sub.self_ref not in my_visited + and sub.self_ref not in excluded + ): + my_visited.add(sub.self_ref) + sub_res = doc_serializer.serialize( + item=sub, + list_level=list_level + 1, + is_inline_scope=is_inline_scope, + visited=my_visited, + **kwargs, + ) + if sub_res.text: + child_results_wrapped.append(sub_res.text) + item_results.append(sub_res) + + delim = _get_delim(params=params) + if child_results_wrapped: + text_res = delim.join(child_results_wrapped) + text_res = f"{text_res}{delim}" + wrap_tag = ( + IDocTagsToken.ORDERED_LIST.value + if item.first_item_is_enumerated(doc) + else IDocTagsToken.UNORDERED_LIST.value + ) + text_res = _wrap(text=text_res, wrap_tag=wrap_tag) + else: + text_res = "" + return create_ser_result(text=text_res, span_source=item_results) + + class IDocTagsMetaSerializer(BaseModel, BaseMetaSerializer): """DocTags-specific meta serializer.""" @@ -187,6 +444,8 @@ def serialize( otsl_content = temp_table.export_to_otsl( temp_doc, add_cell_location=False, + # Suppress chart cell text if global content is off + add_cell_text=params.add_content, self_closing=params.do_self_closing, table_token=IDocTagsTableToken, ) @@ -200,7 +459,7 @@ def serialize( text_res = "".join([r.text for r in res_parts]) if text_res: - token = DocumentToken.create_token_name_from_doc_item_label( + token = IDocTagsToken.create_token_name_from_doc_item_label( label=DocItemLabel.CHART if is_chart else DocItemLabel.PICTURE, ) text_res = _wrap(text=text_res, wrap_tag=token) @@ -238,12 +497,16 @@ def serialize_doc( text_res = delim.join([p.text for p in parts if p.text]) if self.params.add_page_break: - page_sep = f"<{DocumentToken.PAGE_BREAK.value}{'/' if self.params.do_self_closing else ''}>" + page_sep = f"<{IDocTagsToken.PAGE_BREAK.value}{'/' if self.params.do_self_closing else ''}>" for full_match, _, _ in self._get_page_breaks(text=text_res): text_res = text_res.replace(full_match, page_sep) - wrap_tag = DocumentToken.DOCUMENT.value - text_res = f"<{wrap_tag}>{DOCTAGS_VERSION}{text_res}{delim}" + tmp = f"<{IDocTagsToken.DOCUMENT.value}>" + tmp += f"<{IDocTagsToken.VERSION.value}>{DOCTAGS_VERSION}" + tmp += f"{text_res}" + tmp += f"" + + text_res = tmp if self.params.pretty_indentation and ( my_root := parseString(text_res).documentElement @@ -252,4 +515,5 @@ def serialize_doc( text_res = "\n".join( [line for line in text_res.split("\n") if line.strip()] ) + return create_ser_result(text=text_res, span_source=parts) diff --git a/docling_core/transforms/serializer/common.py b/docling_core/transforms/serializer/common.py index 4930e839..b494eb0e 100644 --- a/docling_core/transforms/serializer/common.py +++ b/docling_core/transforms/serializer/common.py @@ -470,6 +470,10 @@ def get_parts( parts: list[SerializationResult] = [] my_visited: set[str] = visited if visited is not None else set() params = self.params.merge_with_patch(patch=kwargs) + add_content = True + + if hasattr(params, "add_content"): + add_content = getattr(params, "add_content") for node, lvl in _iterate_items( node=item, @@ -489,7 +493,7 @@ def get_parts( visited=my_visited, **(dict(level=lvl) | kwargs), ) - if part.text: + if len(part.text.strip()) > 0 or (not add_content): parts.append(part) return parts diff --git a/docling_core/transforms/serializer/doctags.py b/docling_core/transforms/serializer/doctags.py index 807b7750..beff6168 100644 --- a/docling_core/transforms/serializer/doctags.py +++ b/docling_core/transforms/serializer/doctags.py @@ -106,10 +106,16 @@ def serialize( """Serializes the passed item.""" my_visited = visited if visited is not None else set() params = DocTagsParams(**kwargs) - wrap_tag: Optional[str] = DocumentToken.create_token_name_from_doc_item_label( - label=item.label, - **({"level": item.level} if isinstance(item, SectionHeaderItem) else {}), + # Decide wrapping up-front so ListItem never gets wrapped here + wrap_tag_token: Optional[str] = ( + DocumentToken.create_token_name_from_doc_item_label( + label=item.label, + **( + {"level": item.level} if isinstance(item, SectionHeaderItem) else {} + ), + ) ) + wrap_tag: Optional[str] = None if isinstance(item, ListItem) else wrap_tag_token parts: list[str] = [] if item.meta: @@ -152,8 +158,6 @@ def serialize( text_part = f"{language_token}{text_part}" else: text_part = text_part.strip() - if isinstance(item, ListItem): - wrap_tag = None # deferring list item tags to list handling if text_part: parts.append(text_part) @@ -203,7 +207,8 @@ def serialize( otsl_text = item.export_to_otsl( doc=doc, add_cell_location=params.add_table_cell_location, - add_cell_text=params.add_table_cell_text, + # Suppress cell text when global content is disabled + add_cell_text=(params.add_table_cell_text and params.add_content), xsize=params.xsize, ysize=params.ysize, visited=visited, @@ -460,6 +465,7 @@ def serialize( **kwargs, ) delim = _get_delim(params=params) + if parts: text_res = delim.join( [ @@ -636,18 +642,19 @@ def serialize_captions( results: list[SerializationResult] = [] if item.captions: cap_res = super().serialize_captions(item, **kwargs) - if cap_res.text: - if params.add_location: - for caption in item.captions: - if caption.cref not in self.get_excluded_refs(**kwargs): - if isinstance(cap := caption.resolve(self.doc), DocItem): - loc_txt = cap.get_location_tokens( - doc=self.doc, - xsize=params.xsize, - ysize=params.ysize, - self_closing=params.do_self_closing, - ) - results.append(create_ser_result(text=loc_txt)) + if cap_res.text and params.add_location: + for caption in item.captions: + if caption.cref not in self.get_excluded_refs(**kwargs): + if isinstance(cap := caption.resolve(self.doc), DocItem): + loc_txt = cap.get_location_tokens( + doc=self.doc, + xsize=params.xsize, + ysize=params.ysize, + self_closing=params.do_self_closing, + ) + results.append(create_ser_result(text=loc_txt)) + # Only include caption textual content when add_content is True + if cap_res.text and params.add_content: results.append(cap_res) text_res = "".join([r.text for r in results]) if text_res: diff --git a/test/data/doc/ddoc_0.json b/test/data/doc/ddoc_0.json new file mode 100644 index 00000000..7894bed1 --- /dev/null +++ b/test/data/doc/ddoc_0.json @@ -0,0 +1,2103 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.8.0", + "name": "00073b00f3fbd33ef92f0c4902c5c7397c89f07f6a5528c5c97af53c67c4dcc7", + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/tables/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/tables/1" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/groups/1" + }, + { + "$ref": "#/tables/2" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/groups/2" + }, + { + "$ref": "#/tables/3" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/groups/3" + }, + { + "$ref": "#/tables/4" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/groups/4" + }, + { + "$ref": "#/tables/5" + }, + { + "$ref": "#/texts/11" + }, + { + "$ref": "#/groups/5" + }, + { + "$ref": "#/tables/6" + }, + { + "$ref": "#/texts/13" + }, + { + "$ref": "#/texts/14" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/2" + } + ], + "content_layer": "body", + "name": "list_standalone_10", + "label": "list" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/4" + } + ], + "content_layer": "body", + "name": "list_standalone_11", + "label": "list" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/6" + } + ], + "content_layer": "body", + "name": "list_standalone_12", + "label": "list" + }, + { + "self_ref": "#/groups/3", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/8" + } + ], + "content_layer": "body", + "name": "list_standalone_13", + "label": "list" + }, + { + "self_ref": "#/groups/4", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/10" + } + ], + "content_layer": "body", + "name": "list_standalone_14", + "label": "list" + }, + { + "self_ref": "#/groups/5", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/12" + } + ], + "content_layer": "body", + "name": "list_standalone_15", + "label": "list" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 412.099992, + "t": 1510.53408, + "r": 847.769328, + "b": 1489.28472, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 47 + ] + } + ], + "orig": "ndbinfo_select_all - Select From ndbinfo Tables", + "text": "ndbinfo_select_all - Select From ndbinfo Tables" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 223.603992, + "t": 1315.97136, + "r": 1144.0813679999999, + "b": 1294.722, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 104 + ] + } + ], + "orig": "This option sets the number of times to execute the select. Use --delay to set the time between loops.", + "text": "This option sets the number of times to execute the select. Use --delay to set the time between loops." + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 204.531624, + "t": 1271.087136, + "r": 451.80899999999997, + "b": 1249.8377759999998, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 21 + ] + } + ], + "orig": "\u2022 --ndb-connectstring", + "text": "\u2022 --ndb-connectstring", + "enumerated": false, + "marker": "" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 224.271072, + "t": 1084.613904, + "r": 1142.0813520000002, + "b": 1039.4017920000001, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 143 + ] + } + ], + "orig": "Set connect string for connecting to ndb_mgmd. Syntax: \"[nodeid=id;][host=]hostname[:port]\". Overrides entries in NDB_CONNECTSTRING and my.cnf.", + "text": "Set connect string for connecting to ndb_mgmd. Syntax: \"[nodeid=id;][host=]hostname[:port]\". Overrides entries in NDB_CONNECTSTRING and my.cnf." + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 204.740928, + "t": 1014.705648, + "r": 403.115832, + "b": 996.925248, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 17 + ] + } + ], + "orig": "\u2022 --ndb-mgmd-host", + "text": "\u2022 --ndb-mgmd-host", + "enumerated": false, + "marker": "" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 223.91855999999999, + "t": 829.993824, + "r": 541.732608, + "b": 808.744464, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 30 + ] + } + ], + "orig": "Same as --ndb-connectstring .", + "text": "Same as --ndb-connectstring ." + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 203.139936, + "t": 783.451152, + "r": 367.819344, + "b": 769.089024, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 14 + ] + } + ], + "orig": "\u2022 --ndb-nodeid", + "text": "\u2022 --ndb-nodeid", + "enumerated": false, + "marker": "" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 224.521992, + "t": 598.5096480000001, + "r": 849.322584, + "b": 577.260288, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 72 + ] + } + ], + "orig": "Set node ID for this node, overriding any ID set by --ndb-connectstring.", + "text": "Set node ID for this node, overriding any ID set by --ndb-connectstring." + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 204.8058, + "t": 552.36456, + "r": 584.730504, + "b": 536.87304, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 32 + ] + } + ], + "orig": "\u2022 --ndb-optimized-node-selection", + "text": "\u2022 --ndb-optimized-node-selection", + "enumerated": false, + "marker": "" + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 223.919784, + "t": 434.44684800000005, + "r": 1102.77504, + "b": 388.3603679999999, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 136 + ] + } + ], + "orig": "Enable optimizations for selection of nodes for transactions. Enabled by default; use --skip-ndb- optimized-node-selection to disable.", + "text": "Enable optimizations for selection of nodes for transactions. Enabled by default; use --skip-ndb- optimized-node-selection to disable." + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 204.31619999999998, + "t": 363.80044799999996, + "r": 377.693352, + "b": 349.65532800000005, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 15 + ] + } + ], + "orig": "\u2022 --no-defaults", + "text": "\u2022 --no-defaults", + "enumerated": false, + "marker": "" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 225.182952, + "t": 245.847888, + "r": 818.1216, + "b": 224.598528, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 71 + ] + } + ], + "orig": "Do not read default options from any option file other than login file.", + "text": "Do not read default options from any option file other than login file." + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/groups/5" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 204.862104, + "t": 199.52063999999996, + "r": 414.566352, + "b": 180.91339199999993, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 18 + ] + } + ], + "orig": "\u2022 --print-defaults", + "text": "\u2022 --print-defaults", + "enumerated": false, + "marker": "" + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 224.58808800000003, + "t": 80.88537599999995, + "r": 545.5931039999999, + "b": 59.61700799999994, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 37 + ] + } + ], + "orig": "Print program argument list and exit.", + "text": "Print program argument list and exit." + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "furniture", + "label": "text", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 1105.84116, + "t": 93.37996799999996, + "r": 1151.544096, + "b": 77.81716800000004, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 4 + ] + } + ], + "orig": "4253", + "text": "4253" + } + ], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 222.250248, + "t": 1438.47, + "r": 1151.596728, + "b": 1340.410896, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 53 + ] + } + ], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "bbox": { + "l": 224.533008, + "t": 146.833632, + "r": 688.3127280000001, + "b": 176.839344, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Default Value", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 688.2906959999999, + "t": 146.833632, + "r": 1149.957792, + "b": 176.839344, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "1", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 224.51464800000002, + "t": 176.642928, + "r": 688.3127280000001, + "b": 209.54419199999998, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Minimum Value", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 688.2906959999999, + "t": 176.642928, + "r": 1149.957792, + "b": 209.54419199999998, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "0", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 224.553816, + "t": 209.526768, + "r": 688.3127280000001, + "b": 241.270128, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Maximum Value", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 688.2906959999999, + "t": 209.526768, + "r": 1149.957792, + "b": 242.42803199999997, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "MAX_INT", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + "num_rows": 3, + "num_cols": 2, + "grid": [ + [ + { + "bbox": { + "l": 224.533008, + "t": 146.833632, + "r": 688.3127280000001, + "b": 176.839344, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Default Value", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 688.2906959999999, + "t": 146.833632, + "r": 1149.957792, + "b": 176.839344, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "1", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": { + "l": 224.51464800000002, + "t": 176.642928, + "r": 688.3127280000001, + "b": 209.54419199999998, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Minimum Value", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 688.2906959999999, + "t": 176.642928, + "r": 1149.957792, + "b": 209.54419199999998, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "0", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": { + "l": 224.553816, + "t": 209.526768, + "r": 688.3127280000001, + "b": 241.270128, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Maximum Value", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 688.2906959999999, + "t": 209.526768, + "r": 1149.957792, + "b": 242.42803199999997, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "MAX_INT", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + }, + { + "self_ref": "#/tables/1", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 222.977304, + "t": 1210.340736, + "r": 1153.22832, + "b": 1109.547648, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 90 + ] + } + ], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "bbox": { + "l": 224.89898399999998, + "t": 375.309792, + "r": 686.81088, + "b": 408.21105600000004, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Command-Line Format", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 686.8488239999999, + "t": 375.309792, + "r": 1150.3458, + "b": 408.21105600000004, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "--ndb-connectstring=connection-string", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 224.672544, + "t": 407.9196, + "r": 686.81088, + "b": 440.820864, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Type", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 686.8488239999999, + "t": 407.9196, + "r": 1150.1193600000001, + "b": 440.820864, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "String", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 224.63092799999998, + "t": 440.996688, + "r": 686.81088, + "b": 472.518288, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Default Value", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 686.8488239999999, + "t": 440.996688, + "r": 1150.0777440000002, + "b": 472.518288, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "[none]", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + "num_rows": 3, + "num_cols": 2, + "grid": [ + [ + { + "bbox": { + "l": 224.89898399999998, + "t": 375.309792, + "r": 686.81088, + "b": 408.21105600000004, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Command-Line Format", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 686.8488239999999, + "t": 375.309792, + "r": 1150.3458, + "b": 408.21105600000004, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "--ndb-connectstring=connection-string", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": { + "l": 224.672544, + "t": 407.9196, + "r": 686.81088, + "b": 440.820864, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Type", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 686.8488239999999, + "t": 407.9196, + "r": 1150.1193600000001, + "b": 440.820864, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "String", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": { + "l": 224.63092799999998, + "t": 440.996688, + "r": 686.81088, + "b": 472.518288, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Default Value", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 686.8488239999999, + "t": 440.996688, + "r": 1150.0777440000002, + "b": 472.518288, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "[none]", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + }, + { + "self_ref": "#/tables/2", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 223.20129599999999, + "t": 955.31832, + "r": 1153.452312, + "b": 854.525232, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 86 + ] + } + ], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "bbox": { + "l": 225.70927200000003, + "t": 630.3464640000001, + "r": 688.6591199999999, + "b": 661.868064, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Command-Line Format", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 688.993272, + "t": 630.3464640000001, + "r": 1151.156088, + "b": 661.868064, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "--ndb-mgmd-host=connection-string", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 224.87083199999998, + "t": 661.646304, + "r": 688.6591199999999, + "b": 694.872288, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Type", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 688.993272, + "t": 661.646304, + "r": 1150.771752, + "b": 694.872288, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "String", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 225.110736, + "t": 695.24928, + "r": 688.6591199999999, + "b": 726.77088, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Default Value", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 688.993272, + "t": 695.24928, + "r": 1150.557552, + "b": 726.77088, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "[none]", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + "num_rows": 3, + "num_cols": 2, + "grid": [ + [ + { + "bbox": { + "l": 225.70927200000003, + "t": 630.3464640000001, + "r": 688.6591199999999, + "b": 661.868064, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Command-Line Format", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 688.993272, + "t": 630.3464640000001, + "r": 1151.156088, + "b": 661.868064, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "--ndb-mgmd-host=connection-string", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": { + "l": 224.87083199999998, + "t": 661.646304, + "r": 688.6591199999999, + "b": 694.872288, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Type", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 688.993272, + "t": 661.646304, + "r": 1150.771752, + "b": 694.872288, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "String", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": { + "l": 225.110736, + "t": 695.24928, + "r": 688.6591199999999, + "b": 726.77088, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Default Value", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 688.993272, + "t": 695.24928, + "r": 1150.557552, + "b": 726.77088, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "[none]", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + }, + { + "self_ref": "#/tables/3", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 222.433848, + "t": 724.960368, + "r": 1152.684864, + "b": 624.16728, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 68 + ] + } + ], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "bbox": { + "l": 224.98344, + "t": 860.6774879999999, + "r": 686.840256, + "b": 893.903472, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Command-Line Format", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 686.8684079999999, + "t": 860.6774879999999, + "r": 1150.917408, + "b": 893.903472, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "--ndb-nodeid=#", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 224.908776, + "t": 893.849616, + "r": 686.840256, + "b": 927.0756, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Type", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 686.8684079999999, + "t": 893.849616, + "r": 1150.842744, + "b": 927.0756, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Integer", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 225.055656, + "t": 927.378144, + "r": 686.840256, + "b": 957.981024, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Default Value", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 686.8684079999999, + "t": 927.378144, + "r": 1150.960248, + "b": 958.152096, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "[none]", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + "num_rows": 3, + "num_cols": 2, + "grid": [ + [ + { + "bbox": { + "l": 224.98344, + "t": 860.6774879999999, + "r": 686.840256, + "b": 893.903472, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Command-Line Format", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 686.8684079999999, + "t": 860.6774879999999, + "r": 1150.917408, + "b": 893.903472, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "--ndb-nodeid=#", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": { + "l": 224.908776, + "t": 893.849616, + "r": 686.840256, + "b": 927.0756, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Type", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 686.8684079999999, + "t": 893.849616, + "r": 1150.842744, + "b": 927.0756, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Integer", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "bbox": { + "l": 225.055656, + "t": 927.378144, + "r": 686.840256, + "b": 957.981024, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Default Value", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 686.8684079999999, + "t": 927.378144, + "r": 1150.960248, + "b": 958.152096, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "[none]", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + }, + { + "self_ref": "#/tables/4", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 223.6248, + "t": 492.69369600000005, + "r": 1154.5906320000001, + "b": 459.70056, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 50 + ] + } + ], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "bbox": { + "l": 225.216, + "t": 1092.2915520000001, + "r": 686.2784399999999, + "b": 1123.383888, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Command-Line Format", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 686.6272799999999, + "t": 1093.1120640000001, + "r": 1150.917408, + "b": 1122.805728, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "--ndb-optimized-node-selection", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + "num_rows": 1, + "num_cols": 2, + "grid": [ + [ + { + "bbox": { + "l": 225.216, + "t": 1092.2915520000001, + "r": 686.2784399999999, + "b": 1123.383888, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Command-Line Format", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 686.6272799999999, + "t": 1093.1120640000001, + "r": 1150.917408, + "b": 1122.805728, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "--ndb-optimized-node-selection", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + }, + { + "self_ref": "#/tables/5", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 222.891624, + "t": 304.4162879999999, + "r": 1153.14264, + "b": 269.734608, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 33 + ] + } + ], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "bbox": { + "l": 224.082576, + "t": 1281.102768, + "r": 687.7031760000001, + "b": 1314.122832, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Command-Line Format", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 687.775392, + "t": 1280.29968, + "r": 1150.0165439999998, + "b": 1314.122832, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "--no-defaults", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + "num_rows": 1, + "num_cols": 2, + "grid": [ + [ + { + "bbox": { + "l": 224.082576, + "t": 1281.102768, + "r": 687.7031760000001, + "b": 1314.122832, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Command-Line Format", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 687.775392, + "t": 1280.29968, + "r": 1150.0165439999998, + "b": 1314.122832, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "--no-defaults", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + }, + { + "self_ref": "#/tables/6", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 224.27352000000002, + "t": 138.81384000000003, + "r": 1153.274832, + "b": 105.69873600000005, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 36 + ] + } + ], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "bbox": { + "l": 224.298, + "t": 1445.525136, + "r": 689.071608, + "b": 1478.317104, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Command-Line Format", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 689.0924160000001, + "t": 1445.525136, + "r": 1152.3348, + "b": 1478.248992, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "--print-defaults", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + "num_rows": 1, + "num_cols": 2, + "grid": [ + [ + { + "bbox": { + "l": 224.298, + "t": 1445.525136, + "r": 689.071608, + "b": 1478.317104, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Command-Line Format", + "column_header": false, + "row_header": true, + "row_section": false, + "fillable": false + }, + { + "bbox": { + "l": 689.0924160000001, + "t": 1445.525136, + "r": 1152.3348, + "b": 1478.248992, + "coord_origin": "TOPLEFT" + }, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "--print-defaults", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + } + ], + "key_value_items": [], + "form_items": [], + "pages": { + "1": { + "size": { + "width": 1224.0, + "height": 1584.0 + }, + "image": { + "mimetype": "image/png", + "dpi": 72, + "size": { + "width": 1224.0, + "height": 1584.0 + }, + "uri": "GroundTruthPageImages/0" + }, + "page_no": 1 + } + } +} \ No newline at end of file diff --git a/test/data/doc/ddoc_0.v0.gt.dt b/test/data/doc/ddoc_0.v0.gt.dt new file mode 100644 index 00000000..7408867f --- /dev/null +++ b/test/data/doc/ddoc_0.v0.gt.dt @@ -0,0 +1,237 @@ + + 1.0.0 + + + + + + ndbinfo_select_all - Select From ndbinfo Tables + + + + + + + + Default Value + + 1 + + + Minimum Value + + 0 + + + Maximum Value + + MAX_INT + + + + + + + + This option sets the number of times to execute the select. Use --delay to set the time between loops. + + + + + + + + • --ndb-connectstring + + + + + + + + + Command-Line Format + + --ndb-connectstring=connection-string + + + Type + + String + + + Default Value + + [none] + + + + + + + + Set connect string for connecting to ndb_mgmd. Syntax: "[nodeid=id;][host=]hostname[:port]". Overrides entries in NDB_CONNECTSTRING and my.cnf. + + + + + + + + • --ndb-mgmd-host + + + + + + + + + Command-Line Format + + --ndb-mgmd-host=connection-string + + + Type + + String + + + Default Value + + [none] + + + + + + + + Same as --ndb-connectstring . + + + + + + + + • --ndb-nodeid + + + + + + + + + Command-Line Format + + --ndb-nodeid=# + + + Type + + Integer + + + Default Value + + [none] + + + + + + + + Set node ID for this node, overriding any ID set by --ndb-connectstring. + + + + + + + + • --ndb-optimized-node-selection + + + + + + + + + Command-Line Format + + --ndb-optimized-node-selection + + + + + + + + Enable optimizations for selection of nodes for transactions. Enabled by default; use --skip-ndb- optimized-node-selection to disable. + + + + + + + + • --no-defaults + + + + + + + + + Command-Line Format + + --no-defaults + + + + + + + + Do not read default options from any option file other than login file. + + + + + + + + • --print-defaults + + + + + + + + + Command-Line Format + + --print-defaults + + + + + + + + Print program argument list and exit. + + + + + + + 4253 + + diff --git a/test/data/doc/ddoc_0.v1.gt.dt b/test/data/doc/ddoc_0.v1.gt.dt new file mode 100644 index 00000000..90808c23 --- /dev/null +++ b/test/data/doc/ddoc_0.v1.gt.dt @@ -0,0 +1,192 @@ + + 1.0.0 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/data/doc/ddoc_0.v2.gt.dt b/test/data/doc/ddoc_0.v2.gt.dt new file mode 100644 index 00000000..bba8d2b9 --- /dev/null +++ b/test/data/doc/ddoc_0.v2.gt.dt @@ -0,0 +1 @@ +1.0.0 diff --git a/test/test_doc_schema.py b/test/test_doc_schema.py index b03a12e5..9776e791 100644 --- a/test/test_doc_schema.py +++ b/test/test_doc_schema.py @@ -43,7 +43,7 @@ def test_ccs_document(): assert False, f"Data in file {filename} should be invalid for CCSDocument model" except ValidationError as e: for error in e.errors(): - print(type(error)) + # print(type(error)) assert all( item in error["loc"] for item in ("description", "logs") ), f"Data in file {filename} should fail in logs" diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py index f10c978f..5a043f57 100644 --- a/test/test_docling_doc.py +++ b/test/test_docling_doc.py @@ -724,7 +724,7 @@ def _test_export_methods( second_page = first_page + 1 if second_page in doc.pages: # Only test if document has at least 2 pages dt_pages_pred = doc.export_to_doctags(pages={first_page, second_page}) - print(dt_pages_pred) + # print(dt_pages_pred) _verify_regression_test(dt_pages_pred, filename=filename, ext="pages.dt") # Test Tables export ... diff --git a/test/test_doctags_content_suppression.py b/test/test_doctags_content_suppression.py new file mode 100644 index 00000000..1a6cf4b2 --- /dev/null +++ b/test/test_doctags_content_suppression.py @@ -0,0 +1,66 @@ +from docling_core.transforms.serializer.doctags import ( + DocTagsDocSerializer, + DocTagsParams, +) +from docling_core.types.doc.document import DoclingDocument, TableData +from docling_core.types.doc.labels import DocItemLabel + + +def serialize_doctags(doc: DoclingDocument, **param_overrides) -> str: + params = DocTagsParams(**param_overrides) + ser = DocTagsDocSerializer(doc=doc, params=params) + return ser.serialize().text + + +def test_no_content_suppresses_caption_and_table_cell_text(): + doc = DoclingDocument(name="t") + + # Add a caption text item + cap = doc.add_text(label=DocItemLabel.CAPTION, text="Table Caption Text") + + # Build a 2x2 table with header row and data row + td = TableData(num_rows=0, num_cols=2) + td.add_row(["H1", "H2"]) # header + td.add_row(["C1", "C2"]) # data + doc.add_table(data=td, caption=cap) + + txt = serialize_doctags(doc, add_content=False) + + # Caption text suppressed + assert "Table Caption Text" not in txt + + # No table cell text + for cell_text in ["H1", "H2", "C1", "C2"]: + assert cell_text not in txt + + # OTSL structural tokens should remain + assert "" in txt and "" in txt + + +def test_no_content_suppresses_figure_caption_text(): + doc = DoclingDocument(name="t") + cap = doc.add_text(label=DocItemLabel.CAPTION, text="Figure Caption Text") + doc.add_picture(caption=cap) + + txt = serialize_doctags(doc, add_content=False) + assert "Figure Caption Text" not in txt + + +def test_list_items_not_double_wrapped_when_no_content(): + doc = DoclingDocument(name="t") + lst = doc.add_list_group() + doc.add_list_item("Item A", parent=lst) + doc.add_list_item("Item B", parent=lst) + + txt = serialize_doctags(doc, add_content=True) + print(f"txt with content:\n{txt}") + + txt = serialize_doctags(doc, add_content=False) + print(f"txt without content:\n{txt}") + + # No nested + assert "" not in txt + + # Should still have exactly two opening list_item wrappers (for the two items) + # Note: other occurrences could appear in location tokens etc., so be conservative + assert txt.count("") >= 2 diff --git a/test/test_json_schema_to_search_mapper.py b/test/test_json_schema_to_search_mapper.py index b2d15786..9a6acbe4 100644 --- a/test/test_json_schema_to_search_mapper.py +++ b/test/test_json_schema_to_search_mapper.py @@ -56,7 +56,7 @@ def test_json_schema_to_search_mapper_0(): def test_json_schema_to_search_mapper_1(): """Test the class JsonSchemaToSearchMapper.""" s = Record.model_json_schema() - print(json.dumps(s, indent=2)) + # print(json.dumps(s, indent=2)) _meta = { "aliases": [".production", "ccc"], diff --git a/test/test_otsl_table_export.py b/test/test_otsl_table_export.py index 4b3534f3..84dd5005 100644 --- a/test/test_otsl_table_export.py +++ b/test/test_otsl_table_export.py @@ -274,10 +274,14 @@ def test_table_export_to_otsl(): otsl_string = doc.tables[0].export_to_otsl( add_cell_location=False, add_cell_text=False, doc=doc ) - print_friendly = otsl_string.split("") - print("OTSL out:") + otsl_string.split("") + # print("OTSL out:") + + """ for s in print_friendly: print(s) + """ + assert ( otsl_string == "" diff --git a/test/test_serialization.py b/test/test_serialization.py index 3f17492e..a783c410 100644 --- a/test/test_serialization.py +++ b/test/test_serialization.py @@ -4,9 +4,12 @@ import pytest -from docling_core.experimental.idoctags import IDocTagsDocSerializer +from docling_core.experimental.idoctags import IDocTagsDocSerializer, IDocTagsParams from docling_core.transforms.serializer.common import _DEFAULT_LABELS -from docling_core.transforms.serializer.doctags import DocTagsDocSerializer +from docling_core.transforms.serializer.doctags import ( + DocTagsDocSerializer, + DocTagsParams, +) from docling_core.transforms.serializer.html import ( HTMLDocSerializer, HTMLOutputStyle, @@ -38,6 +41,16 @@ def verify(exp_file: Path, actual: str): with open(exp_file, "r", encoding="utf-8") as f: expected = f.read().rstrip() + # Normalize platform-dependent quote escaping for DocTags outputs + name = exp_file.name + if name.endswith(".dt") or name.endswith(".idt.xml"): + + def _normalize_quotes(s: str) -> str: + return s.replace(""", '"').replace(""", '"') + + expected = _normalize_quotes(expected) + actual = _normalize_quotes(actual) + assert expected == actual @@ -593,6 +606,43 @@ def test_doctags_meta(): # =============================== +def test_idoctags(): + src = Path("./test/data/doc/ddoc_0.json") + doc = DoclingDocument.load_from_json(src) + + if True: + # Human readable, indented and with content + params = IDocTagsParams() + params.add_content = True + + ser = IDocTagsDocSerializer(doc=doc, params=params) + actual = ser.serialize().text + + verify(exp_file=src.with_suffix(".v0.gt.dt"), actual=actual) + + if True: + # Human readable, indented but without content + params = IDocTagsParams() + params.add_content = False + + ser = IDocTagsDocSerializer(doc=doc, params=params) + actual = ser.serialize().text + + verify(exp_file=src.with_suffix(".v1.gt.dt"), actual=actual) + + if True: + # Machine readable, not indented and without content + params = IDocTagsParams() + params.pretty_indentation = "" + params.add_content = False + params.mode = DocTagsParams.Mode.MINIFIED + + ser = IDocTagsDocSerializer(doc=doc, params=params) + actual = ser.serialize().text + + verify(exp_file=src.with_suffix(".v2.gt.dt"), actual=actual) + + def test_idoctags_meta(): src = Path("./test/data/doc/dummy_doc_with_meta.yaml") doc = DoclingDocument.load_from_yaml(src)