diff --git a/packtools/sps/utils/xml_fixer.py b/packtools/sps/utils/xml_fixer.py
index 504c224c7..94cd0a9b7 100644
--- a/packtools/sps/utils/xml_fixer.py
+++ b/packtools/sps/utils/xml_fixer.py
@@ -1,147 +1,145 @@
-import logging
-from lxml import etree
-
-logger = logging.getLogger(__name__)
-
+"""
+XML Fixer utilities for completing and fixing XML structures.
-def _remove_and_get_info(xmltree, inline_graphic):
- """
- Removes inline-graphic from its current position and returns information about it.
-
- Args:
- xmltree: XML tree for XPath generation
- inline_graphic: inline-graphic element to be removed
-
- Returns:
- tuple: (old_parent, xpath) - parent element and XPath of the removed element
-
- Raises:
- ValueError: If inline-graphic has no parent
- """
- # Store information before modification
- old_parent = inline_graphic.getparent()
+This module provides utilities to fix and complete XML structures,
+particularly for SciELO Publishing Schema (SPS) compliance.
+"""
- if old_parent is None:
- raise ValueError("inline-graphic has no parent element")
-
- # Generate XPath for modification record
- try:
- xpath = xmltree.getroottree().getpath(inline_graphic)
- except (AttributeError, ValueError):
- xpath = f"./{old_parent.tag}/inline-graphic"
-
- # Remove inline-graphic from current position
- old_parent.remove(inline_graphic)
-
- return old_parent, xpath
+from lxml import etree
-def fix_inline_graphic_in_caption(xmltree):
+def complete_pub_date(xmltree, default_day=15, default_month=6):
"""
- Fixes inline-graphic elements incorrectly positioned inside caption/label.
-
- This function searches for containers (fig, table-wrap, disp-formula) that:
- - Contain inline-graphic inside label or caption
- - Do NOT have a graphic element
- - Do NOT have other child elements besides label and caption
-
- For each container found, if there is exactly one inline-graphic,
- it removes it from inside label/caption and creates a graphic element at the container level.
-
+ Completa elementos pub-date incompletos com valores padrão para day e month.
+
+ Esta função processa elementos no XML e adiciona elementos e
+ quando estes estão ausentes, mantendo a ordem correta dos elementos (year, month, day).
+
Args:
- xmltree: XML tree to be processed
-
+ xmltree: Árvore XML (lxml.etree.Element) a ser processada
+ default_day (int, optional): Dia padrão para completar (1-31). Padrão: 15
+ default_month (int, optional): Mês padrão para completar (1-12). Padrão: 6
+
Returns:
- list: List of dictionaries with the modifications performed
- """
- if xmltree is None:
- raise ValueError("xmltree cannot be None")
-
- modifications = []
-
- # XPath that searches for valid containers needing correction:
- # - Are fig, table-wrap or disp-formula
- # - Have inline-graphic inside label or caption
- # - Do NOT have a direct child graphic element
- xpath_containers = """
- (//fig | //table-wrap | //disp-formula)
- [(label//inline-graphic or caption//inline-graphic) and not(.//graphic)]
+ list: Lista de dicionários contendo as mudanças realizadas. Cada dicionário tem:
+ - xpath (str): XPath do elemento pub-date modificado
+ - element_added (str): Nome do elemento adicionado ('day' ou 'month')
+ - value (str): Valor adicionado
+
+ Raises:
+ ValueError: Se default_day não está entre 1-31 ou default_month não está entre 1-12
+
+ Examples:
+ >>> from lxml import etree
+ >>> xml = '''
+ ...
+ ...
+ ...
+ ... 2024
+ ...
+ ...
+ ...
+ ... '''
+ >>> tree = etree.fromstring(xml)
+ >>> changes = complete_pub_date(tree)
+ >>> len(changes)
+ 2
+ >>> changes[0]['element_added']
+ 'month'
+ >>> changes[1]['element_added']
+ 'day'
+
+ >>> # Após a execução, o XML terá:
+ >>> pub_date = tree.find('.//pub-date')
+ >>> pub_date.findtext('month')
+ '6'
+ >>> pub_date.findtext('day')
+ '15'
+
+ >>> # Com valores personalizados:
+ >>> tree = etree.fromstring(xml)
+ >>> changes = complete_pub_date(tree, default_day=1, default_month=1)
+ >>> pub_date = tree.find('.//pub-date')
+ >>> pub_date.findtext('month')
+ '1'
+ >>> pub_date.findtext('day')
+ '1'
+
+ >>> # Não modifica elementos já existentes:
+ >>> xml_complete = '''
+ ...
+ ...
+ ...
+ ... 2024
+ ... 3
+ ... 20
+ ...
+ ...
+ ...
+ ... '''
+ >>> tree = etree.fromstring(xml_complete)
+ >>> changes = complete_pub_date(tree)
+ >>> len(changes)
+ 0
"""
-
- containers = xmltree.xpath(xpath_containers)
-
- for container in containers:
- # Search for all inline-graphics inside label or caption of this container
- inline_graphics = container.xpath(
- ".//label//inline-graphic | .//caption//inline-graphic"
- )
-
- # Process only if there is exactly 1 inline-graphic
- if len(inline_graphics) != 1:
+ # Validar parâmetros
+ if not isinstance(default_day, int) or default_day < 1 or default_day > 31:
+ raise ValueError("default_day must be between 1 and 31")
+
+ if not isinstance(default_month, int) or default_month < 1 or default_month > 12:
+ raise ValueError("default_month must be between 1 and 12")
+
+ changes = []
+
+ # Buscar elementos pub-date com pub-type='pub' ou publication-format='electronic'
+ xpath_query = (
+ ".//pub-date[@pub-type='pub'] | "
+ ".//pub-date[@publication-format='electronic']"
+ )
+
+ pub_date_nodes = xmltree.xpath(xpath_query)
+
+ for pub_date_node in pub_date_nodes:
+ # Obter xpath do elemento para reportar
+ tree = pub_date_node.getroottree()
+ xpath = tree.getpath(pub_date_node)
+
+ # Verificar se year existe (necessário para processar)
+ year_elem = pub_date_node.find('year')
+ if year_elem is None:
continue
-
- # Check if the container has only label and/or caption as children
- # If there are other elements (table, mathml:math, etc.), do not process
- has_only_label_caption = True
- for child in container.getchildren():
- if child.tag not in ("label", "caption"):
- has_only_label_caption = False
- break
-
- if not has_only_label_caption:
- logger.debug(
- f"Container {container.tag} has other children besides label/caption, skipping",
- extra={'container_tag': container.tag, 'container_id': container.get('id')}
- )
- continue
-
- inline_graphic = inline_graphics[0]
-
- try:
- # Remove inline-graphic and get its information
- old_parent, xpath = _remove_and_get_info(xmltree, inline_graphic)
-
- # Change tag from inline-graphic to graphic (preserves all attributes, text, tail, and children)
- inline_graphic.tag = "graphic"
-
- # Append graphic after label and caption (container only has label/caption at this point)
- container.append(inline_graphic)
-
- # Record modification performed
- modifications.append({
- "xpath": xpath,
- "action": "moved_and_renamed",
- "old_parent": old_parent.tag if old_parent is not None else "unknown",
- "new_parent": container.tag
+
+ # Verificar e adicionar month se ausente
+ month_elem = pub_date_node.find('month')
+ if month_elem is None:
+ month_elem = etree.Element('month')
+ month_elem.text = str(default_month)
+
+ # Inserir após year
+ year_index = list(pub_date_node).index(year_elem)
+ pub_date_node.insert(year_index + 1, month_elem)
+
+ changes.append({
+ 'xpath': xpath,
+ 'element_added': 'month',
+ 'value': str(default_month)
})
-
- except AttributeError as e:
- logger.error(
- f"Error processing inline-graphic in container {container.tag}: "
- f"missing attribute - {e}",
- extra={'container_tag': container.tag}
- )
- continue
- except ValueError as e:
- logger.error(
- f"Error processing inline-graphic in container {container.tag}: "
- f"invalid value - {e}",
- extra={'container_tag': container.tag}
- )
- continue
- except (etree.Error, etree.LxmlError) as e:
- logger.error(
- f"Error processing inline-graphic in container {container.tag}: "
- f"XML structure error - {e}",
- extra={'container_tag': container.tag}
- )
- continue
- except TypeError as e:
- logger.error(
- f"Error processing inline-graphic in container {container.tag}: "
- f"type error - {e}",
- extra={'container_tag': container.tag}
- )
- continue
-
- return modifications
+
+ # Verificar e adicionar day se ausente
+ day_elem = pub_date_node.find('day')
+ if day_elem is None:
+ day_elem = etree.Element('day')
+ day_elem.text = str(default_day)
+
+ # Inserir após month
+ month_elem = pub_date_node.find('month') # Atualizar referência
+ month_index = list(pub_date_node).index(month_elem)
+ pub_date_node.insert(month_index + 1, day_elem)
+
+ changes.append({
+ 'xpath': xpath,
+ 'element_added': 'day',
+ 'value': str(default_day)
+ })
+
+ return changes
diff --git a/tests/sps/utils/test_xml_fixer.py b/tests/sps/utils/test_xml_fixer.py
index 2d2953b94..2b78b0ebb 100644
--- a/tests/sps/utils/test_xml_fixer.py
+++ b/tests/sps/utils/test_xml_fixer.py
@@ -1,411 +1,320 @@
+# coding: utf-8
import unittest
-from lxml import etree
-
-from packtools.sps.utils.xml_fixer import fix_inline_graphic_in_caption
-
-
-class XMLFixerTest(unittest.TestCase):
- """Tests for fix_inline_graphic_in_caption"""
-
- def test_fix_inline_graphic_simple_case(self):
- """Basic test: inline-graphic inside caption"""
- xml = """
-
-
-
+from packtools.sps.utils.xml_fixer import complete_pub_date
+
+
+class TestCompletePubDate(unittest.TestCase):
+ """Test suite for complete_pub_date function."""
+
+ def test_complete_pub_date_only_year(self):
+ """Test completing pub-date with only year element."""
+ xml = """
+
+
+
+ 2024
+
+
+ """
+
tree = etree.fromstring(xml)
- mods = fix_inline_graphic_in_caption(tree)
-
- self.assertEqual(len(mods), 2)
- graphics = tree.findall(".//graphic")
- self.assertEqual(len(graphics), 2)
- self.assertIsNone(tree.find(".//inline-graphic"))
-
- def test_multiple_inline_graphics_same_container_no_modification(self):
- """Test: multiple inline-graphics in SAME container - should NOT modify"""
- xml = """
-
-
Title
- """
- tree = etree.fromstring(xml)
- mods = fix_inline_graphic_in_caption(tree)
-
- self.assertEqual(len(mods), 0)
- inline_graphics = tree.findall(".//inline-graphic")
- self.assertEqual(len(inline_graphics), 2)
- self.assertIsNone(tree.find(".//graphic"))
-
- def test_two_inline_graphics_in_caption(self):
- """Test: two inline-graphics inside caption - should NOT modify"""
- xml = """
-
-
- Title
-
Text
-
- """
- tree = etree.fromstring(xml)
- mods = fix_inline_graphic_in_caption(tree)
-
- self.assertEqual(len(mods), 0)
- inline_graphics = tree.findall(".//inline-graphic")
- self.assertEqual(len(inline_graphics), 2)
- self.assertIsNone(tree.find(".//graphic"))
-
- def test_graphic_already_exists(self):
- """Test: should not modify when graphic already exists"""
- xml = """
-
-
Title
-
- """
- tree = etree.fromstring(xml)
- mods = fix_inline_graphic_in_caption(tree)
-
- self.assertEqual(len(mods), 0)
- self.assertIsNotNone(tree.find(".//inline-graphic"))
- graphics = tree.findall(".//graphic")
- self.assertEqual(len(graphics), 1)
-
- def test_container_with_table_no_modification(self):
- """Test: container with table element should NOT be modified"""
- xml = """
-
-
Title
-
-
Data
-
- """
- tree = etree.fromstring(xml)
- mods = fix_inline_graphic_in_caption(tree)
-
- self.assertEqual(len(mods), 0)
- self.assertIsNotNone(tree.find(".//inline-graphic"))
- self.assertIsNone(tree.find(".//graphic"))
-
- def test_container_with_mathml_no_modification(self):
- """Test: container with mathml element should NOT be modified"""
- xml = """
-
-
Equation
- x
- """
- tree = etree.fromstring(xml)
- mods = fix_inline_graphic_in_caption(tree)
-
- self.assertEqual(len(mods), 0)
- self.assertIsNotNone(tree.find(".//inline-graphic"))
- self.assertIsNone(tree.find(".//graphic"))
-
- def test_container_with_paragraph_no_modification(self):
- """Test: container with paragraph element should NOT be modified"""
- xml = """
-
-
Title
-
Some description text
- """
- tree = etree.fromstring(xml)
- mods = fix_inline_graphic_in_caption(tree)
-
- self.assertEqual(len(mods), 0)
- self.assertIsNotNone(tree.find(".//inline-graphic"))
- self.assertIsNone(tree.find(".//graphic"))
-
- def test_table_wrap_context(self):
- """Test: inline-graphic in table-wrap context (no table element)"""
- xml = """
-
-
Title
- """
- tree = etree.fromstring(xml)
- mods = fix_inline_graphic_in_caption(tree)
-
- self.assertEqual(len(mods), 1)
- self.assertEqual(mods[0]["new_parent"], "table-wrap")
- self.assertIsNotNone(tree.find(".//graphic"))
- self.assertIsNone(tree.find(".//inline-graphic"))
-
- def test_disp_formula_context(self):
- """Test: inline-graphic in disp-formula context (no mathml)"""
- xml = """
-
-
Equation
- """
- tree = etree.fromstring(xml)
- mods = fix_inline_graphic_in_caption(tree)
-
- self.assertEqual(len(mods), 1)
- self.assertEqual(mods[0]["new_parent"], "disp-formula")
- self.assertIsNotNone(tree.find(".//graphic"))
- self.assertIsNone(tree.find(".//inline-graphic"))
-
- def test_preserve_attributes(self):
- """Test: preservation of all attributes"""
- xml = """
-
-
Title
- """
+ changes = complete_pub_date(tree)
+
+ # Should add both month and day
+ self.assertEqual(len(changes), 2)
+ self.assertEqual(changes[0]['element_added'], 'month')
+ self.assertEqual(changes[0]['value'], '6')
+ self.assertEqual(changes[1]['element_added'], 'day')
+ self.assertEqual(changes[1]['value'], '15')
+
+ # Verify XML structure
+ pub_date = tree.find('.//pub-date')
+ self.assertEqual(pub_date.findtext('year'), '2024')
+ self.assertEqual(pub_date.findtext('month'), '6')
+ self.assertEqual(pub_date.findtext('day'), '15')
+
+ # Verify order: year, month, day
+ elements = [elem.tag for elem in pub_date]
+ self.assertEqual(elements, ['year', 'month', 'day'])
+
+ def test_complete_pub_date_year_and_month(self):
+ """Test completing pub-date with year and month elements."""
+ xml = """
+
+
+
+ 2024
+ 3
+
+
+
+ """
+
tree = etree.fromstring(xml)
- mods = fix_inline_graphic_in_caption(tree)
-
- graphic = tree.find(".//graphic")
- self.assertIsNotNone(graphic)
- self.assertEqual(graphic.get("{http://www.w3.org/1999/xlink}href"), "img1.jpg")
- self.assertEqual(graphic.get("id"), "ig1")
- self.assertEqual(graphic.get("content-type"), "image/jpeg")
-
- def test_preserve_child_elements(self):
- """Test: preservation of child elements"""
- xml = """
-
-
- Title
-
- Alternative text
-
-
-
- """
+ changes = complete_pub_date(tree)
+
+ # Should add only day
+ self.assertEqual(len(changes), 1)
+ self.assertEqual(changes[0]['element_added'], 'day')
+ self.assertEqual(changes[0]['value'], '15')
+
+ # Verify XML structure
+ pub_date = tree.find('.//pub-date')
+ self.assertEqual(pub_date.findtext('year'), '2024')
+ self.assertEqual(pub_date.findtext('month'), '3')
+ self.assertEqual(pub_date.findtext('day'), '15')
+
+ # Verify order
+ elements = [elem.tag for elem in pub_date]
+ self.assertEqual(elements, ['year', 'month', 'day'])
+
+ def test_complete_pub_date_already_complete(self):
+ """Test that complete pub-date is not modified."""
+ xml = """
+
+
+
+ 2024
+ 3
+ 20
+
+
+
+ """
+
tree = etree.fromstring(xml)
- mods = fix_inline_graphic_in_caption(tree)
-
- graphic = tree.find(".//graphic")
- self.assertIsNotNone(graphic)
- alt_text = graphic.find(".//alt-text")
- self.assertIsNotNone(alt_text)
- self.assertEqual(alt_text.text, "Alternative text")
-
- def test_inline_graphic_position_after_caption(self):
- """Test: graphic is inserted after label and caption"""
- xml = """
-
-
Title
- """
+ changes = complete_pub_date(tree)
+
+ # Should not add anything
+ self.assertEqual(len(changes), 0)
+
+ # Verify XML structure unchanged
+ pub_date = tree.find('.//pub-date')
+ self.assertEqual(pub_date.findtext('year'), '2024')
+ self.assertEqual(pub_date.findtext('month'), '3')
+ self.assertEqual(pub_date.findtext('day'), '20')
+
+ def test_complete_pub_date_with_publication_format_electronic(self):
+ """Test completing pub-date with publication-format='electronic'."""
+ xml = """
+
+
+
+ 2024
+
+
+
+ """
+
tree = etree.fromstring(xml)
- mods = fix_inline_graphic_in_caption(tree)
-
- children = list(tree)
- self.assertEqual(children[0].tag, "label")
- self.assertEqual(children[1].tag, "caption")
- self.assertEqual(children[2].tag, "graphic")
-
- def test_position_after_label_only(self):
- """Test: graphic after label when there is no caption"""
- xml = """
-
- """
+ changes = complete_pub_date(tree)
+
+ # Should add both month and day
+ self.assertEqual(len(changes), 2)
+ self.assertEqual(changes[0]['element_added'], 'month')
+ self.assertEqual(changes[1]['element_added'], 'day')
+
+ # Verify XML structure
+ pub_date = tree.find('.//pub-date')
+ self.assertEqual(pub_date.findtext('month'), '6')
+ self.assertEqual(pub_date.findtext('day'), '15')
+
+ def test_complete_pub_date_ignores_other_pub_types(self):
+ """Test that pub-date with other pub-types are ignored."""
+ xml = """
+
+
+
+ 2024
+
+
+
+ """
+
tree = etree.fromstring(xml)
- mods = fix_inline_graphic_in_caption(tree)
-
- children = list(tree)
- self.assertEqual(children[0].tag, "label")
- self.assertEqual(children[1].tag, "graphic")
-
- def test_empty_modifications_no_inline_graphics(self):
- """Test: returns empty list when there are no inline-graphics"""
- xml = """
-
-