From 6552a1e09cd8d49738ea42abff558c5766f550e1 Mon Sep 17 00:00:00 2001 From: Joseph Myers Date: Wed, 5 Feb 2025 22:15:12 +0000 Subject: [PATCH] Avoid stripping nonbreaking spaces Nonbreaking spaces should be preserved in places such as the start of a paragraph or blockquote, so change various places to strip only ASCII `' \t\r\n'`. There may be other places that should also avoid stripping nonbreaking spaces (or, conversely, where *trailing* such spaces could safely be stripped even if they no longer are after this change), but this seems a reasonable starting point to fix issues in this area. --- markdownify/__init__.py | 10 +++++----- tests/test_conversions.py | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 9e4c99f..4f9f001 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -265,7 +265,7 @@ def process_text(self, el): if (should_remove_whitespace_outside(el.previous_sibling) or (should_remove_whitespace_inside(el.parent) and not el.previous_sibling)): - text = text.lstrip() + text = text.lstrip(' \t\r\n') if (should_remove_whitespace_outside(el.next_sibling) or (should_remove_whitespace_inside(el.parent) and not el.next_sibling)): @@ -351,7 +351,7 @@ def convert_a(self, el, text, convert_as_inline): def convert_blockquote(self, el, text, convert_as_inline): # handle some early-exit scenarios - text = (text or '').strip() + text = (text or '').strip(' \t\r\n') if convert_as_inline: return ' ' + text + ' ' if not text: @@ -525,8 +525,8 @@ def _indent_for_li(match): def convert_p(self, el, text, convert_as_inline): if convert_as_inline: - return ' ' + text.strip() + ' ' - text = text.strip() + return ' ' + text.strip(' \t\r\n') + ' ' + text = text.strip(' \t\r\n') if self.options['wrap']: # Preserve newlines (and preceding whitespace) resulting # from
tags. Newlines in the input have already been @@ -535,7 +535,7 @@ def convert_p(self, el, text, convert_as_inline): lines = text.split('\n') new_lines = [] for line in lines: - line = line.lstrip() + line = line.lstrip(' \t\r\n') line_no_trailing = line.rstrip() trailing = line[len(line_no_trailing):] line = fill(line, diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 1739cb9..e851ac2 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -59,6 +59,7 @@ def test_b_spaces(): def test_blockquote(): assert md('
Hello
') == '\n> Hello\n\n' assert md('
\nHello\n
') == '\n> Hello\n\n' + assert md('
 Hello
') == '\n> \u00a0Hello\n\n' def test_blockquote_with_nested_paragraph(): @@ -266,6 +267,7 @@ def test_p(): assert md('

1234 5678 9012
67890

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n1234 5678\n9012\\\n67890\n\n' assert md('

1234 5678 9012
67890

', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n1234 5678\n9012 \n67890\n\n' assert md('First

Second

Third

Fourth') == 'First\n\nSecond\n\nThird\n\nFourth' + assert md('

 x y

', wrap=True, wrap_width=80) == '\n\n\u00a0x y\n\n' def test_pre():