Skip to content

Fix sphinx.writers.text.TextWrapper #13762

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ Bugs fixed
to improve `semantic HTML structure
<https://html.spec.whatwg.org/multipage/text-level-semantics.html>`__.
Patch by Mark Ostroth.
* #13741: text: fix an infinite loop when processing CSV tables.
Patch by Bénédikt Tran.


Testing
-------
70 changes: 48 additions & 22 deletions sphinx/writers/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,18 +292,18 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]:
else:
indent = self.initial_indent

# Note that column_width(x) > len(x) is possible,
# but _handle_long_word() handles negative widths.
width = self.width - column_width(indent)

if self.drop_whitespace and not chunks[-1].strip() and lines:
del chunks[-1]

while chunks:
l = column_width(chunks[-1])

if cur_len + l <= width:
chunk_width = column_width(chunks[-1])
if cur_len + chunk_width <= width:
cur_line.append(chunks.pop())
cur_len += l

cur_len += chunk_width
else:
break

Expand All @@ -318,43 +318,69 @@ def _wrap_chunks(self, chunks: list[str]) -> list[str]:

return lines

def _break_word(self, word: str, space_left: int) -> tuple[str, str]:
"""Break line by unicode width instead of len(word)."""
@staticmethod
def _find_break_end(word: str, space_left: int) -> int:
"""Break word by unicode width instead of len(word).

The returned position 'end' satisfies::

assert column_width(word[:end]) <= space_left
assert end == len(word) or column_width(word[:end+1]) > space_left
"""
total = 0
for i, c in enumerate(word):
for end, c in enumerate(word, 1):
total += column_width(c)
if total > space_left:
return word[: i - 1], word[i - 1 :]
return word, ''
return end - 1
return len(word)

def _split(self, text: str) -> list[str]:
"""Override original method that only split by 'wordsep_re'.

This '_split' splits wide-characters into chunks by one character.
"""

def split(t: str) -> list[str]:
return super(TextWrapper, self)._split(t)
def column_width_safe(x: str) -> int:
# Handle characters that are 0-width. We should refine
# the grouping to prevent splitting a word at combining
# characters or in a group of combining characters with
# at most one non-combining character as the combining
# characters may act on the right or left character.
#
# See https://github.com/sphinx-doc/sphinx/issues/13741.
return max(1, column_width(x))

chunks: list[str] = []
for chunk in split(text):
for w, g in groupby(chunk, column_width):
if w == 1:
chunks.extend(split(''.join(g)))
for chunk in super()._split(text):
for w, g in groupby(chunk, column_width_safe):
if w <= 1:
chunks.extend(super()._split(''.join(g)))
else:
chunks.extend(list(g))
return chunks

def _handle_long_word(
self, reversed_chunks: list[str], cur_line: list[str], cur_len: int, width: int
) -> None:
"""Override original method for using self._break_word() instead of slice."""
space_left = max(width - cur_len, 1)
if self.break_long_words:
l, r = self._break_word(reversed_chunks[-1], space_left)
cur_line.append(l)
reversed_chunks[-1] = r
"""Override using self._find_break() instead of str.find()."""
# Make sure at least one character is stripped off on every pass.
#
# Do NOT use space_left = max(width - cur_len, 1) as corner cases
# with "self.drop_whitespace == False" and "self.width == 1" fail.
space_left = 1 if width < 1 else (width - cur_len)

if self.break_long_words:
# Some characters may have len(X) < space_left < column_width(X)
# so we should only wrap chunks for which len(X) > space_left.
end = space_left
chunk = reversed_chunks[-1]
if space_left > 0:
end = self._find_break_end(chunk, space_left)
if end == 0 and space_left:
# force processing at least one character
end = 1
cur_line.append(chunk[:end])
reversed_chunks[-1] = chunk[end:]
elif not cur_line:
cur_line.append(reversed_chunks.pop())

Expand Down
183 changes: 183 additions & 0 deletions tests/test_writers/test_writer_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
"""Test the LaTeX writer"""

from __future__ import annotations

import pytest
from docutils.utils import column_width

from sphinx.writers.text import TextWrapper

find_break_end = TextWrapper._find_break_end


@pytest.mark.parametrize(
# glyph of column width 0
'glyph',
['', '\N{COMBINING TILDE}'],
)
def test_text_wrapper_break_phantom_symbol(glyph: str) -> None:
assert column_width(glyph) == 0
glyph_length = len(glyph)

for n in range(1, 5):
# Since the glyph has length 0 and column width 0,
# we can always take the entire glpyh.
assert find_break_end(glyph, n) == glyph_length
for m in range(1, 5):
# The multiplied glyph may have non-zero length
# but its column width will always be 0, so we
# take the entire glyph again.
assert find_break_end(m * glyph, n) == m * glyph_length


@pytest.mark.parametrize(
('text', 'colwidth'),
[
# Glyph of length 1 and column width 1
('X', 1),
# Glyph of length 1 and column width 2
('\N{CJK UNIFIED IDEOGRAPH-65E5}', 2),
# Glyph of length 2 and column width 1
('\N{COMBINING TILDE}X', 1),
# Glyph of length 2 and column width 2
('\N{COMBINING TILDE}\N{CJK UNIFIED IDEOGRAPH-65E5}', 2),
# Glyph of length 3 and column width 1
('\N{COMBINING TILDE}\N{COMBINING BREVE}X', 1),
],
)
def test_text_wrapper_break_visible_symbol(text: str, colwidth: int) -> None:
assert column_width(text) == colwidth
for n in range(1, 5):
end = find_break_end(text, n)
assert column_width(text[:end]) <= n
for m in range(2, 5):
m_text = m * text
end = find_break_end(m_text, n)
assert column_width(m_text[:end]) <= n
assert end == m * len(text) or column_width(m_text[: end + 1]) > n


def test_text_wrapper_break_stop_after_combining_symbols() -> None:
tilde = '\N{COMBINING TILDE}'
multi = '\N{CJK UNIFIED IDEOGRAPH-65E5}'

head = tilde + tilde + '....'
tail = multi + tilde + tilde
text = head + tail
assert find_break_end(head + tail, column_width(head)) == len(head)


@pytest.mark.parametrize(
('text', 'results'),
[
('Hello', {1: list('Hello'), 2: ['He', 'll', 'o']}),
(
'Hello a\N{CJK UNIFIED IDEOGRAPH-65E5}ab!',
{
1: list('Helloa\N{CJK UNIFIED IDEOGRAPH-65E5}ab!'),
2: ['He', 'll', 'o', 'a', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab', '!'],
3: ['Hel', 'lo', 'a\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab!'],
},
),
(
'ab c\N{COMBINING TILDE}def',
{
1: ['a', 'b', 'c\N{COMBINING TILDE}', 'd', 'e', 'f'],
2: ['ab', 'c\N{COMBINING TILDE}d', 'ef'],
3: ['ab ', 'c\N{COMBINING TILDE}de', 'f'],
},
),
(
'abc\N{COMBINING TILDE}\N{CJK UNIFIED IDEOGRAPH-65E5}def',
{
1: [
'a',
'b',
'c\N{COMBINING TILDE}',
'\N{CJK UNIFIED IDEOGRAPH-65E5}',
'd',
'e',
'f',
],
2: [
'ab',
'c\N{COMBINING TILDE}',
'\N{CJK UNIFIED IDEOGRAPH-65E5}',
'de',
'f',
],
3: ['abc\N{COMBINING TILDE}', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'def'],
},
),
(
'abc\N{COMBINING TILDE}\N{COMBINING BREVE}def',
{
1: ['a', 'b', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}', 'd', 'e', 'f'],
2: ['ab', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}d', 'ef'],
3: ['abc\N{COMBINING TILDE}\N{COMBINING BREVE}', 'def'],
},
),
],
)
def test_text_wrapper(text: str, results: dict[int, list[str]]) -> None:
for width, expected in results.items():
w = TextWrapper(width=width, drop_whitespace=True)
assert w.wrap(text) == expected


@pytest.mark.parametrize(
('text', 'results'),
[
('Hello', {1: list('Hello'), 2: ['He', 'll', 'o']}),
(
'Hello a\N{CJK UNIFIED IDEOGRAPH-65E5}ab!',
{
1: list('Hello a\N{CJK UNIFIED IDEOGRAPH-65E5}ab!'),
2: ['He', 'll', 'o ', 'a', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab', '!'],
3: ['Hel', 'lo ', 'a\N{CJK UNIFIED IDEOGRAPH-65E5}', 'ab!'],
},
),
(
'ab c\N{COMBINING TILDE}def',
{
1: ['a', 'b', ' ', 'c\N{COMBINING TILDE}', 'd', 'e', 'f'],
2: ['ab', ' c\N{COMBINING TILDE}', 'de', 'f'],
3: ['ab ', 'c\N{COMBINING TILDE}de', 'f'],
},
),
(
'abc\N{COMBINING TILDE}\N{CJK UNIFIED IDEOGRAPH-65E5}def',
{
1: [
'a',
'b',
'c\N{COMBINING TILDE}',
'\N{CJK UNIFIED IDEOGRAPH-65E5}',
'd',
'e',
'f',
],
2: [
'ab',
'c\N{COMBINING TILDE}',
'\N{CJK UNIFIED IDEOGRAPH-65E5}',
'de',
'f',
],
3: ['abc\N{COMBINING TILDE}', '\N{CJK UNIFIED IDEOGRAPH-65E5}', 'def'],
},
),
(
'abc\N{COMBINING TILDE}\N{COMBINING BREVE}def',
{
1: ['a', 'b', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}', 'd', 'e', 'f'],
2: ['ab', 'c\N{COMBINING TILDE}\N{COMBINING BREVE}d', 'ef'],
3: ['abc\N{COMBINING TILDE}\N{COMBINING BREVE}', 'def'],
},
),
],
)
def test_text_wrapper_drop_ws(text: str, results: dict[int, list[str]]) -> None:
for width, expected in results.items():
w = TextWrapper(width=width, drop_whitespace=False)
assert w.wrap(text) == expected
Loading