Skip to content

Commit 9a88ce1

Browse files
Merge pull request #644 from Crozzers/fix-em-strong-issues
Fix a number of em/strong issues (#641, #642, #643)
2 parents 8d50892 + 40bd17f commit 9a88ce1

11 files changed

+55
-9
lines changed

CHANGES.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
- [pull #639] Fix middle-word-em interfering with strongs (#637)
66
- [pull #640] Fix code friendly extra stopping other syntax being processed (#638)
7+
- [pull #644] Fix a number of em/strong issues (#641, #642, #643)
78

89

910
## python-markdown2 2.5.4

lib/markdown2.py

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1988,14 +1988,48 @@ def _encode_code(self, text: str) -> str:
19881988
self._code_table[text] = hashed
19891989
return hashed
19901990

1991-
_strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]?)(?<=\S)\1", re.S)
1991+
_strong_re = re.compile(r'''
1992+
(?:_{1,}|\*{1,})? # ignore any leading em chars because we want to wrap `<strong>` as tightly around the text as possible
1993+
# eg: `***abc***` -> `*<strong>abc</strong>*` instead of `<strong>*abc*</strong>`
1994+
# Makes subsequent <em> processing easier
1995+
(\*\*|__)(?=\S) # strong syntax - must be followed by a non whitespace char
1996+
(.+?) # the strong text itself
1997+
(?<=\S)\1 # closing syntax - must be preceeded by non whitespace char
1998+
''',
1999+
re.S | re.X
2000+
)
19922001
_em_re = re.compile(r"(\*|_)(?=\S)(.*?\S)\1", re.S)
19932002

19942003
@mark_stage(Stage.ITALIC_AND_BOLD)
19952004
def _do_italics_and_bold(self, text: str) -> str:
2005+
def sub(match: re.Match):
2006+
'''
2007+
regex sub function that checks that the match isn't matching across spans.
2008+
The span shouldn't be across a closing or opening HTML tag, although spans within
2009+
the span is acceptable.
2010+
'''
2011+
contents: str = match.group(2)
2012+
# the strong re also checks for leading em chars, so the match may cover some additional text
2013+
prefix = match.string[match.start(): match.regs[1][0]]
2014+
# look for all possible span HTML tags
2015+
for tag in re.findall(rf'</?({self._span_tags})', contents):
2016+
# if it's unbalanced then that violates the rules
2017+
if not self._tag_is_closed(tag, contents):
2018+
return prefix + match.group(1) + contents + match.group(1)
2019+
2020+
# if it is balanced, but the closing tag is before the opening then
2021+
# the text probably looks like `_</strong>abcdef<strong>_`, which is across 2 spans
2022+
close_index = contents.find(f'</{tag}')
2023+
open_index = contents.find(f'<{tag}')
2024+
if close_index != -1 and close_index < open_index:
2025+
return prefix + match.group(1) + contents + match.group(1)
2026+
2027+
syntax = 'strong' if len(match.group(1)) == 2 else 'em'
2028+
return f'{prefix}<{syntax}>{contents}</{syntax}>'
2029+
19962030
# <strong> must go first:
1997-
text = self._strong_re.sub(r"<strong>\2</strong>", text)
1998-
text = self._em_re.sub(r"<em>\2</em>", text)
2031+
text = self._strong_re.sub(sub, text)
2032+
text = self._em_re.sub(sub, text)
19992033
return text
20002034

20012035
_block_quote_base = r'''
@@ -3320,7 +3354,7 @@ def __init__(self, md: Markdown, options: Union[dict, bool, None]):
33203354
self.middle_word_em_re = re.compile(
33213355
r'''
33223356
(?<!^) # To be middle of a word, it cannot be at the start of the input
3323-
(?<![*_\s]) # cannot be preceeded by em character or whitespace (must be in middle of word)
3357+
(?<![*_\W]) # cannot be preceeded by em char or non word char (must be in middle of word)
33243358
([*_]) # em char
33253359
(?=\S) # must be followed by non-whitespace char
33263360
(?![*_]|$|\W) # cannot be followed by another em char, EOF or a non-word char
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
<p><strong><em>This is strong and em.</em></strong></p>
1+
<p><em><strong>This is strong and em.</strong></em></p>
22

3-
<p>So is <strong><em>this</em></strong> word.</p>
3+
<p>So is <em><strong>this</strong></em> word.</p>
44

5-
<p><strong><em>This is strong and em.</em></strong></p>
5+
<p><em><strong>This is strong and em.</strong></em></p>
66

7-
<p>So is <strong><em>this</em></strong> word.</p>
7+
<p>So is <em><strong>this</strong></em> word.</p>
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<p><strong>strong</strong><em>em</em><strong>strong</strong></p>
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
**strong***em***strong**
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<p><strong>_confusing</strong> ident is <strong>_confusing</strong></p>
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
**_confusing** ident is **_confusing**
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
<p><strong>Strong</strong> (<em>em</em>)</p>
2+
3+
<p>note:<em>this is good</em>, but <em>this is not</em></p>
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{'extras': {'middle-word-em': False}}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
**Strong** (*em*)
2+
3+
note:*this is good*, but *this is not*

0 commit comments

Comments
 (0)