Merge pull request #644 from Crozzers/fix-em-strong-issues

nicholasserra · web-flow · commit 9a88ce16a96c · 2025-10-05T20:15:14.000-04:00
Fix a number of em/strong issues (#641, #642, #643)
diff --git a/CHANGES.md b/CHANGES.md
@@ -4,6 +4,7 @@
 
 - [pull #639] Fix middle-word-em interfering with strongs (#637)
 - [pull #640] Fix code friendly extra stopping other syntax being processed (#638)
+- [pull #644] Fix a number of em/strong issues (#641, #642, #643)
 
 
 ## python-markdown2 2.5.4
diff --git a/lib/markdown2.py b/lib/markdown2.py
@@ -1988,14 +1988,48 @@ def _encode_code(self, text: str) -> str:
         self._code_table[text] = hashed
         return hashed
 
-    _strong_re = re.compile(r"(\*\*|__)(?=\S)(.+?[*_]?)(?<=\S)\1", re.S)
+    _strong_re = re.compile(r'''
+        (?:_{1,}|\*{1,})?  # ignore any leading em chars because we want to wrap `<strong>` as tightly around the text as possible
+                           # eg: `***abc***` -> `*<strong>abc</strong>*` instead of `<strong>*abc*</strong>`
+                           # Makes subsequent <em> processing easier
+        (\*\*|__)(?=\S)    # strong syntax - must be followed by a non whitespace char
+        (.+?)              # the strong text itself
+        (?<=\S)\1          # closing syntax - must be preceeded by non whitespace char
+        ''',
+        re.S | re.X
+    )
     _em_re = re.compile(r"(\*|_)(?=\S)(.*?\S)\1", re.S)
 
     @mark_stage(Stage.ITALIC_AND_BOLD)
     def _do_italics_and_bold(self, text: str) -> str:
+        def sub(match: re.Match):
+            '''
+            regex sub function that checks that the match isn't matching across spans.
+            The span shouldn't be across a closing or opening HTML tag, although spans within
+            the span is acceptable.
+            '''
+            contents: str = match.group(2)
+            # the strong re also checks for leading em chars, so the match may cover some additional text
+            prefix = match.string[match.start(): match.regs[1][0]]
+            # look for all possible span HTML tags
+            for tag in re.findall(rf'</?({self._span_tags})', contents):
+                # if it's unbalanced then that violates the rules
+                if not self._tag_is_closed(tag, contents):
+                    return prefix + match.group(1) + contents + match.group(1)
+
+                # if it is balanced, but the closing tag is before the opening then
+                # the text probably looks like `_</strong>abcdef<strong>_`, which is across 2 spans
+                close_index = contents.find(f'</{tag}')
+                open_index = contents.find(f'<{tag}')
+                if close_index != -1 and close_index < open_index:
+                    return prefix + match.group(1) + contents + match.group(1)
+
+            syntax = 'strong' if len(match.group(1)) == 2 else 'em'
+            return f'{prefix}<{syntax}>{contents}</{syntax}>'
+
         # <strong> must go first:
-        text = self._strong_re.sub(r"<strong>\2</strong>", text)
-        text = self._em_re.sub(r"<em>\2</em>", text)
+        text = self._strong_re.sub(sub, text)
+        text = self._em_re.sub(sub, text)
         return text
 
     _block_quote_base = r'''
@@ -3320,7 +3354,7 @@ def __init__(self, md: Markdown, options: Union[dict, bool, None]):
         self.middle_word_em_re = re.compile(
             r'''
             (?<!^)         # To be middle of a word, it cannot be at the start of the input
-            (?<![*_\s])    # cannot be preceeded by em character or whitespace (must be in middle of word)
+            (?<![*_\W])    # cannot be preceeded by em char or non word char (must be in middle of word)
             ([*_])         # em char
             (?=\S)         # must be followed by non-whitespace char
             (?![*_]|$|\W)  # cannot be followed by another em char, EOF or a non-word char
diff --git a/test/markdowntest-cases/Strong and em together.html b/test/markdowntest-cases/Strong and em together.html
@@ -1,7 +1,7 @@
-<p><strong><em>This is strong and em.</em></strong></p>
+<p><em><strong>This is strong and em.</strong></em></p>
 
-<p>So is <strong><em>this</em></strong> word.</p>
+<p>So is <em><strong>this</strong></em> word.</p>
 
-<p><strong><em>This is strong and em.</em></strong></p>
+<p><em><strong>This is strong and em.</strong></em></p>
 
-<p>So is <strong><em>this</em></strong> word.</p>
+<p>So is <em><strong>this</strong></em> word.</p>
diff --git a/test/tm-cases/consecutive_strong_and_em.html b/test/tm-cases/consecutive_strong_and_em.html
@@ -0,0 +1 @@
+<p><strong>strong</strong><em>em</em><strong>strong</strong></p>
diff --git a/test/tm-cases/consecutive_strong_and_em.text b/test/tm-cases/consecutive_strong_and_em.text
@@ -0,0 +1 @@
+**strong***em***strong**
diff --git a/test/tm-cases/ems_across_spans.html b/test/tm-cases/ems_across_spans.html
@@ -0,0 +1 @@
+<p><strong>_confusing</strong> ident is <strong>_confusing</strong></p>
diff --git a/test/tm-cases/ems_across_spans.text b/test/tm-cases/ems_across_spans.text
@@ -0,0 +1 @@
+**_confusing** ident is **_confusing**
diff --git a/test/tm-cases/middle_word_em_issue641.html b/test/tm-cases/middle_word_em_issue641.html
@@ -0,0 +1,3 @@
+<p><strong>Strong</strong> (<em>em</em>)</p>
+
+<p>note:<em>this is good</em>, but <em>this is not</em></p>
diff --git a/test/tm-cases/middle_word_em_issue641.opts b/test/tm-cases/middle_word_em_issue641.opts
@@ -0,0 +1 @@
+{'extras': {'middle-word-em': False}}
diff --git a/test/tm-cases/middle_word_em_issue641.text b/test/tm-cases/middle_word_em_issue641.text
@@ -0,0 +1,3 @@
+**Strong** (*em*)
+
+note:*this is good*, but *this is not*
diff --git a/test/tm-cases/middle_word_em_with_extra_ems.html b/test/tm-cases/middle_word_em_with_extra_ems.html
@@ -2,7 +2,7 @@
 
 <p><strong>one_two_three</strong></p>
 
-<p><strong><em>one_two_three</em></strong></p>
+<p><em><strong>one_two_three</strong></em></p>
 
 <p><em><strong>one_two_three</strong></em></p>
 

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+<p><strong>strong</strong><em>em</em><strong>strong</strong></p>`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+<p><strong>_confusing</strong> ident is <strong>_confusing</strong></p>`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+<p><strong>Strong</strong> (<em>em</em>)</p>`
	`2`	`+`
	`3`	`+<p>note:<em>this is good</em>, but <em>this is not</em></p>`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Strong (em)`
	`2`	`+`
	`3`	`+note:this is good, but this is not`