@@ -1988,14 +1988,48 @@ def _encode_code(self, text: str) -> str:
19881988 self ._code_table [text ] = hashed
19891989 return hashed
19901990
1991- _strong_re = re .compile (r"(\*\*|__)(?=\S)(.+?[*_]?)(?<=\S)\1" , re .S )
1991+ _strong_re = re .compile (r'''
1992+ (?:_{1,}|\*{1,})? # ignore any leading em chars because we want to wrap `<strong>` as tightly around the text as possible
1993+ # eg: `***abc***` -> `*<strong>abc</strong>*` instead of `<strong>*abc*</strong>`
1994+ # Makes subsequent <em> processing easier
1995+ (\*\*|__)(?=\S) # strong syntax - must be followed by a non whitespace char
1996+ (.+?) # the strong text itself
1997+ (?<=\S)\1 # closing syntax - must be preceeded by non whitespace char
1998+ ''' ,
1999+ re .S | re .X
2000+ )
19922001 _em_re = re .compile (r"(\*|_)(?=\S)(.*?\S)\1" , re .S )
19932002
19942003 @mark_stage (Stage .ITALIC_AND_BOLD )
19952004 def _do_italics_and_bold (self , text : str ) -> str :
2005+ def sub (match : re .Match ):
2006+ '''
2007+ regex sub function that checks that the match isn't matching across spans.
2008+ The span shouldn't be across a closing or opening HTML tag, although spans within
2009+ the span is acceptable.
2010+ '''
2011+ contents : str = match .group (2 )
2012+ # the strong re also checks for leading em chars, so the match may cover some additional text
2013+ prefix = match .string [match .start (): match .regs [1 ][0 ]]
2014+ # look for all possible span HTML tags
2015+ for tag in re .findall (rf'</?({ self ._span_tags } )' , contents ):
2016+ # if it's unbalanced then that violates the rules
2017+ if not self ._tag_is_closed (tag , contents ):
2018+ return prefix + match .group (1 ) + contents + match .group (1 )
2019+
2020+ # if it is balanced, but the closing tag is before the opening then
2021+ # the text probably looks like `_</strong>abcdef<strong>_`, which is across 2 spans
2022+ close_index = contents .find (f'</{ tag } ' )
2023+ open_index = contents .find (f'<{ tag } ' )
2024+ if close_index != - 1 and close_index < open_index :
2025+ return prefix + match .group (1 ) + contents + match .group (1 )
2026+
2027+ syntax = 'strong' if len (match .group (1 )) == 2 else 'em'
2028+ return f'{ prefix } <{ syntax } >{ contents } </{ syntax } >'
2029+
19962030 # <strong> must go first:
1997- text = self ._strong_re .sub (r"<strong>\2</strong>" , text )
1998- text = self ._em_re .sub (r"<em>\2</em>" , text )
2031+ text = self ._strong_re .sub (sub , text )
2032+ text = self ._em_re .sub (sub , text )
19992033 return text
20002034
20012035 _block_quote_base = r'''
@@ -3320,7 +3354,7 @@ def __init__(self, md: Markdown, options: Union[dict, bool, None]):
33203354 self .middle_word_em_re = re .compile (
33213355 r'''
33223356 (?<!^) # To be middle of a word, it cannot be at the start of the input
3323- (?<![*_\s ]) # cannot be preceeded by em character or whitespace (must be in middle of word)
3357+ (?<![*_\W ]) # cannot be preceeded by em char or non word char (must be in middle of word)
33243358 ([*_]) # em char
33253359 (?=\S) # must be followed by non-whitespace char
33263360 (?![*_]|$|\W) # cannot be followed by another em char, EOF or a non-word char
0 commit comments