From 6dc3dfadbbfd0a8a7fde63b80905e7a97c118183 Mon Sep 17 00:00:00 2001 From: Shinon Date: Mon, 17 Nov 2025 12:19:12 +0800 Subject: [PATCH 1/8] ruff format --- markdownify/__init__.py | 567 +++++++++++++++++++++------------------ markdownify/__init__.pyi | 17 +- markdownify/main.py | 197 +++++++++----- 3 files changed, 445 insertions(+), 336 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 148d340..7df448b 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -5,77 +5,77 @@ # General-purpose regex patterns -re_convert_heading = re.compile(r'convert_h(\d+)') -re_line_with_content = re.compile(r'^(.*)', flags=re.MULTILINE) -re_whitespace = re.compile(r'[\t ]+') -re_all_whitespace = re.compile(r'[\t \r\n]+') -re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*') -re_html_heading = re.compile(r'h(\d+)') -re_pre_lstrip1 = re.compile(r'^ *\n') -re_pre_rstrip1 = re.compile(r'\n *$') -re_pre_lstrip = re.compile(r'^[ \n]*\n') -re_pre_rstrip = re.compile(r'[ \n]*$') +re_convert_heading = re.compile(r"convert_h(\d+)") +re_line_with_content = re.compile(r"^(.*)", flags=re.MULTILINE) +re_whitespace = re.compile(r"[\t ]+") +re_all_whitespace = re.compile(r"[\t \r\n]+") +re_newline_whitespace = re.compile(r"[\t \r\n]*[\r\n][\t \r\n]*") +re_html_heading = re.compile(r"h(\d+)") +re_pre_lstrip1 = re.compile(r"^ *\n") +re_pre_rstrip1 = re.compile(r"\n *$") +re_pre_lstrip = re.compile(r"^[ \n]*\n") +re_pre_rstrip = re.compile(r"[ \n]*$") # Pattern for creating convert_ function names from tag names -re_make_convert_fn_name = re.compile(r'[\[\]:-]') +re_make_convert_fn_name = re.compile(r"[\[\]:-]") # Extract (leading_nl, content, trailing_nl) from a string # (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here) -re_extract_newlines = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL) +re_extract_newlines = re.compile(r"^(\n*)((?:.*[^\n])?)(\n*)$", flags=re.DOTALL) # Escape miscellaneous special Markdown characters -re_escape_misc_chars = re.compile(r'([]\\&<`[>~=+|])') +re_escape_misc_chars = re.compile(r"([]\\&<`[>~=+|])") # Escape sequence of one or more consecutive '-', preceded # and followed by whitespace or start/end of fragment, as it # might be confused with an underline of a header, or with a # list marker -re_escape_misc_dash_sequences = re.compile(r'(\s|^)(-+(?:\s|$))') +re_escape_misc_dash_sequences = re.compile(r"(\s|^)(-+(?:\s|$))") # Escape sequence of up to six consecutive '#', preceded # and followed by whitespace or start/end of fragment, as # it might be confused with an ATX heading -re_escape_misc_hashes = re.compile(r'(\s|^)(#{1,6}(?:\s|$))') +re_escape_misc_hashes = re.compile(r"(\s|^)(#{1,6}(?:\s|$))") # Escape '.' or ')' preceded by up to nine digits, as it might be # confused with a list item -re_escape_misc_list_items = re.compile(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))') +re_escape_misc_list_items = re.compile(r"((?:\s|^)[0-9]{1,9})([.)](?:\s|$))") # Find consecutive backtick sequences in a string -re_backtick_runs = re.compile(r'`+') +re_backtick_runs = re.compile(r"`+") # Heading styles -ATX = 'atx' -ATX_CLOSED = 'atx_closed' -UNDERLINED = 'underlined' +ATX = "atx" +ATX_CLOSED = "atx_closed" +UNDERLINED = "underlined" SETEXT = UNDERLINED # Newline style -SPACES = 'spaces' -BACKSLASH = 'backslash' +SPACES = "spaces" +BACKSLASH = "backslash" # Strong and emphasis style -ASTERISK = '*' -UNDERSCORE = '_' +ASTERISK = "*" +UNDERSCORE = "_" # Document/pre strip styles -LSTRIP = 'lstrip' -RSTRIP = 'rstrip' -STRIP = 'strip' -STRIP_ONE = 'strip_one' +LSTRIP = "lstrip" +RSTRIP = "rstrip" +STRIP = "strip" +STRIP_ONE = "strip_one" def strip1_pre(text): """Strip one leading and trailing newline from a
 string."""
-    text = re_pre_lstrip1.sub('', text)
-    text = re_pre_rstrip1.sub('', text)
+    text = re_pre_lstrip1.sub("", text)
+    text = re_pre_rstrip1.sub("", text)
     return text
 
 
 def strip_pre(text):
     """Strip all leading and trailing newlines from a 
 string."""
-    text = re_pre_lstrip.sub('', text)
-    text = re_pre_rstrip.sub('', text)
+    text = re_pre_lstrip.sub("", text)
+    text = re_pre_rstrip.sub("", text)
     return text
 
 
@@ -86,8 +86,8 @@ def chomp(text):
     This function is used to prevent conversions like
          foo => ** foo**
     """
-    prefix = ' ' if text and text[0] == ' ' else ''
-    suffix = ' ' if text and text[-1] == ' ' else ''
+    prefix = " " if text and text[0] == " " else ""
+    suffix = " " if text and text[-1] == " " else ""
     text = text.strip()
     return (prefix, suffix, text)
 
@@ -100,23 +100,25 @@ def abstract_inline_conversion(markup_fn):
     the text if it looks like an HTML tag. markup_fn is necessary to allow for
     references to self.strong_em_symbol etc.
     """
+
     def implementation(self, el, text, parent_tags):
         markup_prefix = markup_fn(self)
-        if markup_prefix.startswith('<') and markup_prefix.endswith('>'):
-            markup_suffix = '"):
+            markup_suffix = "), ignore adjacent whitespace elements.
                     return True
-                elif should_remove_whitespace_outside(el.previous_sibling) or should_remove_whitespace_outside(el.next_sibling):
+                elif should_remove_whitespace_outside(
+                    el.previous_sibling
+                ) or should_remove_whitespace_outside(el.next_sibling):
                     # Outside block elements (including 
), ignore adjacent whitespace elements.
                     return True
                 else:
@@ -263,7 +284,7 @@ def _can_ignore(el):
             elif el is None:
                 return True
             else:
-                raise ValueError('Unexpected element type: %s' % type(el))
+                raise ValueError("Unexpected element type: %s" % type(el))
 
         children_to_convert = [el for el in node.children if not _can_ignore(el)]
 
@@ -275,13 +296,13 @@ def _can_ignore(el):
         # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
         if (
             re_html_heading.match(node.name) is not None  # headings
-            or node.name in {'td', 'th'}  # table cells
+            or node.name in {"td", "th"}  # table cells
         ):
-            parent_tags_for_children.add('_inline')
+            parent_tags_for_children.add("_inline")
 
         # if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
-        if node.name in {'pre', 'code', 'kbd', 'samp'}:
-            parent_tags_for_children.add('_noformat')
+        if node.name in {"pre", "code", "kbd", "samp"}:
+            parent_tags_for_children.add("_noformat")
 
         # Convert the children elements into a list of result strings.
         child_strings = [
@@ -293,22 +314,26 @@ def _can_ignore(el):
         child_strings = [s for s in child_strings if s]
 
         # Collapse newlines at child element boundaries, if needed.
-        if node.name == 'pre' or node.find_parent('pre'):
+        if node.name == "pre" or node.find_parent("pre"):
             # Inside 
 blocks, do not collapse newlines.
             pass
         else:
             # Collapse newlines at child element boundaries.
-            updated_child_strings = ['']  # so the first lookback works
+            updated_child_strings = [""]  # so the first lookback works
             for child_string in child_strings:
                 # Separate the leading/trailing newlines from the content.
-                leading_nl, content, trailing_nl = re_extract_newlines.match(child_string).groups()
+                leading_nl, content, trailing_nl = re_extract_newlines.match(
+                    child_string
+                ).groups()
 
                 # If the last child had trailing newlines and this child has leading newlines,
                 # use the larger newline count, limited to 2.
                 if updated_child_strings[-1] and leading_nl:
-                    prev_trailing_nl = updated_child_strings.pop()  # will be replaced by the collapsed value
+                    prev_trailing_nl = (
+                        updated_child_strings.pop()
+                    )  # will be replaced by the collapsed value
                     num_newlines = min(2, max(len(prev_trailing_nl), len(leading_nl)))
-                    leading_nl = '\n' * num_newlines
+                    leading_nl = "\n" * num_newlines
 
                 # Add the results to the updated child string list.
                 updated_child_strings.extend([leading_nl, content, trailing_nl])
@@ -316,7 +341,7 @@ def _can_ignore(el):
             child_strings = updated_child_strings
 
         # Join all child text strings into a single string.
-        text = ''.join(child_strings)
+        text = "".join(child_strings)
 
         # apply this tag's final conversion function
         convert_fn = self.get_conv_fn_cached(node.name)
@@ -327,16 +352,18 @@ def _can_ignore(el):
 
     def convert__document_(self, el, text, parent_tags):
         """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
-        if self.options['strip_document'] == LSTRIP:
-            text = text.lstrip('\n')  # remove leading separation newlines
-        elif self.options['strip_document'] == RSTRIP:
-            text = text.rstrip('\n')  # remove trailing separation newlines
-        elif self.options['strip_document'] == STRIP:
-            text = text.strip('\n')  # remove leading and trailing separation newlines
-        elif self.options['strip_document'] is None:
+        if self.options["strip_document"] == LSTRIP:
+            text = text.lstrip("\n")  # remove leading separation newlines
+        elif self.options["strip_document"] == RSTRIP:
+            text = text.rstrip("\n")  # remove trailing separation newlines
+        elif self.options["strip_document"] == STRIP:
+            text = text.strip("\n")  # remove leading and trailing separation newlines
+        elif self.options["strip_document"] is None:
             pass  # leave leading and trailing separation newlines as-is
         else:
-            raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document'])
+            raise ValueError(
+                "Invalid value for strip_document: %s" % self.options["strip_document"]
+            )
 
         return text
 
@@ -345,30 +372,30 @@ def process_text(self, el, parent_tags=None):
         if parent_tags is None:
             parent_tags = set()
 
-        text = six.text_type(el) or ''
+        text = six.text_type(el) or ""
 
         # normalize whitespace if we're not inside a preformatted element
-        if 'pre' not in parent_tags:
-            if self.options['wrap']:
-                text = re_all_whitespace.sub(' ', text)
+        if "pre" not in parent_tags:
+            if self.options["wrap"]:
+                text = re_all_whitespace.sub(" ", text)
             else:
-                text = re_newline_whitespace.sub('\n', text)
-                text = re_whitespace.sub(' ', text)
+                text = re_newline_whitespace.sub("\n", text)
+                text = re_whitespace.sub(" ", text)
 
         # escape special characters if we're not inside a preformatted or code element
-        if '_noformat' not in parent_tags:
+        if "_noformat" not in parent_tags:
             text = self.escape(text, parent_tags)
 
         # remove leading whitespace at the start or just after a
         # block-level element; remove traliing whitespace at the end
         # or just before a block-level element.
-        if (should_remove_whitespace_outside(el.previous_sibling)
-                or (should_remove_whitespace_inside(el.parent)
-                    and not el.previous_sibling)):
-            text = text.lstrip(' \t\r\n')
-        if (should_remove_whitespace_outside(el.next_sibling)
-                or (should_remove_whitespace_inside(el.parent)
-                    and not el.next_sibling)):
+        if should_remove_whitespace_outside(el.previous_sibling) or (
+            should_remove_whitespace_inside(el.parent) and not el.previous_sibling
+        ):
+            text = text.lstrip(" \t\r\n")
+        if should_remove_whitespace_outside(el.next_sibling) or (
+            should_remove_whitespace_inside(el.parent) and not el.next_sibling
+        ):
             text = text.rstrip()
 
         return text
@@ -400,15 +427,17 @@ def get_conv_fn(self, tag_name):
         match = re_html_heading.match(tag_name)
         if match:
             n = int(match.group(1))  # get value of N from 
-            return lambda el, text, parent_tags: self.convert_hN(n, el, text, parent_tags)
+            return lambda el, text, parent_tags: self.convert_hN(
+                n, el, text, parent_tags
+            )
 
         # No conversion function was found
         return None
 
     def should_convert_tag(self, tag):
         """Given a tag name, return whether to convert based on strip/convert options."""
-        strip = self.options['strip']
-        convert = self.options['convert']
+        strip = self.options["strip"]
+        convert = self.options["convert"]
         if strip is not None:
             return tag not in strip
         elif convert is not None:
@@ -418,123 +447,137 @@ def should_convert_tag(self, tag):
 
     def escape(self, text, parent_tags):
         if not text:
-            return ''
-        if self.options['escape_misc']:
-            text = re_escape_misc_chars.sub(r'\\\1', text)
-            text = re_escape_misc_dash_sequences.sub(r'\1\\\2', text)
-            text = re_escape_misc_hashes.sub(r'\1\\\2', text)
-            text = re_escape_misc_list_items.sub(r'\1\\\2', text)
-
-        if self.options['escape_asterisks']:
-            text = text.replace('*', r'\*')
-        if self.options['escape_underscores']:
-            text = text.replace('_', r'\_')
+            return ""
+        if self.options["escape_misc"]:
+            text = re_escape_misc_chars.sub(r"\\\1", text)
+            text = re_escape_misc_dash_sequences.sub(r"\1\\\2", text)
+            text = re_escape_misc_hashes.sub(r"\1\\\2", text)
+            text = re_escape_misc_list_items.sub(r"\1\\\2", text)
+
+        if self.options["escape_asterisks"]:
+            text = text.replace("*", r"\*")
+        if self.options["escape_underscores"]:
+            text = text.replace("_", r"\_")
         return text
 
     def underline(self, text, pad_char):
-        text = (text or '').rstrip()
-        return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
+        text = (text or "").rstrip()
+        return "\n\n%s\n%s\n\n" % (text, pad_char * len(text)) if text else ""
 
     def convert_a(self, el, text, parent_tags):
-        if '_noformat' in parent_tags:
+        if "_noformat" in parent_tags:
             return text
         prefix, suffix, text = chomp(text)
         if not text:
-            return ''
-        href = el.get('href')
-        title = el.get('title')
+            return ""
+        href = el.get("href")
+        title = el.get("title")
         # For the replacement see #29: text nodes underscores are escaped
-        if (self.options['autolinks']
-                and text.replace(r'\_', '_') == href
-                and not title
-                and not self.options['default_title']):
+        if (
+            self.options["autolinks"]
+            and text.replace(r"\_", "_") == href
+            and not title
+            and not self.options["default_title"]
+        ):
             # Shortcut syntax
-            return '<%s>' % href
-        if self.options['default_title'] and not title:
+            return "<%s>" % href
+        if self.options["default_title"] and not title:
             title = href
-        title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
-        return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text
+        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
+        return (
+            "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
+            if href
+            else text
+        )
 
-    convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])
+    convert_b = abstract_inline_conversion(
+        lambda self: 2 * self.options["strong_em_symbol"]
+    )
 
     def convert_blockquote(self, el, text, parent_tags):
         # handle some early-exit scenarios
-        text = (text or '').strip(' \t\r\n')
-        if '_inline' in parent_tags:
-            return ' ' + text + ' '
+        text = (text or "").strip(" \t\r\n")
+        if "_inline" in parent_tags:
+            return " " + text + " "
         if not text:
             return "\n"
 
         # indent lines with blockquote marker
         def _indent_for_blockquote(match):
             line_content = match.group(1)
-            return '> ' + line_content if line_content else '>'
+            return "> " + line_content if line_content else ">"
+
         text = re_line_with_content.sub(_indent_for_blockquote, text)
 
-        return '\n' + text + '\n\n'
+        return "\n" + text + "\n\n"
 
     def convert_br(self, el, text, parent_tags):
-        if '_inline' in parent_tags:
-            return ' '
+        if "_inline" in parent_tags:
+            return " "
 
-        if self.options['newline_style'].lower() == BACKSLASH:
-            return '\\\n'
+        if self.options["newline_style"].lower() == BACKSLASH:
+            return "\\\n"
         else:
-            return '  \n'
+            return "  \n"
 
     def convert_code(self, el, text, parent_tags):
-        if '_noformat' in parent_tags:
+        if "_noformat" in parent_tags:
             return text
 
         prefix, suffix, text = chomp(text)
         if not text:
-            return ''
+            return ""
 
         # Find the maximum number of consecutive backticks in the text, then
         # delimit the code span with one more backtick than that
-        max_backticks = max((len(match) for match in re.findall(re_backtick_runs, text)), default=0)
-        markup_delimiter = '`' * (max_backticks + 1)
+        max_backticks = max(
+            (len(match) for match in re.findall(re_backtick_runs, text)), default=0
+        )
+        markup_delimiter = "`" * (max_backticks + 1)
 
         # If the maximum number of backticks is greater than zero, add a space
         # to avoid interpretation of inside backticks as literals
         if max_backticks > 0:
             text = " " + text + " "
 
-        return '%s%s%s%s%s' % (prefix, markup_delimiter, text, markup_delimiter, suffix)
+        return "%s%s%s%s%s" % (prefix, markup_delimiter, text, markup_delimiter, suffix)
 
-    convert_del = abstract_inline_conversion(lambda self: '~~')
+    convert_del = abstract_inline_conversion(lambda self: "~~")
 
     def convert_div(self, el, text, parent_tags):
-        if '_inline' in parent_tags:
-            return ' ' + text.strip() + ' '
+        if "_inline" in parent_tags:
+            return " " + text.strip() + " "
         text = text.strip()
-        return '\n\n%s\n\n' % text if text else ''
+        return "\n\n%s\n\n" % text if text else ""
 
     convert_article = convert_div
 
     convert_section = convert_div
 
-    convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol'])
+    convert_em = abstract_inline_conversion(
+        lambda self: self.options["strong_em_symbol"]
+    )
 
     convert_kbd = convert_code
 
     def convert_dd(self, el, text, parent_tags):
-        text = (text or '').strip()
-        if '_inline' in parent_tags:
-            return ' ' + text + ' '
+        text = (text or "").strip()
+        if "_inline" in parent_tags:
+            return " " + text + " "
         if not text:
-            return '\n'
+            return "\n"
 
         # indent definition content lines by four spaces
         def _indent_for_dd(match):
             line_content = match.group(1)
-            return '    ' + line_content if line_content else ''
+            return "    " + line_content if line_content else ""
+
         text = re_line_with_content.sub(_indent_for_dd, text)
 
         # insert definition marker into first-line indent whitespace
-        text = ':' + text[1:]
+        text = ":" + text[1:]
 
-        return '%s\n' % text
+        return "%s\n" % text
 
     # definition lists are formatted as follows:
     #   https://pandoc.org/MANUAL.html#definition-lists
@@ -543,175 +586,183 @@ def _indent_for_dd(match):
 
     def convert_dt(self, el, text, parent_tags):
         # remove newlines from term text
-        text = (text or '').strip()
-        text = re_all_whitespace.sub(' ', text)
-        if '_inline' in parent_tags:
-            return ' ' + text + ' '
+        text = (text or "").strip()
+        text = re_all_whitespace.sub(" ", text)
+        if "_inline" in parent_tags:
+            return " " + text + " "
         if not text:
-            return '\n'
+            return "\n"
 
         # TODO - format consecutive 
elements as directly adjacent lines): # https://michelf.ca/projects/php-markdown/extra/#def-list - return '\n\n%s\n' % text + return "\n\n%s\n" % text def convert_hN(self, n, el, text, parent_tags): # convert_hN() converts tags, where N is any integer - if '_inline' in parent_tags: + if "_inline" in parent_tags: return text # Markdown does not support heading depths of n > 6 n = max(1, min(6, n)) - style = self.options['heading_style'].lower() + style = self.options["heading_style"].lower() text = text.strip() if style == UNDERLINED and n <= 2: - line = '=' if n == 1 else '-' + line = "=" if n == 1 else "-" return self.underline(text, line) - text = re_all_whitespace.sub(' ', text) - hashes = '#' * n + text = re_all_whitespace.sub(" ", text) + hashes = "#" * n if style == ATX_CLOSED: - return '\n\n%s %s %s\n\n' % (hashes, text, hashes) - return '\n\n%s %s\n\n' % (hashes, text) + return "\n\n%s %s %s\n\n" % (hashes, text, hashes) + return "\n\n%s %s\n\n" % (hashes, text) def convert_hr(self, el, text, parent_tags): - return '\n\n---\n\n' + return "\n\n---\n\n" convert_i = convert_em def convert_img(self, el, text, parent_tags): - alt = el.attrs.get('alt', None) or '' - src = el.attrs.get('src', None) or '' - title = el.attrs.get('title', None) or '' - title_part = ' "%s"' % title.replace('"', r'\"') if title else '' - if ('_inline' in parent_tags - and el.parent.name not in self.options['keep_inline_images_in']): + alt = el.attrs.get("alt", None) or "" + src = el.attrs.get("src", None) or "" + title = el.attrs.get("title", None) or "" + title_part = ' "%s"' % title.replace('"', r"\"") if title else "" + if ( + "_inline" in parent_tags + and el.parent.name not in self.options["keep_inline_images_in"] + ): return alt - return '![%s](%s%s)' % (alt, src, title_part) + return "![%s](%s%s)" % (alt, src, title_part) def convert_video(self, el, text, parent_tags): - if ('_inline' in parent_tags - and el.parent.name not in self.options['keep_inline_images_in']): + if ( + "_inline" in parent_tags + and el.parent.name not in self.options["keep_inline_images_in"] + ): return text - src = el.attrs.get('src', None) or '' + src = el.attrs.get("src", None) or "" if not src: - sources = el.find_all('source', attrs={'src': True}) + sources = el.find_all("source", attrs={"src": True}) if sources: - src = sources[0].attrs.get('src', None) or '' - poster = el.attrs.get('poster', None) or '' + src = sources[0].attrs.get("src", None) or "" + poster = el.attrs.get("poster", None) or "" if src and poster: - return '[![%s](%s)](%s)' % (text, poster, src) + return "[![%s](%s)](%s)" % (text, poster, src) if src: - return '[%s](%s)' % (text, src) + return "[%s](%s)" % (text, src) if poster: - return '![%s](%s)' % (text, poster) + return "![%s](%s)" % (text, poster) return text def convert_list(self, el, text, parent_tags): - # Converting a list to inline is undefined. # Ignoring inline conversion parents for list. before_paragraph = False next_sibling = _next_block_content_sibling(el) - if next_sibling and next_sibling.name not in ['ul', 'ol']: + if next_sibling and next_sibling.name not in ["ul", "ol"]: before_paragraph = True - if 'li' in parent_tags: + if "li" in parent_tags: # remove trailing newline if we're in a nested list - return '\n' + text.rstrip() - return '\n\n' + text + ('\n' if before_paragraph else '') + return "\n" + text.rstrip() + return "\n\n" + text + ("\n" if before_paragraph else "") convert_ul = convert_list convert_ol = convert_list def convert_li(self, el, text, parent_tags): # handle some early-exit scenarios - text = (text or '').strip() + text = (text or "").strip() if not text: return "\n" # determine list item bullet character to use parent = el.parent - if parent is not None and parent.name == 'ol': + if parent is not None and parent.name == "ol": if parent.get("start") and str(parent.get("start")).isnumeric(): start = int(parent.get("start")) else: start = 1 - bullet = '%s.' % (start + len(el.find_previous_siblings('li'))) + bullet = "%s." % (start + len(el.find_previous_siblings("li"))) else: depth = -1 while el: - if el.name == 'ul': + if el.name == "ul": depth += 1 el = el.parent - bullets = self.options['bullets'] + bullets = self.options["bullets"] bullet = bullets[depth % len(bullets)] - bullet = bullet + ' ' + bullet = bullet + " " bullet_width = len(bullet) - bullet_indent = ' ' * bullet_width + bullet_indent = " " * bullet_width # indent content lines by bullet width def _indent_for_li(match): line_content = match.group(1) - return bullet_indent + line_content if line_content else '' + return bullet_indent + line_content if line_content else "" + text = re_line_with_content.sub(_indent_for_li, text) # insert bullet into first-line indent whitespace text = bullet + text[bullet_width:] - return '%s\n' % text + return "%s\n" % text def convert_p(self, el, text, parent_tags): - if '_inline' in parent_tags: - return ' ' + text.strip(' \t\r\n') + ' ' - text = text.strip(' \t\r\n') - if self.options['wrap']: + if "_inline" in parent_tags: + return " " + text.strip(" \t\r\n") + " " + text = text.strip(" \t\r\n") + if self.options["wrap"]: # Preserve newlines (and preceding whitespace) resulting # from
tags. Newlines in the input have already been # replaced by spaces. - if self.options['wrap_width'] is not None: - lines = text.split('\n') + if self.options["wrap_width"] is not None: + lines = text.split("\n") new_lines = [] for line in lines: - line = line.lstrip(' \t\r\n') + line = line.lstrip(" \t\r\n") line_no_trailing = line.rstrip() - trailing = line[len(line_no_trailing):] - line = fill(line, - width=self.options['wrap_width'], - break_long_words=False, - break_on_hyphens=False) + trailing = line[len(line_no_trailing) :] + line = fill( + line, + width=self.options["wrap_width"], + break_long_words=False, + break_on_hyphens=False, + ) new_lines.append(line + trailing) - text = '\n'.join(new_lines) - return '\n\n%s\n\n' % text if text else '' + text = "\n".join(new_lines) + return "\n\n%s\n\n" % text if text else "" def convert_pre(self, el, text, parent_tags): if not text: - return '' - code_language = self.options['code_language'] + return "" + code_language = self.options["code_language"] - if self.options['code_language_callback']: - code_language = self.options['code_language_callback'](el) or code_language + if self.options["code_language_callback"]: + code_language = self.options["code_language_callback"](el) or code_language - if self.options['strip_pre'] == STRIP: + if self.options["strip_pre"] == STRIP: text = strip_pre(text) # remove all leading/trailing newlines - elif self.options['strip_pre'] == STRIP_ONE: + elif self.options["strip_pre"] == STRIP_ONE: text = strip1_pre(text) # remove one leading/trailing newline - elif self.options['strip_pre'] is None: + elif self.options["strip_pre"] is None: pass # leave leading and trailing newlines as-is else: - raise ValueError('Invalid value for strip_pre: %s' % self.options['strip_pre']) + raise ValueError( + "Invalid value for strip_pre: %s" % self.options["strip_pre"] + ) - return '\n\n```%s\n%s\n```\n\n' % (code_language, text) + return "\n\n```%s\n%s\n```\n\n" % (code_language, text) def convert_q(self, el, text, parent_tags): return '"' + text + '"' def convert_script(self, el, text, parent_tags): - return '' + return "" def convert_style(self, el, text, parent_tags): - return '' + return "" convert_s = convert_del @@ -719,75 +770,75 @@ def convert_style(self, el, text, parent_tags): convert_samp = convert_code - convert_sub = abstract_inline_conversion(lambda self: self.options['sub_symbol']) + convert_sub = abstract_inline_conversion(lambda self: self.options["sub_symbol"]) - convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol']) + convert_sup = abstract_inline_conversion(lambda self: self.options["sup_symbol"]) def convert_table(self, el, text, parent_tags): - return '\n\n' + text.strip() + '\n\n' + return "\n\n" + text.strip() + "\n\n" def convert_caption(self, el, text, parent_tags): - return text.strip() + '\n\n' + return text.strip() + "\n\n" def convert_figcaption(self, el, text, parent_tags): - return '\n\n' + text.strip() + '\n\n' + return "\n\n" + text.strip() + "\n\n" def convert_td(self, el, text, parent_tags): colspan = 1 - if 'colspan' in el.attrs and el['colspan'].isdigit(): - colspan = max(1, min(1000, int(el['colspan']))) - return ' ' + text.strip().replace("\n", " ") + ' |' * colspan + if "colspan" in el.attrs and el["colspan"].isdigit(): + colspan = max(1, min(1000, int(el["colspan"]))) + return " " + text.strip().replace("\n", " ") + " |" * colspan def convert_th(self, el, text, parent_tags): colspan = 1 - if 'colspan' in el.attrs and el['colspan'].isdigit(): - colspan = max(1, min(1000, int(el['colspan']))) - return ' ' + text.strip().replace("\n", " ") + ' |' * colspan + if "colspan" in el.attrs and el["colspan"].isdigit(): + colspan = max(1, min(1000, int(el["colspan"]))) + return " " + text.strip().replace("\n", " ") + " |" * colspan def convert_tr(self, el, text, parent_tags): - cells = el.find_all(['td', 'th']) + cells = el.find_all(["td", "th"]) is_first_row = el.find_previous_sibling() is None - is_headrow = ( - all([cell.name == 'th' for cell in cells]) - or (el.parent.name == 'thead' - # avoid multiple tr in thead - and len(el.parent.find_all('tr')) == 1) + is_headrow = all([cell.name == "th" for cell in cells]) or ( + el.parent.name == "thead" + # avoid multiple tr in thead + and len(el.parent.find_all("tr")) == 1 ) - is_head_row_missing = ( - (is_first_row and not el.parent.name == 'tbody') - or (is_first_row and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1) + is_head_row_missing = (is_first_row and not el.parent.name == "tbody") or ( + is_first_row + and el.parent.name == "tbody" + and len(el.parent.parent.find_all(["thead"])) < 1 ) - overline = '' - underline = '' + overline = "" + underline = "" full_colspan = 0 for cell in cells: - if 'colspan' in cell.attrs and cell['colspan'].isdigit(): - full_colspan += max(1, min(1000, int(cell['colspan']))) + if "colspan" in cell.attrs and cell["colspan"].isdigit(): + full_colspan += max(1, min(1000, int(cell["colspan"]))) else: full_colspan += 1 - if ((is_headrow - or (is_head_row_missing - and self.options['table_infer_header'])) - and is_first_row): + if ( + is_headrow or (is_head_row_missing and self.options["table_infer_header"]) + ) and is_first_row: # first row and: # - is headline or # - headline is missing and header inference is enabled # print headline underline - underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n' - elif ((is_head_row_missing - and not self.options['table_infer_header']) - or (is_first_row - and (el.parent.name == 'table' - or (el.parent.name == 'tbody' - and not el.parent.find_previous_sibling())))): + underline += "| " + " | ".join(["---"] * full_colspan) + " |" + "\n" + elif (is_head_row_missing and not self.options["table_infer_header"]) or ( + is_first_row + and ( + el.parent.name == "table" + or (el.parent.name == "tbody" and not el.parent.find_previous_sibling()) + ) + ): # headline is missing and header inference is disabled or: # first row, not headline, and: # - the parent is table or # - the parent is tbody at the beginning of a table. # print empty headline above this row - overline += '| ' + ' | '.join([''] * full_colspan) + ' |' + '\n' - overline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n' - return overline + '|' + text + '\n' + underline + overline += "| " + " | ".join([""] * full_colspan) + " |" + "\n" + overline += "| " + " | ".join(["---"] * full_colspan) + " |" + "\n" + return overline + "|" + text + "\n" + underline def markdownify(html, **options): diff --git a/markdownify/__init__.pyi b/markdownify/__init__.pyi index 5f9b852..ccb587f 100644 --- a/markdownify/__init__.pyi +++ b/markdownify/__init__.pyi @@ -14,7 +14,6 @@ RSTRIP: str STRIP: str STRIP_ONE: str - def markdownify( html: str, autolinks: bool = ..., @@ -41,7 +40,6 @@ def markdownify( wrap_width: int = ..., ) -> str: ... - class MarkdownConverter: def __init__( self, @@ -49,7 +47,9 @@ class MarkdownConverter: bs4_options: str = ..., bullets: str = ..., code_language: str = ..., - code_language_callback: Union[Callable[[Incomplete], Union[str, None]], None] = ..., + code_language_callback: Union[ + Callable[[Incomplete], Union[str, None]], None + ] = ..., convert: Union[list[str], None] = ..., default_title: bool = ..., escape_asterisks: bool = ..., @@ -67,11 +67,6 @@ class MarkdownConverter: table_infer_header: bool = ..., wrap: bool = ..., wrap_width: int = ..., - ) -> None: - ... - - def convert(self, html: str) -> str: - ... - - def convert_soup(self, soup: Incomplete) -> str: - ... + ) -> None: ... + def convert(self, html: str) -> str: ... + def convert_soup(self, soup: Incomplete) -> str: ... diff --git a/markdownify/main.py b/markdownify/main.py index ba70671..6c32f28 100755 --- a/markdownify/main.py +++ b/markdownify/main.py @@ -3,82 +3,145 @@ import argparse import sys -from markdownify import markdownify, ATX, ATX_CLOSED, UNDERLINED, \ - SPACES, BACKSLASH, ASTERISK, UNDERSCORE +from markdownify import ( + markdownify, + ATX, + ATX_CLOSED, + UNDERLINED, + SPACES, + BACKSLASH, + ASTERISK, + UNDERSCORE, +) def main(argv=sys.argv[1:]): parser = argparse.ArgumentParser( - prog='markdownify', - description='Converts html to markdown.', + prog="markdownify", + description="Converts html to markdown.", ) - parser.add_argument('html', nargs='?', type=argparse.FileType('r'), - default=sys.stdin, - help="The html file to convert. Defaults to STDIN if not " - "provided.") - parser.add_argument('-s', '--strip', nargs='*', - help="A list of tags to strip. This option can't be used with " - "the --convert option.") - parser.add_argument('-c', '--convert', nargs='*', - help="A list of tags to convert. This option can't be used with " - "the --strip option.") - parser.add_argument('-a', '--autolinks', action='store_true', - help="A boolean indicating whether the 'automatic link' style " - "should be used when a 'a' tag's contents match its href.") - parser.add_argument('--default-title', action='store_false', - help="A boolean to enable setting the title of a link to its " - "href, if no title is given.") - parser.add_argument('--heading-style', default=UNDERLINED, - choices=(ATX, ATX_CLOSED, UNDERLINED), - help="Defines how headings should be converted.") - parser.add_argument('-b', '--bullets', default='*+-', - help="A string of bullet styles to use; the bullet will " - "alternate based on nesting level.") - parser.add_argument('--strong-em-symbol', default=ASTERISK, - choices=(ASTERISK, UNDERSCORE), - help="Use * or _ to convert strong and italics text"), - parser.add_argument('--sub-symbol', default='', - help="Define the chars that surround ''.") - parser.add_argument('--sup-symbol', default='', - help="Define the chars that surround ''.") - parser.add_argument('--newline-style', default=SPACES, - choices=(SPACES, BACKSLASH), - help="Defines the style of
conversions: two spaces " - "or backslash at the and of the line thet should break.") - parser.add_argument('--code-language', default='', - help="Defines the language that should be assumed for all " - "'
' sections.")
-    parser.add_argument('--no-escape-asterisks', dest='escape_asterisks',
-                        action='store_false',
-                        help="Do not escape '*' to '\\*' in text.")
-    parser.add_argument('--no-escape-underscores', dest='escape_underscores',
-                        action='store_false',
-                        help="Do not escape '_' to '\\_' in text.")
-    parser.add_argument('-i', '--keep-inline-images-in',
-                        default=[],
-                        nargs='*',
-                        help="Images are converted to their alt-text when the images are "
-                        "located inside headlines or table cells. If some inline images "
-                        "should be converted to markdown images instead, this option can "
-                        "be set to a list of parent tags that should be allowed to "
-                        "contain inline images.")
-    parser.add_argument('--table-infer-header', dest='table_infer_header',
-                        action='store_true',
-                        help="When a table has no header row (as indicated by '' "
-                        "or ''), use the first body row as the header row.")
-    parser.add_argument('-w', '--wrap', action='store_true',
-                        help="Wrap all text paragraphs at --wrap-width characters.")
-    parser.add_argument('--wrap-width', type=int, default=80)
-    parser.add_argument('--bs4-options',
-                        default='html.parser',
-                        help="Specifies the parser that BeautifulSoup should use to parse "
-                             "the HTML markup. Examples include 'html5.parser', 'lxml', and "
-                             "'html5lib'.")
+    parser.add_argument(
+        "html",
+        nargs="?",
+        type=argparse.FileType("r"),
+        default=sys.stdin,
+        help="The html file to convert. Defaults to STDIN if not provided.",
+    )
+    parser.add_argument(
+        "-s",
+        "--strip",
+        nargs="*",
+        help="A list of tags to strip. This option can't be used with "
+        "the --convert option.",
+    )
+    parser.add_argument(
+        "-c",
+        "--convert",
+        nargs="*",
+        help="A list of tags to convert. This option can't be used with "
+        "the --strip option.",
+    )
+    parser.add_argument(
+        "-a",
+        "--autolinks",
+        action="store_true",
+        help="A boolean indicating whether the 'automatic link' style "
+        "should be used when a 'a' tag's contents match its href.",
+    )
+    parser.add_argument(
+        "--default-title",
+        action="store_false",
+        help="A boolean to enable setting the title of a link to its "
+        "href, if no title is given.",
+    )
+    parser.add_argument(
+        "--heading-style",
+        default=UNDERLINED,
+        choices=(ATX, ATX_CLOSED, UNDERLINED),
+        help="Defines how headings should be converted.",
+    )
+    parser.add_argument(
+        "-b",
+        "--bullets",
+        default="*+-",
+        help="A string of bullet styles to use; the bullet will "
+        "alternate based on nesting level.",
+    )
+    (
+        parser.add_argument(
+            "--strong-em-symbol",
+            default=ASTERISK,
+            choices=(ASTERISK, UNDERSCORE),
+            help="Use * or _ to convert strong and italics text",
+        ),
+    )
+    parser.add_argument(
+        "--sub-symbol", default="", help="Define the chars that surround ''."
+    )
+    parser.add_argument(
+        "--sup-symbol", default="", help="Define the chars that surround ''."
+    )
+    parser.add_argument(
+        "--newline-style",
+        default=SPACES,
+        choices=(SPACES, BACKSLASH),
+        help="Defines the style of 
conversions: two spaces " + "or backslash at the and of the line thet should break.", + ) + parser.add_argument( + "--code-language", + default="", + help="Defines the language that should be assumed for all '
' sections.",
+    )
+    parser.add_argument(
+        "--no-escape-asterisks",
+        dest="escape_asterisks",
+        action="store_false",
+        help="Do not escape '*' to '\\*' in text.",
+    )
+    parser.add_argument(
+        "--no-escape-underscores",
+        dest="escape_underscores",
+        action="store_false",
+        help="Do not escape '_' to '\\_' in text.",
+    )
+    parser.add_argument(
+        "-i",
+        "--keep-inline-images-in",
+        default=[],
+        nargs="*",
+        help="Images are converted to their alt-text when the images are "
+        "located inside headlines or table cells. If some inline images "
+        "should be converted to markdown images instead, this option can "
+        "be set to a list of parent tags that should be allowed to "
+        "contain inline images.",
+    )
+    parser.add_argument(
+        "--table-infer-header",
+        dest="table_infer_header",
+        action="store_true",
+        help="When a table has no header row (as indicated by '' "
+        "or ''), use the first body row as the header row.",
+    )
+    parser.add_argument(
+        "-w",
+        "--wrap",
+        action="store_true",
+        help="Wrap all text paragraphs at --wrap-width characters.",
+    )
+    parser.add_argument("--wrap-width", type=int, default=80)
+    parser.add_argument(
+        "--bs4-options",
+        default="html.parser",
+        help="Specifies the parser that BeautifulSoup should use to parse "
+        "the HTML markup. Examples include 'html5.parser', 'lxml', and "
+        "'html5lib'.",
+    )
 
     args = parser.parse_args(argv)
     print(markdownify(**vars(args)))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()

From ad71318fe43905529de59bb4d2347e36552ce4ae Mon Sep 17 00:00:00 2001
From: Shinon 
Date: Mon, 17 Nov 2025 12:28:36 +0800
Subject: [PATCH 2/8] Inital conversion work to selectolax

---
 markdownify/__init__.py | 127 +++++++++++++++++++++++++---------------
 1 file changed, 81 insertions(+), 46 deletions(-)

diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index 7df448b..76b048b 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -1,8 +1,8 @@
-from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag
-from textwrap import fill
 import re
-import six
+from textwrap import fill
+from typing import Any, Callable
 
+from selectolax.lexbor import LexborHTMLParser, LexborNode
 
 # General-purpose regex patterns
 re_convert_heading = re.compile(r"convert_h(\d+)")
@@ -79,7 +79,18 @@ def strip_pre(text):
     return text
 
 
-def chomp(text):
+def find_parent(node: LexborNode | None, node_tag: str):
+    """Finds a parent with the specified tag"""
+    while node:
+        node = node.parent
+        if node is None:
+            break
+        if node.tag == node_tag:
+            return node
+    return node
+
+
+def chomp(text: str):
     """
     If the text in an inline tag like b, a, or em contains a leading or trailing
     space, strip the string and return a space as suffix of prefix, if needed.
@@ -92,7 +103,7 @@ def chomp(text):
     return (prefix, suffix, text)
 
 
-def abstract_inline_conversion(markup_fn):
+def abstract_inline_conversion(markup_fn: Callable):
     """
     This abstracts all simple inline tags like b, em, del, ...
     Returns a function that wraps the chomped text in a pair of the string
@@ -117,17 +128,12 @@ def implementation(self, el, text, parent_tags):
     return implementation
 
 
-def _todict(obj):
+def _todict(obj:Any):
     return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith("_"))
 
 
-def should_remove_whitespace_inside(el):
-    """Return to remove whitespace immediately inside a block-level element."""
-    if not el or not el.name:
-        return False
-    if re_html_heading.match(el.name) is not None:
-        return True
-    return el.name in (
+WHITESPACE_ABLE = set(
+    [
         "p",
         "blockquote",
         "article",
@@ -146,27 +152,49 @@ def should_remove_whitespace_inside(el):
         "tr",
         "td",
         "th",
-    )
+    ]
+)
 
 
-def should_remove_whitespace_outside(el):
+def should_remove_whitespace_inside(el: LexborNode | None):
+    """Return to remove whitespace immediately inside a block-level element."""
+    if not el or not el.tag:
+        return False
+    if re_html_heading.match(el.tag) is not None:
+        return True
+    return el.tag in WHITESPACE_ABLE
+
+
+def should_remove_whitespace_outside(el: LexborNode | None):
     """Return to remove whitespace immediately outside a block-level element."""
-    return should_remove_whitespace_inside(el) or (el and el.name == "pre")
+    return should_remove_whitespace_inside(el) or (el and el.tag == "pre")
 
 
-def _is_block_content_element(el):
+def is_tag(el: LexborNode):
+    """Returns True if the lexbor node is a tag"""
+    return (
+        True
+        if el.tag_id not in [None, "-text", "-document", "-comment", "-doctype"]
+        else False
+    )
+
+
+def _is_block_content_element(el: LexborNode | None):
     """
     In a block context, returns:
 
     - True for content elements (tags and non-whitespace text)
     - False for non-content elements (whitespace text, comments, doctypes)
     """
-    if isinstance(el, Tag):
+    if not el:
+        return False
+    if is_tag(el):
         return True
-    elif isinstance(el, (Comment, Doctype)):
+    elif el.tag == "-comment":
         return False  # (subclasses of NavigableString, must test first)
-    elif isinstance(el, NavigableString):
-        return el.strip() != ""
+    elif el.tag == "-text":
+        text = el.text_content if el.text_content else ""
+        return text.strip() != ""
     else:
         return False
 
@@ -189,10 +217,9 @@ def _next_block_content_sibling(el):
     return None
 
 
-class MarkdownConverter(object):
+class MarkdownConverter:
     class DefaultOptions:
         autolinks = True
-        bs4_options = "html.parser"
         bullets = "*+-"  # An iterable of bullet types.
         code_language = ""
         code_language_callback = None
@@ -235,20 +262,26 @@ def __init__(self, **options):
         # Initialize the conversion function cache
         self.convert_fn_cache = {}
 
-    def convert(self, html):
-        soup = BeautifulSoup(html, **self.options["bs4_options"])
+    def convert(self, html: str | bytes) -> str | None:
+        soup = LexborHTMLParser(html)
         return self.convert_soup(soup)
 
-    def convert_soup(self, soup):
-        return self.process_tag(soup, parent_tags=set())
+    def convert_soup(self, soup: LexborHTMLParser | LexborNode) -> str | None:
+        if isinstance(soup, LexborHTMLParser) and soup.root:
+            return self.process_tag(soup.root, parent_tags=set())
+        elif isinstance(soup, LexborNode):
+            return self.process_tag(soup, parent_tags=set())
+        raise NotImplementedError(
+            f"Unexpected type: {type(soup)} passed to convert_soup()."
+        )
 
-    def process_element(self, node, parent_tags=None):
-        if isinstance(node, NavigableString):
+    def process_element(self, node: LexborNode, parent_tags=None):
+        if node.tag and node.tag == "-text":
             return self.process_text(node, parent_tags=parent_tags)
         else:
             return self.process_tag(node, parent_tags=parent_tags)
 
-    def process_tag(self, node, parent_tags=None):
+    def process_tag(self, node: LexborNode, parent_tags=None):
         # For the top-level element, initialize the parent context with an empty set.
         if parent_tags is None:
             parent_tags = set()
@@ -257,26 +290,24 @@ def process_tag(self, node, parent_tags=None):
         # adjacent to the inner/outer boundaries of block elements.
         should_remove_inside = should_remove_whitespace_inside(node)
 
-        def _can_ignore(el):
-            if isinstance(el, Tag):
+        def _can_ignore(el: LexborNode):
+            if is_tag(el):
                 # Tags are always processed.
                 return False
-            elif isinstance(el, (Comment, Doctype)):
+            elif el.tag in ["-comment", "-doctype"]:
                 # Comment and Doctype elements are always ignored.
                 # (subclasses of NavigableString, must test first)
                 return True
-            elif isinstance(el, NavigableString):
-                if six.text_type(el).strip() != "":
+            elif el.tag == "-text":
+                if el.text_content and el.text_content.strip():
                     # Non-whitespace text nodes are always processed.
                     return False
-                elif should_remove_inside and (
-                    not el.previous_sibling or not el.next_sibling
-                ):
+                elif should_remove_inside and (not el.prev or not el.next):
                     # Inside block elements (excluding 
), ignore adjacent whitespace elements.
                     return True
                 elif should_remove_whitespace_outside(
-                    el.previous_sibling
-                ) or should_remove_whitespace_outside(el.next_sibling):
+                    el.prev
+                ) or should_remove_whitespace_outside(el.next):
                     # Outside block elements (including 
), ignore adjacent whitespace elements.
                     return True
                 else:
@@ -286,22 +317,26 @@ def _can_ignore(el):
             else:
                 raise ValueError("Unexpected element type: %s" % type(el))
 
-        children_to_convert = [el for el in node.children if not _can_ignore(el)]
+        children_to_convert = [
+            el
+            for el in node.iter(include_text=True)
+            if not _can_ignore(el) and el != node
+        ]
 
         # Create a copy of this tag's parent context, then update it to include this tag
         # to propagate down into the children.
         parent_tags_for_children = set(parent_tags)
-        parent_tags_for_children.add(node.name)
+        parent_tags_for_children.add(node.tag)
 
         # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
         if (
-            re_html_heading.match(node.name) is not None  # headings
-            or node.name in {"td", "th"}  # table cells
+            (node.tag and re_html_heading.match(node.tag) is not None)  # headings
+            or node.tag in {"td", "th"}  # table cells
         ):
             parent_tags_for_children.add("_inline")
 
         # if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
-        if node.name in {"pre", "code", "kbd", "samp"}:
+        if node.tag in {"pre", "code", "kbd", "samp"}:
             parent_tags_for_children.add("_noformat")
 
         # Convert the children elements into a list of result strings.
@@ -314,7 +349,7 @@ def _can_ignore(el):
         child_strings = [s for s in child_strings if s]
 
         # Collapse newlines at child element boundaries, if needed.
-        if node.name == "pre" or node.find_parent("pre"):
+        if node.tag == "pre" or find_parent(node, "pre"):
             # Inside 
 blocks, do not collapse newlines.
             pass
         else:

From 5adc61d87b4a5c7f0026d4382ab7d6250a0103fa Mon Sep 17 00:00:00 2001
From: Shinon 
Date: Mon, 17 Nov 2025 12:45:39 +0800
Subject: [PATCH 3/8] Finish conversion to selectolax & remove bs4 options

---
 markdownify/__init__.py  | 216 +++++++++++++++++++++++----------------
 markdownify/__init__.pyi |   2 -
 2 files changed, 129 insertions(+), 89 deletions(-)

diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index 76b048b..bfbdbad 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -10,7 +10,6 @@
 re_whitespace = re.compile(r"[\t ]+")
 re_all_whitespace = re.compile(r"[\t \r\n]+")
 re_newline_whitespace = re.compile(r"[\t \r\n]*[\r\n][\t \r\n]*")
-re_html_heading = re.compile(r"h(\d+)")
 re_pre_lstrip1 = re.compile(r"^ *\n")
 re_pre_rstrip1 = re.compile(r"\n *$")
 re_pre_lstrip = re.compile(r"^[ \n]*\n")
@@ -65,6 +64,21 @@
 STRIP_ONE = "strip_one"
 
 
+def is_header_tag(tag_name: str):
+    """Returns True if the tag is a header (h1, h2, h3 ...)"""
+    tag_name = tag_name.lower()
+    # XXX: isdigit() is the fastest, but can be inaccurate
+    return tag_name[0] == "h" and tag_name[1:].isdigit()
+
+
+def find_previous_siblings(el: LexborNode | None, tag: str):
+    """Finds a previous element with specified tag"""
+    while el:
+        el = el.prev
+        if el and el.tag == tag:
+            yield el
+
+
 def strip1_pre(text):
     """Strip one leading and trailing newline from a 
 string."""
     text = re_pre_lstrip1.sub("", text)
@@ -79,15 +93,15 @@ def strip_pre(text):
     return text
 
 
-def find_parent(node: LexborNode | None, node_tag: str):
+def find_parent(el: LexborNode | None, node_tag: str):
     """Finds a parent with the specified tag"""
-    while node:
-        node = node.parent
-        if node is None:
+    while el:
+        el = el.parent
+        if el is None:
             break
-        if node.tag == node_tag:
-            return node
-    return node
+        if el.tag == node_tag:
+            return el
+    return el
 
 
 def chomp(text: str):
@@ -128,7 +142,7 @@ def implementation(self, el, text, parent_tags):
     return implementation
 
 
-def _todict(obj:Any):
+def _todict(obj: Any):
     return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith("_"))
 
 
@@ -160,7 +174,7 @@ def should_remove_whitespace_inside(el: LexborNode | None):
     """Return to remove whitespace immediately inside a block-level element."""
     if not el or not el.tag:
         return False
-    if re_html_heading.match(el.tag) is not None:
+    if is_header_tag(el.tag) is not None:
         return True
     return el.tag in WHITESPACE_ABLE
 
@@ -255,9 +269,6 @@ def __init__(self, **options):
                 "You may specify either tags to strip or tags to convert, but not both."
             )
 
-        # If a string or list is passed to bs4_options, assume it is a 'features' specification
-        if not isinstance(self.options["bs4_options"], dict):
-            self.options["bs4_options"] = {"features": self.options["bs4_options"]}
 
         # Initialize the conversion function cache
         self.convert_fn_cache = {}
@@ -275,20 +286,22 @@ def convert_soup(self, soup: LexborHTMLParser | LexborNode) -> str | None:
             f"Unexpected type: {type(soup)} passed to convert_soup()."
         )
 
-    def process_element(self, node: LexborNode, parent_tags=None):
-        if node.tag and node.tag == "-text":
-            return self.process_text(node, parent_tags=parent_tags)
+    def process_element(self, el: LexborNode, parent_tags=None):
+        if el.tag and el.tag == "-text":
+            return self.process_text(el, parent_tags=parent_tags)
         else:
-            return self.process_tag(node, parent_tags=parent_tags)
+            return self.process_tag(el, parent_tags=parent_tags)
 
-    def process_tag(self, node: LexborNode, parent_tags=None):
+    def process_tag(self, el: LexborNode, parent_tags=None):
         # For the top-level element, initialize the parent context with an empty set.
         if parent_tags is None:
             parent_tags = set()
 
+        node_tag = el.tag
+
         # Collect child elements to process, ignoring whitespace-only text elements
         # adjacent to the inner/outer boundaries of block elements.
-        should_remove_inside = should_remove_whitespace_inside(node)
+        should_remove_inside = should_remove_whitespace_inside(el)
 
         def _can_ignore(el: LexborNode):
             if is_tag(el):
@@ -318,25 +331,23 @@ def _can_ignore(el: LexborNode):
                 raise ValueError("Unexpected element type: %s" % type(el))
 
         children_to_convert = [
-            el
-            for el in node.iter(include_text=True)
-            if not _can_ignore(el) and el != node
+            el for el in el.iter(include_text=True) if not _can_ignore(el) and el != el
         ]
 
         # Create a copy of this tag's parent context, then update it to include this tag
         # to propagate down into the children.
         parent_tags_for_children = set(parent_tags)
-        parent_tags_for_children.add(node.tag)
+        parent_tags_for_children.add(el.tag)
 
         # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
         if (
-            (node.tag and re_html_heading.match(node.tag) is not None)  # headings
-            or node.tag in {"td", "th"}  # table cells
+            (node_tag and is_header_tag(node_tag) is not None)  # headings
+            or node_tag in {"td", "th"}  # table cells
         ):
             parent_tags_for_children.add("_inline")
 
         # if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
-        if node.tag in {"pre", "code", "kbd", "samp"}:
+        if node_tag in {"pre", "code", "kbd", "samp"}:
             parent_tags_for_children.add("_noformat")
 
         # Convert the children elements into a list of result strings.
@@ -349,7 +360,7 @@ def _can_ignore(el: LexborNode):
         child_strings = [s for s in child_strings if s]
 
         # Collapse newlines at child element boundaries, if needed.
-        if node.tag == "pre" or find_parent(node, "pre"):
+        if node_tag == "pre" or find_parent(el, "pre"):
             # Inside 
 blocks, do not collapse newlines.
             pass
         else:
@@ -378,14 +389,17 @@ def _can_ignore(el: LexborNode):
         # Join all child text strings into a single string.
         text = "".join(child_strings)
 
+        # Ensure node.tag is valid.
+        if el.tag is None:
+            raise NotImplementedError("Expected tag to be valid. Got None.")
         # apply this tag's final conversion function
-        convert_fn = self.get_conv_fn_cached(node.name)
+        convert_fn = self.get_conv_fn_cached(el.tag)
         if convert_fn is not None:
-            text = convert_fn(node, text, parent_tags=parent_tags)
+            text = convert_fn(el, text, parent_tags=parent_tags)
 
         return text
 
-    def convert__document_(self, el, text, parent_tags):
+    def convert__document_(self, el: LexborNode, text, parent_tags):
         """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
         if self.options["strip_document"] == LSTRIP:
             text = text.lstrip("\n")  # remove leading separation newlines
@@ -402,12 +416,12 @@ def convert__document_(self, el, text, parent_tags):
 
         return text
 
-    def process_text(self, el, parent_tags=None):
+    def process_text(self, el: LexborNode, parent_tags=None):
         # For the top-level element, initialize the parent context with an empty set.
         if parent_tags is None:
             parent_tags = set()
 
-        text = six.text_type(el) or ""
+        text = el.text_content or ""
 
         # normalize whitespace if we're not inside a preformatted element
         if "pre" not in parent_tags:
@@ -424,18 +438,18 @@ def process_text(self, el, parent_tags=None):
         # remove leading whitespace at the start or just after a
         # block-level element; remove traliing whitespace at the end
         # or just before a block-level element.
-        if should_remove_whitespace_outside(el.previous_sibling) or (
-            should_remove_whitespace_inside(el.parent) and not el.previous_sibling
+        if should_remove_whitespace_outside(el.prev) or (
+            should_remove_whitespace_inside(el.parent) and not el.prev
         ):
             text = text.lstrip(" \t\r\n")
-        if should_remove_whitespace_outside(el.next_sibling) or (
-            should_remove_whitespace_inside(el.parent) and not el.next_sibling
+        if should_remove_whitespace_outside(el.next) or (
+            should_remove_whitespace_inside(el.parent) and not el.next
         ):
             text = text.rstrip()
 
         return text
 
-    def get_conv_fn_cached(self, tag_name):
+    def get_conv_fn_cached(self, tag_name: str):
         """Given a tag name, return the conversion function using the cache."""
         # If conversion function is not in cache, add it
         if tag_name not in self.convert_fn_cache:
@@ -444,7 +458,7 @@ def get_conv_fn_cached(self, tag_name):
         # Return the cached entry
         return self.convert_fn_cache[tag_name]
 
-    def get_conv_fn(self, tag_name):
+    def get_conv_fn(self, tag_name: str):
         """Given a tag name, find and return the conversion function."""
         tag_name = tag_name.lower()
 
@@ -459,9 +473,9 @@ def get_conv_fn(self, tag_name):
             return convert_fn
 
         # If tag is any heading, handle with convert_hN() function
-        match = re_html_heading.match(tag_name)
+        match = is_header_tag(tag_name)
         if match:
-            n = int(match.group(1))  # get value of N from 
+            n = int(tag_name[1:])  # get value of N from 
             return lambda el, text, parent_tags: self.convert_hN(
                 n, el, text, parent_tags
             )
@@ -469,7 +483,7 @@ def get_conv_fn(self, tag_name):
         # No conversion function was found
         return None
 
-    def should_convert_tag(self, tag):
+    def should_convert_tag(self, tag: str):
         """Given a tag name, return whether to convert based on strip/convert options."""
         strip = self.options["strip"]
         convert = self.options["convert"]
@@ -499,14 +513,15 @@ def underline(self, text, pad_char):
         text = (text or "").rstrip()
         return "\n\n%s\n%s\n\n" % (text, pad_char * len(text)) if text else ""
 
-    def convert_a(self, el, text, parent_tags):
+    def convert_a(self, el: LexborNode, text, parent_tags):
         if "_noformat" in parent_tags:
             return text
         prefix, suffix, text = chomp(text)
         if not text:
             return ""
-        href = el.get("href")
-        title = el.get("title")
+        attributes = el.attributes
+        href = attributes.get("href")
+        title = attributes.get("title")
         # For the replacement see #29: text nodes underscores are escaped
         if (
             self.options["autolinks"]
@@ -529,7 +544,7 @@ def convert_a(self, el, text, parent_tags):
         lambda self: 2 * self.options["strong_em_symbol"]
     )
 
-    def convert_blockquote(self, el, text, parent_tags):
+    def convert_blockquote(self, el: LexborNode, text, parent_tags):
         # handle some early-exit scenarios
         text = (text or "").strip(" \t\r\n")
         if "_inline" in parent_tags:
@@ -546,7 +561,7 @@ def _indent_for_blockquote(match):
 
         return "\n" + text + "\n\n"
 
-    def convert_br(self, el, text, parent_tags):
+    def convert_br(self, el: LexborNode, text, parent_tags):
         if "_inline" in parent_tags:
             return " "
 
@@ -555,7 +570,7 @@ def convert_br(self, el, text, parent_tags):
         else:
             return "  \n"
 
-    def convert_code(self, el, text, parent_tags):
+    def convert_code(self, el: LexborNode, text, parent_tags):
         if "_noformat" in parent_tags:
             return text
 
@@ -657,31 +672,41 @@ def convert_hr(self, el, text, parent_tags):
 
     convert_i = convert_em
 
-    def convert_img(self, el, text, parent_tags):
-        alt = el.attrs.get("alt", None) or ""
-        src = el.attrs.get("src", None) or ""
-        title = el.attrs.get("title", None) or ""
+    def convert_img(self, el: LexborNode, text, parent_tags):
+        if not el.parent:
+            raise NotImplementedError(
+                "img element does not have a children. Potentially malformed?"
+            )
+        attrs = el.attributes
+        alt = attrs.get("alt", None) or ""
+        src = attrs.get("src", None) or ""
+        title = attrs.get("title", None) or ""
         title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
         if (
             "_inline" in parent_tags
-            and el.parent.name not in self.options["keep_inline_images_in"]
+            and el.parent.tag not in self.options["keep_inline_images_in"]
         ):
             return alt
 
         return "![%s](%s%s)" % (alt, src, title_part)
 
-    def convert_video(self, el, text, parent_tags):
+    def convert_video(self, el: LexborNode, text, parent_tags):
+        if not el.parent:
+            raise NotImplementedError(
+                "video element does not have a children. Potentially malformed?"
+            )
         if (
             "_inline" in parent_tags
-            and el.parent.name not in self.options["keep_inline_images_in"]
+            and el.parent.tag not in self.options["keep_inline_images_in"]
         ):
             return text
-        src = el.attrs.get("src", None) or ""
+        attrs = el.attributes
+        src = attrs.get("src", None) or ""
         if not src:
-            sources = el.find_all("source", attrs={"src": True})
+            sources = el.css("source[src]")
             if sources:
-                src = sources[0].attrs.get("src", None) or ""
-        poster = el.attrs.get("poster", None) or ""
+                src = sources[0].attributes.get("src", None) or ""
+        poster = attrs.get("poster", None) or ""
         if src and poster:
             return "[![%s](%s)](%s)" % (text, poster, src)
         if src:
@@ -690,7 +715,7 @@ def convert_video(self, el, text, parent_tags):
             return "![%s](%s)" % (text, poster)
         return text
 
-    def convert_list(self, el, text, parent_tags):
+    def convert_list(self, el: LexborNode, text, parent_tags):
         # Converting a list to inline is undefined.
         # Ignoring inline conversion parents for list.
 
@@ -706,7 +731,11 @@ def convert_list(self, el, text, parent_tags):
     convert_ul = convert_list
     convert_ol = convert_list
 
-    def convert_li(self, el, text, parent_tags):
+    def convert_li(self, el: LexborNode, text, parent_tags):
+        if not el.parent:
+            raise NotImplementedError(
+                "li element does not have a children. Potentially malformed?"
+            )
         # handle some early-exit scenarios
         text = (text or "").strip()
         if not text:
@@ -714,16 +743,17 @@ def convert_li(self, el, text, parent_tags):
 
         # determine list item bullet character to use
         parent = el.parent
-        if parent is not None and parent.name == "ol":
-            if parent.get("start") and str(parent.get("start")).isnumeric():
-                start = int(parent.get("start"))
+        if parent is not None and parent.tag == "ol":
+            start_attribute = parent.attributes.get("start")
+            if start_attribute and str(start_attribute).isnumeric():
+                start = int(start_attribute)
             else:
                 start = 1
-            bullet = "%s." % (start + len(el.find_previous_siblings("li")))
+            bullet = "%s." % (start + len(list(find_previous_siblings(el, "li"))))
         else:
             depth = -1
             while el:
-                if el.name == "ul":
+                if el.tag == "ul":
                     depth += 1
                 el = el.parent
             bullets = self.options["bullets"]
@@ -809,46 +839,58 @@ def convert_style(self, el, text, parent_tags):
 
     convert_sup = abstract_inline_conversion(lambda self: self.options["sup_symbol"])
 
-    def convert_table(self, el, text, parent_tags):
+    def convert_table(self, el: LexborNode, text, parent_tags):
         return "\n\n" + text.strip() + "\n\n"
 
-    def convert_caption(self, el, text, parent_tags):
+    def convert_caption(self, el: LexborNode, text, parent_tags):
         return text.strip() + "\n\n"
 
-    def convert_figcaption(self, el, text, parent_tags):
+    def convert_figcaption(self, el: LexborNode, text, parent_tags):
         return "\n\n" + text.strip() + "\n\n"
 
-    def convert_td(self, el, text, parent_tags):
+    def convert_td(self, el: LexborNode, text, parent_tags):
         colspan = 1
-        if "colspan" in el.attrs and el["colspan"].isdigit():
-            colspan = max(1, min(1000, int(el["colspan"])))
+        el_colspan = el.attributes.get("colspan")
+        el_colspan = int(el_colspan) if el_colspan and el_colspan.isdigit() else 0
+        if el_colspan:
+            colspan = max(1, min(1000, el_colspan))
         return " " + text.strip().replace("\n", " ") + " |" * colspan
 
-    def convert_th(self, el, text, parent_tags):
+    def convert_th(self, el: LexborNode, text, parent_tags):
         colspan = 1
-        if "colspan" in el.attrs and el["colspan"].isdigit():
-            colspan = max(1, min(1000, int(el["colspan"])))
+        el_colspan = el.attributes.get("colspan")
+        el_colspan = int(el_colspan) if el_colspan and el_colspan.isdigit() else 0
+        if el_colspan:
+            colspan = max(1, min(1000, el_colspan))
         return " " + text.strip().replace("\n", " ") + " |" * colspan
 
-    def convert_tr(self, el, text, parent_tags):
-        cells = el.find_all(["td", "th"])
-        is_first_row = el.find_previous_sibling() is None
-        is_headrow = all([cell.name == "th" for cell in cells]) or (
-            el.parent.name == "thead"
+    def convert_tr(self, el: LexborNode, text, parent_tags):
+        if not el.parent or not el.parent.parent:
+            raise NotImplementedError(
+                "Found table row with no parent or sub-parent. Malformed document?"
+            )
+        cells = el.css("td,th")
+        is_first_row = el.prev is None
+        is_headrow = all([cell.tag == "th" for cell in cells]) or (
+            el.parent.tag == "thead"
             # avoid multiple tr in thead
-            and len(el.parent.find_all("tr")) == 1
+            and len(el.parent.css("tr")) == 1
         )
-        is_head_row_missing = (is_first_row and not el.parent.name == "tbody") or (
+        is_head_row_missing = (is_first_row and not el.parent.tag == "tbody") or (
             is_first_row
-            and el.parent.name == "tbody"
-            and len(el.parent.parent.find_all(["thead"])) < 1
+            and el.parent.tag == "tbody"
+            and len(el.parent.parent.css("thead")) < 1
         )
         overline = ""
         underline = ""
         full_colspan = 0
         for cell in cells:
-            if "colspan" in cell.attrs and cell["colspan"].isdigit():
-                full_colspan += max(1, min(1000, int(cell["colspan"])))
+            cell_colspan = cell.attributes.get("colspan")
+            cell_colspan = (
+                int(cell_colspan) if cell_colspan and cell_colspan.isdigit() else 0
+            )
+            if cell_colspan:
+                full_colspan += max(1, min(1000, cell_colspan))
             else:
                 full_colspan += 1
         if (
@@ -862,8 +904,8 @@ def convert_tr(self, el, text, parent_tags):
         elif (is_head_row_missing and not self.options["table_infer_header"]) or (
             is_first_row
             and (
-                el.parent.name == "table"
-                or (el.parent.name == "tbody" and not el.parent.find_previous_sibling())
+                el.parent.tag == "table"
+                or (el.parent.tag == "tbody" and not el.parent.prev)
             )
         ):
             # headline is missing and header inference is disabled or:
diff --git a/markdownify/__init__.pyi b/markdownify/__init__.pyi
index ccb587f..a9b8674 100644
--- a/markdownify/__init__.pyi
+++ b/markdownify/__init__.pyi
@@ -17,7 +17,6 @@ STRIP_ONE: str
 def markdownify(
     html: str,
     autolinks: bool = ...,
-    bs4_options: str = ...,
     bullets: str = ...,
     code_language: str = ...,
     code_language_callback: Union[Callable[[Incomplete], Union[str, None]], None] = ...,
@@ -44,7 +43,6 @@ class MarkdownConverter:
     def __init__(
         self,
         autolinks: bool = ...,
-        bs4_options: str = ...,
         bullets: str = ...,
         code_language: str = ...,
         code_language_callback: Union[

From d72c80ab70981e347773db9178a62cc15bcf89bd Mon Sep 17 00:00:00 2001
From: Shinon 
Date: Mon, 17 Nov 2025 12:46:30 +0800
Subject: [PATCH 4/8] Increment version, fix dependants

---
 pyproject.toml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3df85eb..e8268bb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "markdownify"
-version = "1.2.2"
+version = "2.0.0"
 authors = [{name = "Matthew Tretter", email = "m@tthewwithanm.com"}]
 description = "Convert HTML to markdown."
 readme = "README.rst"
@@ -23,8 +23,7 @@ classifiers = [
     "Topic :: Utilities",
 ]
 dependencies = [
-    "beautifulsoup4>=4.9,<5",
-    "six>=1.15,<2"
+    "selectolax>0.4"
 ]
 
 [project.urls]

From 5fc94a8d8f5098cc9b07d2ba74b4ed46d675e8be Mon Sep 17 00:00:00 2001
From: Shinon 
Date: Mon, 17 Nov 2025 13:18:46 +0800
Subject: [PATCH 5/8] Function Typing

---
 markdownify/__init__.py | 85 ++++++++++++++++++++++++-----------------
 1 file changed, 50 insertions(+), 35 deletions(-)

diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index bfbdbad..c2749a3 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -79,14 +79,14 @@ def find_previous_siblings(el: LexborNode | None, tag: str):
             yield el
 
 
-def strip1_pre(text):
+def strip1_pre(text: str):
     """Strip one leading and trailing newline from a 
 string."""
     text = re_pre_lstrip1.sub("", text)
     text = re_pre_rstrip1.sub("", text)
     return text
 
 
-def strip_pre(text):
+def strip_pre(text: str):
     """Strip all leading and trailing newlines from a 
 string."""
     text = re_pre_lstrip.sub("", text)
     text = re_pre_rstrip.sub("", text)
@@ -117,7 +117,7 @@ def chomp(text: str):
     return (prefix, suffix, text)
 
 
-def abstract_inline_conversion(markup_fn: Callable):
+def abstract_inline_conversion(markup_fn: Callable[["MarkdownConverter"], str]):
     """
     This abstracts all simple inline tags like b, em, del, ...
     Returns a function that wraps the chomped text in a pair of the string
@@ -258,7 +258,7 @@ class DefaultOptions:
     class Options(DefaultOptions):
         pass
 
-    def __init__(self, **options):
+    def __init__(self, **options: dict[str, Any]):
         # Create an options dictionary. Use DefaultOptions as a base so that
         # it doesn't have to be extended.
         self.options = _todict(self.DefaultOptions)
@@ -269,7 +269,6 @@ def __init__(self, **options):
                 "You may specify either tags to strip or tags to convert, but not both."
             )
 
-
         # Initialize the conversion function cache
         self.convert_fn_cache = {}
 
@@ -286,13 +285,13 @@ def convert_soup(self, soup: LexborHTMLParser | LexborNode) -> str | None:
             f"Unexpected type: {type(soup)} passed to convert_soup()."
         )
 
-    def process_element(self, el: LexborNode, parent_tags=None):
+    def process_element(self, el: LexborNode, parent_tags: set[str] | None = None):
         if el.tag and el.tag == "-text":
             return self.process_text(el, parent_tags=parent_tags)
         else:
             return self.process_tag(el, parent_tags=parent_tags)
 
-    def process_tag(self, el: LexborNode, parent_tags=None):
+    def process_tag(self, el: LexborNode, parent_tags: set[str] | None = None):
         # For the top-level element, initialize the parent context with an empty set.
         if parent_tags is None:
             parent_tags = set()
@@ -399,7 +398,7 @@ def _can_ignore(el: LexborNode):
 
         return text
 
-    def convert__document_(self, el: LexborNode, text, parent_tags):
+    def convert__document_(self, el: LexborNode, text: str, parent_tags: set[str]):
         """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
         if self.options["strip_document"] == LSTRIP:
             text = text.lstrip("\n")  # remove leading separation newlines
@@ -416,7 +415,7 @@ def convert__document_(self, el: LexborNode, text, parent_tags):
 
         return text
 
-    def process_text(self, el: LexborNode, parent_tags=None):
+    def process_text(self, el: LexborNode, parent_tags: set[str] | None = None):
         # For the top-level element, initialize the parent context with an empty set.
         if parent_tags is None:
             parent_tags = set()
@@ -494,7 +493,7 @@ def should_convert_tag(self, tag: str):
         else:
             return True
 
-    def escape(self, text, parent_tags):
+    def escape(self, text: str, parent_tags: set[str]):
         if not text:
             return ""
         if self.options["escape_misc"]:
@@ -509,11 +508,11 @@ def escape(self, text, parent_tags):
             text = text.replace("_", r"\_")
         return text
 
-    def underline(self, text, pad_char):
+    def underline(self, text: str, pad_char: str):
         text = (text or "").rstrip()
         return "\n\n%s\n%s\n\n" % (text, pad_char * len(text)) if text else ""
 
-    def convert_a(self, el: LexborNode, text, parent_tags):
+    def convert_a(self, el: LexborNode, text: str, parent_tags: set[str]):
         if "_noformat" in parent_tags:
             return text
         prefix, suffix, text = chomp(text)
@@ -544,7 +543,7 @@ def convert_a(self, el: LexborNode, text, parent_tags):
         lambda self: 2 * self.options["strong_em_symbol"]
     )
 
-    def convert_blockquote(self, el: LexborNode, text, parent_tags):
+    def convert_blockquote(self, el: LexborNode, text: str, parent_tags: set[str]):
         # handle some early-exit scenarios
         text = (text or "").strip(" \t\r\n")
         if "_inline" in parent_tags:
@@ -561,7 +560,7 @@ def _indent_for_blockquote(match):
 
         return "\n" + text + "\n\n"
 
-    def convert_br(self, el: LexborNode, text, parent_tags):
+    def convert_br(self, el: LexborNode, text: str, parent_tags: set[str]):
         if "_inline" in parent_tags:
             return " "
 
@@ -570,7 +569,7 @@ def convert_br(self, el: LexborNode, text, parent_tags):
         else:
             return "  \n"
 
-    def convert_code(self, el: LexborNode, text, parent_tags):
+    def convert_code(self, el: LexborNode, text: str, parent_tags: set[str]):
         if "_noformat" in parent_tags:
             return text
 
@@ -594,7 +593,7 @@ def convert_code(self, el: LexborNode, text, parent_tags):
 
     convert_del = abstract_inline_conversion(lambda self: "~~")
 
-    def convert_div(self, el, text, parent_tags):
+    def convert_div(self, el: LexborNode, text: str, parent_tags: set[str]):
         if "_inline" in parent_tags:
             return " " + text.strip() + " "
         text = text.strip()
@@ -610,7 +609,7 @@ def convert_div(self, el, text, parent_tags):
 
     convert_kbd = convert_code
 
-    def convert_dd(self, el, text, parent_tags):
+    def convert_dd(self, el: LexborNode, text: str, parent_tags: set[str]):
         text = (text or "").strip()
         if "_inline" in parent_tags:
             return " " + text + " "
@@ -634,7 +633,7 @@ def _indent_for_dd(match):
     #   https://michelf.ca/projects/php-markdown/extra/#def-list
     convert_dl = convert_div
 
-    def convert_dt(self, el, text, parent_tags):
+    def convert_dt(self, el: LexborNode, text: str, parent_tags: set[str]):
         # remove newlines from term text
         text = (text or "").strip()
         text = re_all_whitespace.sub(" ", text)
@@ -648,7 +647,7 @@ def convert_dt(self, el, text, parent_tags):
 
         return "\n\n%s\n" % text
 
-    def convert_hN(self, n, el, text, parent_tags):
+    def convert_hN(self, n: int, el: LexborNode, text: str, parent_tags: set[str]):
         # convert_hN() converts  tags, where N is any integer
         if "_inline" in parent_tags:
             return text
@@ -667,12 +666,12 @@ def convert_hN(self, n, el, text, parent_tags):
             return "\n\n%s %s %s\n\n" % (hashes, text, hashes)
         return "\n\n%s %s\n\n" % (hashes, text)
 
-    def convert_hr(self, el, text, parent_tags):
+    def convert_hr(self, el: LexborNode, text: str, parent_tags: set[str]):
         return "\n\n---\n\n"
 
     convert_i = convert_em
 
-    def convert_img(self, el: LexborNode, text, parent_tags):
+    def convert_img(self, el: LexborNode, text: str, parent_tags: set[str]):
         if not el.parent:
             raise NotImplementedError(
                 "img element does not have a children. Potentially malformed?"
@@ -690,7 +689,7 @@ def convert_img(self, el: LexborNode, text, parent_tags):
 
         return "![%s](%s%s)" % (alt, src, title_part)
 
-    def convert_video(self, el: LexborNode, text, parent_tags):
+    def convert_video(self, el: LexborNode, text: str, parent_tags: set[str]):
         if not el.parent:
             raise NotImplementedError(
                 "video element does not have a children. Potentially malformed?"
@@ -715,7 +714,7 @@ def convert_video(self, el: LexborNode, text, parent_tags):
             return "![%s](%s)" % (text, poster)
         return text
 
-    def convert_list(self, el: LexborNode, text, parent_tags):
+    def convert_list(self, el: LexborNode, text: str, parent_tags: set[str]):
         # Converting a list to inline is undefined.
         # Ignoring inline conversion parents for list.
 
@@ -731,7 +730,7 @@ def convert_list(self, el: LexborNode, text, parent_tags):
     convert_ul = convert_list
     convert_ol = convert_list
 
-    def convert_li(self, el: LexborNode, text, parent_tags):
+    def convert_li(self, el: LexborNode, text: str, parent_tags: set[str]):
         if not el.parent:
             raise NotImplementedError(
                 "li element does not have a children. Potentially malformed?"
@@ -774,7 +773,7 @@ def _indent_for_li(match):
 
         return "%s\n" % text
 
-    def convert_p(self, el, text, parent_tags):
+    def convert_p(self, el: LexborNode, text: str, parent_tags: set[str]):
         if "_inline" in parent_tags:
             return " " + text.strip(" \t\r\n") + " "
         text = text.strip(" \t\r\n")
@@ -799,7 +798,7 @@ def convert_p(self, el, text, parent_tags):
                 text = "\n".join(new_lines)
         return "\n\n%s\n\n" % text if text else ""
 
-    def convert_pre(self, el, text, parent_tags):
+    def convert_pre(self, el: LexborNode, text: str, parent_tags: set[str]):
         if not text:
             return ""
         code_language = self.options["code_language"]
@@ -820,13 +819,17 @@ def convert_pre(self, el, text, parent_tags):
 
         return "\n\n```%s\n%s\n```\n\n" % (code_language, text)
 
-    def convert_q(self, el, text, parent_tags):
+    def convert_q(self, el: LexborNode, text: str, parent_tags: set[str] | None = None):
         return '"' + text + '"'
 
-    def convert_script(self, el, text, parent_tags):
+    def convert_script(
+        self, el: LexborNode, text: str, parent_tags: set[str] | None = None
+    ):
         return ""
 
-    def convert_style(self, el, text, parent_tags):
+    def convert_style(
+        self, el: LexborNode, text: str, parent_tags: set[str] | None = None
+    ):
         return ""
 
     convert_s = convert_del
@@ -839,16 +842,24 @@ def convert_style(self, el, text, parent_tags):
 
     convert_sup = abstract_inline_conversion(lambda self: self.options["sup_symbol"])
 
-    def convert_table(self, el: LexborNode, text, parent_tags):
+    def convert_table(
+        self, el: LexborNode, text: str, parent_tags: set[str] | None = None
+    ):
         return "\n\n" + text.strip() + "\n\n"
 
-    def convert_caption(self, el: LexborNode, text, parent_tags):
+    def convert_caption(
+        self, el: LexborNode, text: str, parent_tags: set[str] | None = None
+    ):
         return text.strip() + "\n\n"
 
-    def convert_figcaption(self, el: LexborNode, text, parent_tags):
+    def convert_figcaption(
+        self, el: LexborNode, text: str, parent_tags: set[str] | None = None
+    ):
         return "\n\n" + text.strip() + "\n\n"
 
-    def convert_td(self, el: LexborNode, text, parent_tags):
+    def convert_td(
+        self, el: LexborNode, text: str, parent_tags: set[str] | None = None
+    ):
         colspan = 1
         el_colspan = el.attributes.get("colspan")
         el_colspan = int(el_colspan) if el_colspan and el_colspan.isdigit() else 0
@@ -856,7 +867,9 @@ def convert_td(self, el: LexborNode, text, parent_tags):
             colspan = max(1, min(1000, el_colspan))
         return " " + text.strip().replace("\n", " ") + " |" * colspan
 
-    def convert_th(self, el: LexborNode, text, parent_tags):
+    def convert_th(
+        self, el: LexborNode, text: str, parent_tags: set[str] | None = None
+    ):
         colspan = 1
         el_colspan = el.attributes.get("colspan")
         el_colspan = int(el_colspan) if el_colspan and el_colspan.isdigit() else 0
@@ -864,7 +877,9 @@ def convert_th(self, el: LexborNode, text, parent_tags):
             colspan = max(1, min(1000, el_colspan))
         return " " + text.strip().replace("\n", " ") + " |" * colspan
 
-    def convert_tr(self, el: LexborNode, text, parent_tags):
+    def convert_tr(
+        self, el: LexborNode, text: str, parent_tags: set[str] | None = None
+    ):
         if not el.parent or not el.parent.parent:
             raise NotImplementedError(
                 "Found table row with no parent or sub-parent. Malformed document?"

From b2922b20c7b1961ad832dbaad055920d4cff9b0a Mon Sep 17 00:00:00 2001
From: Shinon 
Date: Mon, 17 Nov 2025 13:19:49 +0800
Subject: [PATCH 6/8] removed unused regex

---
 markdownify/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index c2749a3..75058ec 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -5,7 +5,6 @@
 from selectolax.lexbor import LexborHTMLParser, LexborNode
 
 # General-purpose regex patterns
-re_convert_heading = re.compile(r"convert_h(\d+)")
 re_line_with_content = re.compile(r"^(.*)", flags=re.MULTILINE)
 re_whitespace = re.compile(r"[\t ]+")
 re_all_whitespace = re.compile(r"[\t \r\n]+")

From 330b0ad2ddf39eb5b4b4bc30072d0fe46ac71af6 Mon Sep 17 00:00:00 2001
From: Shinon 
Date: Mon, 17 Nov 2025 13:40:23 +0800
Subject: [PATCH 7/8] Fix tests

---
 markdownify/__init__.py        | 23 ++++++++---------------
 tests/test_custom_converter.py |  4 ++--
 tests/test_escaping.py         |  2 --
 tests/types.py                 |  7 +++----
 tests/utils.py                 |  5 +++--
 5 files changed, 16 insertions(+), 25 deletions(-)

diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index 75058ec..7fe47f6 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -173,7 +173,7 @@ def should_remove_whitespace_inside(el: LexborNode | None):
     """Return to remove whitespace immediately inside a block-level element."""
     if not el or not el.tag:
         return False
-    if is_header_tag(el.tag) is not None:
+    if is_header_tag(el.tag):
         return True
     return el.tag in WHITESPACE_ABLE
 
@@ -212,19 +212,10 @@ def _is_block_content_element(el: LexborNode | None):
         return False
 
 
-def _prev_block_content_sibling(el):
-    """Returns the first previous sibling that is a content element, else None."""
-    while el is not None:
-        el = el.previous_sibling
-        if _is_block_content_element(el):
-            return el
-    return None
-
-
-def _next_block_content_sibling(el):
+def _next_block_content_sibling(el:LexborNode|None):
     """Returns the first next sibling that is a content element, else None."""
     while el is not None:
-        el = el.next_sibling
+        el = el.next
         if _is_block_content_element(el):
             return el
     return None
@@ -329,7 +320,7 @@ def _can_ignore(el: LexborNode):
                 raise ValueError("Unexpected element type: %s" % type(el))
 
         children_to_convert = [
-            el for el in el.iter(include_text=True) if not _can_ignore(el) and el != el
+            el for el in el.iter(include_text=True) if not _can_ignore(el)
         ]
 
         # Create a copy of this tag's parent context, then update it to include this tag
@@ -398,7 +389,9 @@ def _can_ignore(el: LexborNode):
         return text
 
     def convert__document_(self, el: LexborNode, text: str, parent_tags: set[str]):
-        """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
+        """Final document-level formatting for lexbor (node.tag == "[document]")"""
+        # XXX: I believe this is not needed.
+
         if self.options["strip_document"] == LSTRIP:
             text = text.lstrip("\n")  # remove leading separation newlines
         elif self.options["strip_document"] == RSTRIP:
@@ -719,7 +712,7 @@ def convert_list(self, el: LexborNode, text: str, parent_tags: set[str]):
 
         before_paragraph = False
         next_sibling = _next_block_content_sibling(el)
-        if next_sibling and next_sibling.name not in ["ul", "ol"]:
+        if next_sibling and next_sibling.tag not in ["ul", "ol"]:
             before_paragraph = True
         if "li" in parent_tags:
             # remove trailing newline if we're in a nested list
diff --git a/tests/test_custom_converter.py b/tests/test_custom_converter.py
index 00a83fc..51b1170 100644
--- a/tests/test_custom_converter.py
+++ b/tests/test_custom_converter.py
@@ -1,5 +1,5 @@
 from markdownify import MarkdownConverter
-from bs4 import BeautifulSoup
+from selectolax.lexbor import LexborHTMLParser
 
 
 class UnitTestConverter(MarkdownConverter):
@@ -40,5 +40,5 @@ def md(html, **options):
 
 def test_soup():
     html = 'test'
-    soup = BeautifulSoup(html, 'html.parser')
+    soup = LexborHTMLParser(html)
     assert MarkdownConverter().convert_soup(soup) == '**test**'
diff --git a/tests/test_escaping.py b/tests/test_escaping.py
index bab4d11..af828e4 100644
--- a/tests/test_escaping.py
+++ b/tests/test_escaping.py
@@ -1,5 +1,4 @@
 import warnings
-from bs4 import MarkupResemblesLocatorWarning
 from .utils import md
 
 
@@ -32,7 +31,6 @@ def test_single_escaping_entities():
 
 def test_misc():
     # ignore the bs4 warning that "1.2" or "*" looks like a filename
-    warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
 
     assert md('\\*', escape_misc=True) == r'\\\*'
     assert md('<foo>', escape_misc=True) == r'\'
diff --git a/tests/types.py b/tests/types.py
index 7424978..90951de 100644
--- a/tests/types.py
+++ b/tests/types.py
@@ -1,5 +1,5 @@
 from markdownify import markdownify, ASTERISK, BACKSLASH, LSTRIP, RSTRIP, SPACES, STRIP, UNDERLINED, UNDERSCORE, MarkdownConverter
-from bs4 import BeautifulSoup
+from selectolax.lexbor import LexborHTMLParser, LexborNode
 from typing import Union
 
 markdownify("

Hello

") == "Hello" # test default of STRIP @@ -11,7 +11,6 @@ # default options MarkdownConverter( autolinks=True, - bs4_options='html.parser', bullets='*+-', code_language='', code_language_callback=None, @@ -55,11 +54,11 @@ ).convert("") html = 'test' -soup = BeautifulSoup(html, 'html.parser') +soup = LexborHTMLParser(html) MarkdownConverter().convert_soup(soup) == '**test**' -def callback(el: BeautifulSoup) -> Union[str, None]: +def callback(el: LexborNode) -> Union[str, None]: return el['class'][0] if el.has_attr('class') else None diff --git a/tests/utils.py b/tests/utils.py index 0dac580..8e455d5 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,9 +1,10 @@ from markdownify import MarkdownConverter +from selectolax.lexbor import LexborHTMLParser # for unit testing, disable document-level stripping by default so that # separation newlines are included in testing -def md(html, **options): +def md(html: str, **options): options = {"strip_document": None, **options} - return MarkdownConverter(**options).convert(html) + return MarkdownConverter(**options).convert_soup(LexborHTMLParser(html).body) From ed93b1e2691658629fe39fa89c9d17351748c6a2 Mon Sep 17 00:00:00 2001 From: Shinon Date: Mon, 17 Nov 2025 13:46:54 +0800 Subject: [PATCH 8/8] Fix failing tests --- markdownify/__init__.py | 6 +++--- tests/utils.py | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 7fe47f6..bf2de77 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -330,7 +330,7 @@ def _can_ignore(el: LexborNode): # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag if ( - (node_tag and is_header_tag(node_tag) is not None) # headings + (node_tag and is_header_tag(node_tag)) # headings or node_tag in {"td", "th"} # table cells ): parent_tags_for_children.add("_inline") @@ -464,8 +464,8 @@ def get_conv_fn(self, tag_name: str): return convert_fn # If tag is any heading, handle with convert_hN() function - match = is_header_tag(tag_name) - if match: + is_header = is_header_tag(tag_name) + if is_header: n = int(tag_name[1:]) # get value of N from return lambda el, text, parent_tags: self.convert_hN( n, el, text, parent_tags diff --git a/tests/utils.py b/tests/utils.py index 8e455d5..83837f6 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,5 +1,4 @@ from markdownify import MarkdownConverter -from selectolax.lexbor import LexborHTMLParser # for unit testing, disable document-level stripping by default so that @@ -7,4 +6,4 @@ def md(html: str, **options): options = {"strip_document": None, **options} - return MarkdownConverter(**options).convert_soup(LexborHTMLParser(html).body) + return MarkdownConverter(**options).convert(html)