diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 148d340..bf2de77 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -1,98 +1,122 @@ -from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag -from textwrap import fill import re -import six +from textwrap import fill +from typing import Any, Callable +from selectolax.lexbor import LexborHTMLParser, LexborNode # General-purpose regex patterns -re_convert_heading = re.compile(r'convert_h(\d+)') -re_line_with_content = re.compile(r'^(.*)', flags=re.MULTILINE) -re_whitespace = re.compile(r'[\t ]+') -re_all_whitespace = re.compile(r'[\t \r\n]+') -re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*') -re_html_heading = re.compile(r'h(\d+)') -re_pre_lstrip1 = re.compile(r'^ *\n') -re_pre_rstrip1 = re.compile(r'\n *$') -re_pre_lstrip = re.compile(r'^[ \n]*\n') -re_pre_rstrip = re.compile(r'[ \n]*$') +re_line_with_content = re.compile(r"^(.*)", flags=re.MULTILINE) +re_whitespace = re.compile(r"[\t ]+") +re_all_whitespace = re.compile(r"[\t \r\n]+") +re_newline_whitespace = re.compile(r"[\t \r\n]*[\r\n][\t \r\n]*") +re_pre_lstrip1 = re.compile(r"^ *\n") +re_pre_rstrip1 = re.compile(r"\n *$") +re_pre_lstrip = re.compile(r"^[ \n]*\n") +re_pre_rstrip = re.compile(r"[ \n]*$") # Pattern for creating convert_ function names from tag names -re_make_convert_fn_name = re.compile(r'[\[\]:-]') +re_make_convert_fn_name = re.compile(r"[\[\]:-]") # Extract (leading_nl, content, trailing_nl) from a string # (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here) -re_extract_newlines = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL) +re_extract_newlines = re.compile(r"^(\n*)((?:.*[^\n])?)(\n*)$", flags=re.DOTALL) # Escape miscellaneous special Markdown characters -re_escape_misc_chars = re.compile(r'([]\\&<`[>~=+|])') +re_escape_misc_chars = re.compile(r"([]\\&<`[>~=+|])") # Escape sequence of one or more consecutive '-', preceded # and followed by whitespace or start/end of fragment, as it # might be confused with an underline of a header, or with a # list marker -re_escape_misc_dash_sequences = re.compile(r'(\s|^)(-+(?:\s|$))') +re_escape_misc_dash_sequences = re.compile(r"(\s|^)(-+(?:\s|$))") # Escape sequence of up to six consecutive '#', preceded # and followed by whitespace or start/end of fragment, as # it might be confused with an ATX heading -re_escape_misc_hashes = re.compile(r'(\s|^)(#{1,6}(?:\s|$))') +re_escape_misc_hashes = re.compile(r"(\s|^)(#{1,6}(?:\s|$))") # Escape '.' or ')' preceded by up to nine digits, as it might be # confused with a list item -re_escape_misc_list_items = re.compile(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))') +re_escape_misc_list_items = re.compile(r"((?:\s|^)[0-9]{1,9})([.)](?:\s|$))") # Find consecutive backtick sequences in a string -re_backtick_runs = re.compile(r'`+') +re_backtick_runs = re.compile(r"`+") # Heading styles -ATX = 'atx' -ATX_CLOSED = 'atx_closed' -UNDERLINED = 'underlined' +ATX = "atx" +ATX_CLOSED = "atx_closed" +UNDERLINED = "underlined" SETEXT = UNDERLINED # Newline style -SPACES = 'spaces' -BACKSLASH = 'backslash' +SPACES = "spaces" +BACKSLASH = "backslash" # Strong and emphasis style -ASTERISK = '*' -UNDERSCORE = '_' +ASTERISK = "*" +UNDERSCORE = "_" # Document/pre strip styles -LSTRIP = 'lstrip' -RSTRIP = 'rstrip' -STRIP = 'strip' -STRIP_ONE = 'strip_one' +LSTRIP = "lstrip" +RSTRIP = "rstrip" +STRIP = "strip" +STRIP_ONE = "strip_one" + + +def is_header_tag(tag_name: str): + """Returns True if the tag is a header (h1, h2, h3 ...)""" + tag_name = tag_name.lower() + # XXX: isdigit() is the fastest, but can be inaccurate + return tag_name[0] == "h" and tag_name[1:].isdigit() + + +def find_previous_siblings(el: LexborNode | None, tag: str): + """Finds a previous element with specified tag""" + while el: + el = el.prev + if el and el.tag == tag: + yield el -def strip1_pre(text): +def strip1_pre(text: str): """Strip one leading and trailing newline from a
 string."""
-    text = re_pre_lstrip1.sub('', text)
-    text = re_pre_rstrip1.sub('', text)
+    text = re_pre_lstrip1.sub("", text)
+    text = re_pre_rstrip1.sub("", text)
     return text
 
 
-def strip_pre(text):
+def strip_pre(text: str):
     """Strip all leading and trailing newlines from a 
 string."""
-    text = re_pre_lstrip.sub('', text)
-    text = re_pre_rstrip.sub('', text)
+    text = re_pre_lstrip.sub("", text)
+    text = re_pre_rstrip.sub("", text)
     return text
 
 
-def chomp(text):
+def find_parent(el: LexborNode | None, node_tag: str):
+    """Finds a parent with the specified tag"""
+    while el:
+        el = el.parent
+        if el is None:
+            break
+        if el.tag == node_tag:
+            return el
+    return el
+
+
+def chomp(text: str):
     """
     If the text in an inline tag like b, a, or em contains a leading or trailing
     space, strip the string and return a space as suffix of prefix, if needed.
     This function is used to prevent conversions like
          foo => ** foo**
     """
-    prefix = ' ' if text and text[0] == ' ' else ''
-    suffix = ' ' if text and text[-1] == ' ' else ''
+    prefix = " " if text and text[0] == " " else ""
+    suffix = " " if text and text[-1] == " " else ""
     text = text.strip()
     return (prefix, suffix, text)
 
 
-def abstract_inline_conversion(markup_fn):
+def abstract_inline_conversion(markup_fn: Callable[["MarkdownConverter"], str]):
     """
     This abstracts all simple inline tags like b, em, del, ...
     Returns a function that wraps the chomped text in a pair of the string
@@ -100,85 +124,108 @@ def abstract_inline_conversion(markup_fn):
     the text if it looks like an HTML tag. markup_fn is necessary to allow for
     references to self.strong_em_symbol etc.
     """
+
     def implementation(self, el, text, parent_tags):
         markup_prefix = markup_fn(self)
-        if markup_prefix.startswith('<') and markup_prefix.endswith('>'):
-            markup_suffix = '"):
+            markup_suffix = " str | None:
+        soup = LexborHTMLParser(html)
         return self.convert_soup(soup)
 
-    def convert_soup(self, soup):
-        return self.process_tag(soup, parent_tags=set())
+    def convert_soup(self, soup: LexborHTMLParser | LexborNode) -> str | None:
+        if isinstance(soup, LexborHTMLParser) and soup.root:
+            return self.process_tag(soup.root, parent_tags=set())
+        elif isinstance(soup, LexborNode):
+            return self.process_tag(soup, parent_tags=set())
+        raise NotImplementedError(
+            f"Unexpected type: {type(soup)} passed to convert_soup()."
+        )
 
-    def process_element(self, node, parent_tags=None):
-        if isinstance(node, NavigableString):
-            return self.process_text(node, parent_tags=parent_tags)
+    def process_element(self, el: LexborNode, parent_tags: set[str] | None = None):
+        if el.tag and el.tag == "-text":
+            return self.process_text(el, parent_tags=parent_tags)
         else:
-            return self.process_tag(node, parent_tags=parent_tags)
+            return self.process_tag(el, parent_tags=parent_tags)
 
-    def process_tag(self, node, parent_tags=None):
+    def process_tag(self, el: LexborNode, parent_tags: set[str] | None = None):
         # For the top-level element, initialize the parent context with an empty set.
         if parent_tags is None:
             parent_tags = set()
 
+        node_tag = el.tag
+
         # Collect child elements to process, ignoring whitespace-only text elements
         # adjacent to the inner/outer boundaries of block elements.
-        should_remove_inside = should_remove_whitespace_inside(node)
+        should_remove_inside = should_remove_whitespace_inside(el)
 
-        def _can_ignore(el):
-            if isinstance(el, Tag):
+        def _can_ignore(el: LexborNode):
+            if is_tag(el):
                 # Tags are always processed.
                 return False
-            elif isinstance(el, (Comment, Doctype)):
+            elif el.tag in ["-comment", "-doctype"]:
                 # Comment and Doctype elements are always ignored.
                 # (subclasses of NavigableString, must test first)
                 return True
-            elif isinstance(el, NavigableString):
-                if six.text_type(el).strip() != '':
+            elif el.tag == "-text":
+                if el.text_content and el.text_content.strip():
                     # Non-whitespace text nodes are always processed.
                     return False
-                elif should_remove_inside and (not el.previous_sibling or not el.next_sibling):
+                elif should_remove_inside and (not el.prev or not el.next):
                     # Inside block elements (excluding 
), ignore adjacent whitespace elements.
                     return True
-                elif should_remove_whitespace_outside(el.previous_sibling) or should_remove_whitespace_outside(el.next_sibling):
+                elif should_remove_whitespace_outside(
+                    el.prev
+                ) or should_remove_whitespace_outside(el.next):
                     # Outside block elements (including 
), ignore adjacent whitespace elements.
                     return True
                 else:
@@ -263,25 +317,27 @@ def _can_ignore(el):
             elif el is None:
                 return True
             else:
-                raise ValueError('Unexpected element type: %s' % type(el))
+                raise ValueError("Unexpected element type: %s" % type(el))
 
-        children_to_convert = [el for el in node.children if not _can_ignore(el)]
+        children_to_convert = [
+            el for el in el.iter(include_text=True) if not _can_ignore(el)
+        ]
 
         # Create a copy of this tag's parent context, then update it to include this tag
         # to propagate down into the children.
         parent_tags_for_children = set(parent_tags)
-        parent_tags_for_children.add(node.name)
+        parent_tags_for_children.add(el.tag)
 
         # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
         if (
-            re_html_heading.match(node.name) is not None  # headings
-            or node.name in {'td', 'th'}  # table cells
+            (node_tag and is_header_tag(node_tag))  # headings
+            or node_tag in {"td", "th"}  # table cells
         ):
-            parent_tags_for_children.add('_inline')
+            parent_tags_for_children.add("_inline")
 
         # if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
-        if node.name in {'pre', 'code', 'kbd', 'samp'}:
-            parent_tags_for_children.add('_noformat')
+        if node_tag in {"pre", "code", "kbd", "samp"}:
+            parent_tags_for_children.add("_noformat")
 
         # Convert the children elements into a list of result strings.
         child_strings = [
@@ -293,22 +349,26 @@ def _can_ignore(el):
         child_strings = [s for s in child_strings if s]
 
         # Collapse newlines at child element boundaries, if needed.
-        if node.name == 'pre' or node.find_parent('pre'):
+        if node_tag == "pre" or find_parent(el, "pre"):
             # Inside 
 blocks, do not collapse newlines.
             pass
         else:
             # Collapse newlines at child element boundaries.
-            updated_child_strings = ['']  # so the first lookback works
+            updated_child_strings = [""]  # so the first lookback works
             for child_string in child_strings:
                 # Separate the leading/trailing newlines from the content.
-                leading_nl, content, trailing_nl = re_extract_newlines.match(child_string).groups()
+                leading_nl, content, trailing_nl = re_extract_newlines.match(
+                    child_string
+                ).groups()
 
                 # If the last child had trailing newlines and this child has leading newlines,
                 # use the larger newline count, limited to 2.
                 if updated_child_strings[-1] and leading_nl:
-                    prev_trailing_nl = updated_child_strings.pop()  # will be replaced by the collapsed value
+                    prev_trailing_nl = (
+                        updated_child_strings.pop()
+                    )  # will be replaced by the collapsed value
                     num_newlines = min(2, max(len(prev_trailing_nl), len(leading_nl)))
-                    leading_nl = '\n' * num_newlines
+                    leading_nl = "\n" * num_newlines
 
                 # Add the results to the updated child string list.
                 updated_child_strings.extend([leading_nl, content, trailing_nl])
@@ -316,64 +376,71 @@ def _can_ignore(el):
             child_strings = updated_child_strings
 
         # Join all child text strings into a single string.
-        text = ''.join(child_strings)
+        text = "".join(child_strings)
 
+        # Ensure node.tag is valid.
+        if el.tag is None:
+            raise NotImplementedError("Expected tag to be valid. Got None.")
         # apply this tag's final conversion function
-        convert_fn = self.get_conv_fn_cached(node.name)
+        convert_fn = self.get_conv_fn_cached(el.tag)
         if convert_fn is not None:
-            text = convert_fn(node, text, parent_tags=parent_tags)
+            text = convert_fn(el, text, parent_tags=parent_tags)
 
         return text
 
-    def convert__document_(self, el, text, parent_tags):
-        """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
-        if self.options['strip_document'] == LSTRIP:
-            text = text.lstrip('\n')  # remove leading separation newlines
-        elif self.options['strip_document'] == RSTRIP:
-            text = text.rstrip('\n')  # remove trailing separation newlines
-        elif self.options['strip_document'] == STRIP:
-            text = text.strip('\n')  # remove leading and trailing separation newlines
-        elif self.options['strip_document'] is None:
+    def convert__document_(self, el: LexborNode, text: str, parent_tags: set[str]):
+        """Final document-level formatting for lexbor (node.tag == "[document]")"""
+        # XXX: I believe this is not needed.
+
+        if self.options["strip_document"] == LSTRIP:
+            text = text.lstrip("\n")  # remove leading separation newlines
+        elif self.options["strip_document"] == RSTRIP:
+            text = text.rstrip("\n")  # remove trailing separation newlines
+        elif self.options["strip_document"] == STRIP:
+            text = text.strip("\n")  # remove leading and trailing separation newlines
+        elif self.options["strip_document"] is None:
             pass  # leave leading and trailing separation newlines as-is
         else:
-            raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document'])
+            raise ValueError(
+                "Invalid value for strip_document: %s" % self.options["strip_document"]
+            )
 
         return text
 
-    def process_text(self, el, parent_tags=None):
+    def process_text(self, el: LexborNode, parent_tags: set[str] | None = None):
         # For the top-level element, initialize the parent context with an empty set.
         if parent_tags is None:
             parent_tags = set()
 
-        text = six.text_type(el) or ''
+        text = el.text_content or ""
 
         # normalize whitespace if we're not inside a preformatted element
-        if 'pre' not in parent_tags:
-            if self.options['wrap']:
-                text = re_all_whitespace.sub(' ', text)
+        if "pre" not in parent_tags:
+            if self.options["wrap"]:
+                text = re_all_whitespace.sub(" ", text)
             else:
-                text = re_newline_whitespace.sub('\n', text)
-                text = re_whitespace.sub(' ', text)
+                text = re_newline_whitespace.sub("\n", text)
+                text = re_whitespace.sub(" ", text)
 
         # escape special characters if we're not inside a preformatted or code element
-        if '_noformat' not in parent_tags:
+        if "_noformat" not in parent_tags:
             text = self.escape(text, parent_tags)
 
         # remove leading whitespace at the start or just after a
         # block-level element; remove traliing whitespace at the end
         # or just before a block-level element.
-        if (should_remove_whitespace_outside(el.previous_sibling)
-                or (should_remove_whitespace_inside(el.parent)
-                    and not el.previous_sibling)):
-            text = text.lstrip(' \t\r\n')
-        if (should_remove_whitespace_outside(el.next_sibling)
-                or (should_remove_whitespace_inside(el.parent)
-                    and not el.next_sibling)):
+        if should_remove_whitespace_outside(el.prev) or (
+            should_remove_whitespace_inside(el.parent) and not el.prev
+        ):
+            text = text.lstrip(" \t\r\n")
+        if should_remove_whitespace_outside(el.next) or (
+            should_remove_whitespace_inside(el.parent) and not el.next
+        ):
             text = text.rstrip()
 
         return text
 
-    def get_conv_fn_cached(self, tag_name):
+    def get_conv_fn_cached(self, tag_name: str):
         """Given a tag name, return the conversion function using the cache."""
         # If conversion function is not in cache, add it
         if tag_name not in self.convert_fn_cache:
@@ -382,7 +449,7 @@ def get_conv_fn_cached(self, tag_name):
         # Return the cached entry
         return self.convert_fn_cache[tag_name]
 
-    def get_conv_fn(self, tag_name):
+    def get_conv_fn(self, tag_name: str):
         """Given a tag name, find and return the conversion function."""
         tag_name = tag_name.lower()
 
@@ -397,18 +464,20 @@ def get_conv_fn(self, tag_name):
             return convert_fn
 
         # If tag is any heading, handle with convert_hN() function
-        match = re_html_heading.match(tag_name)
-        if match:
-            n = int(match.group(1))  # get value of N from 
-            return lambda el, text, parent_tags: self.convert_hN(n, el, text, parent_tags)
+        is_header = is_header_tag(tag_name)
+        if is_header:
+            n = int(tag_name[1:])  # get value of N from 
+            return lambda el, text, parent_tags: self.convert_hN(
+                n, el, text, parent_tags
+            )
 
         # No conversion function was found
         return None
 
-    def should_convert_tag(self, tag):
+    def should_convert_tag(self, tag: str):
         """Given a tag name, return whether to convert based on strip/convert options."""
-        strip = self.options['strip']
-        convert = self.options['convert']
+        strip = self.options["strip"]
+        convert = self.options["convert"]
         if strip is not None:
             return tag not in strip
         elif convert is not None:
@@ -416,302 +485,344 @@ def should_convert_tag(self, tag):
         else:
             return True
 
-    def escape(self, text, parent_tags):
+    def escape(self, text: str, parent_tags: set[str]):
         if not text:
-            return ''
-        if self.options['escape_misc']:
-            text = re_escape_misc_chars.sub(r'\\\1', text)
-            text = re_escape_misc_dash_sequences.sub(r'\1\\\2', text)
-            text = re_escape_misc_hashes.sub(r'\1\\\2', text)
-            text = re_escape_misc_list_items.sub(r'\1\\\2', text)
-
-        if self.options['escape_asterisks']:
-            text = text.replace('*', r'\*')
-        if self.options['escape_underscores']:
-            text = text.replace('_', r'\_')
+            return ""
+        if self.options["escape_misc"]:
+            text = re_escape_misc_chars.sub(r"\\\1", text)
+            text = re_escape_misc_dash_sequences.sub(r"\1\\\2", text)
+            text = re_escape_misc_hashes.sub(r"\1\\\2", text)
+            text = re_escape_misc_list_items.sub(r"\1\\\2", text)
+
+        if self.options["escape_asterisks"]:
+            text = text.replace("*", r"\*")
+        if self.options["escape_underscores"]:
+            text = text.replace("_", r"\_")
         return text
 
-    def underline(self, text, pad_char):
-        text = (text or '').rstrip()
-        return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
+    def underline(self, text: str, pad_char: str):
+        text = (text or "").rstrip()
+        return "\n\n%s\n%s\n\n" % (text, pad_char * len(text)) if text else ""
 
-    def convert_a(self, el, text, parent_tags):
-        if '_noformat' in parent_tags:
+    def convert_a(self, el: LexborNode, text: str, parent_tags: set[str]):
+        if "_noformat" in parent_tags:
             return text
         prefix, suffix, text = chomp(text)
         if not text:
-            return ''
-        href = el.get('href')
-        title = el.get('title')
+            return ""
+        attributes = el.attributes
+        href = attributes.get("href")
+        title = attributes.get("title")
         # For the replacement see #29: text nodes underscores are escaped
-        if (self.options['autolinks']
-                and text.replace(r'\_', '_') == href
-                and not title
-                and not self.options['default_title']):
+        if (
+            self.options["autolinks"]
+            and text.replace(r"\_", "_") == href
+            and not title
+            and not self.options["default_title"]
+        ):
             # Shortcut syntax
-            return '<%s>' % href
-        if self.options['default_title'] and not title:
+            return "<%s>" % href
+        if self.options["default_title"] and not title:
             title = href
-        title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
-        return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text
+        title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
+        return (
+            "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix)
+            if href
+            else text
+        )
 
-    convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])
+    convert_b = abstract_inline_conversion(
+        lambda self: 2 * self.options["strong_em_symbol"]
+    )
 
-    def convert_blockquote(self, el, text, parent_tags):
+    def convert_blockquote(self, el: LexborNode, text: str, parent_tags: set[str]):
         # handle some early-exit scenarios
-        text = (text or '').strip(' \t\r\n')
-        if '_inline' in parent_tags:
-            return ' ' + text + ' '
+        text = (text or "").strip(" \t\r\n")
+        if "_inline" in parent_tags:
+            return " " + text + " "
         if not text:
             return "\n"
 
         # indent lines with blockquote marker
         def _indent_for_blockquote(match):
             line_content = match.group(1)
-            return '> ' + line_content if line_content else '>'
+            return "> " + line_content if line_content else ">"
+
         text = re_line_with_content.sub(_indent_for_blockquote, text)
 
-        return '\n' + text + '\n\n'
+        return "\n" + text + "\n\n"
 
-    def convert_br(self, el, text, parent_tags):
-        if '_inline' in parent_tags:
-            return ' '
+    def convert_br(self, el: LexborNode, text: str, parent_tags: set[str]):
+        if "_inline" in parent_tags:
+            return " "
 
-        if self.options['newline_style'].lower() == BACKSLASH:
-            return '\\\n'
+        if self.options["newline_style"].lower() == BACKSLASH:
+            return "\\\n"
         else:
-            return '  \n'
+            return "  \n"
 
-    def convert_code(self, el, text, parent_tags):
-        if '_noformat' in parent_tags:
+    def convert_code(self, el: LexborNode, text: str, parent_tags: set[str]):
+        if "_noformat" in parent_tags:
             return text
 
         prefix, suffix, text = chomp(text)
         if not text:
-            return ''
+            return ""
 
         # Find the maximum number of consecutive backticks in the text, then
         # delimit the code span with one more backtick than that
-        max_backticks = max((len(match) for match in re.findall(re_backtick_runs, text)), default=0)
-        markup_delimiter = '`' * (max_backticks + 1)
+        max_backticks = max(
+            (len(match) for match in re.findall(re_backtick_runs, text)), default=0
+        )
+        markup_delimiter = "`" * (max_backticks + 1)
 
         # If the maximum number of backticks is greater than zero, add a space
         # to avoid interpretation of inside backticks as literals
         if max_backticks > 0:
             text = " " + text + " "
 
-        return '%s%s%s%s%s' % (prefix, markup_delimiter, text, markup_delimiter, suffix)
+        return "%s%s%s%s%s" % (prefix, markup_delimiter, text, markup_delimiter, suffix)
 
-    convert_del = abstract_inline_conversion(lambda self: '~~')
+    convert_del = abstract_inline_conversion(lambda self: "~~")
 
-    def convert_div(self, el, text, parent_tags):
-        if '_inline' in parent_tags:
-            return ' ' + text.strip() + ' '
+    def convert_div(self, el: LexborNode, text: str, parent_tags: set[str]):
+        if "_inline" in parent_tags:
+            return " " + text.strip() + " "
         text = text.strip()
-        return '\n\n%s\n\n' % text if text else ''
+        return "\n\n%s\n\n" % text if text else ""
 
     convert_article = convert_div
 
     convert_section = convert_div
 
-    convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol'])
+    convert_em = abstract_inline_conversion(
+        lambda self: self.options["strong_em_symbol"]
+    )
 
     convert_kbd = convert_code
 
-    def convert_dd(self, el, text, parent_tags):
-        text = (text or '').strip()
-        if '_inline' in parent_tags:
-            return ' ' + text + ' '
+    def convert_dd(self, el: LexborNode, text: str, parent_tags: set[str]):
+        text = (text or "").strip()
+        if "_inline" in parent_tags:
+            return " " + text + " "
         if not text:
-            return '\n'
+            return "\n"
 
         # indent definition content lines by four spaces
         def _indent_for_dd(match):
             line_content = match.group(1)
-            return '    ' + line_content if line_content else ''
+            return "    " + line_content if line_content else ""
+
         text = re_line_with_content.sub(_indent_for_dd, text)
 
         # insert definition marker into first-line indent whitespace
-        text = ':' + text[1:]
+        text = ":" + text[1:]
 
-        return '%s\n' % text
+        return "%s\n" % text
 
     # definition lists are formatted as follows:
     #   https://pandoc.org/MANUAL.html#definition-lists
     #   https://michelf.ca/projects/php-markdown/extra/#def-list
     convert_dl = convert_div
 
-    def convert_dt(self, el, text, parent_tags):
+    def convert_dt(self, el: LexborNode, text: str, parent_tags: set[str]):
         # remove newlines from term text
-        text = (text or '').strip()
-        text = re_all_whitespace.sub(' ', text)
-        if '_inline' in parent_tags:
-            return ' ' + text + ' '
+        text = (text or "").strip()
+        text = re_all_whitespace.sub(" ", text)
+        if "_inline" in parent_tags:
+            return " " + text + " "
         if not text:
-            return '\n'
+            return "\n"
 
         # TODO - format consecutive 
elements as directly adjacent lines): # https://michelf.ca/projects/php-markdown/extra/#def-list - return '\n\n%s\n' % text + return "\n\n%s\n" % text - def convert_hN(self, n, el, text, parent_tags): + def convert_hN(self, n: int, el: LexborNode, text: str, parent_tags: set[str]): # convert_hN() converts tags, where N is any integer - if '_inline' in parent_tags: + if "_inline" in parent_tags: return text # Markdown does not support heading depths of n > 6 n = max(1, min(6, n)) - style = self.options['heading_style'].lower() + style = self.options["heading_style"].lower() text = text.strip() if style == UNDERLINED and n <= 2: - line = '=' if n == 1 else '-' + line = "=" if n == 1 else "-" return self.underline(text, line) - text = re_all_whitespace.sub(' ', text) - hashes = '#' * n + text = re_all_whitespace.sub(" ", text) + hashes = "#" * n if style == ATX_CLOSED: - return '\n\n%s %s %s\n\n' % (hashes, text, hashes) - return '\n\n%s %s\n\n' % (hashes, text) + return "\n\n%s %s %s\n\n" % (hashes, text, hashes) + return "\n\n%s %s\n\n" % (hashes, text) - def convert_hr(self, el, text, parent_tags): - return '\n\n---\n\n' + def convert_hr(self, el: LexborNode, text: str, parent_tags: set[str]): + return "\n\n---\n\n" convert_i = convert_em - def convert_img(self, el, text, parent_tags): - alt = el.attrs.get('alt', None) or '' - src = el.attrs.get('src', None) or '' - title = el.attrs.get('title', None) or '' - title_part = ' "%s"' % title.replace('"', r'\"') if title else '' - if ('_inline' in parent_tags - and el.parent.name not in self.options['keep_inline_images_in']): + def convert_img(self, el: LexborNode, text: str, parent_tags: set[str]): + if not el.parent: + raise NotImplementedError( + "img element does not have a children. Potentially malformed?" + ) + attrs = el.attributes + alt = attrs.get("alt", None) or "" + src = attrs.get("src", None) or "" + title = attrs.get("title", None) or "" + title_part = ' "%s"' % title.replace('"', r"\"") if title else "" + if ( + "_inline" in parent_tags + and el.parent.tag not in self.options["keep_inline_images_in"] + ): return alt - return '![%s](%s%s)' % (alt, src, title_part) + return "![%s](%s%s)" % (alt, src, title_part) - def convert_video(self, el, text, parent_tags): - if ('_inline' in parent_tags - and el.parent.name not in self.options['keep_inline_images_in']): + def convert_video(self, el: LexborNode, text: str, parent_tags: set[str]): + if not el.parent: + raise NotImplementedError( + "video element does not have a children. Potentially malformed?" + ) + if ( + "_inline" in parent_tags + and el.parent.tag not in self.options["keep_inline_images_in"] + ): return text - src = el.attrs.get('src', None) or '' + attrs = el.attributes + src = attrs.get("src", None) or "" if not src: - sources = el.find_all('source', attrs={'src': True}) + sources = el.css("source[src]") if sources: - src = sources[0].attrs.get('src', None) or '' - poster = el.attrs.get('poster', None) or '' + src = sources[0].attributes.get("src", None) or "" + poster = attrs.get("poster", None) or "" if src and poster: - return '[![%s](%s)](%s)' % (text, poster, src) + return "[![%s](%s)](%s)" % (text, poster, src) if src: - return '[%s](%s)' % (text, src) + return "[%s](%s)" % (text, src) if poster: - return '![%s](%s)' % (text, poster) + return "![%s](%s)" % (text, poster) return text - def convert_list(self, el, text, parent_tags): - + def convert_list(self, el: LexborNode, text: str, parent_tags: set[str]): # Converting a list to inline is undefined. # Ignoring inline conversion parents for list. before_paragraph = False next_sibling = _next_block_content_sibling(el) - if next_sibling and next_sibling.name not in ['ul', 'ol']: + if next_sibling and next_sibling.tag not in ["ul", "ol"]: before_paragraph = True - if 'li' in parent_tags: + if "li" in parent_tags: # remove trailing newline if we're in a nested list - return '\n' + text.rstrip() - return '\n\n' + text + ('\n' if before_paragraph else '') + return "\n" + text.rstrip() + return "\n\n" + text + ("\n" if before_paragraph else "") convert_ul = convert_list convert_ol = convert_list - def convert_li(self, el, text, parent_tags): + def convert_li(self, el: LexborNode, text: str, parent_tags: set[str]): + if not el.parent: + raise NotImplementedError( + "li element does not have a children. Potentially malformed?" + ) # handle some early-exit scenarios - text = (text or '').strip() + text = (text or "").strip() if not text: return "\n" # determine list item bullet character to use parent = el.parent - if parent is not None and parent.name == 'ol': - if parent.get("start") and str(parent.get("start")).isnumeric(): - start = int(parent.get("start")) + if parent is not None and parent.tag == "ol": + start_attribute = parent.attributes.get("start") + if start_attribute and str(start_attribute).isnumeric(): + start = int(start_attribute) else: start = 1 - bullet = '%s.' % (start + len(el.find_previous_siblings('li'))) + bullet = "%s." % (start + len(list(find_previous_siblings(el, "li")))) else: depth = -1 while el: - if el.name == 'ul': + if el.tag == "ul": depth += 1 el = el.parent - bullets = self.options['bullets'] + bullets = self.options["bullets"] bullet = bullets[depth % len(bullets)] - bullet = bullet + ' ' + bullet = bullet + " " bullet_width = len(bullet) - bullet_indent = ' ' * bullet_width + bullet_indent = " " * bullet_width # indent content lines by bullet width def _indent_for_li(match): line_content = match.group(1) - return bullet_indent + line_content if line_content else '' + return bullet_indent + line_content if line_content else "" + text = re_line_with_content.sub(_indent_for_li, text) # insert bullet into first-line indent whitespace text = bullet + text[bullet_width:] - return '%s\n' % text + return "%s\n" % text - def convert_p(self, el, text, parent_tags): - if '_inline' in parent_tags: - return ' ' + text.strip(' \t\r\n') + ' ' - text = text.strip(' \t\r\n') - if self.options['wrap']: + def convert_p(self, el: LexborNode, text: str, parent_tags: set[str]): + if "_inline" in parent_tags: + return " " + text.strip(" \t\r\n") + " " + text = text.strip(" \t\r\n") + if self.options["wrap"]: # Preserve newlines (and preceding whitespace) resulting # from
tags. Newlines in the input have already been # replaced by spaces. - if self.options['wrap_width'] is not None: - lines = text.split('\n') + if self.options["wrap_width"] is not None: + lines = text.split("\n") new_lines = [] for line in lines: - line = line.lstrip(' \t\r\n') + line = line.lstrip(" \t\r\n") line_no_trailing = line.rstrip() - trailing = line[len(line_no_trailing):] - line = fill(line, - width=self.options['wrap_width'], - break_long_words=False, - break_on_hyphens=False) + trailing = line[len(line_no_trailing) :] + line = fill( + line, + width=self.options["wrap_width"], + break_long_words=False, + break_on_hyphens=False, + ) new_lines.append(line + trailing) - text = '\n'.join(new_lines) - return '\n\n%s\n\n' % text if text else '' + text = "\n".join(new_lines) + return "\n\n%s\n\n" % text if text else "" - def convert_pre(self, el, text, parent_tags): + def convert_pre(self, el: LexborNode, text: str, parent_tags: set[str]): if not text: - return '' - code_language = self.options['code_language'] + return "" + code_language = self.options["code_language"] - if self.options['code_language_callback']: - code_language = self.options['code_language_callback'](el) or code_language + if self.options["code_language_callback"]: + code_language = self.options["code_language_callback"](el) or code_language - if self.options['strip_pre'] == STRIP: + if self.options["strip_pre"] == STRIP: text = strip_pre(text) # remove all leading/trailing newlines - elif self.options['strip_pre'] == STRIP_ONE: + elif self.options["strip_pre"] == STRIP_ONE: text = strip1_pre(text) # remove one leading/trailing newline - elif self.options['strip_pre'] is None: + elif self.options["strip_pre"] is None: pass # leave leading and trailing newlines as-is else: - raise ValueError('Invalid value for strip_pre: %s' % self.options['strip_pre']) + raise ValueError( + "Invalid value for strip_pre: %s" % self.options["strip_pre"] + ) - return '\n\n```%s\n%s\n```\n\n' % (code_language, text) + return "\n\n```%s\n%s\n```\n\n" % (code_language, text) - def convert_q(self, el, text, parent_tags): + def convert_q(self, el: LexborNode, text: str, parent_tags: set[str] | None = None): return '"' + text + '"' - def convert_script(self, el, text, parent_tags): - return '' + def convert_script( + self, el: LexborNode, text: str, parent_tags: set[str] | None = None + ): + return "" - def convert_style(self, el, text, parent_tags): - return '' + def convert_style( + self, el: LexborNode, text: str, parent_tags: set[str] | None = None + ): + return "" convert_s = convert_del @@ -719,75 +830,99 @@ def convert_style(self, el, text, parent_tags): convert_samp = convert_code - convert_sub = abstract_inline_conversion(lambda self: self.options['sub_symbol']) + convert_sub = abstract_inline_conversion(lambda self: self.options["sub_symbol"]) - convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol']) + convert_sup = abstract_inline_conversion(lambda self: self.options["sup_symbol"]) - def convert_table(self, el, text, parent_tags): - return '\n\n' + text.strip() + '\n\n' + def convert_table( + self, el: LexborNode, text: str, parent_tags: set[str] | None = None + ): + return "\n\n" + text.strip() + "\n\n" - def convert_caption(self, el, text, parent_tags): - return text.strip() + '\n\n' + def convert_caption( + self, el: LexborNode, text: str, parent_tags: set[str] | None = None + ): + return text.strip() + "\n\n" - def convert_figcaption(self, el, text, parent_tags): - return '\n\n' + text.strip() + '\n\n' + def convert_figcaption( + self, el: LexborNode, text: str, parent_tags: set[str] | None = None + ): + return "\n\n" + text.strip() + "\n\n" - def convert_td(self, el, text, parent_tags): + def convert_td( + self, el: LexborNode, text: str, parent_tags: set[str] | None = None + ): colspan = 1 - if 'colspan' in el.attrs and el['colspan'].isdigit(): - colspan = max(1, min(1000, int(el['colspan']))) - return ' ' + text.strip().replace("\n", " ") + ' |' * colspan - - def convert_th(self, el, text, parent_tags): + el_colspan = el.attributes.get("colspan") + el_colspan = int(el_colspan) if el_colspan and el_colspan.isdigit() else 0 + if el_colspan: + colspan = max(1, min(1000, el_colspan)) + return " " + text.strip().replace("\n", " ") + " |" * colspan + + def convert_th( + self, el: LexborNode, text: str, parent_tags: set[str] | None = None + ): colspan = 1 - if 'colspan' in el.attrs and el['colspan'].isdigit(): - colspan = max(1, min(1000, int(el['colspan']))) - return ' ' + text.strip().replace("\n", " ") + ' |' * colspan - - def convert_tr(self, el, text, parent_tags): - cells = el.find_all(['td', 'th']) - is_first_row = el.find_previous_sibling() is None - is_headrow = ( - all([cell.name == 'th' for cell in cells]) - or (el.parent.name == 'thead' - # avoid multiple tr in thead - and len(el.parent.find_all('tr')) == 1) + el_colspan = el.attributes.get("colspan") + el_colspan = int(el_colspan) if el_colspan and el_colspan.isdigit() else 0 + if el_colspan: + colspan = max(1, min(1000, el_colspan)) + return " " + text.strip().replace("\n", " ") + " |" * colspan + + def convert_tr( + self, el: LexborNode, text: str, parent_tags: set[str] | None = None + ): + if not el.parent or not el.parent.parent: + raise NotImplementedError( + "Found table row with no parent or sub-parent. Malformed document?" + ) + cells = el.css("td,th") + is_first_row = el.prev is None + is_headrow = all([cell.tag == "th" for cell in cells]) or ( + el.parent.tag == "thead" + # avoid multiple tr in thead + and len(el.parent.css("tr")) == 1 ) - is_head_row_missing = ( - (is_first_row and not el.parent.name == 'tbody') - or (is_first_row and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1) + is_head_row_missing = (is_first_row and not el.parent.tag == "tbody") or ( + is_first_row + and el.parent.tag == "tbody" + and len(el.parent.parent.css("thead")) < 1 ) - overline = '' - underline = '' + overline = "" + underline = "" full_colspan = 0 for cell in cells: - if 'colspan' in cell.attrs and cell['colspan'].isdigit(): - full_colspan += max(1, min(1000, int(cell['colspan']))) + cell_colspan = cell.attributes.get("colspan") + cell_colspan = ( + int(cell_colspan) if cell_colspan and cell_colspan.isdigit() else 0 + ) + if cell_colspan: + full_colspan += max(1, min(1000, cell_colspan)) else: full_colspan += 1 - if ((is_headrow - or (is_head_row_missing - and self.options['table_infer_header'])) - and is_first_row): + if ( + is_headrow or (is_head_row_missing and self.options["table_infer_header"]) + ) and is_first_row: # first row and: # - is headline or # - headline is missing and header inference is enabled # print headline underline - underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n' - elif ((is_head_row_missing - and not self.options['table_infer_header']) - or (is_first_row - and (el.parent.name == 'table' - or (el.parent.name == 'tbody' - and not el.parent.find_previous_sibling())))): + underline += "| " + " | ".join(["---"] * full_colspan) + " |" + "\n" + elif (is_head_row_missing and not self.options["table_infer_header"]) or ( + is_first_row + and ( + el.parent.tag == "table" + or (el.parent.tag == "tbody" and not el.parent.prev) + ) + ): # headline is missing and header inference is disabled or: # first row, not headline, and: # - the parent is table or # - the parent is tbody at the beginning of a table. # print empty headline above this row - overline += '| ' + ' | '.join([''] * full_colspan) + ' |' + '\n' - overline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n' - return overline + '|' + text + '\n' + underline + overline += "| " + " | ".join([""] * full_colspan) + " |" + "\n" + overline += "| " + " | ".join(["---"] * full_colspan) + " |" + "\n" + return overline + "|" + text + "\n" + underline def markdownify(html, **options): diff --git a/markdownify/__init__.pyi b/markdownify/__init__.pyi index 5f9b852..a9b8674 100644 --- a/markdownify/__init__.pyi +++ b/markdownify/__init__.pyi @@ -14,11 +14,9 @@ RSTRIP: str STRIP: str STRIP_ONE: str - def markdownify( html: str, autolinks: bool = ..., - bs4_options: str = ..., bullets: str = ..., code_language: str = ..., code_language_callback: Union[Callable[[Incomplete], Union[str, None]], None] = ..., @@ -41,15 +39,15 @@ def markdownify( wrap_width: int = ..., ) -> str: ... - class MarkdownConverter: def __init__( self, autolinks: bool = ..., - bs4_options: str = ..., bullets: str = ..., code_language: str = ..., - code_language_callback: Union[Callable[[Incomplete], Union[str, None]], None] = ..., + code_language_callback: Union[ + Callable[[Incomplete], Union[str, None]], None + ] = ..., convert: Union[list[str], None] = ..., default_title: bool = ..., escape_asterisks: bool = ..., @@ -67,11 +65,6 @@ class MarkdownConverter: table_infer_header: bool = ..., wrap: bool = ..., wrap_width: int = ..., - ) -> None: - ... - - def convert(self, html: str) -> str: - ... - - def convert_soup(self, soup: Incomplete) -> str: - ... + ) -> None: ... + def convert(self, html: str) -> str: ... + def convert_soup(self, soup: Incomplete) -> str: ... diff --git a/markdownify/main.py b/markdownify/main.py index ba70671..6c32f28 100755 --- a/markdownify/main.py +++ b/markdownify/main.py @@ -3,82 +3,145 @@ import argparse import sys -from markdownify import markdownify, ATX, ATX_CLOSED, UNDERLINED, \ - SPACES, BACKSLASH, ASTERISK, UNDERSCORE +from markdownify import ( + markdownify, + ATX, + ATX_CLOSED, + UNDERLINED, + SPACES, + BACKSLASH, + ASTERISK, + UNDERSCORE, +) def main(argv=sys.argv[1:]): parser = argparse.ArgumentParser( - prog='markdownify', - description='Converts html to markdown.', + prog="markdownify", + description="Converts html to markdown.", ) - parser.add_argument('html', nargs='?', type=argparse.FileType('r'), - default=sys.stdin, - help="The html file to convert. Defaults to STDIN if not " - "provided.") - parser.add_argument('-s', '--strip', nargs='*', - help="A list of tags to strip. This option can't be used with " - "the --convert option.") - parser.add_argument('-c', '--convert', nargs='*', - help="A list of tags to convert. This option can't be used with " - "the --strip option.") - parser.add_argument('-a', '--autolinks', action='store_true', - help="A boolean indicating whether the 'automatic link' style " - "should be used when a 'a' tag's contents match its href.") - parser.add_argument('--default-title', action='store_false', - help="A boolean to enable setting the title of a link to its " - "href, if no title is given.") - parser.add_argument('--heading-style', default=UNDERLINED, - choices=(ATX, ATX_CLOSED, UNDERLINED), - help="Defines how headings should be converted.") - parser.add_argument('-b', '--bullets', default='*+-', - help="A string of bullet styles to use; the bullet will " - "alternate based on nesting level.") - parser.add_argument('--strong-em-symbol', default=ASTERISK, - choices=(ASTERISK, UNDERSCORE), - help="Use * or _ to convert strong and italics text"), - parser.add_argument('--sub-symbol', default='', - help="Define the chars that surround ''.") - parser.add_argument('--sup-symbol', default='', - help="Define the chars that surround ''.") - parser.add_argument('--newline-style', default=SPACES, - choices=(SPACES, BACKSLASH), - help="Defines the style of
conversions: two spaces " - "or backslash at the and of the line thet should break.") - parser.add_argument('--code-language', default='', - help="Defines the language that should be assumed for all " - "'
' sections.")
-    parser.add_argument('--no-escape-asterisks', dest='escape_asterisks',
-                        action='store_false',
-                        help="Do not escape '*' to '\\*' in text.")
-    parser.add_argument('--no-escape-underscores', dest='escape_underscores',
-                        action='store_false',
-                        help="Do not escape '_' to '\\_' in text.")
-    parser.add_argument('-i', '--keep-inline-images-in',
-                        default=[],
-                        nargs='*',
-                        help="Images are converted to their alt-text when the images are "
-                        "located inside headlines or table cells. If some inline images "
-                        "should be converted to markdown images instead, this option can "
-                        "be set to a list of parent tags that should be allowed to "
-                        "contain inline images.")
-    parser.add_argument('--table-infer-header', dest='table_infer_header',
-                        action='store_true',
-                        help="When a table has no header row (as indicated by '' "
-                        "or ''), use the first body row as the header row.")
-    parser.add_argument('-w', '--wrap', action='store_true',
-                        help="Wrap all text paragraphs at --wrap-width characters.")
-    parser.add_argument('--wrap-width', type=int, default=80)
-    parser.add_argument('--bs4-options',
-                        default='html.parser',
-                        help="Specifies the parser that BeautifulSoup should use to parse "
-                             "the HTML markup. Examples include 'html5.parser', 'lxml', and "
-                             "'html5lib'.")
+    parser.add_argument(
+        "html",
+        nargs="?",
+        type=argparse.FileType("r"),
+        default=sys.stdin,
+        help="The html file to convert. Defaults to STDIN if not provided.",
+    )
+    parser.add_argument(
+        "-s",
+        "--strip",
+        nargs="*",
+        help="A list of tags to strip. This option can't be used with "
+        "the --convert option.",
+    )
+    parser.add_argument(
+        "-c",
+        "--convert",
+        nargs="*",
+        help="A list of tags to convert. This option can't be used with "
+        "the --strip option.",
+    )
+    parser.add_argument(
+        "-a",
+        "--autolinks",
+        action="store_true",
+        help="A boolean indicating whether the 'automatic link' style "
+        "should be used when a 'a' tag's contents match its href.",
+    )
+    parser.add_argument(
+        "--default-title",
+        action="store_false",
+        help="A boolean to enable setting the title of a link to its "
+        "href, if no title is given.",
+    )
+    parser.add_argument(
+        "--heading-style",
+        default=UNDERLINED,
+        choices=(ATX, ATX_CLOSED, UNDERLINED),
+        help="Defines how headings should be converted.",
+    )
+    parser.add_argument(
+        "-b",
+        "--bullets",
+        default="*+-",
+        help="A string of bullet styles to use; the bullet will "
+        "alternate based on nesting level.",
+    )
+    (
+        parser.add_argument(
+            "--strong-em-symbol",
+            default=ASTERISK,
+            choices=(ASTERISK, UNDERSCORE),
+            help="Use * or _ to convert strong and italics text",
+        ),
+    )
+    parser.add_argument(
+        "--sub-symbol", default="", help="Define the chars that surround ''."
+    )
+    parser.add_argument(
+        "--sup-symbol", default="", help="Define the chars that surround ''."
+    )
+    parser.add_argument(
+        "--newline-style",
+        default=SPACES,
+        choices=(SPACES, BACKSLASH),
+        help="Defines the style of 
conversions: two spaces " + "or backslash at the and of the line thet should break.", + ) + parser.add_argument( + "--code-language", + default="", + help="Defines the language that should be assumed for all '
' sections.",
+    )
+    parser.add_argument(
+        "--no-escape-asterisks",
+        dest="escape_asterisks",
+        action="store_false",
+        help="Do not escape '*' to '\\*' in text.",
+    )
+    parser.add_argument(
+        "--no-escape-underscores",
+        dest="escape_underscores",
+        action="store_false",
+        help="Do not escape '_' to '\\_' in text.",
+    )
+    parser.add_argument(
+        "-i",
+        "--keep-inline-images-in",
+        default=[],
+        nargs="*",
+        help="Images are converted to their alt-text when the images are "
+        "located inside headlines or table cells. If some inline images "
+        "should be converted to markdown images instead, this option can "
+        "be set to a list of parent tags that should be allowed to "
+        "contain inline images.",
+    )
+    parser.add_argument(
+        "--table-infer-header",
+        dest="table_infer_header",
+        action="store_true",
+        help="When a table has no header row (as indicated by '' "
+        "or ''), use the first body row as the header row.",
+    )
+    parser.add_argument(
+        "-w",
+        "--wrap",
+        action="store_true",
+        help="Wrap all text paragraphs at --wrap-width characters.",
+    )
+    parser.add_argument("--wrap-width", type=int, default=80)
+    parser.add_argument(
+        "--bs4-options",
+        default="html.parser",
+        help="Specifies the parser that BeautifulSoup should use to parse "
+        "the HTML markup. Examples include 'html5.parser', 'lxml', and "
+        "'html5lib'.",
+    )
 
     args = parser.parse_args(argv)
     print(markdownify(**vars(args)))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     main()
diff --git a/pyproject.toml b/pyproject.toml
index 3df85eb..e8268bb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "markdownify"
-version = "1.2.2"
+version = "2.0.0"
 authors = [{name = "Matthew Tretter", email = "m@tthewwithanm.com"}]
 description = "Convert HTML to markdown."
 readme = "README.rst"
@@ -23,8 +23,7 @@ classifiers = [
     "Topic :: Utilities",
 ]
 dependencies = [
-    "beautifulsoup4>=4.9,<5",
-    "six>=1.15,<2"
+    "selectolax>0.4"
 ]
 
 [project.urls]
diff --git a/tests/test_custom_converter.py b/tests/test_custom_converter.py
index 00a83fc..51b1170 100644
--- a/tests/test_custom_converter.py
+++ b/tests/test_custom_converter.py
@@ -1,5 +1,5 @@
 from markdownify import MarkdownConverter
-from bs4 import BeautifulSoup
+from selectolax.lexbor import LexborHTMLParser
 
 
 class UnitTestConverter(MarkdownConverter):
@@ -40,5 +40,5 @@ def md(html, **options):
 
 def test_soup():
     html = 'test'
-    soup = BeautifulSoup(html, 'html.parser')
+    soup = LexborHTMLParser(html)
     assert MarkdownConverter().convert_soup(soup) == '**test**'
diff --git a/tests/test_escaping.py b/tests/test_escaping.py
index bab4d11..af828e4 100644
--- a/tests/test_escaping.py
+++ b/tests/test_escaping.py
@@ -1,5 +1,4 @@
 import warnings
-from bs4 import MarkupResemblesLocatorWarning
 from .utils import md
 
 
@@ -32,7 +31,6 @@ def test_single_escaping_entities():
 
 def test_misc():
     # ignore the bs4 warning that "1.2" or "*" looks like a filename
-    warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
 
     assert md('\\*', escape_misc=True) == r'\\\*'
     assert md('<foo>', escape_misc=True) == r'\'
diff --git a/tests/types.py b/tests/types.py
index 7424978..90951de 100644
--- a/tests/types.py
+++ b/tests/types.py
@@ -1,5 +1,5 @@
 from markdownify import markdownify, ASTERISK, BACKSLASH, LSTRIP, RSTRIP, SPACES, STRIP, UNDERLINED, UNDERSCORE, MarkdownConverter
-from bs4 import BeautifulSoup
+from selectolax.lexbor import LexborHTMLParser, LexborNode
 from typing import Union
 
 markdownify("

Hello

") == "Hello" # test default of STRIP @@ -11,7 +11,6 @@ # default options MarkdownConverter( autolinks=True, - bs4_options='html.parser', bullets='*+-', code_language='', code_language_callback=None, @@ -55,11 +54,11 @@ ).convert("") html = 'test' -soup = BeautifulSoup(html, 'html.parser') +soup = LexborHTMLParser(html) MarkdownConverter().convert_soup(soup) == '**test**' -def callback(el: BeautifulSoup) -> Union[str, None]: +def callback(el: LexborNode) -> Union[str, None]: return el['class'][0] if el.has_attr('class') else None diff --git a/tests/utils.py b/tests/utils.py index 0dac580..83837f6 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -3,7 +3,7 @@ # for unit testing, disable document-level stripping by default so that # separation newlines are included in testing -def md(html, **options): +def md(html: str, **options): options = {"strip_document": None, **options} return MarkdownConverter(**options).convert(html)