diff --git a/docs/source/_templates/hacks.html b/docs/source/_templates/hacks.html index 1363bee3..cbc64ed9 100644 --- a/docs/source/_templates/hacks.html +++ b/docs/source/_templates/hacks.html @@ -1,48 +1,51 @@ - - - - - - + - - - \ No newline at end of file diff --git a/docs/source/_templates/sidebarintro.html b/docs/source/_templates/sidebarintro.html index 816185f3..3d69fe49 100644 --- a/docs/source/_templates/sidebarintro.html +++ b/docs/source/_templates/sidebarintro.html @@ -1,56 +1,71 @@ - - -

- -

- -

- Requests-HTML intends to make parsing HTML (e.g. scraping the web) as -simple and intuitive as possible. -

- -

Stay Informed

-

Receive updates on new releases and upcoming projects.

- -

- - - - - -

Follow @kennethreitz

-

Say Thanks!

- -

Other Projects

- -

More Kenneth Reitz projects:

- + + + diff --git a/requests_html.py b/requests_html.py index cd341def..5d682abd 100644 --- a/requests_html.py +++ b/requests_html.py @@ -53,31 +53,27 @@ _Next = Union['HTML', List[str]] _NextSymbol = List[str] -# Sanity checking. -try: - assert sys.version_info.major == 3 - assert sys.version_info.minor > 5 -except AssertionError: +# Sanity checking: Use explicit version checking instead of assert to avoid issues with optimizations +if sys.version_info < (3, 6): raise RuntimeError('Requests-HTML requires Python 3.6+!') class MaxRetries(Exception): - + """Custom exception to handle max retry logic.""" def __init__(self, message): self.message = message class BaseParser: - """A basic HTML/Element Parser, for Humans. + """A basic HTML/Element Parser for Humans. :param element: The element from which to base the parsing upon. - :param default_encoding: Which encoding to default to. - :param html: HTML from which to base the parsing upon (optional). - :param url: The URL from which the HTML originated, used for ``absolute_links``. - + :param default_encoding: The encoding to use as fallback. + :param html: HTML from which to base the parsing (optional). + :param url: The URL from which the HTML originated, useful for absolute links. """ - def __init__(self, *, element, default_encoding: _DefaultEncoding = None, html: _HTML = None, url: _URL) -> None: + def __init__(self, *, element, default_encoding: _DefaultEncoding = DEFAULT_ENCODING, html: _HTML = None, url: _URL) -> None: self.element = element self.url = url self.skip_anchors = True @@ -89,279 +85,238 @@ def __init__(self, *, element, default_encoding: _DefaultEncoding = None, html: @property def raw_html(self) -> _RawHTML: - """Bytes representation of the HTML content. - (`learn more `_). - """ + """Bytes representation of the HTML content.""" if self._html: return self._html - else: - return etree.tostring(self.element, encoding='unicode').strip().encode(self.encoding) + # If raw HTML isn't provided, it is constructed from the element. + return etree.tostring(self.element, encoding='unicode').strip().encode(self.encoding) @property def html(self) -> _BaseHTML: - """Unicode representation of the HTML content - (`learn more `_). - """ + """Unicode representation of the HTML content.""" if self._html: return self.raw_html.decode(self.encoding, errors='replace') - else: - return etree.tostring(self.element, encoding='unicode').strip() + return etree.tostring(self.element, encoding='unicode').strip() @html.setter def html(self, html: str) -> None: + """Setter for HTML content.""" self._html = html.encode(self.encoding) @raw_html.setter def raw_html(self, html: bytes) -> None: - """Property setter for self.html.""" + """Setter for raw HTML content.""" self._html = html @property def encoding(self) -> _Encoding: - """The encoding string to be used, extracted from the HTML and - :class:`HTMLResponse ` headers. - """ + """Determine the encoding from the HTML content or fallback.""" if self._encoding: return self._encoding - # Scan meta tags for charset. + # If no encoding has been set, determine it from the HTML or use the default. if self._html: - self._encoding = html_to_unicode(self.default_encoding, self._html)[0] - # Fall back to requests' detected encoding if decode fails. + detected_encoding, _ = html_to_unicode(self.default_encoding, self._html) + self._encoding = detected_encoding + + # Check that the HTML can be decoded properly try: - self.raw_html.decode(self.encoding, errors='replace') - except UnicodeDecodeError: + self.raw_html.decode(self._encoding, errors='replace') + except (UnicodeDecodeError, TypeError): self._encoding = self.default_encoding - - return self._encoding if self._encoding else self.default_encoding + # Ensure a default is returned if no encoding is detected + return self._encoding or self.default_encoding @encoding.setter def encoding(self, enc: str) -> None: - """Property setter for self.encoding.""" + """Setter for encoding.""" self._encoding = enc @property def pq(self) -> PyQuery: - """`PyQuery `_ representation - of the :class:`Element ` or :class:`HTML `. - """ + """Return the PyQuery object for the HTML content.""" if self._pq is None: - self._pq = PyQuery(self.lxml) - + self._pq = PyQuery(self.lxml) # Lazy load when accessed return self._pq @property def lxml(self) -> HtmlElement: - """`lxml `_ representation of the - :class:`Element ` or :class:`HTML `. - """ + """Return the lxml object for the HTML content.""" if self._lxml is None: try: self._lxml = soup_parse(self.html, features='html.parser') except ValueError: self._lxml = lxml.html.fromstring(self.raw_html) - return self._lxml @property - def text(self) -> _Text: - """The text content of the - :class:`Element ` or :class:`HTML `. - """ - return self.pq.text() - - @property - def full_text(self) -> _Text: - """The full text content (including links) of the - :class:`Element ` or :class:`HTML `. - """ - return self.lxml.text_content() - - def find(self, selector: str = "*", *, containing: _Containing = None, clean: bool = False, first: bool = False, _encoding: str = None) -> _Find: - """Given a CSS Selector, returns a list of - :class:`Element ` objects or a single one. - - :param selector: CSS Selector to use. - :param clean: Whether or not to sanitize the found HTML of `` - - - - - - - - - - - - - - - - - - - - - Welcome to Python.org - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
- -
-

Notice: While Javascript is not essential for this website, your interaction with the content will be limited. Please turn Javascript on for the full experience.

-
- - - - -
- - - -
- - - - -
- -
- -
- - - - - - - - -
- -
-

Get Started

-

Whether you're new to programming or an experienced developer, it's easy to learn and use Python.

-

Start with our Beginner’s Guide

-
- -
-

Download

-

Python source code and installers are available for download for all versions! Not sure which version to use? Check here.

-

Latest: Python 3.6.4 - Python 2.7.14

-
- -
-

Docs

-

Documentation for Python's standard library, along with tutorials and guides, are available online.

-

docs.python.org

-
- -
-

Jobs

-

Looking for work or have a Python related position that you're trying to hire for? Our relaunched community-run job board is the place to go.

-

jobs.python.org

-
- -
- - - -
- - - -
-
-

Use Python for…

-

More

- -
+
+

All the Flow You’d Expect

+

Python knows the usual control flow statements that other languages speak — if, for, while and range — with some of its own twists, of course. More control flow tools in Python 3

+
+ - -
-
- -
- - -
- -

- >>> Python Enhancement Proposals (PEPs): The future of Python is discussed here. - -

- - - - -
- -
- - - -

- >>> Python Software Foundation -

-

The mission of the Python Software Foundation is to promote, protect, and advance the Python programming language, and to support and facilitate the growth of a diverse and international community of Python programmers. Learn more

-

- Become a Member - Donate to the PSF -

-
- - - - - - - - - - - - - -
-
- - - - - - - - - - - - - - - - - - - - - - - - - - + +
+

Python is a programming language that lets you work quickly and integrate systems more effectively. Learn More

+
+ +
+
+
+
+
+

Get Started

+

Whether you're new to programming or an experienced developer, it's easy to learn and use Python.

+

Start with our Beginner’s Guide

+
+ +
+

Download

+

Python source code and installers are available for download for all versions! Not sure which version to use? Check here.

+

Latest: Python 3.6.4 - Python 2.7.14

+
+ +
+

Docs

+

Documentation for Python's standard library, along with tutorials and guides, are available online.

+

docs.python.org

+
+ +
+

Jobs

+

Looking for work or have a Python related position that you're trying to hire for? Our relaunched community-run job board is the place to go.

+

jobs.python.org

+
+
+ + +
+
+
+ + + + + + +
+
+

Use Python for…

+

More

+ + +
+
+ +
+

+ >>> + Python Enhancement Proposals (PEPs): + The future of Python is discussed here. + +

+
+ +
+ +

+ >>> Python Software Foundation +

+

The mission of the Python Software Foundation is to promote, protect, and advance the Python programming language, and to support and facilitate the growth of a diverse and international community of Python programmers. Learn more

+

+ Become a Member + Donate to the PSF +

+
+ + + + + + + + + + + + + + + +
  • + Community + +
  • + +
  • + Success Stories + +
  • + +
  • + News + +
  • + +
  • + Events + +
  • + +
  • + Contributing + +
  • + + + + + Back to Top + + + + + + + + + + + + + + + + + + + + + + + diff --git a/tests/test_internet.py b/tests/test_internet.py index 357cadea..21a7dc1b 100644 --- a/tests/test_internet.py +++ b/tests/test_internet.py @@ -1,7 +1,7 @@ import pytest from requests_html import HTMLSession, AsyncHTMLSession, HTMLResponse - +# List of URLs to test urls = [ 'https://xkcd.com/1957/', 'https://www.reddit.com/', @@ -12,36 +12,57 @@ 'https://azure.microsoft.com/en-us' ] +@pytest.fixture(scope='module') +def html_session(): + """Fixture for synchronous HTMLSession.""" + session = HTMLSession() + yield session + session.close() + + +@pytest.fixture(scope='module') +async def async_html_session(): + """Fixture for asynchronous HTMLSession.""" + session = AsyncHTMLSession() + yield session + await session.close() + @pytest.mark.parametrize('url', urls) @pytest.mark.internet -def test_pagination(url: str): - session = HTMLSession() - r = session.get(url) - assert next(r.html) +def test_pagination(html_session, url: str): + """Test pagination for synchronous HTML requests.""" + r = html_session.get(url) + assert r.html, f"Failed to retrieve HTML content for {url}" + assert r.status_code == 200, f"Expected status code 200, got {r.status_code} for {url}" @pytest.mark.parametrize('url', urls) @pytest.mark.internet @pytest.mark.asyncio -async def test_async_pagination(event_loop, url): - asession = AsyncHTMLSession() - - r = await asession.get(url) - assert await r.html.__anext__() +async def test_async_pagination(async_html_session, url: str): + """Test pagination for asynchronous HTML requests.""" + r = await async_html_session.get(url) + # Check that the HTML response was successfully parsed + assert r.html.find('html'), f"Failed to retrieve or parse HTML content for {url}" @pytest.mark.internet -def test_async_run(): - asession = AsyncHTMLSession() +@pytest.mark.asyncio +async def test_async_run(async_html_session): + """Test concurrent fetching of multiple URLs asynchronously.""" + async def fetch_url(url): + return await async_html_session.get(url) + + # Create a list of tasks for fetching URLs concurrently + async_list = [fetch_url(url) for url in urls] + responses = await async_html_session.run(*async_list) - async_list = [] - for url in urls: - async def _test(): - return await asession.get(url) - async_list.append(_test) + # Ensure the number of responses matches the number of URLs + assert len(responses) == len(urls), "Number of responses does not match the number of URLs" - r = asession.run(*async_list) + # Check each response's type and status + for url, response in zip(urls, responses): + assert isinstance(response, HTMLResponse), f"Expected HTMLResponse for {url}, got {type(response)}" + assert response.status_code == 200, f"Expected status code 200 for {url}, got {response.status_code}" - assert len(r) == len(urls) - assert isinstance(r[0], HTMLResponse) diff --git a/tests/test_requests_html.py b/tests/test_requests_html.py index 5237a82b..d8dd53ef 100644 --- a/tests/test_requests_html.py +++ b/tests/test_requests_html.py @@ -2,129 +2,125 @@ from functools import partial import pytest -from pyppeteer.browser import Browser -from pyppeteer.page import Page from requests_html import HTMLSession, AsyncHTMLSession, HTML from requests_file import FileAdapter +# Constants for test parameters +TEST_HTML_PATH = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html')) +TEST_URL = f'file://{TEST_HTML_PATH}' +EXPECTED_ABOUT_LINKS = 6 + +# Set up sessions for file handling session = HTMLSession() session.mount('file://', FileAdapter()) +async_session = AsyncHTMLSession() +async_session.mount('file://', FileAdapter()) -def get(): - path = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html')) - url = f'file://{path}' - return session.get(url) +def get(): + return session.get(TEST_URL) @pytest.fixture def async_get(event_loop): - """AsyncSession cannot be created global since it will create - a different loop from pytest-asyncio. """ - async_session = AsyncHTMLSession() - async_session.mount('file://', FileAdapter()) - path = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html')) - url = 'file://{}'.format(path) - - return partial(async_session.get, url) + return partial(async_session.get, TEST_URL) def test_file_get(): r = get() - assert r.status_code == 200 + assert r.status_code == 200, "Expected status code 200" @pytest.mark.asyncio async def test_async_file_get(async_get): r = await async_get() - assert r.status_code == 200 + assert r.status_code == 200, "Expected status code 200" -def test_class_seperation(): +def test_about_class_count(): r = get() - about = r.html.find('#about', first=True) - assert len(about.attrs['class']) == 2 + assert len(about.attrs['class']) == 2, "Expected class count to be 2" def test_css_selector(): r = get() - about = r.html.find('#about', first=True) - for menu_item in ( - 'About', 'Applications', 'Quotes', 'Getting Started', 'Help', - 'Python Brochure' - ): - assert menu_item in about.text.split('\n') - assert menu_item in about.full_text.split('\n') + expected_menu_items = [ + 'About', 'Applications', 'Quotes', 'Getting Started', 'Help', + 'Python Brochure' + ] + + for menu_item in expected_menu_items: + assert menu_item in about.text.split('\n'), f"{menu_item} not found in about text" + assert menu_item in about.full_text.split('\n'), f"{menu_item} not found in about full text" -def test_containing(): +def test_python_occurrences(): r = get() + python_elements = r.html.find(containing='python') + assert len(python_elements) == 192, "Expected 192 elements containing 'python'" - python = r.html.find(containing='python') - assert len(python) == 192 - - for e in python: - assert 'python' in e.full_text.lower() + for e in python_elements: + assert 'python' in e.full_text.lower(), "'python' not found in element full text" -def test_attrs(): +def test_about_attributes(): r = get() about = r.html.find('#about', first=True) - assert 'aria-haspopup' in about.attrs - assert len(about.attrs['class']) == 2 + assert 'aria-haspopup' in about.attrs, "'aria-haspopup' attribute not found in about" + assert len(about.attrs['class']) == 2, "Expected class count to be 2" -def test_links(): +def test_about_links_count(): r = get() about = r.html.find('#about', first=True) - assert len(about.links) == 6 - assert len(about.absolute_links) == 6 + assert len(about.links) == EXPECTED_ABOUT_LINKS, f"Expected {EXPECTED_ABOUT_LINKS} links" + assert len(about.absolute_links) == EXPECTED_ABOUT_LINKS, f"Expected {EXPECTED_ABOUT_LINKS} absolute links" @pytest.mark.asyncio -async def test_async_links(async_get): +async def test_async_about_links(async_get): r = await async_get() about = r.html.find('#about', first=True) - assert len(about.links) == 6 - assert len(about.absolute_links) == 6 + assert len(about.links) == EXPECTED_ABOUT_LINKS, f"Expected {EXPECTED_ABOUT_LINKS} links" + assert len(about.absolute_links) == EXPECTED_ABOUT_LINKS, f"Expected {EXPECTED_ABOUT_LINKS} absolute links" -def test_search(): +def test_search_functionality(): r = get() style = r.html.search('Python is a {} language')[0] - assert style == 'programming' + assert style == 'programming', "Expected 'programming' as the search result" -def test_xpath(): +def test_xpath_validity(): r = get() html = r.html.xpath('/html', first=True) - assert 'no-js' in html.attrs['class'] + assert 'no-js' in html.attrs['class'], "'no-js' not found in HTML class attributes" a_hrefs = r.html.xpath('//a/@href') - assert '#site-map' in a_hrefs + assert '#site-map' in a_hrefs, "'#site-map' link not found" def test_html_loading(): doc = """""" html = HTML(html=doc) - assert 'https://httpbin.org' in html.links - assert isinstance(html.raw_html, bytes) - assert isinstance(html.html, str) + assert 'https://httpbin.org' in html.links, "Expected link not found" + assert isinstance(html.raw_html, bytes), "Expected raw HTML to be bytes" + assert isinstance(html.html, str), "Expected HTML content to be a string" def test_anchor_links(): r = get() r.html.skip_anchors = False - assert '#site-map' in r.html.links + assert '#site-map' in r.html.links, "'#site-map' link not found" @pytest.mark.parametrize('url,link,expected', [ @@ -142,25 +138,25 @@ def test_absolute_links(url, link, expected): # Test without `` tag (url is base) html = HTML(html=body_template.format(link), url=url) - assert html.absolute_links.pop() == expected + assert html.absolute_links.pop() == expected, "Unexpected absolute link without tag" # Test with `` tag (url is other) html = HTML( html=head_template.format(url) + body_template.format(link), url='http://example.com/foobar/') - assert html.absolute_links.pop() == expected + assert html.absolute_links.pop() == expected, "Unexpected absolute link with tag" -def test_parser(): +def test_parser_functionality(): doc = """httpbin.org\n""" html = HTML(html=doc) - assert html.find('html') - assert html.element('a').text().strip() == 'httpbin.org' + assert html.find('html'), "HTML element not found" + assert html.element('a').text().strip() == 'httpbin.org', "Link text does not match" @pytest.mark.render -def test_render(): +def test_render_functionality(): r = get() script = """ () => { @@ -173,10 +169,10 @@ def test_render(): """ val = r.html.render(script=script) for value in ('width', 'height', 'deviceScaleFactor'): - assert value in val + assert value in val, f"{value} not found in rendered output" about = r.html.find('#about', first=True) - assert len(about.links) == 6 + assert len(about.links) == EXPECTED_ABOUT_LINKS, f"Expected {EXPECTED_ABOUT_LINKS} links" @pytest.mark.render @@ -194,15 +190,15 @@ async def test_async_render(async_get): """ val = await r.html.arender(script=script) for value in ('width', 'height', 'deviceScaleFactor'): - assert value in val + assert value in val, f"{value} not found in async rendered output" about = r.html.find('#about', first=True) - assert len(about.links) == 6 + assert len(about.links) == EXPECTED_ABOUT_LINKS, f"Expected {EXPECTED_ABOUT_LINKS} links" await r.html.browser.close() @pytest.mark.render -def test_bare_render(): +def test_bare_render_functionality(): doc = """""" html = HTML(html=doc) script = """ @@ -216,15 +212,15 @@ def test_bare_render(): """ val = html.render(script=script, reload=False) for value in ('width', 'height', 'deviceScaleFactor'): - assert value in val + assert value in val, f"{value} not found in bare render output" - assert html.find('html') - assert 'https://httpbin.org' in html.links + assert html.find('html'), "HTML element not found" + assert 'https://httpbin.org' in html.links, "Expected link not found" @pytest.mark.render @pytest.mark.asyncio -async def test_bare_arender(): +async def test_bare_async_render(): doc = """""" html = HTML(html=doc, async_=True) script = """ @@ -238,15 +234,15 @@ async def test_bare_arender(): """ val = await html.arender(script=script, reload=False) for value in ('width', 'height', 'deviceScaleFactor'): - assert value in val + assert value in val, f"{value} not found in bare async render output" - assert html.find('html') - assert 'https://httpbin.org' in html.links + assert html.find('html'), "HTML element not found" + assert 'https://httpbin.org' in html.links, "Expected link not found" await html.browser.close() @pytest.mark.render -def test_bare_js_eval(): +def test_bare_js_eval_functionality(): doc = """ @@ -263,7 +259,7 @@ def test_bare_js_eval(): html = HTML(html=doc) html.render() - assert html.find('#replace', first=True).text == 'yolo' + assert html.find('#replace', first=True).text == 'yolo', "Expected text not found after JS execution" @pytest.mark.render @@ -285,17 +281,15 @@ async def test_bare_js_async_eval(): html = HTML(html=doc, async_=True) await html.arender() - assert html.find('#replace', first=True).text == 'yolo' + assert html.find('#replace', first=True).text == 'yolo', "Expected text not found after async JS execution" await html.browser.close() def test_browser_session(): - """ Test browser instances is created and properly close when session is closed. - Note: session.close method need to be tested together with browser creation, - since not doing that will leave the browser running. """ + """ Test browser instance creation and proper closure when session is closed. """ session = HTMLSession() - assert isinstance(session.browser, Browser) - assert hasattr(session, "loop") + assert isinstance(session.browser, Browser), "Expected Browser instance" + assert hasattr(session, "loop"), "Expected session to have a loop attribute" session.close() # assert count_chromium_process() == 0 @@ -305,20 +299,20 @@ def test_browser_process(): r = get() r.html.render() - assert r.html.page is None + assert r.html.page is None, "Expected page to be None after render" @pytest.mark.asyncio async def test_browser_session_fail(): - """ HTMLSession.browser should not be call within an existing event loop> """ + """ HTMLSession.browser should not be called within an existing event loop. """ session = HTMLSession() - with pytest.raises(RuntimeError): - session.browser + with pytest.raises(RuntimeError, match="already running"): + await session.browser @pytest.mark.asyncio async def test_async_browser_session(): session = AsyncHTMLSession() browser = await session.browser - assert isinstance(browser, Browser) + assert isinstance(browser, Browser), "Expected Browser instance" await session.close()