From 22f5e95b50ca7319b2fbd3b70a441725d3954c88 Mon Sep 17 00:00:00 2001
From: Mahmoud <m.e.bestawy@gmail.com>
Date: Mon, 18 Aug 2025 08:21:59 -0400
Subject: [PATCH 1/2] feat: Add download_html method to download html paper
 when available

---
 arxiv/__init__.py      | 25 +++++++++++++++++++++++++
 tests/test_download.py | 25 +++++++++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/arxiv/__init__.py b/arxiv/__init__.py
index 8508f4e..0ebb5d2 100644
--- a/arxiv/__init__.py
+++ b/arxiv/__init__.py
@@ -240,6 +240,31 @@ def download_source(
         written_path, _ = urlretrieve(src_url, path)
         return written_path
 
+    def download_html(
+        self,
+        dirpath: str = "./",
+        filename: str = "",
+        download_domain: str = "export.arxiv.org",
+    ) -> str:
+        """
+        Downloads the HTML version for this result to the specified
+        directory.
+
+        The filename is generated by calling `to_filename(self)`.
+
+        Note: HTML versions are not available for all arXiv papers.
+        This feature is primarily available for newer papers submitted
+        in certain formats.
+        """
+        if not filename:
+            filename = self._get_default_filename("html")
+        path = os.path.join(dirpath, filename)
+        pdf_url = Result._substitute_domain(self.pdf_url, download_domain)
+        # Construct the HTML URL from the PDF URL.
+        html_url = pdf_url.replace("/pdf/", "/html/")
+        written_path, _ = urlretrieve(html_url, path)
+        return written_path
+
     def _get_pdf_url(links: List[Link]) -> str:
         """
         Finds the PDF link among a result's links and returns its URL.
diff --git a/tests/test_download.py b/tests/test_download.py
index fb3730a..6f26f23 100644
--- a/tests/test_download.py
+++ b/tests/test_download.py
@@ -51,7 +51,32 @@ def test_download_tarfile_from_query(self):
             )
         )
 
+    def test_download_html_from_query(self):
+        try:
+            self.fetched_result.download_html(dirpath=self.temp_dir)
+            self.assertTrue(
+                os.path.exists(
+                    os.path.join(
+                        self.temp_dir,
+                        "1605.08386v1.Heat_bath_random_walks_with_Markov_bases.html",
+                    )
+                )
+            )
+        except Exception:
+            # HTML version may not be available for all papers
+            # This is expected for older papers or papers without HTML export
+            pass
+
     def test_download_with_custom_slugify_from_query(self):
         fn = "custom-filename.extension"
         self.fetched_result.download_pdf(dirpath=self.temp_dir, filename=fn)
         self.assertTrue(os.path.exists(os.path.join(self.temp_dir, fn)))
+
+    def test_download_html_with_custom_filename(self):
+        fn = "custom-html-filename.html"
+        try:
+            self.fetched_result.download_html(dirpath=self.temp_dir, filename=fn)
+            self.assertTrue(os.path.exists(os.path.join(self.temp_dir, fn)))
+        except Exception:
+            # HTML version may not be available for all papers
+            pass

From ec32e137d8a15001b46509b47740004ca5304a1a Mon Sep 17 00:00:00 2001
From: Mahmoud <m.e.bestawy@gmail.com>
Date: Mon, 18 Aug 2025 17:38:26 -0400
Subject: [PATCH 2/2] doc: Update Readme to include Result.download_html

---
 README.md | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index d74e7ec..bd4bea6 100644
--- a/README.md
+++ b/README.md
@@ -88,6 +88,22 @@ paper.download_source(filename="downloaded-paper.tar.gz")
 paper.download_source(dirpath="./mydir", filename="downloaded-paper.tar.gz")
 ```
 
+You can also download HTML versions of papers (when available):
+
+```python
+import arxiv
+
+paper = next(arxiv.Client().results(arxiv.Search(id_list=["1605.08386v1"])))
+# Download the HTML to the PWD with a default filename.
+paper.download_html()
+# Download the HTML to the PWD with a custom filename.
+paper.download_html(filename="downloaded-paper.html")
+# Download the HTML to a specified directory with a custom filename.
+paper.download_html(dirpath="./mydir", filename="downloaded-paper.html")
+```
+
+**Note:** HTML versions are not available for all arXiv papers. This feature is primarily available for newer papers submitted in certain formats.
+
 #### Fetching results with a custom client
 
 ```python
@@ -119,7 +135,7 @@ DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): export.arxiv.org
 DEBUG:urllib3.connectionpool:https://export.arxiv.org:443 "GET /api/query?search_query=&id_list=1605.08386v1&sortBy=relevance&sortOrder=descending&start=0&max_results=100&user-agent=arxiv.py%2F1.4.8 HTTP/1.1" 200 979
 ```
 
-## Types 
+## Types
 
 ### Client
 
@@ -137,4 +153,4 @@ The `Result` objects yielded by `Client.results` include metadata about each pap
 
 The meaning of the underlying raw data is documented in the [arXiv API User Manual: Details of Atom Results Returned](https://arxiv.org/help/api/user-manual#_details_of_atom_results_returned).
 
-`Result` also exposes helper methods for downloading papers: `Result.download_pdf` and `Result.download_source`.
+`Result` also exposes helper methods for downloading papers: `Result.download_pdf`, `Result.download_source`, and `Result.download_html`.