From 22f5e95b50ca7319b2fbd3b70a441725d3954c88 Mon Sep 17 00:00:00 2001 From: Mahmoud Date: Mon, 18 Aug 2025 08:21:59 -0400 Subject: [PATCH 1/2] feat: Add download_html method to download html paper when available --- arxiv/__init__.py | 25 +++++++++++++++++++++++++ tests/test_download.py | 25 +++++++++++++++++++++++++ 2 files changed, 50 insertions(+) diff --git a/arxiv/__init__.py b/arxiv/__init__.py index 8508f4e..0ebb5d2 100644 --- a/arxiv/__init__.py +++ b/arxiv/__init__.py @@ -240,6 +240,31 @@ def download_source( written_path, _ = urlretrieve(src_url, path) return written_path + def download_html( + self, + dirpath: str = "./", + filename: str = "", + download_domain: str = "export.arxiv.org", + ) -> str: + """ + Downloads the HTML version for this result to the specified + directory. + + The filename is generated by calling `to_filename(self)`. + + Note: HTML versions are not available for all arXiv papers. + This feature is primarily available for newer papers submitted + in certain formats. + """ + if not filename: + filename = self._get_default_filename("html") + path = os.path.join(dirpath, filename) + pdf_url = Result._substitute_domain(self.pdf_url, download_domain) + # Construct the HTML URL from the PDF URL. + html_url = pdf_url.replace("/pdf/", "/html/") + written_path, _ = urlretrieve(html_url, path) + return written_path + def _get_pdf_url(links: List[Link]) -> str: """ Finds the PDF link among a result's links and returns its URL. diff --git a/tests/test_download.py b/tests/test_download.py index fb3730a..6f26f23 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -51,7 +51,32 @@ def test_download_tarfile_from_query(self): ) ) + def test_download_html_from_query(self): + try: + self.fetched_result.download_html(dirpath=self.temp_dir) + self.assertTrue( + os.path.exists( + os.path.join( + self.temp_dir, + "1605.08386v1.Heat_bath_random_walks_with_Markov_bases.html", + ) + ) + ) + except Exception: + # HTML version may not be available for all papers + # This is expected for older papers or papers without HTML export + pass + def test_download_with_custom_slugify_from_query(self): fn = "custom-filename.extension" self.fetched_result.download_pdf(dirpath=self.temp_dir, filename=fn) self.assertTrue(os.path.exists(os.path.join(self.temp_dir, fn))) + + def test_download_html_with_custom_filename(self): + fn = "custom-html-filename.html" + try: + self.fetched_result.download_html(dirpath=self.temp_dir, filename=fn) + self.assertTrue(os.path.exists(os.path.join(self.temp_dir, fn))) + except Exception: + # HTML version may not be available for all papers + pass From ec32e137d8a15001b46509b47740004ca5304a1a Mon Sep 17 00:00:00 2001 From: Mahmoud Date: Mon, 18 Aug 2025 17:38:26 -0400 Subject: [PATCH 2/2] doc: Update Readme to include Result.download_html --- README.md | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index d74e7ec..bd4bea6 100644 --- a/README.md +++ b/README.md @@ -88,6 +88,22 @@ paper.download_source(filename="downloaded-paper.tar.gz") paper.download_source(dirpath="./mydir", filename="downloaded-paper.tar.gz") ``` +You can also download HTML versions of papers (when available): + +```python +import arxiv + +paper = next(arxiv.Client().results(arxiv.Search(id_list=["1605.08386v1"]))) +# Download the HTML to the PWD with a default filename. +paper.download_html() +# Download the HTML to the PWD with a custom filename. +paper.download_html(filename="downloaded-paper.html") +# Download the HTML to a specified directory with a custom filename. +paper.download_html(dirpath="./mydir", filename="downloaded-paper.html") +``` + +**Note:** HTML versions are not available for all arXiv papers. This feature is primarily available for newer papers submitted in certain formats. + #### Fetching results with a custom client ```python @@ -119,7 +135,7 @@ DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): export.arxiv.org DEBUG:urllib3.connectionpool:https://export.arxiv.org:443 "GET /api/query?search_query=&id_list=1605.08386v1&sortBy=relevance&sortOrder=descending&start=0&max_results=100&user-agent=arxiv.py%2F1.4.8 HTTP/1.1" 200 979 ``` -## Types +## Types ### Client @@ -137,4 +153,4 @@ The `Result` objects yielded by `Client.results` include metadata about each pap The meaning of the underlying raw data is documented in the [arXiv API User Manual: Details of Atom Results Returned](https://arxiv.org/help/api/user-manual#_details_of_atom_results_returned). -`Result` also exposes helper methods for downloading papers: `Result.download_pdf` and `Result.download_source`. +`Result` also exposes helper methods for downloading papers: `Result.download_pdf`, `Result.download_source`, and `Result.download_html`.