From 17da20e5fc06dd01c91edd4dfd0dd49ebd5f8a3c Mon Sep 17 00:00:00 2001 From: abebus Date: Sat, 26 Jul 2025 20:24:52 +0300 Subject: [PATCH 1/6] Small micro optimisations --- w3lib/http.py | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/w3lib/http.py b/w3lib/http.py index 1791c98..02503cc 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -1,7 +1,9 @@ from __future__ import annotations from base64 import b64encode +from collections import defaultdict from collections.abc import Mapping, MutableMapping, Sequence +from io import BytesIO from typing import Any, Union, overload from w3lib.util import to_bytes, to_unicode @@ -44,23 +46,22 @@ def headers_raw_to_dict(headers_raw: bytes | None) -> HeadersDictOutput | None: if headers_raw is None: return None - headers = headers_raw.splitlines() - headers_tuples = [header.split(b":", 1) for header in headers] - result_dict: HeadersDictOutput = {} - for header_item in headers_tuples: - if len(header_item) != 2: - continue + if not headers_raw: + return {} + + headers = iter(BytesIO(headers_raw).readline, b"") + result_dict = defaultdict(list) - item_key = header_item[0].strip() - item_value = header_item[1].strip() + for header in headers: + parts = header.split(b":", 1) + if len(parts) != 2: + continue - if item_key in result_dict: - result_dict[item_key].append(item_value) - else: - result_dict[item_key] = [item_value] + key, value = map(bytes.strip, parts) + result_dict[key].append(value) - return result_dict + return dict(result_dict) @overload @@ -93,13 +94,25 @@ def headers_dict_to_raw(headers_dict: HeadersDictInput | None) -> bytes | None: if headers_dict is None: return None - raw_lines = [] + + if not headers_dict: + return b"" + + parts = bytearray() + for key, value in headers_dict.items(): if isinstance(value, bytes): - raw_lines.append(b": ".join([key, value])) + if parts: + parts.extend(b"\r\n") + parts.extend(key + b": " + value) + elif isinstance(value, (list, tuple)): - raw_lines.extend(b": ".join([key, v]) for v in value) - return b"\r\n".join(raw_lines) + for v in value: + if parts: + parts.extend(b"\r\n") + parts.extend(key + b": " + v) + + return bytes(parts) def basic_auth_header( From d737f5addc87d01615cee7b330cb696c658ae17c Mon Sep 17 00:00:00 2001 From: abebus Date: Sat, 26 Jul 2025 21:00:32 +0300 Subject: [PATCH 2/6] `headers_raw_to_dict` actually regressed, fix --- w3lib/http.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/w3lib/http.py b/w3lib/http.py index 02503cc..becabfa 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -1,7 +1,6 @@ from __future__ import annotations from base64 import b64encode -from collections import defaultdict from collections.abc import Mapping, MutableMapping, Sequence from io import BytesIO from typing import Any, Union, overload @@ -51,17 +50,21 @@ def headers_raw_to_dict(headers_raw: bytes | None) -> HeadersDictOutput | None: return {} headers = iter(BytesIO(headers_raw).readline, b"") - result_dict = defaultdict(list) + result_dict = {} for header in headers: - parts = header.split(b":", 1) - if len(parts) != 2: + key, sep, value = header.partition(b":") + if not sep: continue - key, value = map(bytes.strip, parts) - result_dict[key].append(value) + key, value = key.strip(), value.strip() - return dict(result_dict) + if key in result_dict: + result_dict[key].append(value) + else: + result_dict[key] = [value] + + return result_dict @overload From 0db3f3fa283680a78d1cb7d77993e50e8cba5ade Mon Sep 17 00:00:00 2001 From: abebus Date: Sat, 26 Jul 2025 21:11:18 +0300 Subject: [PATCH 3/6] fix typing --- w3lib/http.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/w3lib/http.py b/w3lib/http.py index becabfa..f90871e 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -50,7 +50,7 @@ def headers_raw_to_dict(headers_raw: bytes | None) -> HeadersDictOutput | None: return {} headers = iter(BytesIO(headers_raw).readline, b"") - result_dict = {} + result_dict: HeadersDictOutput = {} for header in headers: key, sep, value = header.partition(b":") From 789e5de5f94ec5bf6894105e28861923108c6b3b Mon Sep 17 00:00:00 2001 From: abebus Date: Sun, 27 Jul 2025 14:22:03 +0300 Subject: [PATCH 4/6] even faster and cleaner `headers_raw_to_dict` --- w3lib/http.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/w3lib/http.py b/w3lib/http.py index f90871e..fe47748 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -49,10 +49,9 @@ def headers_raw_to_dict(headers_raw: bytes | None) -> HeadersDictOutput | None: if not headers_raw: return {} - headers = iter(BytesIO(headers_raw).readline, b"") result_dict: HeadersDictOutput = {} - for header in headers: + for header in BytesIO(headers_raw): key, sep, value = header.partition(b":") if not sep: continue From 7b163a703e36860d53ff0c7521664b0cb665f0ed Mon Sep 17 00:00:00 2001 From: abebus Date: Sun, 27 Jul 2025 15:27:40 +0300 Subject: [PATCH 5/6] add tests for new early exit cases, restore coverage, surpisingly reduce time of `headers_dict_to_raw` to 1.7883 seconds in benchmark --- tests/test_http.py | 4 ++++ w3lib/http.py | 12 ++++++------ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/test_http.py b/tests/test_http.py index 52263cb..e0ece39 100644 --- a/tests/test_http.py +++ b/tests/test_http.py @@ -35,6 +35,10 @@ def test_headers_raw_dict_none(self): assert headers_raw_to_dict(None) is None assert headers_dict_to_raw(None) is None + def test_headers_raw_dict_empty(self): + assert headers_raw_to_dict(b"") == {} + assert headers_dict_to_raw({}) == b"" + def test_headers_raw_to_dict(self): raw = b"Content-type: text/html\n\rAccept: gzip\n\r\ Cache-Control: no-cache\n\rCache-Control: no-store\n\n" diff --git a/w3lib/http.py b/w3lib/http.py index fe47748..5ef3cea 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -100,21 +100,21 @@ def headers_dict_to_raw(headers_dict: HeadersDictInput | None) -> bytes | None: if not headers_dict: return b"" - parts = bytearray() + parts = b"" for key, value in headers_dict.items(): if isinstance(value, bytes): if parts: - parts.extend(b"\r\n") - parts.extend(key + b": " + value) + parts += b"\r\n" + parts += key + b": " + value elif isinstance(value, (list, tuple)): for v in value: if parts: - parts.extend(b"\r\n") - parts.extend(key + b": " + v) + parts += b"\r\n" + parts += key + b": " + v - return bytes(parts) + return parts def basic_auth_header( From a5d30c0d111aa3c7710dddc318c3488d9a2d6e43 Mon Sep 17 00:00:00 2001 From: abebus Date: Sun, 27 Jul 2025 21:00:50 +0300 Subject: [PATCH 6/6] stick with `bytearray`, see #247 --- w3lib/http.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/w3lib/http.py b/w3lib/http.py index 5ef3cea..fe47748 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -100,21 +100,21 @@ def headers_dict_to_raw(headers_dict: HeadersDictInput | None) -> bytes | None: if not headers_dict: return b"" - parts = b"" + parts = bytearray() for key, value in headers_dict.items(): if isinstance(value, bytes): if parts: - parts += b"\r\n" - parts += key + b": " + value + parts.extend(b"\r\n") + parts.extend(key + b": " + value) elif isinstance(value, (list, tuple)): for v in value: if parts: - parts += b"\r\n" - parts += key + b": " + v + parts.extend(b"\r\n") + parts.extend(key + b": " + v) - return parts + return bytes(parts) def basic_auth_header(