Skip to content

Commit 57fe01c

Browse files
authored
Merge pull request #141 from Gallaecio/fix-fragment_marker_2019
Do not decode # in URL paths during canonicalization
2 parents 910ff63 + a26dd48 commit 57fe01c

File tree

2 files changed

+30
-3
lines changed

2 files changed

+30
-3
lines changed

tests/test_url.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,17 @@ def test_safe_url_port_number(self):
219219
safe_url_string(u"http://www.example.com:/résumé?q=résumé"),
220220
"http://www.example.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9")
221221

222+
def test_safe_url_string_preserve_nonfragment_hash(self):
223+
# don't decode `%23` to `#`
224+
self.assertEqual(safe_url_string("http://www.example.com/path/to/%23/foo/bar"),
225+
"http://www.example.com/path/to/%23/foo/bar")
226+
self.assertEqual(safe_url_string("http://www.example.com/path/to/%23/foo/bar#frag"),
227+
"http://www.example.com/path/to/%23/foo/bar#frag")
228+
self.assertEqual(safe_url_string("http://www.example.com/path/to/%23/foo/bar?url=http%3A%2F%2Fwww.example.com%2Fpath%2Fto%2F%23%2Fbar%2Ffoo"),
229+
"http://www.example.com/path/to/%23/foo/bar?url=http%3A%2F%2Fwww.example.com%2Fpath%2Fto%2F%23%2Fbar%2Ffoo")
230+
self.assertEqual(safe_url_string("http://www.example.com/path/to/%23/foo/bar?url=http%3A%2F%2Fwww.example.com%2F%2Fpath%2Fto%2F%23%2Fbar%2Ffoo#frag"),
231+
"http://www.example.com/path/to/%23/foo/bar?url=http%3A%2F%2Fwww.example.com%2F%2Fpath%2Fto%2F%23%2Fbar%2Ffoo#frag")
232+
222233
def test_safe_download_url(self):
223234
self.assertEqual(safe_download_url('http://www.example.org'),
224235
'http://www.example.org/')
@@ -650,6 +661,21 @@ def test_canonicalize_url_idna_exceptions(self):
650661
"http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format(
651662
label=u"example"*11))
652663

664+
def test_preserve_nonfragment_hash(self):
665+
# don't decode `%23` to `#`
666+
self.assertEqual(canonicalize_url("http://www.example.com/path/to/%23/foo/bar"),
667+
"http://www.example.com/path/to/%23/foo/bar")
668+
self.assertEqual(canonicalize_url("http://www.example.com/path/to/%23/foo/bar#frag"),
669+
"http://www.example.com/path/to/%23/foo/bar")
670+
self.assertEqual(canonicalize_url("http://www.example.com/path/to/%23/foo/bar#frag", keep_fragments=True),
671+
"http://www.example.com/path/to/%23/foo/bar#frag")
672+
self.assertEqual(canonicalize_url("http://www.example.com/path/to/%23/foo/bar?url=http%3A%2F%2Fwww.example.com%2Fpath%2Fto%2F%23%2Fbar%2Ffoo"),
673+
"http://www.example.com/path/to/%23/foo/bar?url=http%3A%2F%2Fwww.example.com%2Fpath%2Fto%2F%23%2Fbar%2Ffoo")
674+
self.assertEqual(canonicalize_url("http://www.example.com/path/to/%23/foo/bar?url=http%3A%2F%2Fwww.example.com%2F%2Fpath%2Fto%2F%23%2Fbar%2Ffoo#frag"),
675+
"http://www.example.com/path/to/%23/foo/bar?url=http%3A%2F%2Fwww.example.com%2F%2Fpath%2Fto%2F%23%2Fbar%2Ffoo")
676+
self.assertEqual(canonicalize_url("http://www.example.com/path/to/%23/foo/bar?url=http%3A%2F%2Fwww.example.com%2F%2Fpath%2Fto%2F%23%2Fbar%2Ffoo#frag", keep_fragments=True),
677+
"http://www.example.com/path/to/%23/foo/bar?url=http%3A%2F%2Fwww.example.com%2F%2Fpath%2Fto%2F%23%2Fbar%2Ffoo#frag")
678+
653679

654680
class DataURITests(unittest.TestCase):
655681

w3lib/url.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def _quote_byte(error):
3333
EXTRA_SAFE_CHARS = b'|' # see https://github.com/scrapy/w3lib/pull/25
3434

3535
_safe_chars = RFC3986_RESERVED + RFC3986_UNRESERVED + EXTRA_SAFE_CHARS + b'%'
36+
_path_safe_chars = _safe_chars.replace(b'#', b'')
3637

3738
_ascii_tab_newline_re = re.compile(r'[\t\n\r]') # see https://infra.spec.whatwg.org/#ascii-tab-or-newline
3839

@@ -74,7 +75,7 @@ def safe_url_string(url, encoding='utf8', path_encoding='utf8', quote_path=True)
7475

7576
# default encoding for path component SHOULD be UTF-8
7677
if quote_path:
77-
path = quote(to_bytes(parts.path, path_encoding), _safe_chars)
78+
path = quote(to_bytes(parts.path, path_encoding), _path_safe_chars)
7879
else:
7980
path = to_native_str(parts.path)
8081

@@ -414,7 +415,7 @@ def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'):
414415
to_native_str(netloc),
415416

416417
# default encoding for path component SHOULD be UTF-8
417-
quote(to_bytes(parts.path, path_encoding), _safe_chars),
418+
quote(to_bytes(parts.path, path_encoding), _path_safe_chars),
418419
quote(to_bytes(parts.params, path_encoding), _safe_chars),
419420

420421
# encoding of query and fragment follows page encoding
@@ -502,7 +503,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False,
502503
# 2. decode percent-encoded sequences in path as UTF-8 (or keep raw bytes)
503504
# and percent-encode path again (this normalizes to upper-case %XX)
504505
uqp = _unquotepath(path)
505-
path = quote(uqp, _safe_chars) or '/'
506+
path = quote(uqp, _path_safe_chars) or '/'
506507

507508
fragment = '' if not keep_fragments else fragment
508509

0 commit comments

Comments
 (0)