From 41439a039dd362bbf19e540c8f01a0e8635d4214 Mon Sep 17 00:00:00 2001 From: Ratnadeep Debnath Date: Sun, 19 Feb 2017 14:19:03 +0530 Subject: [PATCH 1/4] bpo-16285: Update urllib quoting to RFC 3986 Initial work done by ctheune at http://bugs.python.org/file34950/0be3805cade1.diff. --- Doc/library/urllib.parse.rst | 6 +++++- Lib/test/test_urllib.py | 4 ++-- Lib/urllib/parse.py | 9 ++++++--- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst index 676321b46a2232..7a5b56f5da69b1 100644 --- a/Doc/library/urllib.parse.rst +++ b/Doc/library/urllib.parse.rst @@ -451,13 +451,17 @@ task isn't already covered by the URL parsing functions above. .. function:: quote(string, safe='/', encoding=None, errors=None) Replace special characters in *string* using the ``%xx`` escape. Letters, - digits, and the characters ``'_.-'`` are never quoted. By default, this + digits, and the characters ``'_.-~'`` are never quoted. By default, this function is intended for quoting the path section of URL. The optional *safe* parameter specifies additional ASCII characters that should not be quoted --- its default value is ``'/'``. *string* may be either a :class:`str` or a :class:`bytes`. + .. versionchanged:: 3.7 + Moved from RFC 2396 to RFC 3986 for quoting URL strings. "~" is now + included in the set of reserved characters. + The optional *encoding* and *errors* parameters specify how to deal with non-ASCII characters, as accepted by the :meth:`str.encode` method. *encoding* defaults to ``'utf-8'``. diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 5084486e5ab479..bffbb0a8d1e3d4 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -733,7 +733,7 @@ def test_short_content_raises_ContentTooShortError_without_reporthook(self): class QuotingTests(unittest.TestCase): r"""Tests for urllib.quote() and urllib.quote_plus() - According to RFC 2396 (Uniform Resource Identifiers), to escape a + According to RFC 3986 (Uniform Resource Identifiers), to escape a character you write it as '%' + <2 character US-ASCII hex value>. The Python code of ``'%' + hex(ord())[2:]`` escapes a character properly. Case does not matter on the hex letters. @@ -761,7 +761,7 @@ def test_never_quote(self): do_not_quote = '' .join(["ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz", "0123456789", - "_.-"]) + "_.-~"]) result = urllib.parse.quote(do_not_quote) self.assertEqual(do_not_quote, result, "using quote(): %r != %r" % (do_not_quote, result)) diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py index 1d08730a89fe8a..f3a309aacc2e99 100644 --- a/Lib/urllib/parse.py +++ b/Lib/urllib/parse.py @@ -704,7 +704,7 @@ def unquote_plus(string, encoding='utf-8', errors='replace'): _ALWAYS_SAFE = frozenset(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' b'abcdefghijklmnopqrstuvwxyz' b'0123456789' - b'_.-') + b'_.-~') _ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE) _safe_quoters = {} @@ -736,15 +736,18 @@ def quote(string, safe='/', encoding=None, errors=None): Each part of a URL, e.g. the path info, the query, etc., has a different set of reserved characters that must be quoted. - RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists + RFC 3986 Uniform Resource Identifiers (URI): Generic Syntax lists the following reserved characters. reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | - "$" | "," + "$" | "," | "~" Each of these characters is reserved in some component of a URL, but not necessarily in all of them. + Python 3.7 updates from using RFC 2396 to RFC 3986 to quote URL strings. + Now, "~" is included in the set of reserved characters. + By default, the quote function is intended for quoting the path section of a URL. Thus, it will not encode '/'. This character is reserved, but in typical usage the quote function is being From 3e8ac1b09607be9bda40952be1ae79d529550d3e Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 25 Feb 2017 15:30:35 +1000 Subject: [PATCH 2/4] Update ACKS --- Misc/ACKS | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Misc/ACKS b/Misc/ACKS index 5ab6411688c7d1..7ed9a7e15e2269 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -344,6 +344,7 @@ Kushal Das Jonathan Dasteel Pierre-Yves David A. Jesse Jiryu Davis +Ratnadeep Debnath Merlijn van Deen John DeGood Ned Deily @@ -1517,6 +1518,7 @@ Mikhail Terekhov Victor Terrón Richard M. Tew Tobias Thelen +Christian Theune Févry Thibault Lowe Thiderman Nicolas M. Thiéry @@ -1527,7 +1529,7 @@ Stephen Thorne Jeremy Thurgood Eric Tiedemann July Tikhonov -Tracy Tims +Tracy Tims Oren Tirosh Tim Tisdall Jason Tishler From a4bd54b6af54155fce19b1aa0bb05a797d1c84f6 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 25 Feb 2017 15:35:27 +1000 Subject: [PATCH 3/4] Update What's New in 3.7 guide --- Doc/whatsnew/3.7.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Doc/whatsnew/3.7.rst b/Doc/whatsnew/3.7.rst index 21621c5ee4eac9..4e72efbb9c7939 100644 --- a/Doc/whatsnew/3.7.rst +++ b/Doc/whatsnew/3.7.rst @@ -100,6 +100,13 @@ The :const:`~unittest.mock.sentinel` attributes now preserve their identity when they are :mod:`copied ` or :mod:`pickled `. (Contributed by Serhiy Storchaka in :issue:`20804`.) +urllib.parse +------------ + +:func:`urllib.parse.quote` has been updated to from RFC 2396 to RFC 3986, +adding `~` to the set of characters that is never quoted by default. +(Contributed by Christian Theune and Ratnadeep Debnath in :issue:`16285`.) + Optimizations ============= From c67169caade7d874ebea4025997588055c41b912 Mon Sep 17 00:00:00 2001 From: Nick Coghlan Date: Sat, 25 Feb 2017 18:26:59 +1000 Subject: [PATCH 4/4] Add NEWS entry --- Misc/NEWS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Misc/NEWS b/Misc/NEWS index e7ab3df8d773c0..74ec8c3bdf26e3 100644 --- a/Misc/NEWS +++ b/Misc/NEWS @@ -249,6 +249,10 @@ Extension Modules Library ------- +- Issue #16285: urrlib.parse.quote is now based on RFC 3986 and hence includes + '~' in the set of characters that is not quoted by default. Patch by + Christian Theune and Ratnadeep Debnath. + - bpo-29532: Altering a kwarg dictionary passed to functools.partial() no longer affects a partial object after creation.