You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

150 lines
6.3 KiB

  1. From 30a779770fe690584456970b602ea16ec3f74ce7 Mon Sep 17 00:00:00 2001
  2. From: Steve Dower <steve.dower@python.org>
  3. Date: Thu, 7 Mar 2019 08:05:31 -0800
  4. Subject: [PATCH] bpo-36216: Add check for characters in netloc that normalize
  5. to separators (GH-12201)
  6. ---
  7. Doc/library/urllib.parse.rst | 18 +++++++++++++++
  8. Lib/test/test_urlparse.py | 23 +++++++++++++++++++
  9. Lib/urllib/parse.py | 17 ++++++++++++++
  10. .../2019-03-06-09-38-40.bpo-36216.6q1m4a.rst | 3 +++
  11. 4 files changed, 61 insertions(+)
  12. create mode 100644 Misc/NEWS.d/next/Security/2019-03-06-09-38-40.bpo-36216.6q1m4a.rst
  13. diff --git a/Doc/library/urllib.parse.rst b/Doc/library/urllib.parse.rst
  14. index 0c8f0f607314..b565e1edd321 100644
  15. --- a/Doc/library/urllib.parse.rst
  16. +++ b/Doc/library/urllib.parse.rst
  17. @@ -124,6 +124,11 @@ or on combining URL components into a URL string.
  18. Unmatched square brackets in the :attr:`netloc` attribute will raise a
  19. :exc:`ValueError`.
  20. + Characters in the :attr:`netloc` attribute that decompose under NFKC
  21. + normalization (as used by the IDNA encoding) into any of ``/``, ``?``,
  22. + ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
  23. + decomposed before parsing, no error will be raised.
  24. +
  25. .. versionchanged:: 3.2
  26. Added IPv6 URL parsing capabilities.
  27. @@ -136,6 +141,10 @@ or on combining URL components into a URL string.
  28. Out-of-range port numbers now raise :exc:`ValueError`, instead of
  29. returning :const:`None`.
  30. + .. versionchanged:: 3.7.3
  31. + Characters that affect netloc parsing under NFKC normalization will
  32. + now raise :exc:`ValueError`.
  33. +
  34. .. function:: parse_qs(qs, keep_blank_values=False, strict_parsing=False, encoding='utf-8', errors='replace', max_num_fields=None)
  35. @@ -257,10 +266,19 @@ or on combining URL components into a URL string.
  36. Unmatched square brackets in the :attr:`netloc` attribute will raise a
  37. :exc:`ValueError`.
  38. + Characters in the :attr:`netloc` attribute that decompose under NFKC
  39. + normalization (as used by the IDNA encoding) into any of ``/``, ``?``,
  40. + ``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
  41. + decomposed before parsing, no error will be raised.
  42. +
  43. .. versionchanged:: 3.6
  44. Out-of-range port numbers now raise :exc:`ValueError`, instead of
  45. returning :const:`None`.
  46. + .. versionchanged:: 3.7.3
  47. + Characters that affect netloc parsing under NFKC normalization will
  48. + now raise :exc:`ValueError`.
  49. +
  50. .. function:: urlunsplit(parts)
  51. diff --git a/Lib/test/test_urlparse.py b/Lib/test/test_urlparse.py
  52. index be50b47603aa..e6638aee2244 100644
  53. --- a/Lib/test/test_urlparse.py
  54. +++ b/Lib/test/test_urlparse.py
  55. @@ -1,3 +1,5 @@
  56. +import sys
  57. +import unicodedata
  58. import unittest
  59. import urllib.parse
  60. @@ -984,6 +986,27 @@ def test_all(self):
  61. expected.append(name)
  62. self.assertCountEqual(urllib.parse.__all__, expected)
  63. + def test_urlsplit_normalization(self):
  64. + # Certain characters should never occur in the netloc,
  65. + # including under normalization.
  66. + # Ensure that ALL of them are detected and cause an error
  67. + illegal_chars = '/:#?@'
  68. + hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars}
  69. + denorm_chars = [
  70. + c for c in map(chr, range(128, sys.maxunicode))
  71. + if (hex_chars & set(unicodedata.decomposition(c).split()))
  72. + and c not in illegal_chars
  73. + ]
  74. + # Sanity check that we found at least one such character
  75. + self.assertIn('\u2100', denorm_chars)
  76. + self.assertIn('\uFF03', denorm_chars)
  77. +
  78. + for scheme in ["http", "https", "ftp"]:
  79. + for c in denorm_chars:
  80. + url = "{}://netloc{}false.netloc/path".format(scheme, c)
  81. + with self.subTest(url=url, char='{:04X}'.format(ord(c))):
  82. + with self.assertRaises(ValueError):
  83. + urllib.parse.urlsplit(url)
  84. class Utility_Tests(unittest.TestCase):
  85. """Testcase to test the various utility functions in the urllib."""
  86. diff --git a/Lib/urllib/parse.py b/Lib/urllib/parse.py
  87. index f691ab74f87f..39c5d6a80824 100644
  88. --- a/Lib/urllib/parse.py
  89. +++ b/Lib/urllib/parse.py
  90. @@ -391,6 +391,21 @@ def _splitnetloc(url, start=0):
  91. delim = min(delim, wdelim) # use earliest delim position
  92. return url[start:delim], url[delim:] # return (domain, rest)
  93. +def _checknetloc(netloc):
  94. + if not netloc or netloc.isascii():
  95. + return
  96. + # looking for characters like \u2100 that expand to 'a/c'
  97. + # IDNA uses NFKC equivalence, so normalize for this check
  98. + import unicodedata
  99. + netloc2 = unicodedata.normalize('NFKC', netloc)
  100. + if netloc == netloc2:
  101. + return
  102. + _, _, netloc = netloc.rpartition('@') # anything to the left of '@' is okay
  103. + for c in '/?#@:':
  104. + if c in netloc2:
  105. + raise ValueError("netloc '" + netloc2 + "' contains invalid " +
  106. + "characters under NFKC normalization")
  107. +
  108. def urlsplit(url, scheme='', allow_fragments=True):
  109. """Parse a URL into 5 components:
  110. <scheme>://<netloc>/<path>?<query>#<fragment>
  111. @@ -419,6 +434,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
  112. url, fragment = url.split('#', 1)
  113. if '?' in url:
  114. url, query = url.split('?', 1)
  115. + _checknetloc(netloc)
  116. v = SplitResult('http', netloc, url, query, fragment)
  117. _parse_cache[key] = v
  118. return _coerce_result(v)
  119. @@ -442,6 +458,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
  120. url, fragment = url.split('#', 1)
  121. if '?' in url:
  122. url, query = url.split('?', 1)
  123. + _checknetloc(netloc)
  124. v = SplitResult(scheme, netloc, url, query, fragment)
  125. _parse_cache[key] = v
  126. return _coerce_result(v)
  127. diff --git a/Misc/NEWS.d/next/Security/2019-03-06-09-38-40.bpo-36216.6q1m4a.rst b/Misc/NEWS.d/next/Security/2019-03-06-09-38-40.bpo-36216.6q1m4a.rst
  128. new file mode 100644
  129. index 000000000000..5546394157f9
  130. --- /dev/null
  131. +++ b/Misc/NEWS.d/next/Security/2019-03-06-09-38-40.bpo-36216.6q1m4a.rst
  132. @@ -0,0 +1,3 @@
  133. +Changes urlsplit() to raise ValueError when the URL contains characters that
  134. +decompose under IDNA encoding (NFKC-normalization) into characters that
  135. +affect how the URL is parsed.