Browse Source

[utils] Encode hostnames before passing to urllib

With IDN (Internationalized Domain Name) and a proxy, non-ascii URLs
are passed down to urllib/urllib2, causing UnicodeEncodeError

Fixes #8890
totalwebcasting
Yen Chi Hsuan 9 years ago
parent
commit
efbed08dc2
2 changed files with 11 additions and 0 deletions
  1. +10
    -0
      test/test_http.py
  2. +1
    -0
      youtube_dl/utils.py

+ 10
- 0
test/test_http.py View File

@ -1,4 +1,5 @@
#!/usr/bin/env python #!/usr/bin/env python
# coding: utf-8
from __future__ import unicode_literals from __future__ import unicode_literals
# Allow direct execution # Allow direct execution
@ -120,5 +121,14 @@ class TestProxy(unittest.TestCase):
response = ydl.urlopen(req).read().decode('utf-8') response = ydl.urlopen(req).read().decode('utf-8')
self.assertEqual(response, 'cn: {0}'.format(url)) self.assertEqual(response, 'cn: {0}'.format(url))
def test_proxy_with_idn(self):
ydl = YoutubeDL({
'proxy': 'localhost:{0}'.format(self.port),
})
url = 'http://中文.tw/'
response = ydl.urlopen(url).read().decode('utf-8')
# b'xn--fiq228c' is '中文'.encode('idna')
self.assertEqual(response, 'normal: http://xn--fiq228c.tw/')
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

+ 1
- 0
youtube_dl/utils.py View File

@ -1746,6 +1746,7 @@ def escape_url(url):
"""Escape URL as suggested by RFC 3986""" """Escape URL as suggested by RFC 3986"""
url_parsed = compat_urllib_parse_urlparse(url) url_parsed = compat_urllib_parse_urlparse(url)
return url_parsed._replace( return url_parsed._replace(
netloc=url_parsed.netloc.encode('idna').decode('ascii'),
path=escape_rfc3986(url_parsed.path), path=escape_rfc3986(url_parsed.path),
params=escape_rfc3986(url_parsed.params), params=escape_rfc3986(url_parsed.params),
query=escape_rfc3986(url_parsed.query), query=escape_rfc3986(url_parsed.query),


Loading…
Cancel
Save