From 7198063d96003050eccb0ea59cc938f0388c0606 Mon Sep 17 00:00:00 2001 From: Mister Hat Date: Sun, 24 May 2015 15:26:59 -0500 Subject: [PATCH 01/29] [pinkbike] new extractor --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/pinkbike.py | 78 ++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 youtube_dl/extractor/pinkbike.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 79bcd9106..80bec39da 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -394,6 +394,7 @@ from .pbs import PBSIE from .philharmoniedeparis import PhilharmonieDeParisIE from .phoenix import PhoenixIE from .photobucket import PhotobucketIE +from .pinkbike import PinkbikeIE from .planetaplay import PlanetaPlayIE from .pladform import PladformIE from .played import PlayedIE diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py new file mode 100644 index 000000000..4a15c1835 --- /dev/null +++ b/youtube_dl/extractor/pinkbike.py @@ -0,0 +1,78 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class PinkbikeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pinkbike\.com/video/(?P[0-9]+)' + _TESTS = [{ + 'url': 'http://www.pinkbike.com/video/402811/', + 'md5': '4814b8ca7651034cd87e3361d5c2155a', + 'info_dict': { + 'id': '402811', + 'ext': 'mp4', + 'title': 'Brandon Semenuk - RAW 100', + 'thumbnail': 're:^https?://.*\.jpg$', + 'location': 'Victoria, British Columbia, Canada', + 'uploader_id': 'revelco', + 'upload_date': '20150406', + 'description': 'Official release: www.redbull.ca/rupertwalker', + 'duration': '100' + } + }, { + 'url': 'http://www.pinkbike.com/video/406629/', + 'md5': 'c7a3e19a2bd5cde5a1cda6b2b46caa74', + 'info_dict': { + 'id': '406629', + 'ext': 'mp4', + 'title': 'Chromag: Reece Wallace in Utah', + 'thumbnail': 're:^https?://.*\.jpg$', + 'location': 'Whistler, British Columbia, Canada', + 'uploader_id': 'Chromagbikes', + 'upload_date': '20150505', + 'description': 'Reece Wallace shredding Virgin, Utah. Video by Virtu Media.', + 'duration': '180' + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'(.*?)', webpage, 'title') + title = title[:-len(' Video - Pinkbike')] + + description = self._html_search_meta('description', webpage, 'description') + description = description[len(title + '. '):] + + uploader_id = self._html_search_regex(r'un:\s*"(.*?)"', webpage, 'uploader_id') + + upload_date = self._html_search_regex( + r'class="fullTime"\s*title="([0-9]{4}(?:-[0-9]{2}){2})"', + webpage, 'upload_date') + upload_date = upload_date.replace('-', '') + + location = self._html_search_regex( + r'
Location
\n?\s*
\n?(.*?)\s*', + webpage) + + formats = [{'url': fmt[1], 'height': fmt[0]} for fmt in formats] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': self._html_search_meta('video:duration', webpage, 'duration'), + 'thumbnail': self._html_search_meta('og:image', webpage, 'thumbnail'), + 'uploader_id': uploader_id, + 'upload_date': upload_date, + 'location': location, + 'formats': formats + } From 2c935c0c7224a3332ff9f0fd83e8c074cfbe2c9d Mon Sep 17 00:00:00 2001 From: Mister Hat Date: Sun, 24 May 2015 16:30:03 -0500 Subject: [PATCH 02/29] [pinkbike] converted duration to int --- youtube_dl/extractor/pinkbike.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py index 4a15c1835..66605ddbe 100644 --- a/youtube_dl/extractor/pinkbike.py +++ b/youtube_dl/extractor/pinkbike.py @@ -20,7 +20,7 @@ class PinkbikeIE(InfoExtractor): 'uploader_id': 'revelco', 'upload_date': '20150406', 'description': 'Official release: www.redbull.ca/rupertwalker', - 'duration': '100' + 'duration': 100 } }, { 'url': 'http://www.pinkbike.com/video/406629/', @@ -34,7 +34,7 @@ class PinkbikeIE(InfoExtractor): 'uploader_id': 'Chromagbikes', 'upload_date': '20150505', 'description': 'Reece Wallace shredding Virgin, Utah. Video by Virtu Media.', - 'duration': '180' + 'duration': 180 } }] @@ -69,7 +69,7 @@ class PinkbikeIE(InfoExtractor): 'id': video_id, 'title': title, 'description': description, - 'duration': self._html_search_meta('video:duration', webpage, 'duration'), + 'duration': int(self._html_search_meta('video:duration', webpage, 'duration')), 'thumbnail': self._html_search_meta('og:image', webpage, 'thumbnail'), 'uploader_id': uploader_id, 'upload_date': upload_date, From 680f9744c4e010ad5111c7711c58c341d5ba24dd Mon Sep 17 00:00:00 2001 From: Mister Hat Date: Sun, 24 May 2015 16:45:10 -0500 Subject: [PATCH 03/29] [pinkbike] used proper conversion methods --- youtube_dl/extractor/pinkbike.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py index 66605ddbe..45c0b1377 100644 --- a/youtube_dl/extractor/pinkbike.py +++ b/youtube_dl/extractor/pinkbike.py @@ -4,6 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + int_or_none, + remove_end, + remove_start +) class PinkbikeIE(InfoExtractor): @@ -43,10 +48,13 @@ class PinkbikeIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = self._html_search_regex(r'(.*?)', webpage, 'title') - title = title[:-len(' Video - Pinkbike')] + title = remove_end(title, ' Video - Pinkbike') description = self._html_search_meta('description', webpage, 'description') - description = description[len(title + '. '):] + description = remove_start(description, title + '. ') + + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'duration')) uploader_id = self._html_search_regex(r'un:\s*"(.*?)"', webpage, 'uploader_id') @@ -63,13 +71,13 @@ class PinkbikeIE(InfoExtractor): r'', webpage) - formats = [{'url': fmt[1], 'height': fmt[0]} for fmt in formats] + formats = [{'url': fmt[1], 'height': int_or_none(fmt[0])} for fmt in formats] return { 'id': video_id, 'title': title, 'description': description, - 'duration': int(self._html_search_meta('video:duration', webpage, 'duration')), + 'duration': duration, 'thumbnail': self._html_search_meta('og:image', webpage, 'thumbnail'), 'uploader_id': uploader_id, 'upload_date': upload_date, From c9bebed294dd29d9188265c8f7bfb0e1b43406ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 19 Jun 2015 20:52:44 +0600 Subject: [PATCH 04/29] [youtube] Add itag 59 and 78 (Closes #5979) --- youtube_dl/extractor/youtube.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9e2671192..a3da56c14 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -234,6 +234,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '44': {'ext': 'webm', 'width': 854, 'height': 480}, '45': {'ext': 'webm', 'width': 1280, 'height': 720}, '46': {'ext': 'webm', 'width': 1920, 'height': 1080}, + '59': {'ext': 'mp4', 'width': 854, 'height': 480}, + '78': {'ext': 'mp4', 'width': 854, 'height': 480}, # 3d videos From cbcd1a5474dd8b39e68b0d2bbc493701c655a2d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 19 Jun 2015 21:57:31 +0600 Subject: [PATCH 05/29] [dramafever] Add support for authentication (Closes #6017) --- youtube_dl/extractor/dramafever.py | 38 ++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index a34aad486..cfbcddcef 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -6,6 +6,8 @@ import itertools from .common import InfoExtractor from ..compat import ( compat_HTTPError, + compat_urllib_parse, + compat_urllib_request, compat_urlparse, ) from ..utils import ( @@ -17,7 +19,39 @@ from ..utils import ( ) -class DramaFeverIE(InfoExtractor): +class DramaFeverBaseIE(InfoExtractor): + _LOGIN_URL = 'https://www.dramafever.com/accounts/login/' + _NETRC_MACHINE = 'dramafever' + + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_form = { + 'username': username, + 'password': password, + } + + request = compat_urllib_request.Request( + self._LOGIN_URL, compat_urllib_parse.urlencode(login_form).encode('utf-8')) + response = self._download_webpage( + request, None, 'Logging in as %s' % username) + + if all(logout_pattern not in response + for logout_pattern in ['href="/accounts/logout/"', '>Log out<']): + error = self._html_search_regex( + r'(?s)class="hidden-xs prompt"[^>]*>(.+?)<', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + +class DramaFeverIE(DramaFeverBaseIE): IE_NAME = 'dramafever' _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P[0-9]+/[0-9]+)(?:/|$)' _TEST = { @@ -97,7 +131,7 @@ class DramaFeverIE(InfoExtractor): } -class DramaFeverSeriesIE(InfoExtractor): +class DramaFeverSeriesIE(DramaFeverBaseIE): IE_NAME = 'dramafever:series' _VALID_URL = r'https?://(?:www\.)?dramafever\.com/drama/(?P[0-9]+)(?:/(?:(?!\d+(?:/|$)).+)?)?$' _TESTS = [{ From 10464af5d1d03a3461286a601ae7db91c5a8141c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Fri, 19 Jun 2015 22:02:07 +0600 Subject: [PATCH 06/29] [dramafever:series] Fix extraction while authenticated --- youtube_dl/extractor/dramafever.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index cfbcddcef..ca41a3abf 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -185,8 +185,11 @@ class DramaFeverSeriesIE(DramaFeverBaseIE): % (consumer_secret, series_id, self._PAGE_SIZE, page_num), series_id, 'Downloading episodes JSON page #%d' % page_num) for episode in episodes.get('value', []): + episode_url = episode.get('episode_url') + if not episode_url: + continue entries.append(self.url_result( - compat_urlparse.urljoin(url, episode['episode_url']), + compat_urlparse.urljoin(url, episode_url), 'DramaFever', episode.get('guid'))) if page_num == episodes['num_pages']: break From 385c3e5e91680dcc6573f05e6b30fdf45048503e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 20 Jun 2015 00:10:08 +0600 Subject: [PATCH 07/29] [pinkbike] Improve and simplify --- youtube_dl/extractor/pinkbike.py | 86 ++++++++++++++++++-------------- 1 file changed, 48 insertions(+), 38 deletions(-) diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py index 45c0b1377..745433b48 100644 --- a/youtube_dl/extractor/pinkbike.py +++ b/youtube_dl/extractor/pinkbike.py @@ -7,12 +7,14 @@ from .common import InfoExtractor from ..utils import ( int_or_none, remove_end, - remove_start + remove_start, + str_to_int, + unified_strdate, ) class PinkbikeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pinkbike\.com/video/(?P[0-9]+)' + _VALID_URL = r'https?://(?:(?:www\.)?pinkbike\.com/video/|es\.pinkbike\.org/i/kvid/kvid-y5\.swf\?id=)(?P[0-9]+)' _TESTS = [{ 'url': 'http://www.pinkbike.com/video/402811/', 'md5': '4814b8ca7651034cd87e3361d5c2155a', @@ -20,67 +22,75 @@ class PinkbikeIE(InfoExtractor): 'id': '402811', 'ext': 'mp4', 'title': 'Brandon Semenuk - RAW 100', + 'description': 'Official release: www.redbull.ca/rupertwalker', 'thumbnail': 're:^https?://.*\.jpg$', - 'location': 'Victoria, British Columbia, Canada', - 'uploader_id': 'revelco', + 'duration': 100, 'upload_date': '20150406', - 'description': 'Official release: www.redbull.ca/rupertwalker', - 'duration': 100 + 'uploader': 'revelco', + 'location': 'Victoria, British Columbia, Canada', + 'view_count': int, + 'comment_count': int, } }, { - 'url': 'http://www.pinkbike.com/video/406629/', - 'md5': 'c7a3e19a2bd5cde5a1cda6b2b46caa74', - 'info_dict': { - 'id': '406629', - 'ext': 'mp4', - 'title': 'Chromag: Reece Wallace in Utah', - 'thumbnail': 're:^https?://.*\.jpg$', - 'location': 'Whistler, British Columbia, Canada', - 'uploader_id': 'Chromagbikes', - 'upload_date': '20150505', - 'description': 'Reece Wallace shredding Virgin, Utah. Video by Virtu Media.', - 'duration': 180 - } + 'url': 'http://es.pinkbike.org/i/kvid/kvid-y5.swf?id=406629', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'(.*?)', webpage, 'title') - title = remove_end(title, ' Video - Pinkbike') + webpage = self._download_webpage( + 'http://www.pinkbike.com/video/%s' % video_id, video_id) - description = self._html_search_meta('description', webpage, 'description') - description = remove_start(description, title + '. ') + formats = [] + for _, format_id, src in re.findall( + r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage): + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)) + formats.append({ + 'url': src, + 'format_id': format_id, + 'height': height, + }) + self._sort_formats(formats) + title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike') + description = self._html_search_regex( + r'(?s)id="media-description"[^>]*>(.+?)<', + webpage, 'description', default=None) or remove_start( + self._og_search_description(webpage), title + '. ') + thumbnail = self._og_search_thumbnail(webpage) duration = int_or_none(self._html_search_meta( 'video:duration', webpage, 'duration')) - uploader_id = self._html_search_regex(r'un:\s*"(.*?)"', webpage, 'uploader_id') - - upload_date = self._html_search_regex( - r'class="fullTime"\s*title="([0-9]{4}(?:-[0-9]{2}){2})"', - webpage, 'upload_date') - upload_date = upload_date.replace('-', '') + uploader = self._search_regex( + r'un:\s*"([^"]+)"', webpage, 'uploader', fatal=False) + upload_date = unified_strdate(self._search_regex( + r'class="fullTime"[^>]+title="([^"]+)"', + webpage, 'upload date', fatal=False)) location = self._html_search_regex( - r'
Location
\n?\s*
\n?(.*?)\s*Location\s*
(.+?)<', + webpage, 'location', fatal=False) - formats = re.findall( - r'', - webpage) + def extract_count(webpage, label): + return str_to_int(self._search_regex( + r']+class="stat-num"[^>]*>([\d,.]+)\s*]+class="stat-label"[^>]*>%s' % label, + webpage, label, fatal=False)) - formats = [{'url': fmt[1], 'height': int_or_none(fmt[0])} for fmt in formats] + view_count = extract_count(webpage, 'Views') + comment_count = extract_count(webpage, 'Comments') return { 'id': video_id, 'title': title, 'description': description, + 'thumbnail': thumbnail, 'duration': duration, - 'thumbnail': self._html_search_meta('og:image', webpage, 'thumbnail'), - 'uploader_id': uploader_id, 'upload_date': upload_date, + 'uploader': uploader, 'location': location, + 'view_count': view_count, + 'comment_count': comment_count, 'formats': formats } From 16d6973f8a9eb5a70c12d82aa40f57c2b4aa8c6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 20 Jun 2015 00:49:28 +0600 Subject: [PATCH 08/29] [viki] Pass session token around (#6005) --- youtube_dl/extractor/viki.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index 52d10d242..51cdc6b65 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -28,11 +28,15 @@ class VikiBaseIE(InfoExtractor): _NETRC_MACHINE = 'viki' + _token = None + def _prepare_call(self, path, timestamp=None, post_data=None): path += '?' if '?' not in path else '&' if not timestamp: timestamp = int(time.time()) query = self._API_QUERY_TEMPLATE % (path, self._APP, timestamp) + if self._token: + query += '&token=%s' % self._token sig = hmac.new( self._APP_SECRET.encode('ascii'), query.encode('ascii'), @@ -76,10 +80,14 @@ class VikiBaseIE(InfoExtractor): 'password': password, } - self._call_api( + login = self._call_api( 'sessions.json', None, 'Logging in as %s' % username, post_data=login_form) + self._token = login.get('token') + if not self._token: + self.report_warning('Unable to get session token, login has probably failed') + class VikiIE(VikiBaseIE): IE_NAME = 'viki' From 964afd0689bdd7140b8ab182273d6379fe7b0548 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 21 Jun 2015 03:12:17 +0800 Subject: [PATCH 09/29] [xvideos] Support lower-quality formats found on Android Closes #5968 --- youtube_dl/extractor/xvideos.py | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 2a45dc574..d8415bed4 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -5,10 +5,12 @@ import re from .common import InfoExtractor from ..compat import ( compat_urllib_parse, + compat_urllib_request, ) from ..utils import ( clean_html, ExtractorError, + determine_ext, ) @@ -25,6 +27,8 @@ class XVideosIE(InfoExtractor): } } + _ANDROID_USER_AGENT = 'Mozilla/5.0 (Linux; Android 4.0.4; Galaxy Nexus Build/IMM76B) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.133 Mobile Safari/535.19' + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -40,9 +44,30 @@ class XVideosIE(InfoExtractor): video_thumbnail = self._search_regex( r'url_bigthumb=(.+?)&', webpage, 'thumbnail', fatal=False) + formats = [{ + 'url': video_url, + }] + + android_req = compat_urllib_request.Request(url) + android_req.add_header('User-Agent', self._ANDROID_USER_AGENT) + android_webpage = self._download_webpage(android_req, video_id, fatal=False) + + if android_webpage is not None: + player_params_str = self._search_regex( + 'mobileReplacePlayerDivTwoQual\(([^)]+)\)', + android_webpage, 'player parameters', default='') + player_params = list(map(lambda s: s.strip(' \''), player_params_str.split(','))) + if player_params: + formats.extend([{ + 'url': param, + 'preference': -10, + } for param in player_params if determine_ext(param) == 'mp4']) + + self._sort_formats(formats) + return { 'id': video_id, - 'url': video_url, + 'formats': formats, 'title': video_title, 'ext': 'flv', 'thumbnail': video_thumbnail, From c9ac7fa909fb969ac21a6d168d09803119b018c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 21 Jun 2015 04:17:54 +0600 Subject: [PATCH 10/29] [imdb] Fix extraction --- youtube_dl/extractor/imdb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index f29df36b5..4bb574cf3 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -46,7 +46,7 @@ class ImdbIE(InfoExtractor): format_info = info['videoPlayerObject']['video'] formats.append({ 'format_id': f_id, - 'url': format_info['url'], + 'url': format_info['videoInfoList'][0]['videoUrl'], }) return { From 6a745c2c0fa2d627b46f2d4d8013fa69276c4fac Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 21 Jun 2015 18:22:19 +0800 Subject: [PATCH 11/29] [pinkbike] PEP8 --- youtube_dl/extractor/pinkbike.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/pinkbike.py b/youtube_dl/extractor/pinkbike.py index 745433b48..a52210fab 100644 --- a/youtube_dl/extractor/pinkbike.py +++ b/youtube_dl/extractor/pinkbike.py @@ -44,7 +44,7 @@ class PinkbikeIE(InfoExtractor): formats = [] for _, format_id, src in re.findall( - r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage): + r'data-quality=((?:\\)?["\'])(.+?)\1[^>]+src=\1(.+?)\1', webpage): height = int_or_none(self._search_regex( r'^(\d+)[pP]$', format_id, 'height', default=None)) formats.append({ From b407e173e44041b1a92fb61e316f92d19834a40a Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 21 Jun 2015 18:23:58 +0800 Subject: [PATCH 12/29] [vimeo/generic] Move detection logic from GenericIE to VimeoIE --- youtube_dl/extractor/generic.py | 16 ++++------------ youtube_dl/extractor/vimeo.py | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index f6b984300..bf689f531 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -43,6 +43,7 @@ from .senateisvp import SenateISVPIE from .bliptv import BlipTVIE from .svt import SVTIE from .pornhub import PornHubIE +from .vimeo import VimeoIE class GenericIE(InfoExtractor): @@ -1089,18 +1090,9 @@ class GenericIE(InfoExtractor): if matches: return _playlist_from_matches(matches, ie='RtlNl') - # Look for embedded (iframe) Vimeo player - mobj = re.search( - r']+?src=(["\'])(?P(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage) - if mobj: - player_url = unescapeHTML(mobj.group('url')) - surl = smuggle_url(player_url, {'Referer': url}) - return self.url_result(surl) - # Look for embedded (swf embed) Vimeo player - mobj = re.search( - r']+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) - if mobj: - return self.url_result(mobj.group(1)) + vimeo_url = VimeoIE._extract_vimeo_url(url, webpage) + if vimeo_url is not None: + return self.url_result(vimeo_url) # Look for embedded YouTube player matches = re.findall(r'''(?x) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index f300c7ca4..cae90205d 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -22,6 +22,7 @@ from ..utils import ( unified_strdate, unsmuggle_url, urlencode_postdata, + unescapeHTML, ) @@ -173,6 +174,21 @@ class VimeoIE(VimeoBaseInfoExtractor): }, ] + @staticmethod + def _extract_vimeo_url(url, webpage): + # Look for embedded (iframe) Vimeo player + mobj = re.search( + r']+?src=(["\'])(?P(?:https?:)?//player\.vimeo\.com/video/.+?)\1', webpage) + if mobj: + player_url = unescapeHTML(mobj.group('url')) + surl = smuggle_url(player_url, {'Referer': url}) + return surl + # Look for embedded (swf embed) Vimeo player + mobj = re.search( + r']+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) + if mobj: + return mobj.group(1) + def _verify_video_password(self, url, video_id, webpage): password = self._downloader.params.get('videopassword', None) if password is None: From c5895d5dbdc33fbad1c91f448704d7711448220d Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 21 Jun 2015 18:30:38 +0800 Subject: [PATCH 13/29] [tumblr] Support Vimeo embeds (fixes #5969) --- youtube_dl/extractor/tumblr.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/youtube_dl/extractor/tumblr.py b/youtube_dl/extractor/tumblr.py index 63c20310d..9ead13a91 100644 --- a/youtube_dl/extractor/tumblr.py +++ b/youtube_dl/extractor/tumblr.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from .pornhub import PornHubIE +from .vimeo import VimeoIE class TumblrIE(InfoExtractor): @@ -40,6 +41,17 @@ class TumblrIE(InfoExtractor): 'timestamp': 1430931613, }, 'add_ie': ['Vidme'], + }, { + 'url': 'http://camdamage.tumblr.com/post/98846056295/', + 'md5': 'a9e0c8371ea1ca306d6554e3fecf50b6', + 'info_dict': { + 'id': '105463834', + 'ext': 'mp4', + 'title': 'Cam Damage-HD 720p', + 'uploader': 'John Moyer', + 'uploader_id': 'user32021558', + }, + 'add_ie': ['Vimeo'], }] def _real_extract(self, url): @@ -60,6 +72,10 @@ class TumblrIE(InfoExtractor): if pornhub_url: return self.url_result(pornhub_url, 'PornHub') + vimeo_url = VimeoIE._extract_vimeo_url(url, webpage) + if vimeo_url: + return self.url_result(vimeo_url, 'Vimeo') + iframe_url = self._search_regex( r'src=\'(https?://www\.tumblr\.com/video/[^\']+)\'', webpage, 'iframe url') From 396726244a9096f142f5420ba5f3a1a36abb9a86 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 21 Jun 2015 18:53:17 +0800 Subject: [PATCH 14/29] [utils/ffmpeg] Move ISO 639 related codes to utils --- youtube_dl/postprocessor/ffmpeg.py | 196 +--------------------------- youtube_dl/utils.py | 202 +++++++++++++++++++++++++++++ 2 files changed, 204 insertions(+), 194 deletions(-) diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py index cc65b34e7..fe7e0a8ee 100644 --- a/youtube_dl/postprocessor/ffmpeg.py +++ b/youtube_dl/postprocessor/ffmpeg.py @@ -21,6 +21,7 @@ from ..utils import ( shell_quote, subtitles_filename, dfxp2srt, + ISO639Utils, ) @@ -307,199 +308,6 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor): class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): - # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt - _lang_map = { - 'aa': 'aar', - 'ab': 'abk', - 'ae': 'ave', - 'af': 'afr', - 'ak': 'aka', - 'am': 'amh', - 'an': 'arg', - 'ar': 'ara', - 'as': 'asm', - 'av': 'ava', - 'ay': 'aym', - 'az': 'aze', - 'ba': 'bak', - 'be': 'bel', - 'bg': 'bul', - 'bh': 'bih', - 'bi': 'bis', - 'bm': 'bam', - 'bn': 'ben', - 'bo': 'bod', - 'br': 'bre', - 'bs': 'bos', - 'ca': 'cat', - 'ce': 'che', - 'ch': 'cha', - 'co': 'cos', - 'cr': 'cre', - 'cs': 'ces', - 'cu': 'chu', - 'cv': 'chv', - 'cy': 'cym', - 'da': 'dan', - 'de': 'deu', - 'dv': 'div', - 'dz': 'dzo', - 'ee': 'ewe', - 'el': 'ell', - 'en': 'eng', - 'eo': 'epo', - 'es': 'spa', - 'et': 'est', - 'eu': 'eus', - 'fa': 'fas', - 'ff': 'ful', - 'fi': 'fin', - 'fj': 'fij', - 'fo': 'fao', - 'fr': 'fra', - 'fy': 'fry', - 'ga': 'gle', - 'gd': 'gla', - 'gl': 'glg', - 'gn': 'grn', - 'gu': 'guj', - 'gv': 'glv', - 'ha': 'hau', - 'he': 'heb', - 'hi': 'hin', - 'ho': 'hmo', - 'hr': 'hrv', - 'ht': 'hat', - 'hu': 'hun', - 'hy': 'hye', - 'hz': 'her', - 'ia': 'ina', - 'id': 'ind', - 'ie': 'ile', - 'ig': 'ibo', - 'ii': 'iii', - 'ik': 'ipk', - 'io': 'ido', - 'is': 'isl', - 'it': 'ita', - 'iu': 'iku', - 'ja': 'jpn', - 'jv': 'jav', - 'ka': 'kat', - 'kg': 'kon', - 'ki': 'kik', - 'kj': 'kua', - 'kk': 'kaz', - 'kl': 'kal', - 'km': 'khm', - 'kn': 'kan', - 'ko': 'kor', - 'kr': 'kau', - 'ks': 'kas', - 'ku': 'kur', - 'kv': 'kom', - 'kw': 'cor', - 'ky': 'kir', - 'la': 'lat', - 'lb': 'ltz', - 'lg': 'lug', - 'li': 'lim', - 'ln': 'lin', - 'lo': 'lao', - 'lt': 'lit', - 'lu': 'lub', - 'lv': 'lav', - 'mg': 'mlg', - 'mh': 'mah', - 'mi': 'mri', - 'mk': 'mkd', - 'ml': 'mal', - 'mn': 'mon', - 'mr': 'mar', - 'ms': 'msa', - 'mt': 'mlt', - 'my': 'mya', - 'na': 'nau', - 'nb': 'nob', - 'nd': 'nde', - 'ne': 'nep', - 'ng': 'ndo', - 'nl': 'nld', - 'nn': 'nno', - 'no': 'nor', - 'nr': 'nbl', - 'nv': 'nav', - 'ny': 'nya', - 'oc': 'oci', - 'oj': 'oji', - 'om': 'orm', - 'or': 'ori', - 'os': 'oss', - 'pa': 'pan', - 'pi': 'pli', - 'pl': 'pol', - 'ps': 'pus', - 'pt': 'por', - 'qu': 'que', - 'rm': 'roh', - 'rn': 'run', - 'ro': 'ron', - 'ru': 'rus', - 'rw': 'kin', - 'sa': 'san', - 'sc': 'srd', - 'sd': 'snd', - 'se': 'sme', - 'sg': 'sag', - 'si': 'sin', - 'sk': 'slk', - 'sl': 'slv', - 'sm': 'smo', - 'sn': 'sna', - 'so': 'som', - 'sq': 'sqi', - 'sr': 'srp', - 'ss': 'ssw', - 'st': 'sot', - 'su': 'sun', - 'sv': 'swe', - 'sw': 'swa', - 'ta': 'tam', - 'te': 'tel', - 'tg': 'tgk', - 'th': 'tha', - 'ti': 'tir', - 'tk': 'tuk', - 'tl': 'tgl', - 'tn': 'tsn', - 'to': 'ton', - 'tr': 'tur', - 'ts': 'tso', - 'tt': 'tat', - 'tw': 'twi', - 'ty': 'tah', - 'ug': 'uig', - 'uk': 'ukr', - 'ur': 'urd', - 'uz': 'uzb', - 've': 'ven', - 'vi': 'vie', - 'vo': 'vol', - 'wa': 'wln', - 'wo': 'wol', - 'xh': 'xho', - 'yi': 'yid', - 'yo': 'yor', - 'za': 'zha', - 'zh': 'zho', - 'zu': 'zul', - } - - @classmethod - def _conver_lang_code(cls, code): - """Convert language code from ISO 639-1 to ISO 639-2/T""" - return cls._lang_map.get(code[:2]) - def run(self, information): if information['ext'] not in ['mp4', 'mkv']: self._downloader.to_screen('[ffmpeg] Subtitles can only be embedded in mp4 or mkv files') @@ -525,7 +333,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): opts += ['-c:s', 'mov_text'] for (i, lang) in enumerate(sub_langs): opts.extend(['-map', '%d:0' % (i + 1)]) - lang_code = self._conver_lang_code(lang) + lang_code = ISO639Utils.short2long(lang) if lang_code is not None: opts.extend(['-metadata:s:s:%d' % i, 'language=%s' % lang_code]) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 52d198fa3..259a9d634 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1879,6 +1879,208 @@ def dfxp2srt(dfxp_data): return ''.join(out) +class ISO639Utils(object): + # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt + _lang_map = { + 'aa': 'aar', + 'ab': 'abk', + 'ae': 'ave', + 'af': 'afr', + 'ak': 'aka', + 'am': 'amh', + 'an': 'arg', + 'ar': 'ara', + 'as': 'asm', + 'av': 'ava', + 'ay': 'aym', + 'az': 'aze', + 'ba': 'bak', + 'be': 'bel', + 'bg': 'bul', + 'bh': 'bih', + 'bi': 'bis', + 'bm': 'bam', + 'bn': 'ben', + 'bo': 'bod', + 'br': 'bre', + 'bs': 'bos', + 'ca': 'cat', + 'ce': 'che', + 'ch': 'cha', + 'co': 'cos', + 'cr': 'cre', + 'cs': 'ces', + 'cu': 'chu', + 'cv': 'chv', + 'cy': 'cym', + 'da': 'dan', + 'de': 'deu', + 'dv': 'div', + 'dz': 'dzo', + 'ee': 'ewe', + 'el': 'ell', + 'en': 'eng', + 'eo': 'epo', + 'es': 'spa', + 'et': 'est', + 'eu': 'eus', + 'fa': 'fas', + 'ff': 'ful', + 'fi': 'fin', + 'fj': 'fij', + 'fo': 'fao', + 'fr': 'fra', + 'fy': 'fry', + 'ga': 'gle', + 'gd': 'gla', + 'gl': 'glg', + 'gn': 'grn', + 'gu': 'guj', + 'gv': 'glv', + 'ha': 'hau', + 'he': 'heb', + 'hi': 'hin', + 'ho': 'hmo', + 'hr': 'hrv', + 'ht': 'hat', + 'hu': 'hun', + 'hy': 'hye', + 'hz': 'her', + 'ia': 'ina', + 'id': 'ind', + 'ie': 'ile', + 'ig': 'ibo', + 'ii': 'iii', + 'ik': 'ipk', + 'io': 'ido', + 'is': 'isl', + 'it': 'ita', + 'iu': 'iku', + 'ja': 'jpn', + 'jv': 'jav', + 'ka': 'kat', + 'kg': 'kon', + 'ki': 'kik', + 'kj': 'kua', + 'kk': 'kaz', + 'kl': 'kal', + 'km': 'khm', + 'kn': 'kan', + 'ko': 'kor', + 'kr': 'kau', + 'ks': 'kas', + 'ku': 'kur', + 'kv': 'kom', + 'kw': 'cor', + 'ky': 'kir', + 'la': 'lat', + 'lb': 'ltz', + 'lg': 'lug', + 'li': 'lim', + 'ln': 'lin', + 'lo': 'lao', + 'lt': 'lit', + 'lu': 'lub', + 'lv': 'lav', + 'mg': 'mlg', + 'mh': 'mah', + 'mi': 'mri', + 'mk': 'mkd', + 'ml': 'mal', + 'mn': 'mon', + 'mr': 'mar', + 'ms': 'msa', + 'mt': 'mlt', + 'my': 'mya', + 'na': 'nau', + 'nb': 'nob', + 'nd': 'nde', + 'ne': 'nep', + 'ng': 'ndo', + 'nl': 'nld', + 'nn': 'nno', + 'no': 'nor', + 'nr': 'nbl', + 'nv': 'nav', + 'ny': 'nya', + 'oc': 'oci', + 'oj': 'oji', + 'om': 'orm', + 'or': 'ori', + 'os': 'oss', + 'pa': 'pan', + 'pi': 'pli', + 'pl': 'pol', + 'ps': 'pus', + 'pt': 'por', + 'qu': 'que', + 'rm': 'roh', + 'rn': 'run', + 'ro': 'ron', + 'ru': 'rus', + 'rw': 'kin', + 'sa': 'san', + 'sc': 'srd', + 'sd': 'snd', + 'se': 'sme', + 'sg': 'sag', + 'si': 'sin', + 'sk': 'slk', + 'sl': 'slv', + 'sm': 'smo', + 'sn': 'sna', + 'so': 'som', + 'sq': 'sqi', + 'sr': 'srp', + 'ss': 'ssw', + 'st': 'sot', + 'su': 'sun', + 'sv': 'swe', + 'sw': 'swa', + 'ta': 'tam', + 'te': 'tel', + 'tg': 'tgk', + 'th': 'tha', + 'ti': 'tir', + 'tk': 'tuk', + 'tl': 'tgl', + 'tn': 'tsn', + 'to': 'ton', + 'tr': 'tur', + 'ts': 'tso', + 'tt': 'tat', + 'tw': 'twi', + 'ty': 'tah', + 'ug': 'uig', + 'uk': 'ukr', + 'ur': 'urd', + 'uz': 'uzb', + 've': 'ven', + 'vi': 'vie', + 'vo': 'vol', + 'wa': 'wln', + 'wo': 'wol', + 'xh': 'xho', + 'yi': 'yid', + 'yo': 'yor', + 'za': 'zha', + 'zh': 'zho', + 'zu': 'zul', + } + + @classmethod + def short2long(cls, code): + """Convert language code from ISO 639-1 to ISO 639-2/T""" + return cls._lang_map.get(code[:2]) + + @classmethod + def long2short(cls, code): + """Convert language code from ISO 639-2/T to ISO 639-1""" + for short_name, long_name in cls._lang_map.items(): + if long_name == code: + return short_name + + class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): def __init__(self, proxies=None): # Set default handlers From 607841af64d308eaf577e528fd7317a8b382b8e6 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 21 Jun 2015 18:55:26 +0800 Subject: [PATCH 15/29] [adobetv] Support embeddable videos (closes #6039) --- youtube_dl/extractor/__init__.py | 5 ++- youtube_dl/extractor/adobetv.py | 59 ++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 6c548d8e9..0f4af88f0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -4,7 +4,10 @@ from .abc import ABCIE from .abc7news import Abc7NewsIE from .academicearth import AcademicEarthCourseIE from .addanime import AddAnimeIE -from .adobetv import AdobeTVIE +from .adobetv import ( + AdobeTVIE, + AdobeTVVideoIE, +) from .adultswim import AdultSwimIE from .aftenposten import AftenpostenIE from .aftonbladet import AftonbladetIE diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 97d128560..695a4a15c 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -5,6 +5,8 @@ from ..utils import ( parse_duration, unified_strdate, str_to_int, + float_or_none, + ISO639Utils, ) @@ -69,3 +71,60 @@ class AdobeTVIE(InfoExtractor): 'view_count': view_count, 'formats': formats, } + + +class AdobeTVVideoIE(InfoExtractor): + _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P\d+)' + + _TEST = { + 'url': 'https://video.tv.adobe.com/v/2456/', + 'md5': '43662b577c018ad707a63766462b1e87', + 'info_dict': { + 'id': '2456', + 'ext': 'mp4', + 'title': 'New experience with Acrobat DC', + 'description': 'New experience with Acrobat DC', + 'duration': 248.667, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + player_params = self._parse_json(self._search_regex( + r'var\s+bridge\s*=\s*([^;]+);', webpage, 'player parameters'), + video_id) + + formats = [{ + 'url': source['src'], + 'width': source.get('width'), + 'height': source.get('height'), + 'tbr': source.get('bitrate'), + } for source in player_params['sources']] + + # For both metadata and downloaded files the duration varies among + # formats. I just pick the max one + duration = max(filter(None, [ + float_or_none(source.get('duration'), scale=1000) + for source in player_params['sources']])) + + subtitles = {} + for translation in player_params.get('translations', []): + lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium']) + if lang_id not in subtitles: + subtitles[lang_id] = [] + subtitles[lang_id].append({ + 'url': translation['vttPath'], + 'ext': 'vtt', + }) + + return { + 'id': video_id, + 'formats': formats, + 'title': player_params['title'], + 'description': self._og_search_description(webpage), + 'duration': duration, + 'subtitles': subtitles, + } From 4e3357717312ac56145ba166a1ae2806f6db8337 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 21 Jun 2015 19:16:59 +0800 Subject: [PATCH 16/29] [utils] Support ttaf1 namespace in TTML It's found in bbc.co.uk. See #6038 --- youtube_dl/utils.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 259a9d634..a2746b2d1 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1841,7 +1841,10 @@ def srt_subtitles_timecode(seconds): def dfxp2srt(dfxp_data): - _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'}) + _x = functools.partial(xpath_with_ns, ns_map={ + 'ttml': 'http://www.w3.org/ns/ttml', + 'ttaf1': 'http://www.w3.org/2006/10/ttaf1', + }) def parse_node(node): str_or_empty = functools.partial(str_or_none, default='') @@ -1849,9 +1852,9 @@ def dfxp2srt(dfxp_data): out = str_or_empty(node.text) for child in node: - if child.tag in (_x('ttml:br'), 'br'): + if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'): out += '\n' + str_or_empty(child.tail) - elif child.tag in (_x('ttml:span'), 'span'): + elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'): out += str_or_empty(parse_node(child)) else: out += str_or_empty(xml.etree.ElementTree.tostring(child)) @@ -1860,7 +1863,7 @@ def dfxp2srt(dfxp_data): dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8')) out = [] - paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') + paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall('.//p') if not paras: raise ValueError('Invalid dfxp/TTML subtitle') From 78294e6a9ce2c9a294d663ac79936df7353b9980 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 21 Jun 2015 19:22:26 +0800 Subject: [PATCH 17/29] [bbccouk] Remove TTML to srt conversion codes It's broken. See #6038 --- youtube_dl/extractor/bbccouk.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/youtube_dl/extractor/bbccouk.py b/youtube_dl/extractor/bbccouk.py index 0305f88b5..5825d2867 100644 --- a/youtube_dl/extractor/bbccouk.py +++ b/youtube_dl/extractor/bbccouk.py @@ -251,26 +251,11 @@ class BBCCoUkIE(InfoExtractor): for connection in self._extract_connections(media): captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions') lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en') - ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}')) - srt = '' - - def _extract_text(p): - if p.text is not None: - stripped_text = p.text.strip() - if stripped_text: - return stripped_text - return ' '.join(span.text.strip() for span in p.findall('{http://www.w3.org/2006/10/ttaf1}span')) - for pos, p in enumerate(ps): - srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'), _extract_text(p)) subtitles[lang] = [ { 'url': connection.get('href'), 'ext': 'ttml', }, - { - 'data': srt, - 'ext': 'srt', - }, ] return subtitles From 756f574e4e7160ca5b39c6e18ec5168beb4a8eb1 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 21 Jun 2015 21:30:34 +0800 Subject: [PATCH 18/29] [dailymotion/generic] Add DailymotionCloudIE --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/dailymotion.py | 42 +++++++++++++++++++++++++++++ youtube_dl/extractor/generic.py | 17 ++++++++++++ 3 files changed, 60 insertions(+) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 0f4af88f0..bd3c3193f 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -106,6 +106,7 @@ from .dailymotion import ( DailymotionIE, DailymotionPlaylistIE, DailymotionUserIE, + DailymotionCloudIE, ) from .daum import DaumIE from .dbtv import DBTVIE diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 70aa4333c..96f0ed9ad 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -251,3 +251,45 @@ class DailymotionUserIE(DailymotionPlaylistIE): 'title': full_user, 'entries': self._extract_entries(user), } + + +class DailymotionCloudIE(DailymotionBaseInfoExtractor): + _VALID_URL = r'http://api\.dmcloud\.net/embed/[^/]+/(?P[^/?]+)' + + _TEST = { + # From http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html + # Tested at FranceTvInfo_2 + 'url': 'http://api.dmcloud.net/embed/4e7343f894a6f677b10006b4/556e03339473995ee145930c?auth=1464865870-0-jyhsm84b-ead4c701fb750cf9367bf4447167a3db&autoplay=1', + 'only_matching': True, + } + + @classmethod + def _extract_dmcloud_url(self, webpage): + mobj = re.search(r']+src=[\'"](http://api\.dmcloud\.net/embed/[^/]+/[^\'"]+)[\'"]', webpage) + if mobj: + return mobj.group(1) + + mobj = re.search(r']+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=[\'"](http://api\.dmcloud\.net/embed/[^/]+/[^\'"]+)[\'"]', webpage) + if mobj: + return mobj.group(1) + + def _real_extract(self, url): + video_id = self._match_id(url) + + request = self._build_request(url) + webpage = self._download_webpage(request, video_id) + + title = self._html_search_regex(r'([^>]+)', webpage, 'title') + + video_info = self._parse_json(self._search_regex( + r'var\s+info\s*=\s*([^;]+);', webpage, 'video info'), video_id) + + # TODO: parse ios_url, which is in fact a manifest + video_url = video_info['mp4_url'] + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': video_info.get('thumbnail_url'), + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index bf689f531..07939b196 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -44,6 +44,7 @@ from .bliptv import BlipTVIE from .svt import SVTIE from .pornhub import PornHubIE from .vimeo import VimeoIE +from .dailymotion import DailymotionCloudIE class GenericIE(InfoExtractor): @@ -813,6 +814,17 @@ class GenericIE(InfoExtractor): 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.', 'uploader': 'Rogers Sportsnet', }, + }, + # Dailymotion Cloud video + { + 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910', + 'md5': '49444254273501a64675a7e68c502681', + 'info_dict': { + 'id': '5585de919473990de4bee11b', + 'ext': 'mp4', + 'title': 'Le débat', + 'thumbnail': 're:^https?://.*\.jpe?g$', + } } ] @@ -1486,6 +1498,11 @@ class GenericIE(InfoExtractor): if senate_isvp_url: return self.url_result(senate_isvp_url, 'SenateISVP') + # Look for Dailymotion Cloud videos + dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) + if dmcloud_url: + return self.url_result(dmcloud_url, 'DailymotionCloud') + def check_video(vurl): if YoutubeIE.suitable(vurl): return True From 6f96e308d0fa7674ac88e1e80fc602413f9a6b31 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 21 Jun 2015 21:31:33 +0800 Subject: [PATCH 19/29] [francetvinfo.fr] Support dmcloud embeds (fixes #6034) --- youtube_dl/extractor/francetv.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index db0bbec1e..b2c984bf2 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -18,6 +18,7 @@ from ..utils import ( parse_duration, determine_ext, ) +from .dailymotion import DailymotionCloudIE class FranceTVBaseInfoExtractor(InfoExtractor): @@ -131,12 +132,26 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'skip_download': 'HLS (reqires ffmpeg)' }, 'skip': 'Ce direct est terminé et sera disponible en rattrapage dans quelques minutes.', + }, { + 'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html', + 'md5': 'f485bda6e185e7d15dbc69b72bae993e', + 'info_dict': { + 'id': '556e03339473995ee145930c', + 'ext': 'mp4', + 'title': 'Les entreprises familiales : le secret de la réussite', + 'thumbnail': 're:^https?://.*\.jpe?g$', + } }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) page_title = mobj.group('title') webpage = self._download_webpage(url, page_title) + + dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) + if dmcloud_url: + return self.url_result(dmcloud_url, 'DailymotionCloud') + video_id, catalogue = self._search_regex( r'id-video=([^@]+@[^"]+)', webpage, 'video id').split('@') return self._extract_video(video_id, catalogue) From 3f3308cd75fc068e4d67d00aa7d7892e02ab16e9 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Sun, 21 Jun 2015 23:29:40 +0800 Subject: [PATCH 20/29] Revert "[sohu] Update extractor" This reverts commit 32060c6d6b618fa858b2ce43db34d02fd43bc542. --- youtube_dl/extractor/sohu.py | 44 ++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 29bd9ce6f..7644cc02d 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -8,7 +8,10 @@ from ..compat import ( compat_str, compat_urllib_request ) -from ..utils import ExtractorError +from ..utils import ( + sanitize_url_path_consecutive_slashes, + ExtractorError, +) class SohuIE(InfoExtractor): @@ -26,7 +29,7 @@ class SohuIE(InfoExtractor): 'skip': 'On available in China', }, { 'url': 'http://tv.sohu.com/20150305/n409385080.shtml', - 'md5': 'ac9a5d322b4bf9ae184d53e4711e4f1a', + 'md5': '699060e75cf58858dd47fb9c03c42cfb', 'info_dict': { 'id': '409385080', 'ext': 'mp4', @@ -34,7 +37,7 @@ class SohuIE(InfoExtractor): } }, { 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', - 'md5': '49308ff6dafde5ece51137d04aec311e', + 'md5': '9bf34be48f2f4dadcb226c74127e203c', 'info_dict': { 'id': '78693464', 'ext': 'mp4', @@ -48,7 +51,7 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', }, 'playlist': [{ - 'md5': '492923eac023ba2f13ff69617c32754a', + 'md5': 'bdbfb8f39924725e6589c146bc1883ad', 'info_dict': { 'id': '78910339_part1', 'ext': 'mp4', @@ -56,7 +59,7 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { - 'md5': 'de604848c0e8e9c4a4dde7e1347c0637', + 'md5': '3e1f46aaeb95354fd10e7fca9fc1804e', 'info_dict': { 'id': '78910339_part2', 'ext': 'mp4', @@ -64,7 +67,7 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { - 'md5': '93584716ee0657c0b205b8aa3d27aa13', + 'md5': '8407e634175fdac706766481b9443450', 'info_dict': { 'id': '78910339_part3', 'ext': 'mp4', @@ -139,21 +142,24 @@ class SohuIE(InfoExtractor): for i in range(part_count): formats = [] for format_id, format_data in formats_json.items(): + allot = format_data['allot'] + prot = format_data['prot'] + data = format_data['data'] + clips_url = data['clipsURL'] + su = data['su'] + + part_str = self._download_webpage( + 'http://%s/?prot=%s&file=%s&new=%s' % + (allot, prot, clips_url[i], su[i]), + video_id, + 'Downloading %s video URL part %d of %d' + % (format_id, i + 1, part_count)) + + part_info = part_str.split('|') - # URLs starts with http://newflv.sohu.ccgslb.net/ is not usable - # so retry until got a working URL - video_url = 'newflv.sohu.ccgslb.net' - retries = 0 - while 'newflv.sohu.ccgslb.net' in video_url and retries < 5: - download_note = 'Download information from CDN gateway for format ' + format_id - if retries > 0: - download_note += ' (retry #%d)' % retries - retries += 1 - cdn_info = self._download_json( - 'http://data.vod.itc.cn/cdnList?new=' + data['su'][i], - video_id, download_note) - video_url = cdn_info['url'] + video_url = sanitize_url_path_consecutive_slashes( + '%s%s?key=%s' % (part_info[0], su[i], part_info[3])) formats.append({ 'url': video_url, From 98ca102441624c2a1a66114c32e0142899f78dd3 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 22 Jun 2015 00:59:55 +0800 Subject: [PATCH 21/29] [sohu] Fix extraction again --- youtube_dl/extractor/sohu.py | 42 +++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 7644cc02d..ba2d5e19b 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -6,10 +6,10 @@ import re from .common import InfoExtractor from ..compat import ( compat_str, - compat_urllib_request + compat_urllib_request, + compat_urllib_parse, ) from ..utils import ( - sanitize_url_path_consecutive_slashes, ExtractorError, ) @@ -143,23 +143,41 @@ class SohuIE(InfoExtractor): formats = [] for format_id, format_data in formats_json.items(): allot = format_data['allot'] - prot = format_data['prot'] data = format_data['data'] clips_url = data['clipsURL'] su = data['su'] - part_str = self._download_webpage( - 'http://%s/?prot=%s&file=%s&new=%s' % - (allot, prot, clips_url[i], su[i]), - video_id, - 'Downloading %s video URL part %d of %d' - % (format_id, i + 1, part_count)) + video_url = 'newflv.sohu.ccgslb.net' + cdnId = None + retries = 0 - part_info = part_str.split('|') + while 'newflv.sohu.ccgslb.net' in video_url: + params = { + 'prot': 9, + 'file': clips_url[i], + 'new': su[i], + 'prod': 'flash', + } - video_url = sanitize_url_path_consecutive_slashes( - '%s%s?key=%s' % (part_info[0], su[i], part_info[3])) + if cdnId is not None: + params['idc'] = cdnId + + download_note = 'Downloading %s video URL part %d of %d' % ( + format_id, i + 1, part_count) + + if retries > 0: + download_note += ' (retry #%d)' % retries + part_info = self._parse_json(self._download_webpage( + 'http://%s/?%s' % (allot, compat_urllib_parse.urlencode(params)), + video_id, download_note), video_id) + + video_url = part_info['url'] + cdnId = part_info.get('nid') + + retries += 1 + if retries > 5: + raise ExtractorError('Failed to get video URL') formats.append({ 'url': video_url, From 0bbba43ed0c68b612fcafbdad460a93b733b9f87 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 21 Jun 2015 23:10:38 +0600 Subject: [PATCH 22/29] [xhamster:embed] Add extractor (Closes #6032) --- youtube_dl/extractor/__init__.py | 5 ++++- youtube_dl/extractor/xhamster.py | 28 +++++++++++++++++++++++++++- 2 files changed, 31 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index bd3c3193f..dc1a302e6 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -701,7 +701,10 @@ from .wrzuta import WrzutaIE from .wsj import WSJIE from .xbef import XBefIE from .xboxclips import XboxClipsIE -from .xhamster import XHamsterIE +from .xhamster import ( + XHamsterIE, + XHamsterEmbedIE, +) from .xminus import XMinusIE from .xnxx import XNXXIE from .xstream import XstreamIE diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 4527567f8..ae3c5962b 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -13,7 +13,6 @@ from ..utils import ( class XHamsterIE(InfoExtractor): - """Information Extractor for xHamster""" _VALID_URL = r'(?Phttps?)://(?:.+?\.)?xhamster\.com/movies/(?P[0-9]+)/(?P.+?)\.html(?:\?.*)?' _TESTS = [ { @@ -133,3 +132,30 @@ class XHamsterIE(InfoExtractor): 'age_limit': age_limit, 'formats': formats, } + + +class XHamsterEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?xhamster\.com/xembed\.php\?video=(?P\d+)' + _TEST = { + 'url': 'http://xhamster.com/xembed.php?video=3328539', + 'info_dict': { + 'id': '3328539', + 'ext': 'mp4', + 'title': 'Pen Masturbation', + 'upload_date': '20140728', + 'uploader_id': 'anonymous', + 'duration': 5, + 'age_limit': 18, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex( + r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id, + webpage, 'xhamster url') + + return self.url_result(video_url, 'XHamster'); From 2bb5b6d0a1671957c7a2e6d6433901b2a1b8f48f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 21 Jun 2015 23:11:25 +0600 Subject: [PATCH 23/29] [generic] Add support for xhamster embeds --- youtube_dl/extractor/generic.py | 6 ++++++ youtube_dl/extractor/xhamster.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 07939b196..e108bde66 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -43,6 +43,7 @@ from .senateisvp import SenateISVPIE from .bliptv import BlipTVIE from .svt import SVTIE from .pornhub import PornHubIE +from .xhamster import XHamsterEmbedIE from .vimeo import VimeoIE from .dailymotion import DailymotionCloudIE @@ -1331,6 +1332,11 @@ class GenericIE(InfoExtractor): if pornhub_url: return self.url_result(pornhub_url, 'PornHub') + # Look for embedded XHamster player + xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) + if xhamster_urls: + return _playlist_from_matches(xhamster_urls, ie='XHamsterEmbed') + # Look for embedded Tvigle player mobj = re.search( r']+?src=(["\'])(?P(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index ae3c5962b..725e01ac3 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -149,6 +149,12 @@ class XHamsterEmbedIE(InfoExtractor): } } + @staticmethod + def _extract_urls(webpage): + return [url for _, url in re.findall( + r']+?src=(["\'])(?P(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1', + webpage)] + def _real_extract(self, url): video_id = self._match_id(url) From c76799c5553b3b48bb7cc73dec452c3637a8670a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sun, 21 Jun 2015 23:18:28 +0600 Subject: [PATCH 24/29] [extractor/generic] Add test for xhamster embed --- youtube_dl/extractor/generic.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index e108bde66..20fcd8170 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -336,6 +336,15 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + # XHamster embed + { + 'url': 'http://www.numisc.com/forum/showthread.php?11696-FM15-which-pumiscer-was-this-%28-vid-%29-%28-alfa-as-fuck-srx-%29&s=711f5db534502e22260dec8c5e2d66d8', + 'info_dict': { + 'id': 'showthread', + 'title': '[NSFL] [FM15] which pumiscer was this ( vid ) ( alfa as fuck srx )', + }, + 'playlist_mincount': 7, + }, # Embedded TED video { 'url': 'http://en.support.wordpress.com/videos/ted-talks/', From a5158f38a31e863a39de8f66c26469a5d4469280 Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Mon, 22 Jun 2015 15:02:53 +0800 Subject: [PATCH 25/29] [generic/adobetv] Support AdobeTVVideo embeds (#6039) --- youtube_dl/extractor/adobetv.py | 1 + youtube_dl/extractor/generic.py | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py index 695a4a15c..5e43adc51 100644 --- a/youtube_dl/extractor/adobetv.py +++ b/youtube_dl/extractor/adobetv.py @@ -77,6 +77,7 @@ class AdobeTVVideoIE(InfoExtractor): _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P\d+)' _TEST = { + # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners 'url': 'https://video.tv.adobe.com/v/2456/', 'md5': '43662b577c018ad707a63766462b1e87', 'info_dict': { diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 20fcd8170..5c03fddc6 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -835,6 +835,18 @@ class GenericIE(InfoExtractor): 'title': 'Le débat', 'thumbnail': 're:^https?://.*\.jpe?g$', } + }, + # AdobeTVVideo embed + { + 'url': 'https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners', + 'md5': '43662b577c018ad707a63766462b1e87', + 'info_dict': { + 'id': '2456', + 'ext': 'mp4', + 'title': 'New experience with Acrobat DC', + 'description': 'New experience with Acrobat DC', + 'duration': 248.667, + }, } ] @@ -1518,6 +1530,15 @@ class GenericIE(InfoExtractor): if dmcloud_url: return self.url_result(dmcloud_url, 'DailymotionCloud') + # Look for AdobeTVVideo embeds + mobj = re.search( + r']+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', + webpage) + if mobj is not None: + return self.url_result( + self._proto_relative_url(unescapeHTML(mobj.group(1))), + 'AdobeTVVideo') + def check_video(vurl): if YoutubeIE.suitable(vurl): return True From 25701d5a2ca8c7a58c91e11c6a30d4e61b02e89c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Mon, 22 Jun 2015 11:18:52 +0200 Subject: [PATCH 26/29] [xhamster] pep8: remove trailing ';' --- youtube_dl/extractor/xhamster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 725e01ac3..b4ad513a0 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -164,4 +164,4 @@ class XHamsterEmbedIE(InfoExtractor): r'href="(https?://xhamster\.com/movies/%s/[^"]+\.html[^"]*)"' % video_id, webpage, 'xhamster url') - return self.url_result(video_url, 'XHamster'); + return self.url_result(video_url, 'XHamster') From 255f5694aabe07fa7f33978c6b97ced469e172db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Mon, 22 Jun 2015 20:11:15 +0600 Subject: [PATCH 27/29] [faz] Extend _VALID_URL (Closes #6050) --- youtube_dl/extractor/faz.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/faz.py b/youtube_dl/extractor/faz.py index 3c39ca451..cebdd0193 100644 --- a/youtube_dl/extractor/faz.py +++ b/youtube_dl/extractor/faz.py @@ -6,9 +6,9 @@ from .common import InfoExtractor class FazIE(InfoExtractor): IE_NAME = 'faz.net' - _VALID_URL = r'https?://www\.faz\.net/multimedia/videos/.*?-(?P\d+)\.html' + _VALID_URL = r'https?://(?:www\.)?faz\.net/(?:[^/]+/)*.*?-(?P\d+)\.html' - _TEST = { + _TESTS = [{ 'url': 'http://www.faz.net/multimedia/videos/stockholm-chemie-nobelpreis-fuer-drei-amerikanische-forscher-12610585.html', 'info_dict': { 'id': '12610585', @@ -16,7 +16,22 @@ class FazIE(InfoExtractor): 'title': 'Stockholm: Chemie-Nobelpreis für drei amerikanische Forscher', 'description': 'md5:1453fbf9a0d041d985a47306192ea253', }, - } + }, { + 'url': 'http://www.faz.net/aktuell/politik/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html', + 'only_matching': True, + }, { + 'url': 'http://www.faz.net/berlin-gabriel-besteht-zerreissprobe-ueber-datenspeicherung-13659345.html', + 'only_matching': True, + }, { + 'url': 'http://www.faz.net/-13659345.html', + 'only_matching': True, + }, { + 'url': 'http://www.faz.net/aktuell/politik/-13659345.html', + 'only_matching': True, + }, { + 'url': 'http://www.faz.net/foobarblafasel-13659345.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) From e20d0c1e69f66a82dd493680351538ea92393fd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Tue, 23 Jun 2015 21:34:29 +0600 Subject: [PATCH 28/29] [brightcove] Use `compat_xml_parse_error` (Closes #6060) --- youtube_dl/extractor/brightcove.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index d768f99e6..4721c2293 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -13,6 +13,7 @@ from ..compat import ( compat_urllib_parse_urlparse, compat_urllib_request, compat_urlparse, + compat_xml_parse_error, ) from ..utils import ( determine_ext, @@ -119,7 +120,7 @@ class BrightcoveIE(InfoExtractor): try: object_doc = xml.etree.ElementTree.fromstring(object_str.encode('utf-8')) - except xml.etree.ElementTree.ParseError: + except compat_xml_parse_error: return fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') From 18b5e1e5348ba3a6d1b6a98e97217eebb3d32a1e Mon Sep 17 00:00:00 2001 From: Yen Chi Hsuan Date: Wed, 24 Jun 2015 16:00:12 +0800 Subject: [PATCH 29/29] [drbonanza] Fix extraction of videos --- youtube_dl/extractor/drbonanza.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/drbonanza.py b/youtube_dl/extractor/drbonanza.py index 7626219ba..8b98b013a 100644 --- a/youtube_dl/extractor/drbonanza.py +++ b/youtube_dl/extractor/drbonanza.py @@ -15,7 +15,6 @@ class DRBonanzaIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.dr.dk/bonanza/serie/portraetter/Talkshowet.htm?assetId=65517', - 'md5': 'fe330252ddea607635cf2eb2c99a0af3', 'info_dict': { 'id': '65517', 'ext': 'mp4', @@ -26,6 +25,9 @@ class DRBonanzaIE(InfoExtractor): 'upload_date': '20110120', 'duration': 3664, }, + 'params': { + 'skip_download': True, # requires rtmp + }, }, { 'url': 'http://www.dr.dk/bonanza/radio/serie/sport/fodbold.htm?assetId=59410', 'md5': '6dfe039417e76795fb783c52da3de11d', @@ -93,6 +95,11 @@ class DRBonanzaIE(InfoExtractor): 'format_id': file['Type'].replace('Video', ''), 'preference': preferencemap.get(file['Type'], -10), }) + if format['url'].startswith('rtmp'): + rtmp_url = format['url'] + format['rtmp_live'] = True # --resume does not work + if '/bonanza/' in rtmp_url: + format['play_path'] = rtmp_url.split('/bonanza/')[1] formats.append(format) elif file['Type'] == "Thumb": thumbnail = file['Location'] @@ -111,9 +118,6 @@ class DRBonanzaIE(InfoExtractor): description = '%s\n%s\n%s\n' % ( info['Description'], info['Actors'], info['Colophon']) - for f in formats: - f['url'] = f['url'].replace('rtmp://vod-bonanza.gss.dr.dk/bonanza/', 'http://vodfiles.dr.dk/') - f['url'] = f['url'].replace('mp4:bonanza', 'bonanza') self._sort_formats(formats) display_id = re.sub(r'[^\w\d-]', '', re.sub(r' ', '-', title.lower())) + '-' + asset_id