From 2ca7ed41fed73cf37581b07d0c67d3bad8a6acc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergey=20M=E2=80=A4?= Date: Sat, 30 Dec 2017 07:28:18 +0700 Subject: [PATCH] [mediasite] Improve extraction and code style, add support for DASH (closes #11185, closes #14343, refs #5428) --- youtube_dl/extractor/generic.py | 29 +++++-- youtube_dl/extractor/mediasite.py | 128 ++++++++++++++++++------------ 2 files changed, 100 insertions(+), 57 deletions(-) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index d5622c823..cc4c90b8c 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -100,6 +100,7 @@ from .megaphone import MegaphoneIE from .vzaar import VzaarIE from .channel9 import Channel9IE from .vshare import VShareIE +from .mediasite import MediasiteIE class GenericIE(InfoExtractor): @@ -1925,6 +1926,18 @@ class GenericIE(InfoExtractor): 'title': 'vl14062007715967', 'ext': 'mp4', } + }, + { + 'url': 'http://www.heidelberg-laureate-forum.org/blog/video/lecture-friday-september-23-2016-sir-c-antony-r-hoare/', + 'md5': 'aecd089f55b1cb5a59032cb049d3a356', + 'info_dict': { + 'id': '90227f51a80c4d8f86c345a7fa62bd9a1d', + 'ext': 'mp4', + 'title': 'Lecture: Friday, September 23, 2016 - Sir Tony Hoare', + 'description': 'md5:5a51db84a62def7b7054df2ade403c6c', + 'timestamp': 1474354800, + 'upload_date': '20160920', + } } # { # # TODO: find another test @@ -2884,14 +2897,14 @@ class GenericIE(InfoExtractor): vshare_urls, video_id, video_title, ie=VShareIE.ie_key()) # Look for Mediasite embeds - mobj = re.search(r'''(?xi) - ]+src="((?:https?://[a-z0-9\-\.:\[\]]+)? - /Mediasite/Play/[0-9a-f]{32,34}(?:\?.*?)?)" - ''', webpage) - if mobj is not None: - return self.url_result(smuggle_url( - compat_urlparse.urljoin(url, unescapeHTML(mobj.group(1))), - { 'UrlReferrer': url }), 'Livestream') + mediasite_urls = MediasiteIE._extract_urls(webpage) + if mediasite_urls: + entries = [ + self.url_result(smuggle_url( + compat_urlparse.urljoin(url, mediasite_url), + {'UrlReferrer': url}), ie=MediasiteIE.ie_key()) + for mediasite_url in mediasite_urls] + return self.playlist_result(entries, video_id, video_title) def merge_dicts(dict1, dict2): merged = {} diff --git a/youtube_dl/extractor/mediasite.py b/youtube_dl/extractor/mediasite.py index 5d281684e..0e2645c55 100644 --- a/youtube_dl/extractor/mediasite.py +++ b/youtube_dl/extractor/mediasite.py @@ -5,21 +5,22 @@ import re import json from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( ExtractorError, - unsmuggle_url, - mimetype2ext, float_or_none, + mimetype2ext, + unescapeHTML, + unsmuggle_url, + urljoin, ) class MediasiteIE(InfoExtractor): - _VALID_URL = r'''(?xi) - https?://[a-z0-9\-\.:\[\]]+/Mediasite/Play/ - (?P[0-9a-f]{32,34}) - (?P\?[^#]+|) - ''' + _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/Play/(?P[0-9a-f]{32,34})(?P\?[^#]+|)' _TESTS = [ { 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', @@ -87,67 +88,96 @@ class MediasiteIE(InfoExtractor): # look in Mediasite.Core.js (Mediasite.ContentStreamType[*]) _STREAM_TYPES = { - 0: 'video1', # the main video + 0: 'video1', # the main video 2: 'slide', 3: 'presentation', - 4: 'video2', # screencast? + 4: 'video2', # screencast? 5: 'video3', } + @staticmethod + def _extract_urls(webpage): + return [ + unescapeHTML(mobj.group('url')) + for mobj in re.finditer( + r'(?xi)]+\bsrc=(["\'])(?P(?:(?:https?:)?//[^/]+)?/Mediasite/Play/[0-9a-f]{32,34}(?:\?.*?)?)\1', + webpage)] + def _real_extract(self, url): url, data = unsmuggle_url(url, {}) mobj = re.match(self._VALID_URL, url) - ResourceId = mobj.group('id') - QueryString = mobj.group('QueryString') + resource_id = mobj.group('id') + query = mobj.group('query') - webpage = self._download_webpage(url, ResourceId) # XXX: add UrlReferrer? + webpage, urlh = self._download_webpage_handle(url, resource_id) # XXX: add UrlReferrer? + redirect_url = compat_str(urlh.geturl()) # XXX: might have also extracted UrlReferrer and QueryString from the html - ServicePath = compat_urlparse.urljoin(url, self._html_search_regex( - r'
(.+?)
', webpage, ResourceId, + service_path = compat_urlparse.urljoin(redirect_url, self._html_search_regex( + r']+\bid=["\']ServicePath[^>]+>(.+?)', webpage, resource_id, default='/Mediasite/PlayerService/PlayerService.svc/json')) - PlayerOptions = self._download_json( - '%s/GetPlayerOptions' % (ServicePath), ResourceId, + player_options = self._download_json( + '%s/GetPlayerOptions' % service_path, resource_id, headers={ 'Content-type': 'application/json; charset=utf-8', 'X-Requested-With': 'XMLHttpRequest', }, data=json.dumps({ 'getPlayerOptionsRequest': { - 'ResourceId': ResourceId, - 'QueryString': QueryString, + 'ResourceId': resource_id, + 'QueryString': query, 'UrlReferrer': data.get('UrlReferrer', ''), 'UseScreenReader': False, } - }).encode('utf-8')) - Presentation = PlayerOptions['d']['Presentation'] - if Presentation is None: - raise ExtractorError('Mediasite says: %s' % - (PlayerOptions['d']['PlayerPresentationStatusMessage'],), + }).encode('utf-8'))['d'] + + presentation = player_options['Presentation'] + title = presentation['Title'] + + if presentation is None: + raise ExtractorError( + 'Mediasite says: %s' % player_options['PlayerPresentationStatusMessage'], expected=True) thumbnails = [] formats = [] - for snum, Stream in enumerate(Presentation['Streams']): - stream_type = self._STREAM_TYPES.get( - Stream['StreamType'], 'type%u' % Stream['StreamType']) + for snum, Stream in enumerate(presentation['Streams']): + stream_type = Stream.get('StreamType') + if stream_type is None: + continue + + video_urls = Stream.get('VideoUrls') + if not isinstance(video_urls, list): + video_urls = [] + + stream_id = self._STREAM_TYPES.get( + stream_type, 'type%u' % stream_type) stream_formats = [] - for unum, VideoUrl in enumerate(Stream['VideoUrls']): - url = VideoUrl['Location'] + for unum, VideoUrl in enumerate(video_urls): + video_url = VideoUrl.get('Location') + if not video_url or not isinstance(video_url, compat_str): + continue # XXX: if Stream.get('CanChangeScheme', False), switch scheme to HTTP/HTTPS - if VideoUrl['MediaType'] == 'SS': + media_type = VideoUrl.get('MediaType') + if media_type == 'SS': stream_formats.extend(self._extract_ism_formats( - url, ResourceId, ism_id='%s-%u.%u' % (stream_type, snum, unum))) - continue - - stream_formats.append({ - 'format_id': '%s-%u.%u' % (stream_type, snum, unum), - 'url': url, - 'ext': mimetype2ext(VideoUrl['MimeType']), - }) + video_url, resource_id, + ism_id='%s-%u.%u' % (stream_id, snum, unum), + fatal=False)) + elif media_type == 'Dash': + stream_formats.extend(self._extract_mpd_formats( + video_url, resource_id, + mpd_id='%s-%u.%u' % (stream_id, snum, unum), + fatal=False)) + else: + stream_formats.append({ + 'format_id': '%s-%u.%u' % (stream_id, snum, unum), + 'url': video_url, + 'ext': mimetype2ext(VideoUrl.get('MimeType')), + }) # TODO: if Stream['HasSlideContent']: # synthesise an MJPEG video stream '%s-%u.slides' % (stream_type, snum) @@ -155,16 +185,16 @@ class MediasiteIE(InfoExtractor): # this will require writing a custom downloader... # disprefer 'secondary' streams - if Stream['StreamType'] != 0: + if stream_type != 0: for fmt in stream_formats: fmt['preference'] = -1 - ThumbnailUrl = Stream.get('ThumbnailUrl') - if ThumbnailUrl: + thumbnail_url = Stream.get('ThumbnailUrl') + if thumbnail_url: thumbnails.append({ - 'id': '%s-%u' % (stream_type, snum), - 'url': compat_urlparse.urljoin(url, ThumbnailUrl), - 'preference': -1 if Stream['StreamType'] != 0 else 0, + 'id': '%s-%u' % (stream_id, snum), + 'url': urljoin(redirect_url, thumbnail_url), + 'preference': -1 if stream_type != 0 else 0, }) formats.extend(stream_formats) @@ -174,11 +204,11 @@ class MediasiteIE(InfoExtractor): # XXX: Presentation['Transcript'] return { - 'id': ResourceId, - 'title': Presentation['Title'], - 'description': Presentation.get('Description'), - 'duration': float_or_none(Presentation.get('Duration'), 1000), - 'timestamp': float_or_none(Presentation.get('UnixTime'), 1000), + 'id': resource_id, + 'title': title, + 'description': presentation.get('Description'), + 'duration': float_or_none(presentation.get('Duration'), 1000), + 'timestamp': float_or_none(presentation.get('UnixTime'), 1000), 'formats': formats, 'thumbnails': thumbnails, }