You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

319 lines
12 KiB

  1. from __future__ import unicode_literals
  2. import re
  3. import json
  4. from .common import InfoExtractor
  5. from .gigya import GigyaBaseIE
  6. from ..compat import compat_HTTPError
  7. from ..utils import (
  8. ExtractorError,
  9. strip_or_none,
  10. float_or_none,
  11. int_or_none,
  12. merge_dicts,
  13. parse_iso8601,
  14. )
  15. class CanvasIE(InfoExtractor):
  16. _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P<id>[^/?#&]+)'
  17. _TESTS = [{
  18. 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
  19. 'md5': '90139b746a0a9bd7bb631283f6e2a64e',
  20. 'info_dict': {
  21. 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
  22. 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
  23. 'ext': 'flv',
  24. 'title': 'Nachtwacht: De Greystook',
  25. 'description': 'md5:1db3f5dc4c7109c821261e7512975be7',
  26. 'thumbnail': r're:^https?://.*\.jpg$',
  27. 'duration': 1468.03,
  28. },
  29. 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'],
  30. }, {
  31. 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
  32. 'only_matching': True,
  33. }]
  34. _HLS_ENTRY_PROTOCOLS_MAP = {
  35. 'HLS': 'm3u8_native',
  36. 'HLS_AES': 'm3u8',
  37. }
  38. def _real_extract(self, url):
  39. mobj = re.match(self._VALID_URL, url)
  40. site_id, video_id = mobj.group('site_id'), mobj.group('id')
  41. data = self._download_json(
  42. 'https://mediazone.vrt.be/api/v1/%s/assets/%s'
  43. % (site_id, video_id), video_id)
  44. title = data['title']
  45. description = data.get('description')
  46. formats = []
  47. for target in data['targetUrls']:
  48. format_url, format_type = target.get('url'), target.get('type')
  49. if not format_url or not format_type:
  50. continue
  51. if format_type in self._HLS_ENTRY_PROTOCOLS_MAP:
  52. formats.extend(self._extract_m3u8_formats(
  53. format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type],
  54. m3u8_id=format_type, fatal=False))
  55. elif format_type == 'HDS':
  56. formats.extend(self._extract_f4m_formats(
  57. format_url, video_id, f4m_id=format_type, fatal=False))
  58. elif format_type == 'MPEG_DASH':
  59. formats.extend(self._extract_mpd_formats(
  60. format_url, video_id, mpd_id=format_type, fatal=False))
  61. elif format_type == 'HSS':
  62. formats.extend(self._extract_ism_formats(
  63. format_url, video_id, ism_id='mss', fatal=False))
  64. else:
  65. formats.append({
  66. 'format_id': format_type,
  67. 'url': format_url,
  68. })
  69. self._sort_formats(formats)
  70. subtitles = {}
  71. subtitle_urls = data.get('subtitleUrls')
  72. if isinstance(subtitle_urls, list):
  73. for subtitle in subtitle_urls:
  74. subtitle_url = subtitle.get('url')
  75. if subtitle_url and subtitle.get('type') == 'CLOSED':
  76. subtitles.setdefault('nl', []).append({'url': subtitle_url})
  77. return {
  78. 'id': video_id,
  79. 'display_id': video_id,
  80. 'title': title,
  81. 'description': description,
  82. 'formats': formats,
  83. 'duration': float_or_none(data.get('duration'), 1000),
  84. 'thumbnail': data.get('posterImageUrl'),
  85. 'subtitles': subtitles,
  86. }
  87. class CanvasEenIE(InfoExtractor):
  88. IE_DESC = 'canvas.be and een.be'
  89. _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)'
  90. _TESTS = [{
  91. 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week',
  92. 'md5': 'ed66976748d12350b118455979cca293',
  93. 'info_dict': {
  94. 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
  95. 'display_id': 'de-afspraak-veilt-voor-de-warmste-week',
  96. 'ext': 'flv',
  97. 'title': 'De afspraak veilt voor de Warmste Week',
  98. 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6',
  99. 'thumbnail': r're:^https?://.*\.jpg$',
  100. 'duration': 49.02,
  101. },
  102. 'expected_warnings': ['is not a supported codec'],
  103. }, {
  104. # with subtitles
  105. 'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167',
  106. 'info_dict': {
  107. 'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625',
  108. 'display_id': 'pieter-0167',
  109. 'ext': 'mp4',
  110. 'title': 'Pieter 0167',
  111. 'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e',
  112. 'thumbnail': r're:^https?://.*\.jpg$',
  113. 'duration': 2553.08,
  114. 'subtitles': {
  115. 'nl': [{
  116. 'ext': 'vtt',
  117. }],
  118. },
  119. },
  120. 'params': {
  121. 'skip_download': True,
  122. },
  123. 'skip': 'Pagina niet gevonden',
  124. }, {
  125. 'url': 'https://www.een.be/sorry-voor-alles/herbekijk-sorry-voor-alles',
  126. 'info_dict': {
  127. 'id': 'mz-ast-11a587f8-b921-4266-82e2-0bce3e80d07f',
  128. 'display_id': 'herbekijk-sorry-voor-alles',
  129. 'ext': 'mp4',
  130. 'title': 'Herbekijk Sorry voor alles',
  131. 'description': 'md5:8bb2805df8164e5eb95d6a7a29dc0dd3',
  132. 'thumbnail': r're:^https?://.*\.jpg$',
  133. 'duration': 3788.06,
  134. },
  135. 'params': {
  136. 'skip_download': True,
  137. },
  138. 'skip': 'Episode no longer available',
  139. }, {
  140. 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend',
  141. 'only_matching': True,
  142. }]
  143. def _real_extract(self, url):
  144. mobj = re.match(self._VALID_URL, url)
  145. site_id, display_id = mobj.group('site_id'), mobj.group('id')
  146. webpage = self._download_webpage(url, display_id)
  147. title = strip_or_none(self._search_regex(
  148. r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>',
  149. webpage, 'title', default=None) or self._og_search_title(
  150. webpage, default=None))
  151. video_id = self._html_search_regex(
  152. r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
  153. group='id')
  154. return {
  155. '_type': 'url_transparent',
  156. 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id),
  157. 'ie_key': CanvasIE.ie_key(),
  158. 'id': video_id,
  159. 'display_id': display_id,
  160. 'title': title,
  161. 'description': self._og_search_description(webpage),
  162. }
  163. class VrtNUIE(GigyaBaseIE):
  164. IE_DESC = 'VrtNU.be'
  165. _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?P<site_id>vrtnu)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
  166. _TESTS = [{
  167. 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/',
  168. 'info_dict': {
  169. 'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de',
  170. 'ext': 'flv',
  171. 'title': 'De zwarte weduwe',
  172. 'description': 'md5:d90c21dced7db869a85db89a623998d4',
  173. 'duration': 1457.04,
  174. 'thumbnail': r're:^https?://.*\.jpg$',
  175. 'season': '1',
  176. 'season_number': 1,
  177. 'episode_number': 1,
  178. },
  179. 'skip': 'This video is only available for registered users'
  180. }]
  181. _NETRC_MACHINE = 'vrtnu'
  182. _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy'
  183. _CONTEXT_ID = 'R3595707040'
  184. def _real_initialize(self):
  185. self._login()
  186. def _login(self):
  187. username, password = self._get_login_info()
  188. if username is None:
  189. return
  190. auth_data = {
  191. 'APIKey': self._APIKEY,
  192. 'targetEnv': 'jssdk',
  193. 'loginID': username,
  194. 'password': password,
  195. 'authMode': 'cookie',
  196. }
  197. auth_info = self._gigya_login(auth_data)
  198. # Sometimes authentication fails for no good reason, retry
  199. login_attempt = 1
  200. while login_attempt <= 3:
  201. try:
  202. # When requesting a token, no actual token is returned, but the
  203. # necessary cookies are set.
  204. self._request_webpage(
  205. 'https://token.vrt.be',
  206. None, note='Requesting a token', errnote='Could not get a token',
  207. headers={
  208. 'Content-Type': 'application/json',
  209. 'Referer': 'https://www.vrt.be/vrtnu/',
  210. },
  211. data=json.dumps({
  212. 'uid': auth_info['UID'],
  213. 'uidsig': auth_info['UIDSignature'],
  214. 'ts': auth_info['signatureTimestamp'],
  215. 'email': auth_info['profile']['email'],
  216. }).encode('utf-8'))
  217. except ExtractorError as e:
  218. if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
  219. login_attempt += 1
  220. self.report_warning('Authentication failed')
  221. self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again')
  222. else:
  223. raise e
  224. else:
  225. break
  226. def _real_extract(self, url):
  227. display_id = self._match_id(url)
  228. webpage, urlh = self._download_webpage_handle(url, display_id)
  229. info = self._search_json_ld(webpage, display_id, default={})
  230. # title is optional here since it may be extracted by extractor
  231. # that is delegated from here
  232. title = strip_or_none(self._html_search_regex(
  233. r'(?ms)<h1 class="content__heading">(.+?)</h1>',
  234. webpage, 'title', default=None))
  235. description = self._html_search_regex(
  236. r'(?ms)<div class="content__description">(.+?)</div>',
  237. webpage, 'description', default=None)
  238. season = self._html_search_regex(
  239. [r'''(?xms)<div\ class="tabs__tab\ tabs__tab--active">\s*
  240. <span>seizoen\ (.+?)</span>\s*
  241. </div>''',
  242. r'<option value="seizoen (\d{1,3})" data-href="[^"]+?" selected>'],
  243. webpage, 'season', default=None)
  244. season_number = int_or_none(season)
  245. episode_number = int_or_none(self._html_search_regex(
  246. r'''(?xms)<div\ class="content__episode">\s*
  247. <abbr\ title="aflevering">afl</abbr>\s*<span>(\d+)</span>
  248. </div>''',
  249. webpage, 'episode_number', default=None))
  250. release_date = parse_iso8601(self._html_search_regex(
  251. r'(?ms)<div class="content__broadcastdate">\s*<time\ datetime="(.+?)"',
  252. webpage, 'release_date', default=None))
  253. # If there's a ? or a # in the URL, remove them and everything after
  254. clean_url = urlh.geturl().split('?')[0].split('#')[0].strip('/')
  255. securevideo_url = clean_url + '.mssecurevideo.json'
  256. try:
  257. video = self._download_json(securevideo_url, display_id)
  258. except ExtractorError as e:
  259. if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
  260. self.raise_login_required()
  261. raise
  262. # We are dealing with a '../<show>.relevant' URL
  263. redirect_url = video.get('url')
  264. if redirect_url:
  265. return self.url_result(self._proto_relative_url(redirect_url, 'https:'))
  266. # There is only one entry, but with an unknown key, so just get
  267. # the first one
  268. video_id = list(video.values())[0].get('videoid')
  269. return merge_dicts(info, {
  270. '_type': 'url_transparent',
  271. 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id,
  272. 'ie_key': CanvasIE.ie_key(),
  273. 'id': video_id,
  274. 'display_id': display_id,
  275. 'title': title,
  276. 'description': description,
  277. 'season': season,
  278. 'season_number': season_number,
  279. 'episode_number': episode_number,
  280. 'release_date': release_date,
  281. })