You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

368 lines
14 KiB

  1. from __future__ import unicode_literals
  2. import re
  3. import json
  4. from .common import InfoExtractor
  5. from .gigya import GigyaBaseIE
  6. from ..compat import compat_HTTPError
  7. from ..utils import (
  8. ExtractorError,
  9. strip_or_none,
  10. float_or_none,
  11. int_or_none,
  12. merge_dicts,
  13. parse_iso8601,
  14. str_or_none,
  15. url_or_none,
  16. )
  17. class CanvasIE(InfoExtractor):
  18. _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrt(?:video|nieuws)|sporza)/assets/(?P<id>[^/?#&]+)'
  19. _TESTS = [{
  20. 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
  21. 'md5': '68993eda72ef62386a15ea2cf3c93107',
  22. 'info_dict': {
  23. 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
  24. 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475',
  25. 'ext': 'mp4',
  26. 'title': 'Nachtwacht: De Greystook',
  27. 'description': 'Nachtwacht: De Greystook',
  28. 'thumbnail': r're:^https?://.*\.jpg$',
  29. 'duration': 1468.04,
  30. },
  31. 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'],
  32. }, {
  33. 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
  34. 'only_matching': True,
  35. }]
  36. _HLS_ENTRY_PROTOCOLS_MAP = {
  37. 'HLS': 'm3u8_native',
  38. 'HLS_AES': 'm3u8',
  39. }
  40. _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1'
  41. def _real_extract(self, url):
  42. mobj = re.match(self._VALID_URL, url)
  43. site_id, video_id = mobj.group('site_id'), mobj.group('id')
  44. # Old API endpoint, serves more formats but may fail for some videos
  45. data = self._download_json(
  46. 'https://mediazone.vrt.be/api/v1/%s/assets/%s'
  47. % (site_id, video_id), video_id, 'Downloading asset JSON',
  48. 'Unable to download asset JSON', fatal=False)
  49. # New API endpoint
  50. if not data:
  51. token = self._download_json(
  52. '%s/tokens' % self._REST_API_BASE, video_id,
  53. 'Downloading token', data=b'',
  54. headers={'Content-Type': 'application/json'})['vrtPlayerToken']
  55. data = self._download_json(
  56. '%s/videos/%s' % (self._REST_API_BASE, video_id),
  57. video_id, 'Downloading video JSON', fatal=False, query={
  58. 'vrtPlayerToken': token,
  59. 'client': '%s@PROD' % site_id,
  60. }, expected_status=400)
  61. message = data.get('message')
  62. if message and not data.get('title'):
  63. if data.get('code') == 'AUTHENTICATION_REQUIRED':
  64. self.raise_login_required(message)
  65. raise ExtractorError(message, expected=True)
  66. title = data['title']
  67. description = data.get('description')
  68. formats = []
  69. for target in data['targetUrls']:
  70. format_url, format_type = url_or_none(target.get('url')), str_or_none(target.get('type'))
  71. if not format_url or not format_type:
  72. continue
  73. format_type = format_type.upper()
  74. if format_type in self._HLS_ENTRY_PROTOCOLS_MAP:
  75. formats.extend(self._extract_m3u8_formats(
  76. format_url, video_id, 'mp4', self._HLS_ENTRY_PROTOCOLS_MAP[format_type],
  77. m3u8_id=format_type, fatal=False))
  78. elif format_type == 'HDS':
  79. formats.extend(self._extract_f4m_formats(
  80. format_url, video_id, f4m_id=format_type, fatal=False))
  81. elif format_type == 'MPEG_DASH':
  82. formats.extend(self._extract_mpd_formats(
  83. format_url, video_id, mpd_id=format_type, fatal=False))
  84. elif format_type == 'HSS':
  85. formats.extend(self._extract_ism_formats(
  86. format_url, video_id, ism_id='mss', fatal=False))
  87. else:
  88. formats.append({
  89. 'format_id': format_type,
  90. 'url': format_url,
  91. })
  92. self._sort_formats(formats)
  93. subtitles = {}
  94. subtitle_urls = data.get('subtitleUrls')
  95. if isinstance(subtitle_urls, list):
  96. for subtitle in subtitle_urls:
  97. subtitle_url = subtitle.get('url')
  98. if subtitle_url and subtitle.get('type') == 'CLOSED':
  99. subtitles.setdefault('nl', []).append({'url': subtitle_url})
  100. return {
  101. 'id': video_id,
  102. 'display_id': video_id,
  103. 'title': title,
  104. 'description': description,
  105. 'formats': formats,
  106. 'duration': float_or_none(data.get('duration'), 1000),
  107. 'thumbnail': data.get('posterImageUrl'),
  108. 'subtitles': subtitles,
  109. }
  110. class CanvasEenIE(InfoExtractor):
  111. IE_DESC = 'canvas.be and een.be'
  112. _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)'
  113. _TESTS = [{
  114. 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week',
  115. 'md5': 'ed66976748d12350b118455979cca293',
  116. 'info_dict': {
  117. 'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
  118. 'display_id': 'de-afspraak-veilt-voor-de-warmste-week',
  119. 'ext': 'flv',
  120. 'title': 'De afspraak veilt voor de Warmste Week',
  121. 'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6',
  122. 'thumbnail': r're:^https?://.*\.jpg$',
  123. 'duration': 49.02,
  124. },
  125. 'expected_warnings': ['is not a supported codec'],
  126. }, {
  127. # with subtitles
  128. 'url': 'http://www.canvas.be/video/panorama/2016/pieter-0167',
  129. 'info_dict': {
  130. 'id': 'mz-ast-5240ff21-2d30-4101-bba6-92b5ec67c625',
  131. 'display_id': 'pieter-0167',
  132. 'ext': 'mp4',
  133. 'title': 'Pieter 0167',
  134. 'description': 'md5:943cd30f48a5d29ba02c3a104dc4ec4e',
  135. 'thumbnail': r're:^https?://.*\.jpg$',
  136. 'duration': 2553.08,
  137. 'subtitles': {
  138. 'nl': [{
  139. 'ext': 'vtt',
  140. }],
  141. },
  142. },
  143. 'params': {
  144. 'skip_download': True,
  145. },
  146. 'skip': 'Pagina niet gevonden',
  147. }, {
  148. 'url': 'https://www.een.be/thuis/emma-pakt-thilly-aan',
  149. 'info_dict': {
  150. 'id': 'md-ast-3a24ced2-64d7-44fb-b4ed-ed1aafbf90b8',
  151. 'display_id': 'emma-pakt-thilly-aan',
  152. 'ext': 'mp4',
  153. 'title': 'Emma pakt Thilly aan',
  154. 'description': 'md5:c5c9b572388a99b2690030afa3f3bad7',
  155. 'thumbnail': r're:^https?://.*\.jpg$',
  156. 'duration': 118.24,
  157. },
  158. 'params': {
  159. 'skip_download': True,
  160. },
  161. 'expected_warnings': ['is not a supported codec'],
  162. }, {
  163. 'url': 'https://www.canvas.be/check-point/najaar-2016/de-politie-uw-vriend',
  164. 'only_matching': True,
  165. }]
  166. def _real_extract(self, url):
  167. mobj = re.match(self._VALID_URL, url)
  168. site_id, display_id = mobj.group('site_id'), mobj.group('id')
  169. webpage = self._download_webpage(url, display_id)
  170. title = strip_or_none(self._search_regex(
  171. r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>',
  172. webpage, 'title', default=None) or self._og_search_title(
  173. webpage, default=None))
  174. video_id = self._html_search_regex(
  175. r'data-video=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage, 'video id',
  176. group='id')
  177. return {
  178. '_type': 'url_transparent',
  179. 'url': 'https://mediazone.vrt.be/api/v1/%s/assets/%s' % (site_id, video_id),
  180. 'ie_key': CanvasIE.ie_key(),
  181. 'id': video_id,
  182. 'display_id': display_id,
  183. 'title': title,
  184. 'description': self._og_search_description(webpage),
  185. }
  186. class VrtNUIE(GigyaBaseIE):
  187. IE_DESC = 'VrtNU.be'
  188. _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?P<site_id>vrtnu)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
  189. _TESTS = [{
  190. # Available via old API endpoint
  191. 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/',
  192. 'info_dict': {
  193. 'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de',
  194. 'ext': 'mp4',
  195. 'title': 'De zwarte weduwe',
  196. 'description': 'md5:db1227b0f318c849ba5eab1fef895ee4',
  197. 'duration': 1457.04,
  198. 'thumbnail': r're:^https?://.*\.jpg$',
  199. 'season': 'Season 1',
  200. 'season_number': 1,
  201. 'episode_number': 1,
  202. },
  203. 'skip': 'This video is only available for registered users',
  204. 'params': {
  205. 'username': '<snip>',
  206. 'password': '<snip>',
  207. },
  208. 'expected_warnings': ['is not a supported codec'],
  209. }, {
  210. # Only available via new API endpoint
  211. 'url': 'https://www.vrt.be/vrtnu/a-z/kamp-waes/1/kamp-waes-s1a5/',
  212. 'info_dict': {
  213. 'id': 'pbs-pub-0763b56c-64fb-4d38-b95b-af60bf433c71$vid-ad36a73c-4735-4f1f-b2c0-a38e6e6aa7e1',
  214. 'ext': 'mp4',
  215. 'title': 'Aflevering 5',
  216. 'description': 'Wie valt door de mand tijdens een missie?',
  217. 'duration': 2967.06,
  218. 'season': 'Season 1',
  219. 'season_number': 1,
  220. 'episode_number': 5,
  221. },
  222. 'skip': 'This video is only available for registered users',
  223. 'params': {
  224. 'username': '<snip>',
  225. 'password': '<snip>',
  226. },
  227. 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'],
  228. }]
  229. _NETRC_MACHINE = 'vrtnu'
  230. _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy'
  231. _CONTEXT_ID = 'R3595707040'
  232. def _real_initialize(self):
  233. self._login()
  234. def _login(self):
  235. username, password = self._get_login_info()
  236. if username is None:
  237. return
  238. auth_data = {
  239. 'APIKey': self._APIKEY,
  240. 'targetEnv': 'jssdk',
  241. 'loginID': username,
  242. 'password': password,
  243. 'authMode': 'cookie',
  244. }
  245. auth_info = self._gigya_login(auth_data)
  246. # Sometimes authentication fails for no good reason, retry
  247. login_attempt = 1
  248. while login_attempt <= 3:
  249. try:
  250. # When requesting a token, no actual token is returned, but the
  251. # necessary cookies are set.
  252. self._request_webpage(
  253. 'https://token.vrt.be',
  254. None, note='Requesting a token', errnote='Could not get a token',
  255. headers={
  256. 'Content-Type': 'application/json',
  257. 'Referer': 'https://www.vrt.be/vrtnu/',
  258. },
  259. data=json.dumps({
  260. 'uid': auth_info['UID'],
  261. 'uidsig': auth_info['UIDSignature'],
  262. 'ts': auth_info['signatureTimestamp'],
  263. 'email': auth_info['profile']['email'],
  264. }).encode('utf-8'))
  265. except ExtractorError as e:
  266. if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
  267. login_attempt += 1
  268. self.report_warning('Authentication failed')
  269. self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again')
  270. else:
  271. raise e
  272. else:
  273. break
  274. def _real_extract(self, url):
  275. display_id = self._match_id(url)
  276. webpage, urlh = self._download_webpage_handle(url, display_id)
  277. info = self._search_json_ld(webpage, display_id, default={})
  278. # title is optional here since it may be extracted by extractor
  279. # that is delegated from here
  280. title = strip_or_none(self._html_search_regex(
  281. r'(?ms)<h1 class="content__heading">(.+?)</h1>',
  282. webpage, 'title', default=None))
  283. description = self._html_search_regex(
  284. r'(?ms)<div class="content__description">(.+?)</div>',
  285. webpage, 'description', default=None)
  286. season = self._html_search_regex(
  287. [r'''(?xms)<div\ class="tabs__tab\ tabs__tab--active">\s*
  288. <span>seizoen\ (.+?)</span>\s*
  289. </div>''',
  290. r'<option value="seizoen (\d{1,3})" data-href="[^"]+?" selected>'],
  291. webpage, 'season', default=None)
  292. season_number = int_or_none(season)
  293. episode_number = int_or_none(self._html_search_regex(
  294. r'''(?xms)<div\ class="content__episode">\s*
  295. <abbr\ title="aflevering">afl</abbr>\s*<span>(\d+)</span>
  296. </div>''',
  297. webpage, 'episode_number', default=None))
  298. release_date = parse_iso8601(self._html_search_regex(
  299. r'(?ms)<div class="content__broadcastdate">\s*<time\ datetime="(.+?)"',
  300. webpage, 'release_date', default=None))
  301. # If there's a ? or a # in the URL, remove them and everything after
  302. clean_url = urlh.geturl().split('?')[0].split('#')[0].strip('/')
  303. securevideo_url = clean_url + '.mssecurevideo.json'
  304. try:
  305. video = self._download_json(securevideo_url, display_id)
  306. except ExtractorError as e:
  307. if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
  308. self.raise_login_required()
  309. raise
  310. # We are dealing with a '../<show>.relevant' URL
  311. redirect_url = video.get('url')
  312. if redirect_url:
  313. return self.url_result(self._proto_relative_url(redirect_url, 'https:'))
  314. # There is only one entry, but with an unknown key, so just get
  315. # the first one
  316. video_id = list(video.values())[0].get('videoid')
  317. return merge_dicts(info, {
  318. '_type': 'url_transparent',
  319. 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id,
  320. 'ie_key': CanvasIE.ie_key(),
  321. 'id': video_id,
  322. 'display_id': display_id,
  323. 'title': title,
  324. 'description': description,
  325. 'season': season,
  326. 'season_number': season_number,
  327. 'episode_number': episode_number,
  328. 'release_date': release_date,
  329. })