You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

86 lines
3.1 KiB

  1. import re
  2. import json
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. # This is used by the not implemented extractLiveStream method
  6. compat_urllib_parse,
  7. ExtractorError,
  8. unified_strdate,
  9. )
  10. class ArteTvIE(InfoExtractor):
  11. _VALID_URL = r'(?:http://)?www\.arte.tv/guide/(?:fr|de)/(?:(?:sendungen|emissions)/)?(?P<id>.*?)/(?P<name>.*?)(\?.*)?'
  12. _LIVE_URL = r'index-[0-9]+\.html$'
  13. IE_NAME = u'arte.tv'
  14. # TODO implement Live Stream
  15. # def extractLiveStream(self, url):
  16. # video_lang = url.split('/')[-4]
  17. # info = self.grep_webpage(
  18. # url,
  19. # r'src="(.*?/videothek_js.*?\.js)',
  20. # 0,
  21. # [
  22. # (1, 'url', u'Invalid URL: %s' % url)
  23. # ]
  24. # )
  25. # http_host = url.split('/')[2]
  26. # next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
  27. # info = self.grep_webpage(
  28. # next_url,
  29. # r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
  30. # '(http://.*?\.swf).*?' +
  31. # '(rtmp://.*?)\'',
  32. # re.DOTALL,
  33. # [
  34. # (1, 'path', u'could not extract video path: %s' % url),
  35. # (2, 'player', u'could not extract video player: %s' % url),
  36. # (3, 'url', u'could not extract video url: %s' % url)
  37. # ]
  38. # )
  39. # video_url = u'%s/%s' % (info.get('url'), info.get('path'))
  40. def _real_extract(self, url):
  41. mobj = re.match(self._VALID_URL, url)
  42. name = mobj.group('name')
  43. # This is not a real id, it can be for example AJT for the news
  44. # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
  45. video_id = mobj.group('id')
  46. if re.search(self._LIVE_URL, video_id) is not None:
  47. raise ExtractorError(u'Arte live streams are not yet supported, sorry')
  48. # self.extractLiveStream(url)
  49. # return
  50. webpage = self._download_webpage(url, video_id)
  51. json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url')
  52. json_info = self._download_webpage(json_url, video_id, 'Downloading info json')
  53. self.report_extraction(video_id)
  54. info = json.loads(json_info)
  55. player_info = info['videoJsonPlayer']
  56. info_dict = {'id': player_info['VID'],
  57. 'title': player_info['VTI'],
  58. 'description': player_info['VDE'],
  59. 'upload_date': unified_strdate(player_info['VDA'].split(' ')[0]),
  60. 'thumbnail': player_info['programImage'],
  61. }
  62. formats = player_info['VSR'].values()
  63. # We order the formats by quality
  64. formats = sorted(formats, key=lambda f: int(f['height']))
  65. # Pick the best quality
  66. format_info = formats[-1]
  67. if format_info['mediaType'] == u'rtmp':
  68. info_dict['url'] = format_info['streamer']
  69. info_dict['play_path'] = 'mp4:' + format_info['url']
  70. info_dict['ext'] = 'mp4'
  71. else:
  72. info_dict['url'] = format_info['url']
  73. info_dict['ext'] = 'mp4'
  74. return info_dict