You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

204 lines
7.9 KiB

  1. from __future__ import unicode_literals
  2. import re
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. int_or_none,
  6. unified_strdate,
  7. xpath_text,
  8. determine_ext,
  9. qualities,
  10. float_or_none,
  11. ExtractorError,
  12. )
  13. class DreiSatIE(InfoExtractor):
  14. IE_NAME = '3sat'
  15. _VALID_URL = r'(?:https?://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php|mediathek\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$'
  16. _TESTS = [
  17. {
  18. 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918',
  19. 'md5': 'be37228896d30a88f315b638900a026e',
  20. 'info_dict': {
  21. 'id': '45918',
  22. 'ext': 'mp4',
  23. 'title': 'Waidmannsheil',
  24. 'description': 'md5:cce00ca1d70e21425e72c86a98a56817',
  25. 'uploader': 'SCHWEIZWEIT',
  26. 'uploader_id': '100000210',
  27. 'upload_date': '20140913'
  28. },
  29. 'params': {
  30. 'skip_download': True, # m3u8 downloads
  31. }
  32. },
  33. {
  34. 'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066',
  35. 'only_matching': True,
  36. },
  37. ]
  38. def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
  39. param_groups = {}
  40. for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)):
  41. group_id = param_group.attrib.get(self._xpath_ns('id', 'http://www.w3.org/XML/1998/namespace'))
  42. params = {}
  43. for param in param_group:
  44. params[param.get('name')] = param.get('value')
  45. param_groups[group_id] = params
  46. formats = []
  47. for video in smil.findall(self._xpath_ns('.//video', namespace)):
  48. src = video.get('src')
  49. if not src:
  50. continue
  51. bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
  52. group_id = video.get('paramGroup')
  53. param_group = param_groups[group_id]
  54. for proto in param_group['protocols'].split(','):
  55. formats.append({
  56. 'url': '%s://%s' % (proto, param_group['host']),
  57. 'app': param_group['app'],
  58. 'play_path': src,
  59. 'ext': 'flv',
  60. 'format_id': '%s-%d' % (proto, bitrate),
  61. 'tbr': bitrate,
  62. })
  63. self._sort_formats(formats)
  64. return formats
  65. def extract_from_xml_url(self, video_id, xml_url):
  66. doc = self._download_xml(
  67. xml_url, video_id,
  68. note='Downloading video info',
  69. errnote='Failed to download video info')
  70. status_code = doc.find('./status/statuscode')
  71. if status_code is not None and status_code.text != 'ok':
  72. code = status_code.text
  73. if code == 'notVisibleAnymore':
  74. message = 'Video %s is not available' % video_id
  75. else:
  76. message = '%s returned error: %s' % (self.IE_NAME, code)
  77. raise ExtractorError(message, expected=True)
  78. title = doc.find('.//information/title').text
  79. description = xpath_text(doc, './/information/detail', 'description')
  80. duration = int_or_none(xpath_text(doc, './/details/lengthSec', 'duration'))
  81. uploader = xpath_text(doc, './/details/originChannelTitle', 'uploader')
  82. uploader_id = xpath_text(doc, './/details/originChannelId', 'uploader id')
  83. upload_date = unified_strdate(xpath_text(doc, './/details/airtime', 'upload date'))
  84. def xml_to_thumbnails(fnode):
  85. thumbnails = []
  86. for node in fnode:
  87. thumbnail_url = node.text
  88. if not thumbnail_url:
  89. continue
  90. thumbnail = {
  91. 'url': thumbnail_url,
  92. }
  93. if 'key' in node.attrib:
  94. m = re.match('^([0-9]+)x([0-9]+)$', node.attrib['key'])
  95. if m:
  96. thumbnail['width'] = int(m.group(1))
  97. thumbnail['height'] = int(m.group(2))
  98. thumbnails.append(thumbnail)
  99. return thumbnails
  100. thumbnails = xml_to_thumbnails(doc.findall('.//teaserimages/teaserimage'))
  101. format_nodes = doc.findall('.//formitaeten/formitaet')
  102. quality = qualities(['veryhigh', 'high', 'med', 'low'])
  103. def get_quality(elem):
  104. return quality(xpath_text(elem, 'quality'))
  105. format_nodes.sort(key=get_quality)
  106. format_ids = []
  107. formats = []
  108. for fnode in format_nodes:
  109. video_url = fnode.find('url').text
  110. is_available = 'http://www.metafilegenerator' not in video_url
  111. if not is_available:
  112. continue
  113. format_id = fnode.attrib['basetype']
  114. quality = xpath_text(fnode, './quality', 'quality')
  115. format_m = re.match(r'''(?x)
  116. (?P<vcodec>[^_]+)_(?P<acodec>[^_]+)_(?P<container>[^_]+)_
  117. (?P<proto>[^_]+)_(?P<index>[^_]+)_(?P<indexproto>[^_]+)
  118. ''', format_id)
  119. ext = determine_ext(video_url, None) or format_m.group('container')
  120. if ext not in ('smil', 'f4m', 'm3u8'):
  121. format_id = format_id + '-' + quality
  122. if format_id in format_ids:
  123. continue
  124. if ext == 'meta':
  125. continue
  126. elif ext == 'smil':
  127. formats.extend(self._extract_smil_formats(
  128. video_url, video_id, fatal=False))
  129. elif ext == 'm3u8':
  130. # the certificates are misconfigured (see
  131. # https://github.com/rg3/youtube-dl/issues/8665)
  132. if video_url.startswith('https://'):
  133. continue
  134. formats.extend(self._extract_m3u8_formats(
  135. video_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))
  136. elif ext == 'f4m':
  137. formats.extend(self._extract_f4m_formats(
  138. video_url, video_id, f4m_id=format_id, fatal=False))
  139. else:
  140. proto = format_m.group('proto').lower()
  141. abr = int_or_none(xpath_text(fnode, './audioBitrate', 'abr'), 1000)
  142. vbr = int_or_none(xpath_text(fnode, './videoBitrate', 'vbr'), 1000)
  143. width = int_or_none(xpath_text(fnode, './width', 'width'))
  144. height = int_or_none(xpath_text(fnode, './height', 'height'))
  145. filesize = int_or_none(xpath_text(fnode, './filesize', 'filesize'))
  146. format_note = ''
  147. if not format_note:
  148. format_note = None
  149. formats.append({
  150. 'format_id': format_id,
  151. 'url': video_url,
  152. 'ext': ext,
  153. 'acodec': format_m.group('acodec'),
  154. 'vcodec': format_m.group('vcodec'),
  155. 'abr': abr,
  156. 'vbr': vbr,
  157. 'width': width,
  158. 'height': height,
  159. 'filesize': filesize,
  160. 'format_note': format_note,
  161. 'protocol': proto,
  162. '_available': is_available,
  163. })
  164. format_ids.append(format_id)
  165. self._sort_formats(formats)
  166. return {
  167. 'id': video_id,
  168. 'title': title,
  169. 'description': description,
  170. 'duration': duration,
  171. 'thumbnails': thumbnails,
  172. 'uploader': uploader,
  173. 'uploader_id': uploader_id,
  174. 'upload_date': upload_date,
  175. 'formats': formats,
  176. }
  177. def _real_extract(self, url):
  178. mobj = re.match(self._VALID_URL, url)
  179. video_id = mobj.group('id')
  180. details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id
  181. return self.extract_from_xml_url(video_id, details_url)