You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

120 lines
4.8 KiB

  1. # -*- coding: utf-8 -*-
  2. from __future__ import unicode_literals
  3. import re
  4. from .common import InfoExtractor
  5. from ..utils import parse_filesize
  6. class TagesschauIE(InfoExtractor):
  7. _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?:sendung/ts|video/video)(?P<id>-?[0-9]+)\.html'
  8. _TESTS = [{
  9. 'url': 'http://www.tagesschau.de/multimedia/video/video1399128.html',
  10. 'md5': 'bcdeac2194fb296d599ce7929dfa4009',
  11. 'info_dict': {
  12. 'id': '1399128',
  13. 'ext': 'mp4',
  14. 'title': 'Harald Range, Generalbundesanwalt, zu den Ermittlungen',
  15. 'description': 'md5:69da3c61275b426426d711bde96463ab',
  16. 'thumbnail': 're:^http:.*\.jpg$',
  17. },
  18. }, {
  19. 'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
  20. 'md5': '3c54c1f6243d279b706bde660ceec633',
  21. 'info_dict': {
  22. 'id': '5727',
  23. 'ext': 'mp4',
  24. 'description': 'md5:695c01bfd98b7e313c501386327aea59',
  25. 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr',
  26. 'thumbnail': 're:^http:.*\.jpg$',
  27. }
  28. }]
  29. _FORMATS = {
  30. 's': {'width': 256, 'height': 144, 'quality': 1},
  31. 'm': {'width': 512, 'height': 288, 'quality': 2},
  32. 'l': {'width': 960, 'height': 544, 'quality': 3},
  33. }
  34. def _real_extract(self, url):
  35. video_id = self._match_id(url)
  36. display_id = video_id.lstrip('-')
  37. webpage = self._download_webpage(url, display_id)
  38. player_url = self._html_search_meta(
  39. 'twitter:player', webpage, 'player URL', default=None)
  40. if player_url:
  41. playerpage = self._download_webpage(
  42. player_url, display_id, 'Downloading player page')
  43. medias = re.findall(
  44. r'"(http://media.+?)", type:"video/(.+?)", quality:"(.+?)"',
  45. playerpage)
  46. formats = []
  47. for url, ext, res in medias:
  48. f = {
  49. 'format_id': res + '_' + ext,
  50. 'url': url,
  51. 'ext': ext,
  52. }
  53. f.update(self._FORMATS.get(res, {}))
  54. formats.append(f)
  55. thumbnail_fn = re.findall(r'"(/multimedia/.+?\.jpg)"', playerpage)[-1]
  56. title = self._og_search_title(webpage).strip()
  57. description = self._og_search_description(webpage).strip()
  58. else:
  59. download_text = self._search_regex(
  60. r'(?s)<p>Wir bieten dieses Video in folgenden Formaten zum Download an:</p>\s*<div class="controls">(.*?)</div>\s*<p>',
  61. webpage, 'download links')
  62. links = re.finditer(
  63. r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>',
  64. download_text)
  65. formats = []
  66. for l in links:
  67. format_id = self._search_regex(
  68. r'.*/[^/.]+\.([^/]+)\.[^/.]+', l.group('url'), 'format ID')
  69. format = {
  70. 'format_id': format_id,
  71. 'url': l.group('url'),
  72. 'format_name': l.group('name'),
  73. }
  74. m = re.match(
  75. r'''(?x)
  76. Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10;
  77. (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10;
  78. (?P<vbr>[0-9]+)kbps&\#10;
  79. Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10;
  80. Gr&ouml;&szlig;e:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''',
  81. l.group('title'))
  82. if m:
  83. format.update({
  84. 'format_note': m.group('audio_desc'),
  85. 'vcodec': m.group('vcodec'),
  86. 'width': int(m.group('width')),
  87. 'height': int(m.group('height')),
  88. 'abr': int(m.group('abr')),
  89. 'vbr': int(m.group('vbr')),
  90. 'filesize_approx': parse_filesize(m.group('filesize_approx')),
  91. })
  92. formats.append(format)
  93. thumbnail_fn = self._search_regex(
  94. r'(?s)<img alt="Sendungsbild".*?src="([^"]+)"',
  95. webpage, 'thumbnail', fatal=False)
  96. description = self._html_search_regex(
  97. r'(?s)<p class="teasertext">(.*?)</p>',
  98. webpage, 'description', fatal=False)
  99. title = self._html_search_regex(
  100. r'<span class="headline".*?>(.*?)</span>', webpage, 'title')
  101. self._sort_formats(formats)
  102. thumbnail = 'http://www.tagesschau.de' + thumbnail_fn
  103. return {
  104. 'id': display_id,
  105. 'title': title,
  106. 'thumbnail': thumbnail,
  107. 'formats': formats,
  108. 'description': description,
  109. }