You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

118 lines
4.8 KiB

11 years ago
11 years ago
  1. import json
  2. import re
  3. from .subtitles import SubtitlesInfoExtractor
  4. from ..utils import (
  5. compat_str,
  6. RegexNotFoundError,
  7. )
  8. class TEDIE(SubtitlesInfoExtractor):
  9. _VALID_URL=r'''http://www\.ted\.com/
  10. (
  11. ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
  12. |
  13. ((?P<type_talk>talks)) # We have a simple talk
  14. )
  15. (/lang/(.*?))? # The url may contain the language
  16. /(?P<name>\w+) # Here goes the name and then ".html"
  17. '''
  18. _TEST = {
  19. u'url': u'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
  20. u'file': u'102.mp4',
  21. u'md5': u'2d76ee1576672e0bd8f187513267adf6',
  22. u'info_dict': {
  23. u"description": u"md5:c6fa72e6eedbd938c9caf6b2702f5922",
  24. u"title": u"Dan Dennett: The illusion of consciousness"
  25. }
  26. }
  27. @classmethod
  28. def suitable(cls, url):
  29. """Receives a URL and returns True if suitable for this IE."""
  30. return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
  31. def _real_extract(self, url):
  32. m=re.match(self._VALID_URL, url, re.VERBOSE)
  33. if m.group('type_talk'):
  34. return self._talk_info(url)
  35. else :
  36. playlist_id=m.group('playlist_id')
  37. name=m.group('name')
  38. self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
  39. return [self._playlist_videos_info(url,name,playlist_id)]
  40. def _playlist_videos_info(self, url, name, playlist_id):
  41. '''Returns the videos of the playlist'''
  42. webpage = self._download_webpage(
  43. url, playlist_id, u'Downloading playlist webpage')
  44. matches = re.finditer(
  45. r'<p\s+class="talk-title[^"]*"><a\s+href="(?P<talk_url>/talks/[^"]+\.html)">[^<]*</a></p>',
  46. webpage)
  47. playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
  48. webpage, 'playlist title')
  49. playlist_entries = [
  50. self.url_result(u'http://www.ted.com' + m.group('talk_url'), 'TED')
  51. for m in matches
  52. ]
  53. return self.playlist_result(
  54. playlist_entries, playlist_id=playlist_id, playlist_title=playlist_title)
  55. def _talk_info(self, url, video_id=0):
  56. """Return the video for the talk in the url"""
  57. m = re.match(self._VALID_URL, url,re.VERBOSE)
  58. video_name = m.group('name')
  59. webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
  60. self.report_extraction(video_name)
  61. # If the url includes the language we get the title translated
  62. title = self._html_search_regex(r'<span .*?id="altHeadline".+?>(?P<title>.*)</span>',
  63. webpage, 'title')
  64. json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
  65. webpage, 'json data')
  66. info = json.loads(json_data)
  67. desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
  68. webpage, 'description', flags = re.DOTALL)
  69. thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
  70. webpage, 'thumbnail')
  71. formats = [{
  72. 'ext': 'mp4',
  73. 'url': stream['file'],
  74. 'format': stream['id']
  75. } for stream in info['htmlStreams']]
  76. video_id = info['id']
  77. # subtitles
  78. video_subtitles = self.extract_subtitles(video_id, webpage)
  79. if self._downloader.params.get('listsubtitles', False):
  80. self._list_available_subtitles(video_id, webpage)
  81. return
  82. return {
  83. 'id': video_id,
  84. 'title': title,
  85. 'thumbnail': thumbnail,
  86. 'description': desc,
  87. 'subtitles': video_subtitles,
  88. 'formats': formats,
  89. }
  90. def _get_available_subtitles(self, video_id, webpage):
  91. try:
  92. options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL)
  93. languages = re.findall(r'(?:<option value=")(\S+)"', options)
  94. if languages:
  95. sub_lang_list = {}
  96. for l in languages:
  97. url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
  98. sub_lang_list[l] = url
  99. return sub_lang_list
  100. except RegexNotFoundError as err:
  101. self._downloader.report_warning(u'video doesn\'t have subtitles')
  102. return {}