You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

117 lines
4.7 KiB

11 years ago
11 years ago
  1. import json
  2. import re
  3. from .subtitles import SubtitlesInfoExtractor
  4. from ..utils import (
  5. RegexNotFoundError,
  6. )
  7. class TEDIE(SubtitlesInfoExtractor):
  8. _VALID_URL=r'''http://www\.ted\.com/
  9. (
  10. ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
  11. |
  12. ((?P<type_talk>talks)) # We have a simple talk
  13. )
  14. (/lang/(.*?))? # The url may contain the language
  15. /(?P<name>\w+) # Here goes the name and then ".html"
  16. '''
  17. _TEST = {
  18. u'url': u'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html',
  19. u'file': u'102.mp4',
  20. u'md5': u'2d76ee1576672e0bd8f187513267adf6',
  21. u'info_dict': {
  22. u"description": u"md5:c6fa72e6eedbd938c9caf6b2702f5922",
  23. u"title": u"Dan Dennett: The illusion of consciousness"
  24. }
  25. }
  26. @classmethod
  27. def suitable(cls, url):
  28. """Receives a URL and returns True if suitable for this IE."""
  29. return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
  30. def _real_extract(self, url):
  31. m=re.match(self._VALID_URL, url, re.VERBOSE)
  32. if m.group('type_talk'):
  33. return self._talk_info(url)
  34. else :
  35. playlist_id=m.group('playlist_id')
  36. name=m.group('name')
  37. self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
  38. return [self._playlist_videos_info(url,name,playlist_id)]
  39. def _playlist_videos_info(self, url, name, playlist_id):
  40. '''Returns the videos of the playlist'''
  41. webpage = self._download_webpage(
  42. url, playlist_id, u'Downloading playlist webpage')
  43. matches = re.finditer(
  44. r'<p\s+class="talk-title[^"]*"><a\s+href="(?P<talk_url>/talks/[^"]+\.html)">[^<]*</a></p>',
  45. webpage)
  46. playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
  47. webpage, 'playlist title')
  48. playlist_entries = [
  49. self.url_result(u'http://www.ted.com' + m.group('talk_url'), 'TED')
  50. for m in matches
  51. ]
  52. return self.playlist_result(
  53. playlist_entries, playlist_id=playlist_id, playlist_title=playlist_title)
  54. def _talk_info(self, url, video_id=0):
  55. """Return the video for the talk in the url"""
  56. m = re.match(self._VALID_URL, url,re.VERBOSE)
  57. video_name = m.group('name')
  58. webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
  59. self.report_extraction(video_name)
  60. # If the url includes the language we get the title translated
  61. title = self._html_search_regex(r'<span .*?id="altHeadline".+?>(?P<title>.*)</span>',
  62. webpage, 'title')
  63. json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
  64. webpage, 'json data')
  65. info = json.loads(json_data)
  66. desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
  67. webpage, 'description', flags = re.DOTALL)
  68. thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
  69. webpage, 'thumbnail')
  70. formats = [{
  71. 'ext': 'mp4',
  72. 'url': stream['file'],
  73. 'format': stream['id']
  74. } for stream in info['htmlStreams']]
  75. video_id = info['id']
  76. # subtitles
  77. video_subtitles = self.extract_subtitles(video_id, webpage)
  78. if self._downloader.params.get('listsubtitles', False):
  79. self._list_available_subtitles(video_id, webpage)
  80. return
  81. return {
  82. 'id': video_id,
  83. 'title': title,
  84. 'thumbnail': thumbnail,
  85. 'description': desc,
  86. 'subtitles': video_subtitles,
  87. 'formats': formats,
  88. }
  89. def _get_available_subtitles(self, video_id, webpage):
  90. try:
  91. options = self._search_regex(r'(?:<select name="subtitles_language_select" id="subtitles_language_select">)(.*?)(?:</select>)', webpage, 'subtitles_language_select', flags=re.DOTALL)
  92. languages = re.findall(r'(?:<option value=")(\S+)"', options)
  93. if languages:
  94. sub_lang_list = {}
  95. for l in languages:
  96. url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
  97. sub_lang_list[l] = url
  98. return sub_lang_list
  99. except RegexNotFoundError:
  100. self._downloader.report_warning(u'video doesn\'t have subtitles')
  101. return {}