You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

174 lines
6.8 KiB

  1. import re
  2. import json
  3. import itertools
  4. from .common import InfoExtractor
  5. from .subtitles import SubtitlesInfoExtractor
  6. from ..utils import (
  7. compat_urllib_request,
  8. compat_str,
  9. get_element_by_attribute,
  10. get_element_by_id,
  11. ExtractorError,
  12. )
  13. class DailymotionBaseInfoExtractor(InfoExtractor):
  14. @staticmethod
  15. def _build_request(url):
  16. """Build a request with the family filter disabled"""
  17. request = compat_urllib_request.Request(url)
  18. request.add_header('Cookie', 'family_filter=off')
  19. return request
  20. class DailymotionIE(DailymotionBaseInfoExtractor, SubtitlesInfoExtractor):
  21. """Information Extractor for Dailymotion"""
  22. _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/(?:embed/)?video/([^/]+)'
  23. IE_NAME = u'dailymotion'
  24. _TEST = {
  25. u'url': u'http://www.dailymotion.com/video/x33vw9_tutoriel-de-youtubeur-dl-des-video_tech',
  26. u'file': u'x33vw9.mp4',
  27. u'md5': u'392c4b85a60a90dc4792da41ce3144eb',
  28. u'info_dict': {
  29. u"uploader": u"Amphora Alex and Van .",
  30. u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\""
  31. }
  32. }
  33. def _real_extract(self, url):
  34. # Extract id and simplified title from URL
  35. mobj = re.match(self._VALID_URL, url)
  36. video_id = mobj.group(1).split('_')[0].split('?')[0]
  37. video_extension = 'mp4'
  38. url = 'http://www.dailymotion.com/video/%s' % video_id
  39. # Retrieve video webpage to extract further information
  40. request = self._build_request(url)
  41. webpage = self._download_webpage(request, video_id)
  42. # Extract URL, uploader and title from webpage
  43. self.report_extraction(video_id)
  44. video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
  45. # Looking for official user
  46. r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
  47. webpage, 'video uploader')
  48. video_upload_date = None
  49. mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
  50. if mobj is not None:
  51. video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
  52. embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id
  53. embed_page = self._download_webpage(embed_url, video_id,
  54. u'Downloading embed page')
  55. info = self._search_regex(r'var info = ({.*?}),$', embed_page,
  56. 'video info', flags=re.MULTILINE)
  57. info = json.loads(info)
  58. if info.get('error') is not None:
  59. msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title']
  60. raise ExtractorError(msg, expected=True)
  61. # TODO: support choosing qualities
  62. for key in ['stream_h264_hd1080_url','stream_h264_hd_url',
  63. 'stream_h264_hq_url','stream_h264_url',
  64. 'stream_h264_ld_url']:
  65. if info.get(key):#key in info and info[key]:
  66. max_quality = key
  67. self.to_screen(u'Using %s' % key)
  68. break
  69. else:
  70. raise ExtractorError(u'Unable to extract video URL')
  71. video_url = info[max_quality]
  72. # subtitles
  73. video_subtitles = self.extract_subtitles(video_id)
  74. if self._downloader.params.get('listsubtitles', False):
  75. self._list_available_subtitles(video_id)
  76. return
  77. return [{
  78. 'id': video_id,
  79. 'url': video_url,
  80. 'uploader': video_uploader,
  81. 'upload_date': video_upload_date,
  82. 'title': self._og_search_title(webpage),
  83. 'ext': video_extension,
  84. 'subtitles': video_subtitles,
  85. 'thumbnail': info['thumbnail_url']
  86. }]
  87. def _get_available_subtitles(self, video_id):
  88. try:
  89. sub_list = self._download_webpage(
  90. 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id,
  91. video_id, note=False)
  92. except ExtractorError as err:
  93. self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
  94. return {}
  95. info = json.loads(sub_list)
  96. if (info['total'] > 0):
  97. sub_lang_list = dict((l['language'], l['url']) for l in info['list'])
  98. return sub_lang_list
  99. self._downloader.report_warning(u'video doesn\'t have subtitles')
  100. return {}
  101. class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
  102. IE_NAME = u'dailymotion:playlist'
  103. _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/'
  104. _MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/playlist/.+?".*?>.*?</a>.*?</div>'
  105. _PAGE_TEMPLATE = 'https://www.dailymotion.com/playlist/%s/%s'
  106. def _extract_entries(self, id):
  107. video_ids = []
  108. for pagenum in itertools.count(1):
  109. request = self._build_request(self._PAGE_TEMPLATE % (id, pagenum))
  110. webpage = self._download_webpage(request,
  111. id, u'Downloading page %s' % pagenum)
  112. playlist_el = get_element_by_attribute(u'class', u'video_list', webpage)
  113. video_ids.extend(re.findall(r'data-id="(.+?)" data-ext-id', playlist_el))
  114. if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
  115. break
  116. return [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
  117. for video_id in video_ids]
  118. def _real_extract(self, url):
  119. mobj = re.match(self._VALID_URL, url)
  120. playlist_id = mobj.group('id')
  121. webpage = self._download_webpage(url, playlist_id)
  122. return {'_type': 'playlist',
  123. 'id': playlist_id,
  124. 'title': get_element_by_id(u'playlist_name', webpage),
  125. 'entries': self._extract_entries(playlist_id),
  126. }
  127. class DailymotionUserIE(DailymotionPlaylistIE):
  128. IE_NAME = u'dailymotion:user'
  129. _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/user/(?P<user>[^/]+)'
  130. _MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/user/.+?".*?>.*?</a>.*?</div>'
  131. _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
  132. def _real_extract(self, url):
  133. mobj = re.match(self._VALID_URL, url)
  134. user = mobj.group('user')
  135. webpage = self._download_webpage(url, user)
  136. full_user = self._html_search_regex(
  137. r'<a class="label" href="/%s".*?>(.*?)</' % re.escape(user),
  138. webpage, u'user', flags=re.DOTALL)
  139. return {
  140. '_type': 'playlist',
  141. 'id': user,
  142. 'title': full_user,
  143. 'entries': self._extract_entries(user),
  144. }