You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

218 lines
8.9 KiB

  1. import re
  2. import xml.etree.ElementTree
  3. from .common import InfoExtractor
  4. from .mtv import MTVIE, _media_xml_tag
  5. from ..utils import (
  6. compat_str,
  7. compat_urllib_parse,
  8. ExtractorError,
  9. unified_strdate,
  10. )
  11. class ComedyCentralIE(MTVIE):
  12. _VALID_URL = r'http://www.comedycentral.com/(video-clips|episodes|cc-studios)/(?P<title>.*)'
  13. _FEED_URL = u'http://comedycentral.com/feeds/mrss/'
  14. _TEST = {
  15. u'url': u'http://www.comedycentral.com/video-clips/kllhuv/stand-up-greg-fitzsimmons--uncensored---too-good-of-a-mother',
  16. u'md5': u'4167875aae411f903b751a21f357f1ee',
  17. u'info_dict': {
  18. u'id': u'cef0cbb3-e776-4bc9-b62e-8016deccb354',
  19. u'ext': u'mp4',
  20. u'title': u'Uncensored - Greg Fitzsimmons - Too Good of a Mother',
  21. u'description': u'After a certain point, breastfeeding becomes c**kblocking.',
  22. },
  23. }
  24. # Overwrite MTVIE properties we don't want
  25. _TESTS = []
  26. def _get_thumbnail_url(self, uri, itemdoc):
  27. search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
  28. return itemdoc.find(search_path).attrib['url']
  29. def _real_extract(self, url):
  30. mobj = re.match(self._VALID_URL, url)
  31. title = mobj.group('title')
  32. webpage = self._download_webpage(url, title)
  33. mgid = self._search_regex(r'data-mgid="(?P<mgid>mgid:.*?)"',
  34. webpage, u'mgid')
  35. return self._get_videos_info(mgid)
  36. class ComedyCentralShowsIE(InfoExtractor):
  37. IE_DESC = u'The Daily Show / Colbert Report'
  38. # urls can be abbreviations like :thedailyshow or :colbert
  39. # urls for episodes like:
  40. # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
  41. # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
  42. # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
  43. _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
  44. |(https?://)?(www\.)?
  45. (?P<showname>thedailyshow|colbertnation)\.com/
  46. (full-episodes/(?P<episode>.*)|
  47. (?P<clip>
  48. (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
  49. |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))|
  50. (?P<interview>
  51. extended-interviews/(?P<interID>[0-9]+)/playlist_tds_extended_(?P<interview_title>.*?)/.*?)))
  52. $"""
  53. _TEST = {
  54. u'url': u'http://www.thedailyshow.com/watch/thu-december-13-2012/kristen-stewart',
  55. u'file': u'422212.mp4',
  56. u'md5': u'4e2f5cb088a83cd8cdb7756132f9739d',
  57. u'info_dict': {
  58. u"upload_date": u"20121214",
  59. u"description": u"Kristen Stewart",
  60. u"uploader": u"thedailyshow",
  61. u"title": u"thedailyshow-kristen-stewart part 1"
  62. }
  63. }
  64. _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
  65. _video_extensions = {
  66. '3500': 'mp4',
  67. '2200': 'mp4',
  68. '1700': 'mp4',
  69. '1200': 'mp4',
  70. '750': 'mp4',
  71. '400': 'mp4',
  72. }
  73. _video_dimensions = {
  74. '3500': (1280, 720),
  75. '2200': (960, 540),
  76. '1700': (768, 432),
  77. '1200': (640, 360),
  78. '750': (512, 288),
  79. '400': (384, 216),
  80. }
  81. @classmethod
  82. def suitable(cls, url):
  83. """Receives a URL and returns True if suitable for this IE."""
  84. return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
  85. @staticmethod
  86. def _transform_rtmp_url(rtmp_video_url):
  87. m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
  88. if not m:
  89. raise ExtractorError(u'Cannot transform RTMP url')
  90. base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
  91. return base + m.group('finalid')
  92. def _real_extract(self, url):
  93. mobj = re.match(self._VALID_URL, url, re.VERBOSE)
  94. if mobj is None:
  95. raise ExtractorError(u'Invalid URL: %s' % url)
  96. if mobj.group('shortname'):
  97. if mobj.group('shortname') in ('tds', 'thedailyshow'):
  98. url = u'http://www.thedailyshow.com/full-episodes/'
  99. else:
  100. url = u'http://www.colbertnation.com/full-episodes/'
  101. mobj = re.match(self._VALID_URL, url, re.VERBOSE)
  102. assert mobj is not None
  103. if mobj.group('clip'):
  104. if mobj.group('showname') == 'thedailyshow':
  105. epTitle = mobj.group('tdstitle')
  106. else:
  107. epTitle = mobj.group('cntitle')
  108. dlNewest = False
  109. elif mobj.group('interview'):
  110. epTitle = mobj.group('interview_title')
  111. dlNewest = False
  112. else:
  113. dlNewest = not mobj.group('episode')
  114. if dlNewest:
  115. epTitle = mobj.group('showname')
  116. else:
  117. epTitle = mobj.group('episode')
  118. self.report_extraction(epTitle)
  119. webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
  120. if dlNewest:
  121. url = htmlHandle.geturl()
  122. mobj = re.match(self._VALID_URL, url, re.VERBOSE)
  123. if mobj is None:
  124. raise ExtractorError(u'Invalid redirected URL: ' + url)
  125. if mobj.group('episode') == '':
  126. raise ExtractorError(u'Redirected URL is still not specific: ' + url)
  127. epTitle = mobj.group('episode')
  128. mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
  129. if len(mMovieParams) == 0:
  130. # The Colbert Report embeds the information in a without
  131. # a URL prefix; so extract the alternate reference
  132. # and then add the URL prefix manually.
  133. altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
  134. if len(altMovieParams) == 0:
  135. raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
  136. else:
  137. mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
  138. uri = mMovieParams[0][1]
  139. indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
  140. indexXml = self._download_webpage(indexUrl, epTitle,
  141. u'Downloading show index',
  142. u'unable to download episode index')
  143. results = []
  144. idoc = xml.etree.ElementTree.fromstring(indexXml)
  145. itemEls = idoc.findall('.//item')
  146. for partNum,itemEl in enumerate(itemEls):
  147. mediaId = itemEl.findall('./guid')[0].text
  148. shortMediaId = mediaId.split(':')[-1]
  149. showId = mediaId.split(':')[-2].replace('.com', '')
  150. officialTitle = itemEl.findall('./title')[0].text
  151. officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
  152. configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
  153. compat_urllib_parse.urlencode({'uri': mediaId}))
  154. configXml = self._download_webpage(configUrl, epTitle,
  155. u'Downloading configuration for %s' % shortMediaId)
  156. cdoc = xml.etree.ElementTree.fromstring(configXml)
  157. turls = []
  158. for rendition in cdoc.findall('.//rendition'):
  159. finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
  160. turls.append(finfo)
  161. if len(turls) == 0:
  162. self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
  163. continue
  164. formats = []
  165. for format, rtmp_video_url in turls:
  166. w, h = self._video_dimensions.get(format, (None, None))
  167. formats.append({
  168. 'url': self._transform_rtmp_url(rtmp_video_url),
  169. 'ext': self._video_extensions.get(format, 'mp4'),
  170. 'format_id': format,
  171. 'height': h,
  172. 'width': w,
  173. })
  174. effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
  175. info = {
  176. 'id': shortMediaId,
  177. 'formats': formats,
  178. 'uploader': showId,
  179. 'upload_date': officialDate,
  180. 'title': effTitle,
  181. 'thumbnail': None,
  182. 'description': compat_str(officialTitle),
  183. }
  184. # TODO: Remove when #980 has been merged
  185. info.update(info['formats'][-1])
  186. results.append(info)
  187. return results