You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

176 lines
6.6 KiB

11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
  1. from __future__ import unicode_literals
  2. import itertools
  3. import json
  4. import re
  5. from .common import InfoExtractor, SearchInfoExtractor
  6. from ..utils import (
  7. compat_urllib_parse,
  8. compat_urlparse,
  9. clean_html,
  10. int_or_none,
  11. )
  12. class YahooIE(InfoExtractor):
  13. IE_DESC = 'Yahoo screen and movies'
  14. _VALID_URL = r'(?P<url>https?://(?:screen|movies)\.yahoo\.com/.*?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'
  15. _TESTS = [
  16. {
  17. 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
  18. 'md5': '4962b075c08be8690a922ee026d05e69',
  19. 'info_dict': {
  20. 'id': '2d25e626-2378-391f-ada0-ddaf1417e588',
  21. 'ext': 'mp4',
  22. 'title': 'Julian Smith & Travis Legg Watch Julian Smith',
  23. 'description': 'Julian and Travis watch Julian Smith',
  24. },
  25. },
  26. {
  27. 'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
  28. 'md5': 'd6e6fc6e1313c608f316ddad7b82b306',
  29. 'info_dict': {
  30. 'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9',
  31. 'ext': 'mp4',
  32. 'title': 'Codefellas - The Cougar Lies with Spanish Moss',
  33. 'description': 'Agent Topple\'s mustache does its dirty work, and Nicole brokers a deal for peace. But why is the NSA collecting millions of Instagram brunch photos? And if your waffles have nothing to hide, what are they so worried about?',
  34. },
  35. },
  36. {
  37. 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed',
  38. 'md5': '60e8ac193d8fb71997caa8fce54c6460',
  39. 'info_dict': {
  40. 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb',
  41. 'ext': 'mp4',
  42. 'title': "Yahoo Saves 'Community'",
  43. 'description': 'md5:4d4145af2fd3de00cbb6c1d664105053',
  44. }
  45. },
  46. ]
  47. def _real_extract(self, url):
  48. mobj = re.match(self._VALID_URL, url)
  49. video_id = mobj.group('id')
  50. url = mobj.group('url')
  51. webpage = self._download_webpage(url, video_id)
  52. items_json = self._search_regex(
  53. r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,
  54. default=None)
  55. if items_json is None:
  56. CONTENT_ID_REGEXES = [
  57. r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"',
  58. r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"',
  59. r'"first_videoid"\s*:\s*"([^"]+)"',
  60. ]
  61. long_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID')
  62. video_id = long_id
  63. else:
  64. items = json.loads(items_json)
  65. info = items['mediaItems']['query']['results']['mediaObj'][0]
  66. # The 'meta' field is not always in the video webpage, we request it
  67. # from another page
  68. long_id = info['id']
  69. return self._get_info(long_id, video_id, webpage)
  70. def _get_info(self, long_id, video_id, webpage):
  71. query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"'
  72. ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="US"'
  73. ' AND protocol="http"' % long_id)
  74. data = compat_urllib_parse.urlencode({
  75. 'q': query,
  76. 'env': 'prod',
  77. 'format': 'json',
  78. })
  79. query_result = self._download_json(
  80. 'http://video.query.yahoo.com/v1/public/yql?' + data,
  81. video_id, 'Downloading video info')
  82. info = query_result['query']['results']['mediaObj'][0]
  83. meta = info['meta']
  84. formats = []
  85. for s in info['streams']:
  86. format_info = {
  87. 'width': int_or_none(s.get('width')),
  88. 'height': int_or_none(s.get('height')),
  89. 'tbr': int_or_none(s.get('bitrate')),
  90. }
  91. host = s['host']
  92. path = s['path']
  93. if host.startswith('rtmp'):
  94. format_info.update({
  95. 'url': host,
  96. 'play_path': path,
  97. 'ext': 'flv',
  98. })
  99. else:
  100. format_url = compat_urlparse.urljoin(host, path)
  101. format_info['url'] = format_url
  102. formats.append(format_info)
  103. self._sort_formats(formats)
  104. return {
  105. 'id': video_id,
  106. 'title': meta['title'],
  107. 'formats': formats,
  108. 'description': clean_html(meta['description']),
  109. 'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
  110. }
  111. class YahooNewsIE(YahooIE):
  112. IE_NAME = 'yahoo:news'
  113. _VALID_URL = r'http://news\.yahoo\.com/video/.*?-(?P<id>\d*?)\.html'
  114. _TESTS = [{
  115. 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
  116. 'md5': '67010fdf3a08d290e060a4dd96baa07b',
  117. 'info_dict': {
  118. 'id': '104538833',
  119. 'ext': 'mp4',
  120. 'title': 'China Moses Is Crazy About the Blues',
  121. 'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0',
  122. },
  123. }]
  124. def _real_extract(self, url):
  125. mobj = re.match(self._VALID_URL, url)
  126. video_id = mobj.group('id')
  127. webpage = self._download_webpage(url, video_id)
  128. long_id = self._search_regex(r'contentId: \'(.+?)\',', webpage, 'long id')
  129. return self._get_info(long_id, video_id, webpage)
  130. class YahooSearchIE(SearchInfoExtractor):
  131. IE_DESC = 'Yahoo screen search'
  132. _MAX_RESULTS = 1000
  133. IE_NAME = 'screen.yahoo:search'
  134. _SEARCH_KEY = 'yvsearch'
  135. def _get_n_results(self, query, n):
  136. """Get a specified number of results for a query"""
  137. entries = []
  138. for pagenum in itertools.count(0):
  139. result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
  140. info = self._download_json(result_url, query,
  141. note='Downloading results page '+str(pagenum+1))
  142. m = info['m']
  143. results = info['results']
  144. for (i, r) in enumerate(results):
  145. if (pagenum * 30) + i >= n:
  146. break
  147. mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
  148. e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
  149. entries.append(e)
  150. if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)):
  151. break
  152. return {
  153. '_type': 'playlist',
  154. 'id': query,
  155. 'entries': entries,
  156. }