You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

113 lines
5.5 KiB

  1. import datetime
  2. import itertools
  3. import json
  4. import re
  5. from .common import InfoExtractor, SearchInfoExtractor
  6. from ..utils import (
  7. compat_urllib_parse,
  8. ExtractorError,
  9. )
  10. class YahooIE(InfoExtractor):
  11. """Information extractor for screen.yahoo.com."""
  12. _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
  13. def _real_extract(self, url):
  14. mobj = re.match(self._VALID_URL, url)
  15. if mobj is None:
  16. raise ExtractorError(u'Invalid URL: %s' % url)
  17. video_id = mobj.group('id')
  18. webpage = self._download_webpage(url, video_id)
  19. m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
  20. if m_id is None:
  21. # TODO: Check which url parameters are required
  22. info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
  23. webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
  24. info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
  25. <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
  26. <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
  27. <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
  28. '''
  29. self.report_extraction(video_id)
  30. m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
  31. if m_info is None:
  32. raise ExtractorError(u'Unable to extract video info')
  33. video_title = m_info.group('title')
  34. video_description = m_info.group('description')
  35. video_thumb = m_info.group('thumb')
  36. video_date = m_info.group('date')
  37. video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
  38. # TODO: Find a way to get mp4 videos
  39. rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
  40. webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
  41. m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
  42. video_url = m_rest.group('url')
  43. video_path = m_rest.group('path')
  44. if m_rest is None:
  45. raise ExtractorError(u'Unable to extract video url')
  46. else: # We have to use a different method if another id is defined
  47. long_id = m_id.group('new_id')
  48. info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
  49. webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
  50. json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
  51. info = json.loads(json_str)
  52. res = info[u'query'][u'results'][u'mediaObj'][0]
  53. stream = res[u'streams'][0]
  54. video_path = stream[u'path']
  55. video_url = stream[u'host']
  56. meta = res[u'meta']
  57. video_title = meta[u'title']
  58. video_description = meta[u'description']
  59. video_thumb = meta[u'thumbnail']
  60. video_date = None # I can't find it
  61. info_dict = {
  62. 'id': video_id,
  63. 'url': video_url,
  64. 'play_path': video_path,
  65. 'title':video_title,
  66. 'description': video_description,
  67. 'thumbnail': video_thumb,
  68. 'upload_date': video_date,
  69. 'ext': 'flv',
  70. }
  71. return info_dict
  72. class YahooSearchIE(SearchInfoExtractor):
  73. """Information Extractor for Yahoo! Video search queries."""
  74. _MAX_RESULTS = 1000
  75. IE_NAME = u'screen.yahoo:search'
  76. _SEARCH_KEY = 'yvsearch'
  77. def _get_n_results(self, query, n):
  78. """Get a specified number of results for a query"""
  79. res = {
  80. '_type': 'playlist',
  81. 'id': query,
  82. 'entries': []
  83. }
  84. for pagenum in itertools.count(0):
  85. result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
  86. webpage = self._download_webpage(result_url, query,
  87. note='Downloading results page '+str(pagenum+1))
  88. info = json.loads(webpage)
  89. m = info[u'm']
  90. results = info[u'results']
  91. for (i, r) in enumerate(results):
  92. if (pagenum * 30) +i >= n:
  93. break
  94. mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
  95. e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
  96. res['entries'].append(e)
  97. if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
  98. break
  99. return res