You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

122 lines
5.9 KiB

  1. import datetime
  2. import itertools
  3. import json
  4. import re
  5. from .common import InfoExtractor, SearchInfoExtractor
  6. from ..utils import (
  7. compat_urllib_parse,
  8. ExtractorError,
  9. )
  10. class YahooIE(InfoExtractor):
  11. """Information extractor for screen.yahoo.com."""
  12. _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
  13. _TEST = {
  14. u'url': u'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
  15. u'file': u'214727115.flv',
  16. u'md5': u'2e717f169c1be93d84d3794a00d4a325',
  17. u'info_dict': {
  18. u"title": u"Julian Smith & Travis Legg Watch Julian Smith"
  19. },
  20. u'skip': u'Requires rtmpdump'
  21. }
  22. def _real_extract(self, url):
  23. mobj = re.match(self._VALID_URL, url)
  24. if mobj is None:
  25. raise ExtractorError(u'Invalid URL: %s' % url)
  26. video_id = mobj.group('id')
  27. webpage = self._download_webpage(url, video_id)
  28. m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
  29. if m_id is None:
  30. # TODO: Check which url parameters are required
  31. info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
  32. webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
  33. info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
  34. <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
  35. <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
  36. <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
  37. '''
  38. self.report_extraction(video_id)
  39. m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
  40. if m_info is None:
  41. raise ExtractorError(u'Unable to extract video info')
  42. video_title = m_info.group('title')
  43. video_description = m_info.group('description')
  44. video_thumb = m_info.group('thumb')
  45. video_date = m_info.group('date')
  46. video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
  47. # TODO: Find a way to get mp4 videos
  48. rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
  49. webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
  50. m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
  51. video_url = m_rest.group('url')
  52. video_path = m_rest.group('path')
  53. if m_rest is None:
  54. raise ExtractorError(u'Unable to extract video url')
  55. else: # We have to use a different method if another id is defined
  56. long_id = m_id.group('new_id')
  57. info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
  58. webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
  59. json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
  60. info = json.loads(json_str)
  61. res = info[u'query'][u'results'][u'mediaObj'][0]
  62. stream = res[u'streams'][0]
  63. video_path = stream[u'path']
  64. video_url = stream[u'host']
  65. meta = res[u'meta']
  66. video_title = meta[u'title']
  67. video_description = meta[u'description']
  68. video_thumb = meta[u'thumbnail']
  69. video_date = None # I can't find it
  70. info_dict = {
  71. 'id': video_id,
  72. 'url': video_url,
  73. 'play_path': video_path,
  74. 'title':video_title,
  75. 'description': video_description,
  76. 'thumbnail': video_thumb,
  77. 'upload_date': video_date,
  78. 'ext': 'flv',
  79. }
  80. return info_dict
  81. class YahooSearchIE(SearchInfoExtractor):
  82. """Information Extractor for Yahoo! Video search queries."""
  83. _MAX_RESULTS = 1000
  84. IE_NAME = u'screen.yahoo:search'
  85. _SEARCH_KEY = 'yvsearch'
  86. def _get_n_results(self, query, n):
  87. """Get a specified number of results for a query"""
  88. res = {
  89. '_type': 'playlist',
  90. 'id': query,
  91. 'entries': []
  92. }
  93. for pagenum in itertools.count(0):
  94. result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
  95. webpage = self._download_webpage(result_url, query,
  96. note='Downloading results page '+str(pagenum+1))
  97. info = json.loads(webpage)
  98. m = info[u'm']
  99. results = info[u'results']
  100. for (i, r) in enumerate(results):
  101. if (pagenum * 30) +i >= n:
  102. break
  103. mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
  104. e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
  105. res['entries'].append(e)
  106. if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
  107. break
  108. return res