You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

251 lines
10 KiB

11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import itertools
  4. import json
  5. import re
  6. from .common import InfoExtractor, SearchInfoExtractor
  7. from ..compat import (
  8. compat_urllib_parse,
  9. compat_urlparse,
  10. )
  11. from ..utils import (
  12. clean_html,
  13. ExtractorError,
  14. int_or_none,
  15. )
  16. class YahooIE(InfoExtractor):
  17. IE_DESC = 'Yahoo screen and movies'
  18. _VALID_URL = r'(?P<url>(?P<host>https?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P<display_id>.+?)-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'
  19. _TESTS = [
  20. {
  21. 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
  22. 'md5': '4962b075c08be8690a922ee026d05e69',
  23. 'info_dict': {
  24. 'id': '2d25e626-2378-391f-ada0-ddaf1417e588',
  25. 'ext': 'mp4',
  26. 'title': 'Julian Smith & Travis Legg Watch Julian Smith',
  27. 'description': 'Julian and Travis watch Julian Smith',
  28. 'duration': 6863,
  29. },
  30. },
  31. {
  32. 'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
  33. 'md5': 'd6e6fc6e1313c608f316ddad7b82b306',
  34. 'info_dict': {
  35. 'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9',
  36. 'ext': 'mp4',
  37. 'title': 'Codefellas - The Cougar Lies with Spanish Moss',
  38. 'description': 'md5:66b627ab0a282b26352136ca96ce73c1',
  39. 'duration': 151,
  40. },
  41. },
  42. {
  43. 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed',
  44. 'md5': '60e8ac193d8fb71997caa8fce54c6460',
  45. 'info_dict': {
  46. 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb',
  47. 'ext': 'mp4',
  48. 'title': "Yahoo Saves 'Community'",
  49. 'description': 'md5:4d4145af2fd3de00cbb6c1d664105053',
  50. 'duration': 170,
  51. }
  52. },
  53. {
  54. 'url': 'https://tw.screen.yahoo.com/taipei-opinion-poll/選情站報-街頭民調-台北市篇-102823042.html',
  55. 'md5': '92a7fdd8a08783c68a174d7aa067dde8',
  56. 'info_dict': {
  57. 'id': '7a23b569-7bea-36cb-85b9-bd5301a0a1fb',
  58. 'ext': 'mp4',
  59. 'title': '選情站報 街頭民調 台北市篇',
  60. 'description': '選情站報 街頭民調 台北市篇',
  61. 'duration': 429,
  62. }
  63. },
  64. {
  65. 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html',
  66. 'md5': '0b51660361f0e27c9789e7037ef76f4b',
  67. 'info_dict': {
  68. 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58',
  69. 'ext': 'mp4',
  70. 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder',
  71. 'description': 'md5:f66c890e1490f4910a9953c941dee944',
  72. 'duration': 97,
  73. }
  74. },
  75. {
  76. 'url': 'https://ca.sports.yahoo.com/video/program-makes-hockey-more-affordable-013127711.html',
  77. 'md5': '57e06440778b1828a6079d2f744212c4',
  78. 'info_dict': {
  79. 'id': 'c9fa2a36-0d4d-3937-b8f6-cc0fb1881e73',
  80. 'ext': 'mp4',
  81. 'title': 'Program that makes hockey more affordable not offered in Manitoba',
  82. 'description': 'md5:c54a609f4c078d92b74ffb9bf1f496f4',
  83. 'duration': 121,
  84. }
  85. }, {
  86. 'url': 'https://ca.finance.yahoo.com/news/20-most-valuable-brands-world-112600775.html',
  87. 'md5': '3e401e4eed6325aa29d9b96125fd5b4f',
  88. 'info_dict': {
  89. 'id': 'c1b4c09c-8ed8-3b65-8b05-169c55358a83',
  90. 'ext': 'mp4',
  91. 'title': "Apple Is The World's Most Valuable Brand",
  92. 'description': 'md5:73eabc1a11c6f59752593b2ceefa1262',
  93. 'duration': 21,
  94. }
  95. }, {
  96. 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
  97. 'md5': '67010fdf3a08d290e060a4dd96baa07b',
  98. 'info_dict': {
  99. 'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521',
  100. 'ext': 'mp4',
  101. 'title': 'China Moses Is Crazy About the Blues',
  102. 'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0',
  103. 'duration': 128,
  104. }
  105. }, {
  106. 'url': 'https://in.lifestyle.yahoo.com/video/connect-dots-dark-side-virgo-090247395.html',
  107. 'md5': 'd9a083ccf1379127bf25699d67e4791b',
  108. 'info_dict': {
  109. 'id': '52aeeaa3-b3d1-30d8-9ef8-5d0cf05efb7c',
  110. 'ext': 'mp4',
  111. 'title': 'Connect the Dots: Dark Side of Virgo',
  112. 'description': 'md5:1428185051cfd1949807ad4ff6d3686a',
  113. 'duration': 201,
  114. }
  115. }, {
  116. 'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html',
  117. 'only_matching': True,
  118. }
  119. ]
  120. def _real_extract(self, url):
  121. mobj = re.match(self._VALID_URL, url)
  122. display_id = mobj.group('display_id')
  123. url = mobj.group('url')
  124. host = mobj.group('host')
  125. webpage = self._download_webpage(url, display_id)
  126. # Look for iframed media first
  127. iframe_m = re.search(r'<iframe[^>]+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage)
  128. if iframe_m:
  129. iframepage = self._download_webpage(
  130. host + iframe_m.group(1), display_id, 'Downloading iframe webpage')
  131. items_json = self._search_regex(
  132. r'mediaItems: (\[.+?\])$', iframepage, 'items', flags=re.MULTILINE, default=None)
  133. if items_json:
  134. items = json.loads(items_json)
  135. video_id = items[0]['id']
  136. return self._get_info(video_id, display_id, webpage)
  137. items_json = self._search_regex(
  138. r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,
  139. default=None)
  140. if items_json is None:
  141. CONTENT_ID_REGEXES = [
  142. r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"',
  143. r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"',
  144. r'"first_videoid"\s*:\s*"([^"]+)"',
  145. ]
  146. video_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID')
  147. else:
  148. items = json.loads(items_json)
  149. info = items['mediaItems']['query']['results']['mediaObj'][0]
  150. # The 'meta' field is not always in the video webpage, we request it
  151. # from another page
  152. video_id = info['id']
  153. return self._get_info(video_id, display_id, webpage)
  154. def _get_info(self, video_id, display_id, webpage):
  155. region = self._search_regex(
  156. r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"',
  157. webpage, 'region', fatal=False, default='US')
  158. query = ('SELECT * FROM yahoo.media.video.streams WHERE id="%s"'
  159. ' AND plrs="86Gj0vCaSzV_Iuf6hNylf2" AND region="%s"'
  160. ' AND protocol="http"' % (video_id, region))
  161. data = compat_urllib_parse.urlencode({
  162. 'q': query,
  163. 'env': 'prod',
  164. 'format': 'json',
  165. })
  166. query_result = self._download_json(
  167. 'http://video.query.yahoo.com/v1/public/yql?' + data,
  168. display_id, 'Downloading video info')
  169. info = query_result['query']['results']['mediaObj'][0]
  170. meta = info.get('meta')
  171. if not meta:
  172. msg = info['status'].get('msg')
  173. if msg:
  174. raise ExtractorError(
  175. '%s returned error: %s' % (self.IE_NAME, msg), expected=True)
  176. raise ExtractorError('Unable to extract media object meta')
  177. formats = []
  178. for s in info['streams']:
  179. format_info = {
  180. 'width': int_or_none(s.get('width')),
  181. 'height': int_or_none(s.get('height')),
  182. 'tbr': int_or_none(s.get('bitrate')),
  183. }
  184. host = s['host']
  185. path = s['path']
  186. if host.startswith('rtmp'):
  187. format_info.update({
  188. 'url': host,
  189. 'play_path': path,
  190. 'ext': 'flv',
  191. })
  192. else:
  193. format_url = compat_urlparse.urljoin(host, path)
  194. format_info['url'] = format_url
  195. formats.append(format_info)
  196. self._sort_formats(formats)
  197. return {
  198. 'id': video_id,
  199. 'display_id': display_id,
  200. 'title': meta['title'],
  201. 'formats': formats,
  202. 'description': clean_html(meta['description']),
  203. 'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
  204. 'duration': int_or_none(meta.get('duration')),
  205. }
  206. class YahooSearchIE(SearchInfoExtractor):
  207. IE_DESC = 'Yahoo screen search'
  208. _MAX_RESULTS = 1000
  209. IE_NAME = 'screen.yahoo:search'
  210. _SEARCH_KEY = 'yvsearch'
  211. def _get_n_results(self, query, n):
  212. """Get a specified number of results for a query"""
  213. entries = []
  214. for pagenum in itertools.count(0):
  215. result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
  216. info = self._download_json(result_url, query,
  217. note='Downloading results page ' + str(pagenum + 1))
  218. m = info['m']
  219. results = info['results']
  220. for (i, r) in enumerate(results):
  221. if (pagenum * 30) + i >= n:
  222. break
  223. mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
  224. e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
  225. entries.append(e)
  226. if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)):
  227. break
  228. return {
  229. '_type': 'playlist',
  230. 'id': query,
  231. 'entries': entries,
  232. }