You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

327 lines
14 KiB

10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import itertools
  4. import json
  5. import re
  6. from .common import InfoExtractor, SearchInfoExtractor
  7. from ..compat import (
  8. compat_urllib_parse,
  9. compat_urlparse,
  10. )
  11. from ..utils import (
  12. clean_html,
  13. unescapeHTML,
  14. ExtractorError,
  15. int_or_none,
  16. mimetype2ext,
  17. )
  18. from .nbc import NBCSportsVPlayerIE
  19. class YahooIE(InfoExtractor):
  20. IE_DESC = 'Yahoo screen and movies'
  21. _VALID_URL = r'(?P<url>(?P<host>https?://(?:[a-zA-Z]{2}\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?P<display_id>.+)?-(?P<id>[0-9]+)(?:-[a-z]+)?\.html)'
  22. _TESTS = [
  23. {
  24. 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
  25. 'info_dict': {
  26. 'id': '2d25e626-2378-391f-ada0-ddaf1417e588',
  27. 'ext': 'mp4',
  28. 'title': 'Julian Smith & Travis Legg Watch Julian Smith',
  29. 'description': 'Julian and Travis watch Julian Smith',
  30. 'duration': 6863,
  31. },
  32. },
  33. {
  34. 'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
  35. 'md5': 'd6e6fc6e1313c608f316ddad7b82b306',
  36. 'info_dict': {
  37. 'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9',
  38. 'ext': 'mp4',
  39. 'title': 'Codefellas - The Cougar Lies with Spanish Moss',
  40. 'description': 'md5:66b627ab0a282b26352136ca96ce73c1',
  41. 'duration': 151,
  42. },
  43. },
  44. {
  45. 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed',
  46. 'md5': '60e8ac193d8fb71997caa8fce54c6460',
  47. 'info_dict': {
  48. 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb',
  49. 'ext': 'mp4',
  50. 'title': "Yahoo Saves 'Community'",
  51. 'description': 'md5:4d4145af2fd3de00cbb6c1d664105053',
  52. 'duration': 170,
  53. }
  54. },
  55. {
  56. 'url': 'https://tw.screen.yahoo.com/election-2014-askmayor/敢問市長-黃秀霜批賴清德-非常高傲-033009720.html',
  57. 'md5': '3a09cf59349cfaddae1797acc3c087fc',
  58. 'info_dict': {
  59. 'id': 'cac903b3-fcf4-3c14-b632-643ab541712f',
  60. 'ext': 'mp4',
  61. 'title': '敢問市長/黃秀霜批賴清德「非常高傲」',
  62. 'description': '直言台南沒捷運 交通居五都之末',
  63. 'duration': 396,
  64. }
  65. },
  66. {
  67. 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html',
  68. 'md5': '0b51660361f0e27c9789e7037ef76f4b',
  69. 'info_dict': {
  70. 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58',
  71. 'ext': 'mp4',
  72. 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder',
  73. 'description': 'md5:f66c890e1490f4910a9953c941dee944',
  74. 'duration': 97,
  75. }
  76. },
  77. {
  78. 'url': 'https://ca.sports.yahoo.com/video/program-makes-hockey-more-affordable-013127711.html',
  79. 'md5': '57e06440778b1828a6079d2f744212c4',
  80. 'info_dict': {
  81. 'id': 'c9fa2a36-0d4d-3937-b8f6-cc0fb1881e73',
  82. 'ext': 'mp4',
  83. 'title': 'Program that makes hockey more affordable not offered in Manitoba',
  84. 'description': 'md5:c54a609f4c078d92b74ffb9bf1f496f4',
  85. 'duration': 121,
  86. }
  87. }, {
  88. 'url': 'https://ca.finance.yahoo.com/news/hackers-sony-more-trouble-well-154609075.html',
  89. 'md5': '226a895aae7e21b0129e2a2006fe9690',
  90. 'info_dict': {
  91. 'id': 'e624c4bc-3389-34de-9dfc-025f74943409',
  92. 'ext': 'mp4',
  93. 'title': '\'The Interview\' TV Spot: War',
  94. 'description': 'The Interview',
  95. 'duration': 30,
  96. }
  97. }, {
  98. 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
  99. 'md5': '88e209b417f173d86186bef6e4d1f160',
  100. 'info_dict': {
  101. 'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521',
  102. 'ext': 'mp4',
  103. 'title': 'China Moses Is Crazy About the Blues',
  104. 'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0',
  105. 'duration': 128,
  106. }
  107. }, {
  108. 'url': 'https://in.lifestyle.yahoo.com/video/connect-dots-dark-side-virgo-090247395.html',
  109. 'md5': 'd9a083ccf1379127bf25699d67e4791b',
  110. 'info_dict': {
  111. 'id': '52aeeaa3-b3d1-30d8-9ef8-5d0cf05efb7c',
  112. 'ext': 'mp4',
  113. 'title': 'Connect the Dots: Dark Side of Virgo',
  114. 'description': 'md5:1428185051cfd1949807ad4ff6d3686a',
  115. 'duration': 201,
  116. }
  117. }, {
  118. 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html',
  119. 'md5': '989396ae73d20c6f057746fb226aa215',
  120. 'info_dict': {
  121. 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1',
  122. 'ext': 'mp4',
  123. 'title': '\'True Story\' Trailer',
  124. 'description': 'True Story',
  125. 'duration': 150,
  126. },
  127. }, {
  128. 'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html',
  129. 'only_matching': True,
  130. }, {
  131. 'note': 'NBC Sports embeds',
  132. 'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313',
  133. 'info_dict': {
  134. 'id': '9CsDKds0kvHI',
  135. 'ext': 'flv',
  136. 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
  137. 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
  138. }
  139. }, {
  140. 'url': 'https://tw.news.yahoo.com/-100120367.html',
  141. 'only_matching': True,
  142. }, {
  143. # Query result is embedded in webpage, but explicit request to video API fails with geo restriction
  144. 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html',
  145. 'md5': '4fbafb9c9b6f07aa8f870629f6671b35',
  146. 'info_dict': {
  147. 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504',
  148. 'ext': 'mp4',
  149. 'title': 'Communitary - Community Episode 1: Ladders',
  150. 'description': 'md5:8fc39608213295748e1e289807838c97',
  151. 'duration': 1646,
  152. },
  153. }
  154. ]
  155. def _real_extract(self, url):
  156. mobj = re.match(self._VALID_URL, url)
  157. display_id = mobj.group('display_id') or self._match_id(url)
  158. page_id = mobj.group('id')
  159. url = mobj.group('url')
  160. host = mobj.group('host')
  161. webpage = self._download_webpage(url, display_id)
  162. # Look for iframed media first
  163. iframe_m = re.search(r'<iframe[^>]+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage)
  164. if iframe_m:
  165. iframepage = self._download_webpage(
  166. host + iframe_m.group(1), display_id, 'Downloading iframe webpage')
  167. items_json = self._search_regex(
  168. r'mediaItems: (\[.+?\])$', iframepage, 'items', flags=re.MULTILINE, default=None)
  169. if items_json:
  170. items = json.loads(items_json)
  171. video_id = items[0]['id']
  172. return self._get_info(video_id, display_id, webpage)
  173. # Look for NBCSports iframes
  174. nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
  175. if nbc_sports_url:
  176. return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
  177. # Query result is often embedded in webpage as JSON. Sometimes explicit requests
  178. # to video API results in a failure with geo restriction reason therefore using
  179. # embedded query result when present sounds reasonable.
  180. config_json = self._search_regex(
  181. r'window\.Af\.bootstrap\[[^\]]+\]\s*=\s*({.*?"applet_type"\s*:\s*"td-applet-videoplayer".*?});(?:</script>|$)',
  182. webpage, 'videoplayer applet', default=None)
  183. if config_json:
  184. config = self._parse_json(config_json, display_id, fatal=False)
  185. if config:
  186. sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi')
  187. if sapi:
  188. return self._extract_info(display_id, sapi, webpage)
  189. items_json = self._search_regex(
  190. r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,
  191. default=None)
  192. if items_json is None:
  193. CONTENT_ID_REGEXES = [
  194. r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"',
  195. r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"',
  196. r'"first_videoid"\s*:\s*"([^"]+)"',
  197. r'%s[^}]*"ccm_id"\s*:\s*"([^"]+)"' % re.escape(page_id),
  198. ]
  199. video_id = self._search_regex(CONTENT_ID_REGEXES, webpage, 'content ID')
  200. else:
  201. items = json.loads(items_json)
  202. info = items['mediaItems']['query']['results']['mediaObj'][0]
  203. # The 'meta' field is not always in the video webpage, we request it
  204. # from another page
  205. video_id = info['id']
  206. return self._get_info(video_id, display_id, webpage)
  207. def _extract_info(self, display_id, query, webpage):
  208. info = query['query']['results']['mediaObj'][0]
  209. meta = info.get('meta')
  210. video_id = info.get('id')
  211. if not meta:
  212. msg = info['status'].get('msg')
  213. if msg:
  214. raise ExtractorError(
  215. '%s returned error: %s' % (self.IE_NAME, msg), expected=True)
  216. raise ExtractorError('Unable to extract media object meta')
  217. formats = []
  218. for s in info['streams']:
  219. format_info = {
  220. 'width': int_or_none(s.get('width')),
  221. 'height': int_or_none(s.get('height')),
  222. 'tbr': int_or_none(s.get('bitrate')),
  223. }
  224. host = s['host']
  225. path = s['path']
  226. if host.startswith('rtmp'):
  227. format_info.update({
  228. 'url': host,
  229. 'play_path': path,
  230. 'ext': 'flv',
  231. })
  232. else:
  233. if s.get('format') == 'm3u8_playlist':
  234. format_info['protocol'] = 'm3u8_native'
  235. format_info['ext'] = 'mp4'
  236. format_url = compat_urlparse.urljoin(host, path)
  237. format_info['url'] = format_url
  238. formats.append(format_info)
  239. self._sort_formats(formats)
  240. closed_captions = self._html_search_regex(
  241. r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions',
  242. default='[]')
  243. cc_json = self._parse_json(closed_captions, video_id, fatal=False)
  244. subtitles = {}
  245. if cc_json:
  246. for closed_caption in cc_json:
  247. lang = closed_caption['lang']
  248. if lang not in subtitles:
  249. subtitles[lang] = []
  250. subtitles[lang].append({
  251. 'url': closed_caption['url'],
  252. 'ext': mimetype2ext(closed_caption['content_type']),
  253. })
  254. return {
  255. 'id': video_id,
  256. 'display_id': display_id,
  257. 'title': unescapeHTML(meta['title']),
  258. 'formats': formats,
  259. 'description': clean_html(meta['description']),
  260. 'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
  261. 'duration': int_or_none(meta.get('duration')),
  262. 'subtitles': subtitles,
  263. }
  264. def _get_info(self, video_id, display_id, webpage):
  265. region = self._search_regex(
  266. r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"',
  267. webpage, 'region', fatal=False, default='US')
  268. data = compat_urllib_parse.urlencode({
  269. 'protocol': 'http',
  270. 'region': region,
  271. })
  272. query_url = (
  273. 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/'
  274. '{id}?{data}'.format(id=video_id, data=data))
  275. query_result = self._download_json(
  276. query_url, display_id, 'Downloading video info')
  277. return self._extract_info(display_id, query_result, webpage)
  278. class YahooSearchIE(SearchInfoExtractor):
  279. IE_DESC = 'Yahoo screen search'
  280. _MAX_RESULTS = 1000
  281. IE_NAME = 'screen.yahoo:search'
  282. _SEARCH_KEY = 'yvsearch'
  283. def _get_n_results(self, query, n):
  284. """Get a specified number of results for a query"""
  285. entries = []
  286. for pagenum in itertools.count(0):
  287. result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
  288. info = self._download_json(result_url, query,
  289. note='Downloading results page ' + str(pagenum + 1))
  290. m = info['m']
  291. results = info['results']
  292. for (i, r) in enumerate(results):
  293. if (pagenum * 30) + i >= n:
  294. break
  295. mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
  296. e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
  297. entries.append(e)
  298. if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)):
  299. break
  300. return {
  301. '_type': 'playlist',
  302. 'id': query,
  303. 'entries': entries,
  304. }