You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

479 lines
20 KiB

11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
11 years ago
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import itertools
  4. import json
  5. import re
  6. from .common import InfoExtractor, SearchInfoExtractor
  7. from ..compat import (
  8. compat_urllib_parse,
  9. compat_urlparse,
  10. )
  11. from ..utils import (
  12. clean_html,
  13. determine_ext,
  14. ExtractorError,
  15. extract_attributes,
  16. int_or_none,
  17. mimetype2ext,
  18. smuggle_url,
  19. unescapeHTML,
  20. )
  21. from .brightcove import (
  22. BrightcoveLegacyIE,
  23. BrightcoveNewIE,
  24. )
  25. from .nbc import NBCSportsVPlayerIE
  26. class YahooIE(InfoExtractor):
  27. IE_DESC = 'Yahoo screen and movies'
  28. _VALID_URL = r'(?P<host>https?://(?:(?P<country>[a-zA-Z]{2})\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?:(?P<display_id>.+)?-)?(?P<id>[0-9]+)(?:-[a-z]+)?(?:\.html)?'
  29. _TESTS = [
  30. {
  31. 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
  32. 'info_dict': {
  33. 'id': '2d25e626-2378-391f-ada0-ddaf1417e588',
  34. 'ext': 'mp4',
  35. 'title': 'Julian Smith & Travis Legg Watch Julian Smith',
  36. 'description': 'Julian and Travis watch Julian Smith',
  37. 'duration': 6863,
  38. },
  39. },
  40. {
  41. 'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
  42. 'md5': '251af144a19ebc4a033e8ba91ac726bb',
  43. 'info_dict': {
  44. 'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9',
  45. 'ext': 'mp4',
  46. 'title': 'Codefellas - The Cougar Lies with Spanish Moss',
  47. 'description': 'md5:66b627ab0a282b26352136ca96ce73c1',
  48. 'duration': 151,
  49. },
  50. 'skip': 'HTTP Error 404',
  51. },
  52. {
  53. 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed',
  54. 'md5': '7993e572fac98e044588d0b5260f4352',
  55. 'info_dict': {
  56. 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb',
  57. 'ext': 'mp4',
  58. 'title': "Yahoo Saves 'Community'",
  59. 'description': 'md5:4d4145af2fd3de00cbb6c1d664105053',
  60. 'duration': 170,
  61. }
  62. },
  63. {
  64. 'url': 'https://tw.news.yahoo.com/%E6%95%A2%E5%95%8F%E5%B8%82%E9%95%B7%20%E9%BB%83%E7%A7%80%E9%9C%9C%E6%89%B9%E8%B3%B4%E6%B8%85%E5%BE%B7%20%E9%9D%9E%E5%B8%B8%E9%AB%98%E5%82%B2-034024051.html',
  65. 'md5': '45c024bad51e63e9b6f6fad7a43a8c23',
  66. 'info_dict': {
  67. 'id': 'cac903b3-fcf4-3c14-b632-643ab541712f',
  68. 'ext': 'mp4',
  69. 'title': '敢問市長/黃秀霜批賴清德「非常高傲」',
  70. 'description': '直言台南沒捷運 交通居五都之末',
  71. 'duration': 396,
  72. },
  73. },
  74. {
  75. 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html',
  76. 'md5': '71298482f7c64cbb7fa064e4553ff1c1',
  77. 'info_dict': {
  78. 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58',
  79. 'ext': 'webm',
  80. 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder',
  81. 'description': 'md5:f66c890e1490f4910a9953c941dee944',
  82. 'duration': 97,
  83. }
  84. },
  85. {
  86. 'url': 'https://ca.sports.yahoo.com/video/program-makes-hockey-more-affordable-013127711.html',
  87. 'md5': '57e06440778b1828a6079d2f744212c4',
  88. 'info_dict': {
  89. 'id': 'c9fa2a36-0d4d-3937-b8f6-cc0fb1881e73',
  90. 'ext': 'mp4',
  91. 'title': 'Program that makes hockey more affordable not offered in Manitoba',
  92. 'description': 'md5:c54a609f4c078d92b74ffb9bf1f496f4',
  93. 'duration': 121,
  94. },
  95. 'skip': 'Video gone',
  96. }, {
  97. 'url': 'https://ca.finance.yahoo.com/news/hackers-sony-more-trouble-well-154609075.html',
  98. 'info_dict': {
  99. 'id': '154609075',
  100. },
  101. 'playlist': [{
  102. 'md5': '000887d0dc609bc3a47c974151a40fb8',
  103. 'info_dict': {
  104. 'id': 'e624c4bc-3389-34de-9dfc-025f74943409',
  105. 'ext': 'mp4',
  106. 'title': '\'The Interview\' TV Spot: War',
  107. 'description': 'The Interview',
  108. 'duration': 30,
  109. },
  110. }, {
  111. 'md5': '81bc74faf10750fe36e4542f9a184c66',
  112. 'info_dict': {
  113. 'id': '1fc8ada0-718e-3abe-a450-bf31f246d1a9',
  114. 'ext': 'mp4',
  115. 'title': '\'The Interview\' TV Spot: Guys',
  116. 'description': 'The Interview',
  117. 'duration': 30,
  118. },
  119. }],
  120. }, {
  121. 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
  122. 'md5': '88e209b417f173d86186bef6e4d1f160',
  123. 'info_dict': {
  124. 'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521',
  125. 'ext': 'mp4',
  126. 'title': 'China Moses Is Crazy About the Blues',
  127. 'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0',
  128. 'duration': 128,
  129. }
  130. }, {
  131. 'url': 'https://in.lifestyle.yahoo.com/video/connect-dots-dark-side-virgo-090247395.html',
  132. 'md5': 'd9a083ccf1379127bf25699d67e4791b',
  133. 'info_dict': {
  134. 'id': '52aeeaa3-b3d1-30d8-9ef8-5d0cf05efb7c',
  135. 'ext': 'mp4',
  136. 'title': 'Connect the Dots: Dark Side of Virgo',
  137. 'description': 'md5:1428185051cfd1949807ad4ff6d3686a',
  138. 'duration': 201,
  139. },
  140. 'skip': 'Domain name in.lifestyle.yahoo.com gone',
  141. }, {
  142. 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html',
  143. 'md5': '989396ae73d20c6f057746fb226aa215',
  144. 'info_dict': {
  145. 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1',
  146. 'ext': 'mp4',
  147. 'title': '\'True Story\' Trailer',
  148. 'description': 'True Story',
  149. 'duration': 150,
  150. },
  151. }, {
  152. 'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html',
  153. 'only_matching': True,
  154. }, {
  155. 'note': 'NBC Sports embeds',
  156. 'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313',
  157. 'info_dict': {
  158. 'id': '9CsDKds0kvHI',
  159. 'ext': 'flv',
  160. 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
  161. 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
  162. 'upload_date': '20150313',
  163. 'uploader': 'NBCU-SPORTS',
  164. 'timestamp': 1426270238,
  165. }
  166. }, {
  167. 'url': 'https://tw.news.yahoo.com/-100120367.html',
  168. 'only_matching': True,
  169. }, {
  170. # Query result is embedded in webpage, but explicit request to video API fails with geo restriction
  171. 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html',
  172. 'md5': '4fbafb9c9b6f07aa8f870629f6671b35',
  173. 'info_dict': {
  174. 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504',
  175. 'ext': 'mp4',
  176. 'title': 'Communitary - Community Episode 1: Ladders',
  177. 'description': 'md5:8fc39608213295748e1e289807838c97',
  178. 'duration': 1646,
  179. },
  180. }, {
  181. # it uses an alias to get the video_id
  182. 'url': 'https://www.yahoo.com/movies/the-stars-of-daddys-home-have-very-different-212843197.html',
  183. 'info_dict': {
  184. 'id': '40eda9c8-8e5f-3552-8745-830f67d0c737',
  185. 'ext': 'mp4',
  186. 'title': 'Will Ferrell & Mark Wahlberg Are Pro-Spanking',
  187. 'description': 'While they play feuding fathers in \'Daddy\'s Home,\' star Will Ferrell & Mark Wahlberg share their true feelings on parenthood.',
  188. },
  189. },
  190. {
  191. # config['models']['applet_model']['data']['sapi'] has no query
  192. 'url': 'https://www.yahoo.com/music/livenation/event/galactic-2016',
  193. 'md5': 'dac0c72d502bc5facda80c9e6d5c98db',
  194. 'info_dict': {
  195. 'id': 'a6015640-e9e5-3efb-bb60-05589a183919',
  196. 'ext': 'mp4',
  197. 'description': 'Galactic',
  198. 'title': 'Dolla Diva (feat. Maggie Koerner)',
  199. },
  200. 'skip': 'redirect to https://www.yahoo.com/music',
  201. },
  202. {
  203. # yahoo://article/
  204. 'url': 'https://www.yahoo.com/movies/video/true-story-trailer-173000497.html',
  205. 'info_dict': {
  206. 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1',
  207. 'ext': 'mp4',
  208. 'title': "'True Story' Trailer",
  209. 'description': 'True Story',
  210. },
  211. 'params': {
  212. 'skip_download': True,
  213. },
  214. },
  215. {
  216. # ytwnews://cavideo/
  217. 'url': 'https://tw.video.yahoo.com/movie-tw/單車天使-中文版預-092316541.html',
  218. 'info_dict': {
  219. 'id': 'ba133ff2-0793-3510-b636-59dfe9ff6cff',
  220. 'ext': 'mp4',
  221. 'title': '單車天使 - 中文版預',
  222. 'description': '中文版預',
  223. },
  224. 'params': {
  225. 'skip_download': True,
  226. },
  227. },
  228. {
  229. # custom brightcove
  230. 'url': 'https://au.tv.yahoo.com/plus7/sunrise/-/watch/37083565/clown-entertainers-say-it-is-hurting-their-business/',
  231. 'info_dict': {
  232. 'id': '5575377707001',
  233. 'ext': 'mp4',
  234. 'title': "Clown entertainers say 'It' is hurting their business",
  235. 'description': 'Stephen King s horror film has much to answer for. Jelby and Mr Loopy the Clowns join us.',
  236. 'timestamp': 1505341164,
  237. 'upload_date': '20170913',
  238. 'uploader_id': '2376984109001',
  239. },
  240. 'params': {
  241. 'skip_download': True,
  242. },
  243. },
  244. {
  245. # custom brightcove, geo-restricted to Australia, bypassable
  246. 'url': 'https://au.tv.yahoo.com/plus7/sunrise/-/watch/37263964/sunrise-episode-wed-27-sep/',
  247. 'only_matching': True,
  248. }
  249. ]
  250. def _real_extract(self, url):
  251. mobj = re.match(self._VALID_URL, url)
  252. page_id = mobj.group('id')
  253. display_id = mobj.group('display_id') or page_id
  254. host = mobj.group('host')
  255. webpage, urlh = self._download_webpage_handle(url, display_id)
  256. if 'err=404' in urlh.geturl():
  257. raise ExtractorError('Video gone', expected=True)
  258. # Look for iframed media first
  259. entries = []
  260. iframe_urls = re.findall(r'<iframe[^>]+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage)
  261. for idx, iframe_url in enumerate(iframe_urls):
  262. entries.append(self.url_result(host + iframe_url, 'Yahoo'))
  263. if entries:
  264. return self.playlist_result(entries, page_id)
  265. # Look for NBCSports iframes
  266. nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
  267. if nbc_sports_url:
  268. return self.url_result(nbc_sports_url, NBCSportsVPlayerIE.ie_key())
  269. # Look for Brightcove Legacy Studio embeds
  270. bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
  271. if bc_url:
  272. return self.url_result(bc_url, BrightcoveLegacyIE.ie_key())
  273. def brightcove_url_result(bc_url):
  274. return self.url_result(
  275. smuggle_url(bc_url, {'geo_countries': [mobj.group('country')]}),
  276. BrightcoveNewIE.ie_key())
  277. # Look for Brightcove New Studio embeds
  278. bc_url = BrightcoveNewIE._extract_url(self, webpage)
  279. if bc_url:
  280. return brightcove_url_result(bc_url)
  281. brightcove_iframe = self._search_regex(
  282. r'(<iframe[^>]+data-video-id=["\']\d+[^>]+>)', webpage,
  283. 'brightcove iframe', default=None)
  284. if brightcove_iframe:
  285. attr = extract_attributes(brightcove_iframe)
  286. src = attr.get('src')
  287. if src:
  288. parsed_src = compat_urlparse.urlparse(src)
  289. qs = compat_urlparse.parse_qs(parsed_src.query)
  290. account_id = qs.get('accountId', ['2376984109001'])[0]
  291. brightcove_id = attr.get('data-video-id') or qs.get('videoId', [None])[0]
  292. if account_id and brightcove_id:
  293. return brightcove_url_result(
  294. 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s'
  295. % (account_id, brightcove_id))
  296. # Query result is often embedded in webpage as JSON. Sometimes explicit requests
  297. # to video API results in a failure with geo restriction reason therefore using
  298. # embedded query result when present sounds reasonable.
  299. config_json = self._search_regex(
  300. r'window\.Af\.bootstrap\[[^\]]+\]\s*=\s*({.*?"applet_type"\s*:\s*"td-applet-videoplayer".*?});(?:</script>|$)',
  301. webpage, 'videoplayer applet', default=None)
  302. if config_json:
  303. config = self._parse_json(config_json, display_id, fatal=False)
  304. if config:
  305. sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi')
  306. if sapi and 'query' in sapi:
  307. info = self._extract_info(display_id, sapi, webpage)
  308. self._sort_formats(info['formats'])
  309. return info
  310. items_json = self._search_regex(
  311. r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,
  312. default=None)
  313. if items_json is None:
  314. alias = self._search_regex(
  315. r'"aliases":{"video":"(.*?)"', webpage, 'alias', default=None)
  316. if alias is not None:
  317. alias_info = self._download_json(
  318. 'https://www.yahoo.com/_td/api/resource/VideoService.videos;video_aliases=["%s"]' % alias,
  319. display_id, 'Downloading alias info')
  320. video_id = alias_info[0]['id']
  321. else:
  322. CONTENT_ID_REGEXES = [
  323. r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"',
  324. r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"',
  325. r'"first_videoid"\s*:\s*"([^"]+)"',
  326. r'%s[^}]*"ccm_id"\s*:\s*"([^"]+)"' % re.escape(page_id),
  327. r'<article[^>]data-uuid=["\']([^"\']+)',
  328. r'<meta[^<>]+yahoo://article/view\?.*\buuid=([^&"\']+)',
  329. r'<meta[^<>]+["\']ytwnews://cavideo/(?:[^/]+/)+([\da-fA-F-]+)[&"\']',
  330. ]
  331. video_id = self._search_regex(
  332. CONTENT_ID_REGEXES, webpage, 'content ID')
  333. else:
  334. items = json.loads(items_json)
  335. info = items['mediaItems']['query']['results']['mediaObj'][0]
  336. # The 'meta' field is not always in the video webpage, we request it
  337. # from another page
  338. video_id = info['id']
  339. return self._get_info(video_id, display_id, webpage)
  340. def _extract_info(self, display_id, query, webpage):
  341. info = query['query']['results']['mediaObj'][0]
  342. meta = info.get('meta')
  343. video_id = info.get('id')
  344. if not meta:
  345. msg = info['status'].get('msg')
  346. if msg:
  347. raise ExtractorError(
  348. '%s returned error: %s' % (self.IE_NAME, msg), expected=True)
  349. raise ExtractorError('Unable to extract media object meta')
  350. formats = []
  351. for s in info['streams']:
  352. tbr = int_or_none(s.get('bitrate'))
  353. format_info = {
  354. 'width': int_or_none(s.get('width')),
  355. 'height': int_or_none(s.get('height')),
  356. 'tbr': tbr,
  357. }
  358. host = s['host']
  359. path = s['path']
  360. if host.startswith('rtmp'):
  361. fmt = 'rtmp'
  362. format_info.update({
  363. 'url': host,
  364. 'play_path': path,
  365. 'ext': 'flv',
  366. })
  367. else:
  368. if s.get('format') == 'm3u8_playlist':
  369. fmt = 'hls'
  370. format_info.update({
  371. 'protocol': 'm3u8_native',
  372. 'ext': 'mp4',
  373. })
  374. else:
  375. fmt = format_info['ext'] = determine_ext(path)
  376. format_url = compat_urlparse.urljoin(host, path)
  377. format_info['url'] = format_url
  378. format_info['format_id'] = fmt + ('-%d' % tbr if tbr else '')
  379. formats.append(format_info)
  380. closed_captions = self._html_search_regex(
  381. r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions',
  382. default='[]')
  383. cc_json = self._parse_json(closed_captions, video_id, fatal=False)
  384. subtitles = {}
  385. if cc_json:
  386. for closed_caption in cc_json:
  387. lang = closed_caption['lang']
  388. if lang not in subtitles:
  389. subtitles[lang] = []
  390. subtitles[lang].append({
  391. 'url': closed_caption['url'],
  392. 'ext': mimetype2ext(closed_caption['content_type']),
  393. })
  394. return {
  395. 'id': video_id,
  396. 'display_id': display_id,
  397. 'title': unescapeHTML(meta['title']),
  398. 'formats': formats,
  399. 'description': clean_html(meta['description']),
  400. 'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
  401. 'duration': int_or_none(meta.get('duration')),
  402. 'subtitles': subtitles,
  403. }
  404. def _get_info(self, video_id, display_id, webpage):
  405. region = self._search_regex(
  406. r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"',
  407. webpage, 'region', fatal=False, default='US').upper()
  408. formats = []
  409. info = {}
  410. for fmt in ('webm', 'mp4'):
  411. query_result = self._download_json(
  412. 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' + video_id,
  413. display_id, 'Downloading %s video info' % fmt, query={
  414. 'protocol': 'http',
  415. 'region': region,
  416. 'format': fmt,
  417. })
  418. info = self._extract_info(display_id, query_result, webpage)
  419. formats.extend(info['formats'])
  420. formats.extend(self._extract_m3u8_formats(
  421. 'http://video.media.yql.yahoo.com/v1/hls/%s?region=%s' % (video_id, region),
  422. video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
  423. self._sort_formats(formats)
  424. info['formats'] = formats
  425. return info
  426. class YahooSearchIE(SearchInfoExtractor):
  427. IE_DESC = 'Yahoo screen search'
  428. _MAX_RESULTS = 1000
  429. IE_NAME = 'screen.yahoo:search'
  430. _SEARCH_KEY = 'yvsearch'
  431. def _get_n_results(self, query, n):
  432. """Get a specified number of results for a query"""
  433. entries = []
  434. for pagenum in itertools.count(0):
  435. result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
  436. info = self._download_json(result_url, query,
  437. note='Downloading results page ' + str(pagenum + 1))
  438. m = info['m']
  439. results = info['results']
  440. for (i, r) in enumerate(results):
  441. if (pagenum * 30) + i >= n:
  442. break
  443. mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
  444. e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
  445. entries.append(e)
  446. if (pagenum * 30 + i >= n) or (m['last'] >= (m['total'] - 1)):
  447. break
  448. return {
  449. '_type': 'playlist',
  450. 'id': query,
  451. 'entries': entries,
  452. }