You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

449 lines
17 KiB

  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import json
  4. import datetime
  5. from .common import InfoExtractor
  6. from ..compat import (
  7. compat_parse_qs,
  8. compat_urlparse,
  9. )
  10. from ..utils import (
  11. determine_ext,
  12. dict_get,
  13. ExtractorError,
  14. int_or_none,
  15. float_or_none,
  16. parse_duration,
  17. parse_iso8601,
  18. remove_start,
  19. try_get,
  20. unified_timestamp,
  21. urlencode_postdata,
  22. xpath_text,
  23. )
  24. class NiconicoIE(InfoExtractor):
  25. IE_NAME = 'niconico'
  26. IE_DESC = 'ニコニコ動画'
  27. _TESTS = [{
  28. 'url': 'http://www.nicovideo.jp/watch/sm22312215',
  29. 'md5': 'd1a75c0823e2f629128c43e1212760f9',
  30. 'info_dict': {
  31. 'id': 'sm22312215',
  32. 'ext': 'mp4',
  33. 'title': 'Big Buck Bunny',
  34. 'thumbnail': r're:https?://.*',
  35. 'uploader': 'takuya0301',
  36. 'uploader_id': '2698420',
  37. 'upload_date': '20131123',
  38. 'timestamp': 1385182762,
  39. 'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
  40. 'duration': 33,
  41. 'view_count': int,
  42. 'comment_count': int,
  43. },
  44. 'skip': 'Requires an account',
  45. }, {
  46. # File downloaded with and without credentials are different, so omit
  47. # the md5 field
  48. 'url': 'http://www.nicovideo.jp/watch/nm14296458',
  49. 'info_dict': {
  50. 'id': 'nm14296458',
  51. 'ext': 'swf',
  52. 'title': '【鏡音リン】Dance on media【オリジナル】take2!',
  53. 'description': 'md5:689f066d74610b3b22e0f1739add0f58',
  54. 'thumbnail': r're:https?://.*',
  55. 'uploader': 'りょうた',
  56. 'uploader_id': '18822557',
  57. 'upload_date': '20110429',
  58. 'timestamp': 1304065916,
  59. 'duration': 209,
  60. },
  61. 'skip': 'Requires an account',
  62. }, {
  63. # 'video exists but is marked as "deleted"
  64. # md5 is unstable
  65. 'url': 'http://www.nicovideo.jp/watch/sm10000',
  66. 'info_dict': {
  67. 'id': 'sm10000',
  68. 'ext': 'unknown_video',
  69. 'description': 'deleted',
  70. 'title': 'ドラえもんエターナル第3話「決戦第3新東京市」<前編>',
  71. 'thumbnail': r're:https?://.*',
  72. 'upload_date': '20071224',
  73. 'timestamp': int, # timestamp field has different value if logged in
  74. 'duration': 304,
  75. 'view_count': int,
  76. },
  77. 'skip': 'Requires an account',
  78. }, {
  79. 'url': 'http://www.nicovideo.jp/watch/so22543406',
  80. 'info_dict': {
  81. 'id': '1388129933',
  82. 'ext': 'mp4',
  83. 'title': '【第1回】RADIOアニメロミックス ラブライブ!~のぞえりRadio Garden~',
  84. 'description': 'md5:b27d224bb0ff53d3c8269e9f8b561cf1',
  85. 'thumbnail': r're:https?://.*',
  86. 'timestamp': 1388851200,
  87. 'upload_date': '20140104',
  88. 'uploader': 'アニメロチャンネル',
  89. 'uploader_id': '312',
  90. },
  91. 'skip': 'The viewing period of the video you were searching for has expired.',
  92. }, {
  93. # video not available via `getflv`; "old" HTML5 video
  94. 'url': 'http://www.nicovideo.jp/watch/sm1151009',
  95. 'md5': '8fa81c364eb619d4085354eab075598a',
  96. 'info_dict': {
  97. 'id': 'sm1151009',
  98. 'ext': 'mp4',
  99. 'title': 'マスターシステム本体内蔵のスペハリのメインテーマ(PSG版)',
  100. 'description': 'md5:6ee077e0581ff5019773e2e714cdd0b7',
  101. 'thumbnail': r're:https?://.*',
  102. 'duration': 184,
  103. 'timestamp': 1190868283,
  104. 'upload_date': '20070927',
  105. 'uploader': 'denden2',
  106. 'uploader_id': '1392194',
  107. 'view_count': int,
  108. 'comment_count': int,
  109. },
  110. 'skip': 'Requires an account',
  111. }, {
  112. # "New" HTML5 video
  113. 'url': 'http://www.nicovideo.jp/watch/sm31464864',
  114. 'md5': '351647b4917660986dc0fa8864085135',
  115. 'info_dict': {
  116. 'id': 'sm31464864',
  117. 'ext': 'mp4',
  118. 'title': '新作TVアニメ「戦姫絶唱シンフォギアAXZ」PV 最高画質',
  119. 'description': 'md5:e52974af9a96e739196b2c1ca72b5feb',
  120. 'timestamp': 1498514060,
  121. 'upload_date': '20170626',
  122. 'uploader': 'ゲス',
  123. 'uploader_id': '40826363',
  124. 'thumbnail': r're:https?://.*',
  125. 'duration': 198,
  126. 'view_count': int,
  127. 'comment_count': int,
  128. },
  129. 'skip': 'Requires an account',
  130. }, {
  131. 'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg',
  132. 'only_matching': True,
  133. }]
  134. _VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)'
  135. _NETRC_MACHINE = 'niconico'
  136. def _real_initialize(self):
  137. self._login()
  138. def _login(self):
  139. (username, password) = self._get_login_info()
  140. # No authentication to be performed
  141. if not username:
  142. return True
  143. # Log in
  144. login_ok = True
  145. login_form_strs = {
  146. 'mail_tel': username,
  147. 'password': password,
  148. }
  149. urlh = self._request_webpage(
  150. 'https://account.nicovideo.jp/api/v1/login', None,
  151. note='Logging in', errnote='Unable to log in',
  152. data=urlencode_postdata(login_form_strs))
  153. if urlh is False:
  154. login_ok = False
  155. else:
  156. parts = compat_urlparse.urlparse(urlh.geturl())
  157. if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login':
  158. login_ok = False
  159. if not login_ok:
  160. self._downloader.report_warning('unable to log in: bad username or password')
  161. return login_ok
  162. def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality):
  163. def yesno(boolean):
  164. return 'yes' if boolean else 'no'
  165. session_api_data = api_data['video']['dmcInfo']['session_api']
  166. session_api_endpoint = session_api_data['urls'][0]
  167. format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality]))
  168. session_response = self._download_json(
  169. session_api_endpoint['url'], video_id,
  170. query={'_format': 'json'},
  171. headers={'Content-Type': 'application/json'},
  172. note='Downloading JSON metadata for %s' % format_id,
  173. data=json.dumps({
  174. 'session': {
  175. 'client_info': {
  176. 'player_id': session_api_data['player_id'],
  177. },
  178. 'content_auth': {
  179. 'auth_type': session_api_data['auth_types'][session_api_data['protocols'][0]],
  180. 'content_key_timeout': session_api_data['content_key_timeout'],
  181. 'service_id': 'nicovideo',
  182. 'service_user_id': session_api_data['service_user_id']
  183. },
  184. 'content_id': session_api_data['content_id'],
  185. 'content_src_id_sets': [{
  186. 'content_src_ids': [{
  187. 'src_id_to_mux': {
  188. 'audio_src_ids': [audio_quality['id']],
  189. 'video_src_ids': [video_quality['id']],
  190. }
  191. }]
  192. }],
  193. 'content_type': 'movie',
  194. 'content_uri': '',
  195. 'keep_method': {
  196. 'heartbeat': {
  197. 'lifetime': session_api_data['heartbeat_lifetime']
  198. }
  199. },
  200. 'priority': session_api_data['priority'],
  201. 'protocol': {
  202. 'name': 'http',
  203. 'parameters': {
  204. 'http_parameters': {
  205. 'parameters': {
  206. 'http_output_download_parameters': {
  207. 'use_ssl': yesno(session_api_endpoint['is_ssl']),
  208. 'use_well_known_port': yesno(session_api_endpoint['is_well_known_port']),
  209. }
  210. }
  211. }
  212. }
  213. },
  214. 'recipe_id': session_api_data['recipe_id'],
  215. 'session_operation_auth': {
  216. 'session_operation_auth_by_signature': {
  217. 'signature': session_api_data['signature'],
  218. 'token': session_api_data['token'],
  219. }
  220. },
  221. 'timing_constraint': 'unlimited'
  222. }
  223. }))
  224. resolution = video_quality.get('resolution', {})
  225. return {
  226. 'url': session_response['data']['session']['content_uri'],
  227. 'format_id': format_id,
  228. 'ext': 'mp4', # Session API are used in HTML5, which always serves mp4
  229. 'abr': float_or_none(audio_quality.get('bitrate'), 1000),
  230. 'vbr': float_or_none(video_quality.get('bitrate'), 1000),
  231. 'height': resolution.get('height'),
  232. 'width': resolution.get('width'),
  233. }
  234. def _real_extract(self, url):
  235. video_id = self._match_id(url)
  236. # Get video webpage. We are not actually interested in it for normal
  237. # cases, but need the cookies in order to be able to download the
  238. # info webpage
  239. webpage, handle = self._download_webpage_handle(
  240. 'http://www.nicovideo.jp/watch/' + video_id, video_id)
  241. if video_id.startswith('so'):
  242. video_id = self._match_id(handle.geturl())
  243. api_data = self._parse_json(self._html_search_regex(
  244. 'data-api-data="([^"]+)"', webpage,
  245. 'API data', default='{}'), video_id)
  246. def _format_id_from_url(video_url):
  247. return 'economy' if video_real_url.endswith('low') else 'normal'
  248. try:
  249. video_real_url = api_data['video']['smileInfo']['url']
  250. except KeyError: # Flash videos
  251. # Get flv info
  252. flv_info_webpage = self._download_webpage(
  253. 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1',
  254. video_id, 'Downloading flv info')
  255. flv_info = compat_urlparse.parse_qs(flv_info_webpage)
  256. if 'url' not in flv_info:
  257. if 'deleted' in flv_info:
  258. raise ExtractorError('The video has been deleted.',
  259. expected=True)
  260. elif 'closed' in flv_info:
  261. raise ExtractorError('Niconico videos now require logging in',
  262. expected=True)
  263. elif 'error' in flv_info:
  264. raise ExtractorError('%s reports error: %s' % (
  265. self.IE_NAME, flv_info['error'][0]), expected=True)
  266. else:
  267. raise ExtractorError('Unable to find video URL')
  268. video_info_xml = self._download_xml(
  269. 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id,
  270. video_id, note='Downloading video info page')
  271. def get_video_info(items):
  272. if not isinstance(items, list):
  273. items = [items]
  274. for item in items:
  275. ret = xpath_text(video_info_xml, './/' + item)
  276. if ret:
  277. return ret
  278. video_real_url = flv_info['url'][0]
  279. extension = get_video_info('movie_type')
  280. if not extension:
  281. extension = determine_ext(video_real_url)
  282. formats = [{
  283. 'url': video_real_url,
  284. 'ext': extension,
  285. 'format_id': _format_id_from_url(video_real_url),
  286. }]
  287. else:
  288. formats = []
  289. dmc_info = api_data['video'].get('dmcInfo')
  290. if dmc_info: # "New" HTML5 videos
  291. quality_info = dmc_info['quality']
  292. for audio_quality in quality_info['audios']:
  293. for video_quality in quality_info['videos']:
  294. if not audio_quality['available'] or not video_quality['available']:
  295. continue
  296. formats.append(self._extract_format_for_quality(
  297. api_data, video_id, audio_quality, video_quality))
  298. self._sort_formats(formats)
  299. else: # "Old" HTML5 videos
  300. formats = [{
  301. 'url': video_real_url,
  302. 'ext': 'mp4',
  303. 'format_id': _format_id_from_url(video_real_url),
  304. }]
  305. def get_video_info(items):
  306. return dict_get(api_data['video'], items)
  307. # Start extracting information
  308. title = get_video_info('title')
  309. if not title:
  310. title = self._og_search_title(webpage, default=None)
  311. if not title:
  312. title = self._html_search_regex(
  313. r'<span[^>]+class="videoHeaderTitle"[^>]*>([^<]+)</span>',
  314. webpage, 'video title')
  315. watch_api_data_string = self._html_search_regex(
  316. r'<div[^>]+id="watchAPIDataContainer"[^>]+>([^<]+)</div>',
  317. webpage, 'watch api data', default=None)
  318. watch_api_data = self._parse_json(watch_api_data_string, video_id) if watch_api_data_string else {}
  319. video_detail = watch_api_data.get('videoDetail', {})
  320. thumbnail = (
  321. get_video_info(['thumbnail_url', 'thumbnailURL']) or
  322. self._html_search_meta('image', webpage, 'thumbnail', default=None) or
  323. video_detail.get('thumbnail'))
  324. description = get_video_info('description')
  325. timestamp = (parse_iso8601(get_video_info('first_retrieve')) or
  326. unified_timestamp(get_video_info('postedDateTime')))
  327. if not timestamp:
  328. match = self._html_search_meta('datePublished', webpage, 'date published', default=None)
  329. if match:
  330. timestamp = parse_iso8601(match.replace('+', ':00+'))
  331. if not timestamp and video_detail.get('postedAt'):
  332. timestamp = parse_iso8601(
  333. video_detail['postedAt'].replace('/', '-'),
  334. delimiter=' ', timezone=datetime.timedelta(hours=9))
  335. view_count = int_or_none(get_video_info(['view_counter', 'viewCount']))
  336. if not view_count:
  337. match = self._html_search_regex(
  338. r'>Views: <strong[^>]*>([^<]+)</strong>',
  339. webpage, 'view count', default=None)
  340. if match:
  341. view_count = int_or_none(match.replace(',', ''))
  342. view_count = view_count or video_detail.get('viewCount')
  343. comment_count = (int_or_none(get_video_info('comment_num')) or
  344. video_detail.get('commentCount') or
  345. try_get(api_data, lambda x: x['thread']['commentCount']))
  346. if not comment_count:
  347. match = self._html_search_regex(
  348. r'>Comments: <strong[^>]*>([^<]+)</strong>',
  349. webpage, 'comment count', default=None)
  350. if match:
  351. comment_count = int_or_none(match.replace(',', ''))
  352. duration = (parse_duration(
  353. get_video_info('length') or
  354. self._html_search_meta(
  355. 'video:duration', webpage, 'video duration', default=None)) or
  356. video_detail.get('length') or
  357. get_video_info('duration'))
  358. webpage_url = get_video_info('watch_url') or url
  359. owner = api_data.get('owner', {})
  360. uploader_id = get_video_info(['ch_id', 'user_id']) or owner.get('id')
  361. uploader = get_video_info(['ch_name', 'user_nickname']) or owner.get('nickname')
  362. return {
  363. 'id': video_id,
  364. 'title': title,
  365. 'formats': formats,
  366. 'thumbnail': thumbnail,
  367. 'description': description,
  368. 'uploader': uploader,
  369. 'timestamp': timestamp,
  370. 'uploader_id': uploader_id,
  371. 'view_count': view_count,
  372. 'comment_count': comment_count,
  373. 'duration': duration,
  374. 'webpage_url': webpage_url,
  375. }
  376. class NiconicoPlaylistIE(InfoExtractor):
  377. _VALID_URL = r'https?://(?:www\.)?nicovideo\.jp/mylist/(?P<id>\d+)'
  378. _TEST = {
  379. 'url': 'http://www.nicovideo.jp/mylist/27411728',
  380. 'info_dict': {
  381. 'id': '27411728',
  382. 'title': 'AKB48のオールナイトニッポン',
  383. },
  384. 'playlist_mincount': 225,
  385. }
  386. def _real_extract(self, url):
  387. list_id = self._match_id(url)
  388. webpage = self._download_webpage(url, list_id)
  389. entries_json = self._search_regex(r'Mylist\.preload\(\d+, (\[.*\])\);',
  390. webpage, 'entries')
  391. entries = json.loads(entries_json)
  392. entries = [{
  393. '_type': 'url',
  394. 'ie_key': NiconicoIE.ie_key(),
  395. 'url': ('http://www.nicovideo.jp/watch/%s' %
  396. entry['item_data']['video_id']),
  397. } for entry in entries]
  398. return {
  399. '_type': 'playlist',
  400. 'title': self._search_regex(r'\s+name: "(.*?)"', webpage, 'title'),
  401. 'id': list_id,
  402. 'entries': entries,
  403. }