You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

323 lines
11 KiB

9 years ago
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. import time
  5. import itertools
  6. from .common import InfoExtractor
  7. from ..compat import (
  8. compat_urllib_parse_urlencode,
  9. compat_str,
  10. )
  11. from ..utils import (
  12. dict_get,
  13. ExtractorError,
  14. float_or_none,
  15. int_or_none,
  16. remove_start,
  17. try_get,
  18. urlencode_postdata,
  19. )
  20. class VLiveIE(InfoExtractor):
  21. IE_NAME = 'vlive'
  22. _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<id>[0-9]+)'
  23. _TESTS = [{
  24. 'url': 'http://www.vlive.tv/video/1326',
  25. 'md5': 'cc7314812855ce56de70a06a27314983',
  26. 'info_dict': {
  27. 'id': '1326',
  28. 'ext': 'mp4',
  29. 'title': "[V LIVE] Girl's Day's Broadcast",
  30. 'creator': "Girl's Day",
  31. 'view_count': int,
  32. },
  33. }, {
  34. 'url': 'http://www.vlive.tv/video/16937',
  35. 'info_dict': {
  36. 'id': '16937',
  37. 'ext': 'mp4',
  38. 'title': '[V LIVE] 첸백시 걍방',
  39. 'creator': 'EXO',
  40. 'view_count': int,
  41. 'subtitles': 'mincount:12',
  42. },
  43. 'params': {
  44. 'skip_download': True,
  45. },
  46. }]
  47. @classmethod
  48. def suitable(cls, url):
  49. return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url)
  50. def _real_extract(self, url):
  51. video_id = self._match_id(url)
  52. webpage = self._download_webpage(
  53. 'https://www.vlive.tv/video/%s' % video_id, video_id)
  54. VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)'
  55. VIDEO_PARAMS_FIELD = 'video params'
  56. params = self._parse_json(self._search_regex(
  57. VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD, default=''), video_id,
  58. transform_source=lambda s: '[' + s + ']', fatal=False)
  59. if not params or len(params) < 7:
  60. params = self._search_regex(
  61. VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD)
  62. params = [p.strip(r'"') for p in re.split(r'\s*,\s*', params)]
  63. status, long_video_id, key = params[2], params[5], params[6]
  64. status = remove_start(status, 'PRODUCT_')
  65. if status in ('LIVE_ON_AIR', 'BIG_EVENT_ON_AIR'):
  66. return self._live(video_id, webpage)
  67. elif status in ('VOD_ON_AIR', 'BIG_EVENT_INTRO'):
  68. if long_video_id and key:
  69. return self._replay(video_id, webpage, long_video_id, key)
  70. else:
  71. status = 'COMING_SOON'
  72. if status == 'LIVE_END':
  73. raise ExtractorError('Uploading for replay. Please wait...',
  74. expected=True)
  75. elif status == 'COMING_SOON':
  76. raise ExtractorError('Coming soon!', expected=True)
  77. elif status == 'CANCELED':
  78. raise ExtractorError('We are sorry, '
  79. 'but the live broadcast has been canceled.',
  80. expected=True)
  81. else:
  82. raise ExtractorError('Unknown status %s' % status)
  83. def _get_common_fields(self, webpage):
  84. title = self._og_search_title(webpage)
  85. creator = self._html_search_regex(
  86. r'<div[^>]+class="info_area"[^>]*>\s*<a\s+[^>]*>([^<]+)',
  87. webpage, 'creator', fatal=False)
  88. thumbnail = self._og_search_thumbnail(webpage)
  89. return {
  90. 'title': title,
  91. 'creator': creator,
  92. 'thumbnail': thumbnail,
  93. }
  94. def _live(self, video_id, webpage):
  95. init_page = self._download_webpage(
  96. 'https://www.vlive.tv/video/init/view',
  97. video_id, note='Downloading live webpage',
  98. data=urlencode_postdata({'videoSeq': video_id}),
  99. headers={
  100. 'Referer': 'https://www.vlive.tv/video/%s' % video_id,
  101. 'Content-Type': 'application/x-www-form-urlencoded'
  102. })
  103. live_params = self._search_regex(
  104. r'"liveStreamInfo"\s*:\s*(".*"),',
  105. init_page, 'live stream info')
  106. live_params = self._parse_json(live_params, video_id)
  107. live_params = self._parse_json(live_params, video_id)
  108. formats = []
  109. for vid in live_params.get('resolutions', []):
  110. formats.extend(self._extract_m3u8_formats(
  111. vid['cdnUrl'], video_id, 'mp4',
  112. m3u8_id=vid.get('name'),
  113. fatal=False, live=True))
  114. self._sort_formats(formats)
  115. info = self._get_common_fields(webpage)
  116. info.update({
  117. 'title': self._live_title(info['title']),
  118. 'id': video_id,
  119. 'formats': formats,
  120. 'is_live': True,
  121. })
  122. return info
  123. def _replay(self, video_id, webpage, long_video_id, key):
  124. playinfo = self._download_json(
  125. 'http://global.apis.naver.com/rmcnmv/rmcnmv/vod_play_videoInfo.json?%s'
  126. % compat_urllib_parse_urlencode({
  127. 'videoId': long_video_id,
  128. 'key': key,
  129. 'ptc': 'http',
  130. 'doct': 'json', # document type (xml or json)
  131. 'cpt': 'vtt', # captions type (vtt or ttml)
  132. }), video_id)
  133. formats = [{
  134. 'url': vid['source'],
  135. 'format_id': vid.get('encodingOption', {}).get('name'),
  136. 'abr': float_or_none(vid.get('bitrate', {}).get('audio')),
  137. 'vbr': float_or_none(vid.get('bitrate', {}).get('video')),
  138. 'width': int_or_none(vid.get('encodingOption', {}).get('width')),
  139. 'height': int_or_none(vid.get('encodingOption', {}).get('height')),
  140. 'filesize': int_or_none(vid.get('size')),
  141. } for vid in playinfo.get('videos', {}).get('list', []) if vid.get('source')]
  142. self._sort_formats(formats)
  143. view_count = int_or_none(playinfo.get('meta', {}).get('count'))
  144. subtitles = {}
  145. for caption in playinfo.get('captions', {}).get('list', []):
  146. lang = dict_get(caption, ('locale', 'language', 'country', 'label'))
  147. if lang and caption.get('source'):
  148. subtitles[lang] = [{
  149. 'ext': 'vtt',
  150. 'url': caption['source']}]
  151. info = self._get_common_fields(webpage)
  152. info.update({
  153. 'id': video_id,
  154. 'formats': formats,
  155. 'view_count': view_count,
  156. 'subtitles': subtitles,
  157. })
  158. return info
  159. class VLiveChannelIE(InfoExtractor):
  160. IE_NAME = 'vlive:channel'
  161. _VALID_URL = r'https?://channels\.vlive\.tv/(?P<id>[0-9A-Z]+)'
  162. _TEST = {
  163. 'url': 'http://channels.vlive.tv/FCD4B',
  164. 'info_dict': {
  165. 'id': 'FCD4B',
  166. 'title': 'MAMAMOO',
  167. },
  168. 'playlist_mincount': 110
  169. }
  170. _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b'
  171. def _real_extract(self, url):
  172. channel_code = self._match_id(url)
  173. webpage = self._download_webpage(
  174. 'http://channels.vlive.tv/%s/video' % channel_code, channel_code)
  175. app_id = None
  176. app_js_url = self._search_regex(
  177. r'<script[^>]+src=(["\'])(?P<url>http.+?/app\.js.*?)\1',
  178. webpage, 'app js', default=None, group='url')
  179. if app_js_url:
  180. app_js = self._download_webpage(
  181. app_js_url, channel_code, 'Downloading app JS', fatal=False)
  182. if app_js:
  183. app_id = self._search_regex(
  184. r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]',
  185. app_js, 'app id', default=None)
  186. app_id = app_id or self._APP_ID
  187. channel_info = self._download_json(
  188. 'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode',
  189. channel_code, note='Downloading decode channel code',
  190. query={
  191. 'app_id': app_id,
  192. 'channelCode': channel_code,
  193. '_': int(time.time())
  194. })
  195. channel_seq = channel_info['result']['channelSeq']
  196. channel_name = None
  197. entries = []
  198. for page_num in itertools.count(1):
  199. video_list = self._download_json(
  200. 'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList',
  201. channel_code, note='Downloading channel list page #%d' % page_num,
  202. query={
  203. 'app_id': app_id,
  204. 'channelSeq': channel_seq,
  205. # Large values of maxNumOfRows (~300 or above) may cause
  206. # empty responses (see [1]), e.g. this happens for [2] that
  207. # has more than 300 videos.
  208. # 1. https://github.com/ytdl-org/youtube-dl/issues/13830
  209. # 2. http://channels.vlive.tv/EDBF.
  210. 'maxNumOfRows': 100,
  211. '_': int(time.time()),
  212. 'pageNo': page_num
  213. }
  214. )
  215. if not channel_name:
  216. channel_name = try_get(
  217. video_list,
  218. lambda x: x['result']['channelInfo']['channelName'],
  219. compat_str)
  220. videos = try_get(
  221. video_list, lambda x: x['result']['videoList'], list)
  222. if not videos:
  223. break
  224. for video in videos:
  225. video_id = video.get('videoSeq')
  226. if not video_id:
  227. continue
  228. video_id = compat_str(video_id)
  229. entries.append(
  230. self.url_result(
  231. 'http://www.vlive.tv/video/%s' % video_id,
  232. ie=VLiveIE.ie_key(), video_id=video_id))
  233. return self.playlist_result(
  234. entries, channel_code, channel_name)
  235. class VLivePlaylistIE(InfoExtractor):
  236. IE_NAME = 'vlive:playlist'
  237. _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)'
  238. _TEST = {
  239. 'url': 'http://www.vlive.tv/video/22867/playlist/22912',
  240. 'info_dict': {
  241. 'id': '22912',
  242. 'title': 'Valentine Day Message from TWICE'
  243. },
  244. 'playlist_mincount': 9
  245. }
  246. def _real_extract(self, url):
  247. mobj = re.match(self._VALID_URL, url)
  248. video_id, playlist_id = mobj.group('video_id', 'id')
  249. VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s'
  250. if self._downloader.params.get('noplaylist'):
  251. self.to_screen(
  252. 'Downloading just video %s because of --no-playlist' % video_id)
  253. return self.url_result(
  254. VIDEO_URL_TEMPLATE % video_id,
  255. ie=VLiveIE.ie_key(), video_id=video_id)
  256. self.to_screen(
  257. 'Downloading playlist %s - add --no-playlist to just download video'
  258. % playlist_id)
  259. webpage = self._download_webpage(
  260. 'http://www.vlive.tv/video/%s/playlist/%s'
  261. % (video_id, playlist_id), playlist_id)
  262. item_ids = self._parse_json(
  263. self._search_regex(
  264. r'playlistVideoSeqs\s*=\s*(\[[^]]+\])', webpage,
  265. 'playlist video seqs'),
  266. playlist_id)
  267. entries = [
  268. self.url_result(
  269. VIDEO_URL_TEMPLATE % item_id, ie=VLiveIE.ie_key(),
  270. video_id=compat_str(item_id))
  271. for item_id in item_ids]
  272. playlist_name = self._html_search_regex(
  273. r'<div[^>]+class="[^"]*multicam_playlist[^>]*>\s*<h3[^>]+>([^<]+)',
  274. webpage, 'playlist title', fatal=False)
  275. return self.playlist_result(entries, playlist_id, playlist_name)