You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

338 lines
11 KiB

Switch codebase to use sanitized_Request instead of compat_urllib_request.Request [downloader/dash] Use sanitized_Request [downloader/http] Use sanitized_Request [atresplayer] Use sanitized_Request [bambuser] Use sanitized_Request [bliptv] Use sanitized_Request [brightcove] Use sanitized_Request [cbs] Use sanitized_Request [ceskatelevize] Use sanitized_Request [collegerama] Use sanitized_Request [extractor/common] Use sanitized_Request [crunchyroll] Use sanitized_Request [dailymotion] Use sanitized_Request [dcn] Use sanitized_Request [dramafever] Use sanitized_Request [dumpert] Use sanitized_Request [eitb] Use sanitized_Request [escapist] Use sanitized_Request [everyonesmixtape] Use sanitized_Request [extremetube] Use sanitized_Request [facebook] Use sanitized_Request [fc2] Use sanitized_Request [flickr] Use sanitized_Request [4tube] Use sanitized_Request [gdcvault] Use sanitized_Request [extractor/generic] Use sanitized_Request [hearthisat] Use sanitized_Request [hotnewhiphop] Use sanitized_Request [hypem] Use sanitized_Request [iprima] Use sanitized_Request [ivi] Use sanitized_Request [keezmovies] Use sanitized_Request [letv] Use sanitized_Request [lynda] Use sanitized_Request [metacafe] Use sanitized_Request [minhateca] Use sanitized_Request [miomio] Use sanitized_Request [meovideo] Use sanitized_Request [mofosex] Use sanitized_Request [moniker] Use sanitized_Request [mooshare] Use sanitized_Request [movieclips] Use sanitized_Request [mtv] Use sanitized_Request [myvideo] Use sanitized_Request [neteasemusic] Use sanitized_Request [nfb] Use sanitized_Request [niconico] Use sanitized_Request [noco] Use sanitized_Request [nosvideo] Use sanitized_Request [novamov] Use sanitized_Request [nowness] Use sanitized_Request [nuvid] Use sanitized_Request [played] Use sanitized_Request [pluralsight] Use sanitized_Request [pornhub] Use sanitized_Request [pornotube] Use sanitized_Request [primesharetv] Use sanitized_Request [promptfile] Use sanitized_Request [qqmusic] Use sanitized_Request [rtve] Use sanitized_Request [safari] Use sanitized_Request [sandia] Use sanitized_Request [shared] Use sanitized_Request [sharesix] Use sanitized_Request [sina] Use sanitized_Request [smotri] Use sanitized_Request [sohu] Use sanitized_Request [spankwire] Use sanitized_Request [sportdeutschland] Use sanitized_Request [streamcloud] Use sanitized_Request [streamcz] Use sanitized_Request [tapely] Use sanitized_Request [tube8] Use sanitized_Request [tubitv] Use sanitized_Request [twitch] Use sanitized_Request [twitter] Use sanitized_Request [udemy] Use sanitized_Request [vbox7] Use sanitized_Request [veoh] Use sanitized_Request [vessel] Use sanitized_Request [vevo] Use sanitized_Request [viddler] Use sanitized_Request [videomega] Use sanitized_Request [viewvster] Use sanitized_Request [viki] Use sanitized_Request [vk] Use sanitized_Request [vodlocker] Use sanitized_Request [voicerepublic] Use sanitized_Request [wistia] Use sanitized_Request [xfileshare] Use sanitized_Request [xtube] Use sanitized_Request [xvideos] Use sanitized_Request [yandexmusic] Use sanitized_Request [youku] Use sanitized_Request [youporn] Use sanitized_Request [youtube] Use sanitized_Request [patreon] Use sanitized_Request [extractor/common] Remove unused import [nfb] PEP 8
9 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import base64
  4. import itertools
  5. import random
  6. import re
  7. import string
  8. import time
  9. from .common import InfoExtractor
  10. from ..compat import (
  11. compat_ord,
  12. compat_str,
  13. compat_urllib_parse_urlencode,
  14. )
  15. from ..utils import (
  16. ExtractorError,
  17. get_element_by_attribute,
  18. try_get,
  19. )
  20. class YoukuIE(InfoExtractor):
  21. IE_NAME = 'youku'
  22. IE_DESC = '优酷'
  23. _VALID_URL = r'''(?x)
  24. (?:
  25. http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)|
  26. youku:)
  27. (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|)
  28. '''
  29. _TESTS = [{
  30. # MD5 is unstable
  31. 'url': 'http://v.youku.com/v_show/id_XMTc1ODE5Njcy.html',
  32. 'info_dict': {
  33. 'id': 'XMTc1ODE5Njcy_part1',
  34. 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.',
  35. 'ext': 'flv'
  36. }
  37. }, {
  38. 'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf',
  39. 'only_matching': True,
  40. }, {
  41. 'url': 'http://v.youku.com/v_show/id_XODgxNjg1Mzk2_ev_1.html',
  42. 'info_dict': {
  43. 'id': 'XODgxNjg1Mzk2',
  44. 'title': '武媚娘传奇 85',
  45. },
  46. 'playlist_count': 11,
  47. 'skip': 'Available in China only',
  48. }, {
  49. 'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html',
  50. 'info_dict': {
  51. 'id': 'XMTI1OTczNDM5Mg',
  52. 'title': '花千骨 04',
  53. },
  54. 'playlist_count': 13,
  55. }, {
  56. 'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html',
  57. 'note': 'Video protected with password',
  58. 'info_dict': {
  59. 'id': 'XNjA1NzA2Njgw',
  60. 'title': '邢義田复旦讲座之想象中的胡人—从“左衽孔子”说起',
  61. },
  62. 'playlist_count': 19,
  63. 'params': {
  64. 'videopassword': '100600',
  65. },
  66. }, {
  67. # /play/get.json contains streams with "channel_type":"tail"
  68. 'url': 'http://v.youku.com/v_show/id_XOTUxMzg4NDMy.html',
  69. 'info_dict': {
  70. 'id': 'XOTUxMzg4NDMy',
  71. 'title': '我的世界☆明月庄主☆车震猎杀☆杀人艺术Minecraft',
  72. },
  73. 'playlist_count': 6,
  74. }]
  75. def construct_video_urls(self, data):
  76. # get sid, token
  77. def yk_t(s1, s2):
  78. ls = list(range(256))
  79. t = 0
  80. for i in range(256):
  81. t = (t + ls[i] + compat_ord(s1[i % len(s1)])) % 256
  82. ls[i], ls[t] = ls[t], ls[i]
  83. s = bytearray()
  84. x, y = 0, 0
  85. for i in range(len(s2)):
  86. y = (y + 1) % 256
  87. x = (x + ls[y]) % 256
  88. ls[x], ls[y] = ls[y], ls[x]
  89. s.append(compat_ord(s2[i]) ^ ls[(ls[x] + ls[y]) % 256])
  90. return bytes(s)
  91. sid, token = yk_t(
  92. b'becaf9be', base64.b64decode(data['security']['encrypt_string'].encode('ascii'))
  93. ).decode('ascii').split('_')
  94. # get oip
  95. oip = data['security']['ip']
  96. fileid_dict = {}
  97. for stream in data['stream']:
  98. if stream.get('channel_type') == 'tail':
  99. continue
  100. format = stream.get('stream_type')
  101. fileid = try_get(
  102. stream, lambda x: x['segs'][0]['fileid'],
  103. compat_str) or stream['stream_fileid']
  104. fileid_dict[format] = fileid
  105. def get_fileid(format, n):
  106. number = hex(int(str(n), 10))[2:].upper()
  107. if len(number) == 1:
  108. number = '0' + number
  109. streamfileids = fileid_dict[format]
  110. fileid = streamfileids[0:8] + number + streamfileids[10:]
  111. return fileid
  112. # get ep
  113. def generate_ep(format, n):
  114. fileid = get_fileid(format, n)
  115. ep_t = yk_t(
  116. b'bf7e5f01',
  117. ('%s_%s_%s' % (sid, fileid, token)).encode('ascii')
  118. )
  119. ep = base64.b64encode(ep_t).decode('ascii')
  120. return ep
  121. # generate video_urls
  122. video_urls_dict = {}
  123. for stream in data['stream']:
  124. if stream.get('channel_type') == 'tail':
  125. continue
  126. format = stream.get('stream_type')
  127. video_urls = []
  128. for dt in stream['segs']:
  129. n = str(stream['segs'].index(dt))
  130. param = {
  131. 'K': dt['key'],
  132. 'hd': self.get_hd(format),
  133. 'myp': 0,
  134. 'ypp': 0,
  135. 'ctype': 12,
  136. 'ev': 1,
  137. 'token': token,
  138. 'oip': oip,
  139. 'ep': generate_ep(format, n)
  140. }
  141. video_url = \
  142. 'http://k.youku.com/player/getFlvPath/' + \
  143. 'sid/' + sid + \
  144. '_00' + \
  145. '/st/' + self.parse_ext_l(format) + \
  146. '/fileid/' + get_fileid(format, n) + '?' + \
  147. compat_urllib_parse_urlencode(param)
  148. video_urls.append(video_url)
  149. video_urls_dict[format] = video_urls
  150. return video_urls_dict
  151. @staticmethod
  152. def get_ysuid():
  153. return '%d%s' % (int(time.time()), ''.join([
  154. random.choice(string.ascii_letters) for i in range(3)]))
  155. def get_hd(self, fm):
  156. hd_id_dict = {
  157. '3gp': '0',
  158. '3gphd': '1',
  159. 'flv': '0',
  160. 'flvhd': '0',
  161. 'mp4': '1',
  162. 'mp4hd': '1',
  163. 'mp4hd2': '1',
  164. 'mp4hd3': '1',
  165. 'hd2': '2',
  166. 'hd3': '3',
  167. }
  168. return hd_id_dict[fm]
  169. def parse_ext_l(self, fm):
  170. ext_dict = {
  171. '3gp': 'flv',
  172. '3gphd': 'mp4',
  173. 'flv': 'flv',
  174. 'flvhd': 'flv',
  175. 'mp4': 'mp4',
  176. 'mp4hd': 'mp4',
  177. 'mp4hd2': 'flv',
  178. 'mp4hd3': 'flv',
  179. 'hd2': 'flv',
  180. 'hd3': 'flv',
  181. }
  182. return ext_dict[fm]
  183. def get_format_name(self, fm):
  184. _dict = {
  185. '3gp': 'h6',
  186. '3gphd': 'h5',
  187. 'flv': 'h4',
  188. 'flvhd': 'h4',
  189. 'mp4': 'h3',
  190. 'mp4hd': 'h3',
  191. 'mp4hd2': 'h4',
  192. 'mp4hd3': 'h4',
  193. 'hd2': 'h2',
  194. 'hd3': 'h1',
  195. }
  196. return _dict[fm]
  197. def _real_extract(self, url):
  198. video_id = self._match_id(url)
  199. self._set_cookie('youku.com', '__ysuid', self.get_ysuid())
  200. def retrieve_data(req_url, note):
  201. headers = {
  202. 'Referer': req_url,
  203. }
  204. headers.update(self.geo_verification_headers())
  205. self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com')
  206. raw_data = self._download_json(req_url, video_id, note=note, headers=headers)
  207. return raw_data['data']
  208. video_password = self._downloader.params.get('videopassword')
  209. # request basic data
  210. basic_data_url = 'http://play.youku.com/play/get.json?vid=%s&ct=12' % video_id
  211. if video_password:
  212. basic_data_url += '&pwd=%s' % video_password
  213. data = retrieve_data(basic_data_url, 'Downloading JSON metadata')
  214. error = data.get('error')
  215. if error:
  216. error_note = error.get('note')
  217. if error_note is not None and '因版权原因无法观看此视频' in error_note:
  218. raise ExtractorError(
  219. 'Youku said: Sorry, this video is available in China only', expected=True)
  220. elif error_note and '该视频被设为私密' in error_note:
  221. raise ExtractorError(
  222. 'Youku said: Sorry, this video is private', expected=True)
  223. else:
  224. msg = 'Youku server reported error %i' % error.get('code')
  225. if error_note is not None:
  226. msg += ': ' + error_note
  227. raise ExtractorError(msg)
  228. # get video title
  229. title = data['video']['title']
  230. # generate video_urls_dict
  231. video_urls_dict = self.construct_video_urls(data)
  232. # construct info
  233. entries = [{
  234. 'id': '%s_part%d' % (video_id, i + 1),
  235. 'title': title,
  236. 'formats': [],
  237. # some formats are not available for all parts, we have to detect
  238. # which one has all
  239. } for i in range(max(len(v.get('segs')) for v in data['stream']))]
  240. for stream in data['stream']:
  241. if stream.get('channel_type') == 'tail':
  242. continue
  243. fm = stream.get('stream_type')
  244. video_urls = video_urls_dict[fm]
  245. for video_url, seg, entry in zip(video_urls, stream['segs'], entries):
  246. entry['formats'].append({
  247. 'url': video_url,
  248. 'format_id': self.get_format_name(fm),
  249. 'ext': self.parse_ext_l(fm),
  250. 'filesize': int(seg['size']),
  251. 'width': stream.get('width'),
  252. 'height': stream.get('height'),
  253. })
  254. return {
  255. '_type': 'multi_video',
  256. 'id': video_id,
  257. 'title': title,
  258. 'entries': entries,
  259. }
  260. class YoukuShowIE(InfoExtractor):
  261. _VALID_URL = r'https?://(?:www\.)?youku\.com/show_page/id_(?P<id>[0-9a-z]+)\.html'
  262. IE_NAME = 'youku:show'
  263. _TEST = {
  264. 'url': 'http://www.youku.com/show_page/id_zc7c670be07ff11e48b3f.html',
  265. 'info_dict': {
  266. 'id': 'zc7c670be07ff11e48b3f',
  267. 'title': '花千骨 未删减版',
  268. 'description': 'md5:578d4f2145ae3f9128d9d4d863312910',
  269. },
  270. 'playlist_count': 50,
  271. }
  272. _PAGE_SIZE = 40
  273. def _find_videos_in_page(self, webpage):
  274. videos = re.findall(
  275. r'<li><a[^>]+href="(?P<url>https?://v\.youku\.com/[^"]+)"[^>]+title="(?P<title>[^"]+)"', webpage)
  276. return [
  277. self.url_result(video_url, YoukuIE.ie_key(), title)
  278. for video_url, title in videos]
  279. def _real_extract(self, url):
  280. show_id = self._match_id(url)
  281. webpage = self._download_webpage(url, show_id)
  282. entries = self._find_videos_in_page(webpage)
  283. playlist_title = self._html_search_regex(
  284. r'<span[^>]+class="name">([^<]+)</span>', webpage, 'playlist title', fatal=False)
  285. detail_div = get_element_by_attribute('class', 'detail', webpage) or ''
  286. playlist_description = self._html_search_regex(
  287. r'<span[^>]+style="display:none"[^>]*>([^<]+)</span>',
  288. detail_div, 'playlist description', fatal=False)
  289. for idx in itertools.count(1):
  290. episodes_page = self._download_webpage(
  291. 'http://www.youku.com/show_episode/id_%s.html' % show_id,
  292. show_id, query={'divid': 'reload_%d' % (idx * self._PAGE_SIZE + 1)},
  293. note='Downloading episodes page %d' % idx)
  294. new_entries = self._find_videos_in_page(episodes_page)
  295. entries.extend(new_entries)
  296. if len(new_entries) < self._PAGE_SIZE:
  297. break
  298. return self.playlist_result(entries, show_id, playlist_title, playlist_description)