You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

367 lines
15 KiB

10 years ago
10 years ago
10 years ago
  1. # encoding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. import json
  5. import base64
  6. import zlib
  7. import xml.etree.ElementTree
  8. from hashlib import sha1
  9. from math import pow, sqrt, floor
  10. from .common import InfoExtractor
  11. from ..compat import (
  12. compat_urllib_parse,
  13. compat_urllib_parse_unquote,
  14. compat_urllib_request,
  15. compat_urlparse,
  16. )
  17. from ..utils import (
  18. ExtractorError,
  19. bytes_to_intlist,
  20. intlist_to_bytes,
  21. int_or_none,
  22. remove_end,
  23. unified_strdate,
  24. urlencode_postdata,
  25. xpath_text,
  26. )
  27. from ..aes import (
  28. aes_cbc_decrypt,
  29. )
  30. class CrunchyrollIE(InfoExtractor):
  31. _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'
  32. _NETRC_MACHINE = 'crunchyroll'
  33. _TESTS = [{
  34. 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
  35. 'info_dict': {
  36. 'id': '645513',
  37. 'ext': 'flv',
  38. 'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!',
  39. 'description': 'md5:2d17137920c64f2f49981a7797d275ef',
  40. 'thumbnail': 'http://img1.ak.crunchyroll.com/i/spire1-tmb/20c6b5e10f1a47b10516877d3c039cae1380951166_full.jpg',
  41. 'uploader': 'Yomiuri Telecasting Corporation (YTV)',
  42. 'upload_date': '20131013',
  43. 'url': 're:(?!.*&amp)',
  44. },
  45. 'params': {
  46. # rtmp
  47. 'skip_download': True,
  48. },
  49. }, {
  50. 'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1',
  51. 'info_dict': {
  52. 'id': '589804',
  53. 'ext': 'flv',
  54. 'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11',
  55. 'description': 'md5:fe2743efedb49d279552926d0bd0cd9e',
  56. 'thumbnail': 're:^https?://.*\.jpg$',
  57. 'uploader': 'Danny Choo Network',
  58. 'upload_date': '20120213',
  59. },
  60. 'params': {
  61. # rtmp
  62. 'skip_download': True,
  63. },
  64. }, {
  65. 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697',
  66. 'only_matching': True,
  67. }]
  68. _FORMAT_IDS = {
  69. '360': ('60', '106'),
  70. '480': ('61', '106'),
  71. '720': ('62', '106'),
  72. '1080': ('80', '108'),
  73. }
  74. def _login(self):
  75. (username, password) = self._get_login_info()
  76. if username is None:
  77. return
  78. self.report_login()
  79. login_url = 'https://www.crunchyroll.com/?a=formhandler'
  80. data = urlencode_postdata({
  81. 'formname': 'RpcApiUser_Login',
  82. 'name': username,
  83. 'password': password,
  84. })
  85. login_request = compat_urllib_request.Request(login_url, data)
  86. login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
  87. self._download_webpage(login_request, None, False, 'Wrong login info')
  88. def _real_initialize(self):
  89. self._login()
  90. def _decrypt_subtitles(self, data, iv, id):
  91. data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
  92. iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8')))
  93. id = int(id)
  94. def obfuscate_key_aux(count, modulo, start):
  95. output = list(start)
  96. for _ in range(count):
  97. output.append(output[-1] + output[-2])
  98. # cut off start values
  99. output = output[2:]
  100. output = list(map(lambda x: x % modulo + 33, output))
  101. return output
  102. def obfuscate_key(key):
  103. num1 = int(floor(pow(2, 25) * sqrt(6.9)))
  104. num2 = (num1 ^ key) << 5
  105. num3 = key ^ num1
  106. num4 = num3 ^ (num3 >> 3) ^ num2
  107. prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2)))
  108. shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest())
  109. # Extend 160 Bit hash to 256 Bit
  110. return shaHash + [0] * 12
  111. key = obfuscate_key(id)
  112. decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv))
  113. return zlib.decompress(decrypted_data)
  114. def _convert_subtitles_to_srt(self, sub_root):
  115. output = ''
  116. for i, event in enumerate(sub_root.findall('./events/event'), 1):
  117. start = event.attrib['start'].replace('.', ',')
  118. end = event.attrib['end'].replace('.', ',')
  119. text = event.attrib['text'].replace('\\N', '\n')
  120. output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text)
  121. return output
  122. def _convert_subtitles_to_ass(self, sub_root):
  123. output = ''
  124. def ass_bool(strvalue):
  125. assvalue = '0'
  126. if strvalue == '1':
  127. assvalue = '-1'
  128. return assvalue
  129. output = '[Script Info]\n'
  130. output += 'Title: %s\n' % sub_root.attrib["title"]
  131. output += 'ScriptType: v4.00+\n'
  132. output += 'WrapStyle: %s\n' % sub_root.attrib["wrap_style"]
  133. output += 'PlayResX: %s\n' % sub_root.attrib["play_res_x"]
  134. output += 'PlayResY: %s\n' % sub_root.attrib["play_res_y"]
  135. output += """ScaledBorderAndShadow: yes
  136. [V4+ Styles]
  137. Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
  138. """
  139. for style in sub_root.findall('./styles/style'):
  140. output += 'Style: ' + style.attrib["name"]
  141. output += ',' + style.attrib["font_name"]
  142. output += ',' + style.attrib["font_size"]
  143. output += ',' + style.attrib["primary_colour"]
  144. output += ',' + style.attrib["secondary_colour"]
  145. output += ',' + style.attrib["outline_colour"]
  146. output += ',' + style.attrib["back_colour"]
  147. output += ',' + ass_bool(style.attrib["bold"])
  148. output += ',' + ass_bool(style.attrib["italic"])
  149. output += ',' + ass_bool(style.attrib["underline"])
  150. output += ',' + ass_bool(style.attrib["strikeout"])
  151. output += ',' + style.attrib["scale_x"]
  152. output += ',' + style.attrib["scale_y"]
  153. output += ',' + style.attrib["spacing"]
  154. output += ',' + style.attrib["angle"]
  155. output += ',' + style.attrib["border_style"]
  156. output += ',' + style.attrib["outline"]
  157. output += ',' + style.attrib["shadow"]
  158. output += ',' + style.attrib["alignment"]
  159. output += ',' + style.attrib["margin_l"]
  160. output += ',' + style.attrib["margin_r"]
  161. output += ',' + style.attrib["margin_v"]
  162. output += ',' + style.attrib["encoding"]
  163. output += '\n'
  164. output += """
  165. [Events]
  166. Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
  167. """
  168. for event in sub_root.findall('./events/event'):
  169. output += 'Dialogue: 0'
  170. output += ',' + event.attrib["start"]
  171. output += ',' + event.attrib["end"]
  172. output += ',' + event.attrib["style"]
  173. output += ',' + event.attrib["name"]
  174. output += ',' + event.attrib["margin_l"]
  175. output += ',' + event.attrib["margin_r"]
  176. output += ',' + event.attrib["margin_v"]
  177. output += ',' + event.attrib["effect"]
  178. output += ',' + event.attrib["text"]
  179. output += '\n'
  180. return output
  181. def _extract_subtitles(self, subtitle):
  182. sub_root = xml.etree.ElementTree.fromstring(subtitle)
  183. return [{
  184. 'ext': 'srt',
  185. 'data': self._convert_subtitles_to_srt(sub_root),
  186. }, {
  187. 'ext': 'ass',
  188. 'data': self._convert_subtitles_to_ass(sub_root),
  189. }]
  190. def _get_subtitles(self, video_id, webpage):
  191. subtitles = {}
  192. for sub_id, sub_name in re.findall(r'\?ssid=([0-9]+)" title="([^"]+)', webpage):
  193. sub_page = self._download_webpage(
  194. 'http://www.crunchyroll.com/xml/?req=RpcApiSubtitle_GetXml&subtitle_script_id=' + sub_id,
  195. video_id, note='Downloading subtitles for ' + sub_name)
  196. id = self._search_regex(r'id=\'([0-9]+)', sub_page, 'subtitle_id', fatal=False)
  197. iv = self._search_regex(r'<iv>([^<]+)', sub_page, 'subtitle_iv', fatal=False)
  198. data = self._search_regex(r'<data>([^<]+)', sub_page, 'subtitle_data', fatal=False)
  199. if not id or not iv or not data:
  200. continue
  201. subtitle = self._decrypt_subtitles(data, iv, id).decode('utf-8')
  202. lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False)
  203. if not lang_code:
  204. continue
  205. subtitles[lang_code] = self._extract_subtitles(subtitle)
  206. return subtitles
  207. def _real_extract(self, url):
  208. mobj = re.match(self._VALID_URL, url)
  209. video_id = mobj.group('video_id')
  210. if mobj.group('prefix') == 'm':
  211. mobile_webpage = self._download_webpage(url, video_id, 'Downloading mobile webpage')
  212. webpage_url = self._search_regex(r'<link rel="canonical" href="([^"]+)" />', mobile_webpage, 'webpage_url')
  213. else:
  214. webpage_url = 'http://www.' + mobj.group('url')
  215. webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage')
  216. note_m = self._html_search_regex(
  217. r'<div class="showmedia-trailer-notice">(.+?)</div>',
  218. webpage, 'trailer-notice', default='')
  219. if note_m:
  220. raise ExtractorError(note_m)
  221. mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P<msg>{.+?})\]\)', webpage)
  222. if mobj:
  223. msg = json.loads(mobj.group('msg'))
  224. if msg.get('type') == 'error':
  225. raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True)
  226. if 'To view this, please log in to verify you are 18 or older.' in webpage:
  227. self.raise_login_required()
  228. video_title = self._html_search_regex(r'<h1[^>]*>(.+?)</h1>', webpage, 'video_title', flags=re.DOTALL)
  229. video_title = re.sub(r' {2,}', ' ', video_title)
  230. video_description = self._html_search_regex(r'"description":"([^"]+)', webpage, 'video_description', default='')
  231. if not video_description:
  232. video_description = None
  233. video_upload_date = self._html_search_regex(r'<div>Availability for free users:(.+?)</div>', webpage, 'video_upload_date', fatal=False, flags=re.DOTALL)
  234. if video_upload_date:
  235. video_upload_date = unified_strdate(video_upload_date)
  236. video_uploader = self._html_search_regex(r'<div>\s*Publisher:(.+?)</div>', webpage, 'video_uploader', fatal=False, flags=re.DOTALL)
  237. playerdata_url = compat_urllib_parse_unquote(self._html_search_regex(r'"config_url":"([^"]+)', webpage, 'playerdata_url'))
  238. playerdata_req = compat_urllib_request.Request(playerdata_url)
  239. playerdata_req.data = compat_urllib_parse.urlencode({'current_page': webpage_url})
  240. playerdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
  241. playerdata = self._download_webpage(playerdata_req, video_id, note='Downloading media info')
  242. stream_id = self._search_regex(r'<media_id>([^<]+)', playerdata, 'stream_id')
  243. video_thumbnail = self._search_regex(r'<episode_image_url>([^<]+)', playerdata, 'thumbnail', fatal=False)
  244. formats = []
  245. for fmt in re.findall(r'showmedia\.([0-9]{3,4})p', webpage):
  246. stream_quality, stream_format = self._FORMAT_IDS[fmt]
  247. video_format = fmt + 'p'
  248. streamdata_req = compat_urllib_request.Request(
  249. 'http://www.crunchyroll.com/xml/?req=RpcApiVideoPlayer_GetStandardConfig&media_id=%s&video_format=%s&video_quality=%s'
  250. % (stream_id, stream_format, stream_quality),
  251. compat_urllib_parse.urlencode({'current_page': url}).encode('utf-8'))
  252. streamdata_req.add_header('Content-Type', 'application/x-www-form-urlencoded')
  253. streamdata = self._download_xml(
  254. streamdata_req, video_id,
  255. note='Downloading media info for %s' % video_format)
  256. stream_info = streamdata.find('./{default}preload/stream_info')
  257. video_url = stream_info.find('./host').text
  258. video_play_path = stream_info.find('./file').text
  259. metadata = stream_info.find('./metadata')
  260. format_info = {
  261. 'format': video_format,
  262. 'format_id': video_format,
  263. 'height': int_or_none(xpath_text(metadata, './height')),
  264. 'width': int_or_none(xpath_text(metadata, './width')),
  265. }
  266. if '.fplive.net/' in video_url:
  267. video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip())
  268. parsed_video_url = compat_urlparse.urlparse(video_url)
  269. direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace(
  270. netloc='v.lvlt.crcdn.net',
  271. path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_play_path.split(':')[-1])))
  272. if self._is_valid_url(direct_video_url, video_id, video_format):
  273. format_info.update({
  274. 'url': direct_video_url,
  275. })
  276. formats.append(format_info)
  277. continue
  278. format_info.update({
  279. 'url': video_url,
  280. 'play_path': video_play_path,
  281. 'ext': 'flv',
  282. })
  283. formats.append(format_info)
  284. subtitles = self.extract_subtitles(video_id, webpage)
  285. return {
  286. 'id': video_id,
  287. 'title': video_title,
  288. 'description': video_description,
  289. 'thumbnail': video_thumbnail,
  290. 'uploader': video_uploader,
  291. 'upload_date': video_upload_date,
  292. 'subtitles': subtitles,
  293. 'formats': formats,
  294. }
  295. class CrunchyrollShowPlaylistIE(InfoExtractor):
  296. IE_NAME = "crunchyroll:playlist"
  297. _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?$'
  298. _TESTS = [{
  299. 'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
  300. 'info_dict': {
  301. 'id': 'a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
  302. 'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi'
  303. },
  304. 'playlist_count': 13,
  305. }]
  306. def _real_extract(self, url):
  307. show_id = self._match_id(url)
  308. webpage = self._download_webpage(url, show_id)
  309. title = self._html_search_regex(
  310. r'(?s)<h1[^>]*>\s*<span itemprop="name">(.*?)</span>',
  311. webpage, 'title')
  312. episode_paths = re.findall(
  313. r'(?s)<li id="showview_videos_media_[0-9]+"[^>]+>.*?<a href="([^"]+)"',
  314. webpage)
  315. entries = [
  316. self.url_result('http://www.crunchyroll.com' + ep, 'Crunchyroll')
  317. for ep in episode_paths
  318. ]
  319. entries.reverse()
  320. return {
  321. '_type': 'playlist',
  322. 'id': show_id,
  323. 'title': title,
  324. 'entries': entries,
  325. }