Browse Source

[bilibili] Support new Bangumi URLs (closes #11845)

To reduce complexity, I don't support old Bangumi URLs directly via
_VALID_URL. Instead, I choose to let it go to generic redirection. An
example can be found in #10190:

http://bangumi.bilibili.com/anime/v/40062
master-ytdl-org
Yen Chi Hsuan 8 years ago
parent
commit
bd8f48c78b
No known key found for this signature in database GPG Key ID: 7F902A182457CA23
3 changed files with 134 additions and 11 deletions
  1. +5
    -0
      ChangeLog
  2. +125
    -10
      youtube_dl/extractor/bilibili.py
  3. +4
    -1
      youtube_dl/extractor/extractors.py

+ 5
- 0
ChangeLog View File

@ -1,3 +1,8 @@
version <unreleased>
Extractors
+ [bilibili] Support new Bangumi URLs (#11845)
version 2017.02.01 version 2017.02.01
Extractors Extractors


+ 125
- 10
youtube_dl/extractor/bilibili.py View File

@ -5,19 +5,27 @@ import hashlib
import re import re
from .common import InfoExtractor from .common import InfoExtractor
from ..compat import compat_parse_qs
from ..compat import (
compat_parse_qs,
compat_urlparse,
)
from ..utils import ( from ..utils import (
ExtractorError,
int_or_none, int_or_none,
float_or_none, float_or_none,
parse_iso8601,
smuggle_url,
strip_jsonp,
unified_timestamp, unified_timestamp,
unsmuggle_url,
urlencode_postdata, urlencode_postdata,
) )
class BiliBiliIE(InfoExtractor): class BiliBiliIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/v/)(?P<id>\d+)'
_VALID_URL = r'https?://(?:www\.|bangumi\.|)bilibili\.(?:tv|com)/(?:video/av|anime/(?P<anime_id>\d+)/play#)(?P<id>\d+)'
_TEST = {
_TESTS = [{
'url': 'http://www.bilibili.tv/video/av1074402/', 'url': 'http://www.bilibili.tv/video/av1074402/',
'md5': '9fa226fe2b8a9a4d5a69b4c6a183417e', 'md5': '9fa226fe2b8a9a4d5a69b4c6a183417e',
'info_dict': { 'info_dict': {
@ -32,25 +40,61 @@ class BiliBiliIE(InfoExtractor):
'uploader': '菊子桑', 'uploader': '菊子桑',
'uploader_id': '156160', 'uploader_id': '156160',
}, },
}
}, {
# Tested in BiliBiliBangumiIE
'url': 'http://bangumi.bilibili.com/anime/1869/play#40062',
'only_matching': True,
}, {
'url': 'http://bangumi.bilibili.com/anime/5802/play#100643',
'md5': '3f721ad1e75030cc06faf73587cfec57',
'info_dict': {
'id': '100643',
'ext': 'mp4',
'title': 'CHAOS;CHILD',
'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...',
},
'skip': 'Geo-restricted to China',
}]
_APP_KEY = '84956560bc028eb7' _APP_KEY = '84956560bc028eb7'
_BILIBILI_KEY = '94aba54af9065f71de72f5508f1cd42e' _BILIBILI_KEY = '94aba54af9065f71de72f5508f1cd42e'
def _report_error(self, result):
if 'message' in result:
raise ExtractorError('%s said: %s' % (self.IE_NAME, result['message']), expected=True)
elif 'code' in result:
raise ExtractorError('%s returns error %d' % (self.IE_NAME, result['code']), expected=True)
else:
raise ExtractorError('Can\'t extract Bangumi episode ID')
def _real_extract(self, url): def _real_extract(self, url):
video_id = self._match_id(url)
url, smuggled_data = unsmuggle_url(url, {})
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
anime_id = mobj.group('anime_id')
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
if 'anime/v' not in url:
if 'anime/' not in url:
cid = compat_parse_qs(self._search_regex( cid = compat_parse_qs(self._search_regex(
[r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)',
r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'], r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'],
webpage, 'player parameters'))['cid'][0] webpage, 'player parameters'))['cid'][0]
else: else:
if 'no_bangumi_tip' not in smuggled_data:
self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run youtube-dl with %s' % (
video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id)))
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}
headers.update(self.geo_verification_headers())
js = self._download_json( js = self._download_json(
'http://bangumi.bilibili.com/web_api/get_source', video_id, 'http://bangumi.bilibili.com/web_api/get_source', video_id,
data=urlencode_postdata({'episode_id': video_id}), data=urlencode_postdata({'episode_id': video_id}),
headers={'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8'})
headers=headers)
if 'result' not in js:
self._report_error(js)
cid = js['result']['cid'] cid = js['result']['cid']
payload = 'appkey=%s&cid=%s&otype=json&quality=2&type=mp4' % (self._APP_KEY, cid) payload = 'appkey=%s&cid=%s&otype=json&quality=2&type=mp4' % (self._APP_KEY, cid)
@ -58,7 +102,11 @@ class BiliBiliIE(InfoExtractor):
video_info = self._download_json( video_info = self._download_json(
'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign), 'http://interface.bilibili.com/playurl?%s&sign=%s' % (payload, sign),
video_id, note='Downloading video info page')
video_id, note='Downloading video info page',
headers=self.geo_verification_headers())
if 'durl' not in video_info:
self._report_error(video_info)
entries = [] entries = []
@ -85,7 +133,7 @@ class BiliBiliIE(InfoExtractor):
title = self._html_search_regex('<h1[^>]+title="([^"]+)">', webpage, 'title') title = self._html_search_regex('<h1[^>]+title="([^"]+)">', webpage, 'title')
description = self._html_search_meta('description', webpage) description = self._html_search_meta('description', webpage)
timestamp = unified_timestamp(self._html_search_regex( timestamp = unified_timestamp(self._html_search_regex(
r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', fatal=False))
r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', default=None))
thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage) thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage)
# TODO 'view_count' requires deobfuscating Javascript # TODO 'view_count' requires deobfuscating Javascript
@ -99,7 +147,7 @@ class BiliBiliIE(InfoExtractor):
} }
uploader_mobj = re.search( uploader_mobj = re.search(
r'<a[^>]+href="https?://space\.bilibili\.com/(?P<id>\d+)"[^>]+title="(?P<name>[^"]+)"',
r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]+title="(?P<name>[^"]+)"',
webpage) webpage)
if uploader_mobj: if uploader_mobj:
info.update({ info.update({
@ -123,3 +171,70 @@ class BiliBiliIE(InfoExtractor):
'description': description, 'description': description,
'entries': entries, 'entries': entries,
} }
class BiliBiliBangumiIE(InfoExtractor):
_VALID_URL = r'https?://bangumi\.bilibili\.com/anime/(?P<id>\d+)'
IE_NAME = 'bangumi.bilibili.com'
IE_DESC = 'BiliBili番剧'
_TESTS = [{
'url': 'http://bangumi.bilibili.com/anime/1869',
'info_dict': {
'id': '1869',
'title': '混沌武士',
'description': 'md5:6a9622b911565794c11f25f81d6a97d2',
},
'playlist_count': 26,
}, {
'url': 'http://bangumi.bilibili.com/anime/1869',
'info_dict': {
'id': '1869',
'title': '混沌武士',
'description': 'md5:6a9622b911565794c11f25f81d6a97d2',
},
'playlist': [{
'md5': '91da8621454dd58316851c27c68b0c13',
'info_dict': {
'id': '40062',
'ext': 'mp4',
'title': '混沌武士',
'description': '故事发生在日本的江户时代。风是一个小酒馆的打工女。一日,酒馆里来了一群恶霸,虽然他们的举动令风十分不满,但是毕竟风只是一届女流,无法对他们采取什么行动,只能在心里嘟哝。这时,酒家里又进来了个“不良份子...',
'timestamp': 1414538739,
'upload_date': '20141028',
'episode': '疾风怒涛 Tempestuous Temperaments',
'episode_number': 1,
},
}],
'params': {
'playlist_items': '1',
},
}]
@classmethod
def suitable(cls, url):
return False if BiliBiliIE.suitable(url) else super(BiliBiliBangumiIE, cls).suitable(url)
def _real_extract(self, url):
bangumi_id = self._match_id(url)
# Sometimes this API returns a JSONP response
season_info = self._download_json(
'http://bangumi.bilibili.com/jsonp/seasoninfo/%s.ver' % bangumi_id,
bangumi_id, transform_source=strip_jsonp)['result']
entries = [{
'_type': 'url_transparent',
'url': smuggle_url(episode['webplay_url'], {'no_bangumi_tip': 1}),
'ie_key': BiliBiliIE.ie_key(),
'timestamp': parse_iso8601(episode.get('update_time'), delimiter=' '),
'episode': episode.get('index_title'),
'episode_number': int_or_none(episode.get('index')),
} for episode in season_info['episodes']]
entries = sorted(entries, key=lambda entry: entry.get('episode_number'))
return self.playlist_result(
entries, bangumi_id,
season_info.get('bangumi_title'), season_info.get('evaluate'))

+ 4
- 1
youtube_dl/extractor/extractors.py View File

@ -103,7 +103,10 @@ from .beatport import BeatportIE
from .bet import BetIE from .bet import BetIE
from .bigflix import BigflixIE from .bigflix import BigflixIE
from .bild import BildIE from .bild import BildIE
from .bilibili import BiliBiliIE
from .bilibili import (
BiliBiliIE,
BiliBiliBangumiIE,
)
from .biobiochiletv import BioBioChileTVIE from .biobiochiletv import BioBioChileTVIE
from .biqle import BIQLEIE from .biqle import BIQLEIE
from .bleacherreport import ( from .bleacherreport import (


Loading…
Cancel
Save