You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

142 lines
4.6 KiB

  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. import itertools
  5. import json
  6. import xml.etree.ElementTree as ET
  7. from .common import InfoExtractor
  8. from ..utils import (
  9. int_or_none,
  10. unified_strdate,
  11. ExtractorError,
  12. )
  13. class BiliBiliIE(InfoExtractor):
  14. _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P<id>[0-9]+)/'
  15. _TESTS = [{
  16. 'url': 'http://www.bilibili.tv/video/av1074402/',
  17. 'md5': '2c301e4dab317596e837c3e7633e7d86',
  18. 'info_dict': {
  19. 'id': '1074402_part1',
  20. 'ext': 'flv',
  21. 'title': '【金坷垃】金泡沫',
  22. 'duration': 308,
  23. 'upload_date': '20140420',
  24. 'thumbnail': 're:^https?://.+\.jpg',
  25. },
  26. }, {
  27. 'url': 'http://www.bilibili.com/video/av1041170/',
  28. 'info_dict': {
  29. 'id': '1041170',
  30. 'title': '【BD1080P】刀语【诸神&异域】',
  31. },
  32. 'playlist_count': 9,
  33. }]
  34. def _real_extract(self, url):
  35. video_id = self._match_id(url)
  36. webpage = self._download_webpage(url, video_id)
  37. if '(此视频不存在或被删除)' in webpage:
  38. raise ExtractorError(
  39. 'The video does not exist or was deleted', expected=True)
  40. if '>你没有权限浏览! 由于版权相关问题 我们不对您所在的地区提供服务<' in webpage:
  41. raise ExtractorError(
  42. 'The video is not available in your region due to copyright reasons',
  43. expected=True)
  44. video_code = self._search_regex(
  45. r'(?s)<div itemprop="video".*?>(.*?)</div>', webpage, 'video code')
  46. title = self._html_search_meta(
  47. 'media:title', video_code, 'title', fatal=True)
  48. duration_str = self._html_search_meta(
  49. 'duration', video_code, 'duration')
  50. if duration_str is None:
  51. duration = None
  52. else:
  53. duration_mobj = re.match(
  54. r'^T(?:(?P<hours>[0-9]+)H)?(?P<minutes>[0-9]+)M(?P<seconds>[0-9]+)S$',
  55. duration_str)
  56. duration = (
  57. int_or_none(duration_mobj.group('hours'), default=0) * 3600 +
  58. int(duration_mobj.group('minutes')) * 60 +
  59. int(duration_mobj.group('seconds')))
  60. upload_date = unified_strdate(self._html_search_meta(
  61. 'uploadDate', video_code, fatal=False))
  62. thumbnail = self._html_search_meta(
  63. 'thumbnailUrl', video_code, 'thumbnail', fatal=False)
  64. cid = self._search_regex(r'cid=(\d+)', webpage, 'cid')
  65. entries = []
  66. lq_page = self._download_webpage(
  67. 'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid,
  68. video_id,
  69. note='Downloading LQ video info'
  70. )
  71. try:
  72. err_info = json.loads(lq_page)
  73. raise ExtractorError(
  74. 'BiliBili said: ' + err_info['error_text'], expected=True)
  75. except ValueError:
  76. pass
  77. lq_doc = ET.fromstring(lq_page)
  78. lq_durls = lq_doc.findall('./durl')
  79. hq_doc = self._download_xml(
  80. 'http://interface.bilibili.com/playurl?appkey=1&cid=%s' % cid,
  81. video_id,
  82. note='Downloading HQ video info',
  83. fatal=False,
  84. )
  85. if hq_doc is not False:
  86. hq_durls = hq_doc.findall('./durl')
  87. assert len(lq_durls) == len(hq_durls)
  88. else:
  89. hq_durls = itertools.repeat(None)
  90. i = 1
  91. for lq_durl, hq_durl in zip(lq_durls, hq_durls):
  92. formats = [{
  93. 'format_id': 'lq',
  94. 'quality': 1,
  95. 'url': lq_durl.find('./url').text,
  96. 'filesize': int_or_none(
  97. lq_durl.find('./size'), get_attr='text'),
  98. }]
  99. if hq_durl is not None:
  100. formats.append({
  101. 'format_id': 'hq',
  102. 'quality': 2,
  103. 'ext': 'flv',
  104. 'url': hq_durl.find('./url').text,
  105. 'filesize': int_or_none(
  106. hq_durl.find('./size'), get_attr='text'),
  107. })
  108. self._sort_formats(formats)
  109. entries.append({
  110. 'id': '%s_part%d' % (video_id, i),
  111. 'title': title,
  112. 'formats': formats,
  113. 'duration': duration,
  114. 'upload_date': upload_date,
  115. 'thumbnail': thumbnail,
  116. })
  117. i += 1
  118. return {
  119. '_type': 'multi_video',
  120. 'entries': entries,
  121. 'id': video_id,
  122. 'title': title
  123. }