You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

144 lines
4.6 KiB

  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. import itertools
  5. import json
  6. from .common import InfoExtractor
  7. from ..compat import (
  8. compat_etree_fromstring,
  9. )
  10. from ..utils import (
  11. int_or_none,
  12. unified_strdate,
  13. ExtractorError,
  14. )
  15. class BiliBiliIE(InfoExtractor):
  16. _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P<id>[0-9]+)/'
  17. _TESTS = [{
  18. 'url': 'http://www.bilibili.tv/video/av1074402/',
  19. 'md5': '2c301e4dab317596e837c3e7633e7d86',
  20. 'info_dict': {
  21. 'id': '1074402_part1',
  22. 'ext': 'flv',
  23. 'title': '【金坷垃】金泡沫',
  24. 'duration': 308,
  25. 'upload_date': '20140420',
  26. 'thumbnail': 're:^https?://.+\.jpg',
  27. },
  28. }, {
  29. 'url': 'http://www.bilibili.com/video/av1041170/',
  30. 'info_dict': {
  31. 'id': '1041170',
  32. 'title': '【BD1080P】刀语【诸神&异域】',
  33. },
  34. 'playlist_count': 9,
  35. }]
  36. def _real_extract(self, url):
  37. video_id = self._match_id(url)
  38. webpage = self._download_webpage(url, video_id)
  39. if '(此视频不存在或被删除)' in webpage:
  40. raise ExtractorError(
  41. 'The video does not exist or was deleted', expected=True)
  42. if '>你没有权限浏览! 由于版权相关问题 我们不对您所在的地区提供服务<' in webpage:
  43. raise ExtractorError(
  44. 'The video is not available in your region due to copyright reasons',
  45. expected=True)
  46. video_code = self._search_regex(
  47. r'(?s)<div itemprop="video".*?>(.*?)</div>', webpage, 'video code')
  48. title = self._html_search_meta(
  49. 'media:title', video_code, 'title', fatal=True)
  50. duration_str = self._html_search_meta(
  51. 'duration', video_code, 'duration')
  52. if duration_str is None:
  53. duration = None
  54. else:
  55. duration_mobj = re.match(
  56. r'^T(?:(?P<hours>[0-9]+)H)?(?P<minutes>[0-9]+)M(?P<seconds>[0-9]+)S$',
  57. duration_str)
  58. duration = (
  59. int_or_none(duration_mobj.group('hours'), default=0) * 3600 +
  60. int(duration_mobj.group('minutes')) * 60 +
  61. int(duration_mobj.group('seconds')))
  62. upload_date = unified_strdate(self._html_search_meta(
  63. 'uploadDate', video_code, fatal=False))
  64. thumbnail = self._html_search_meta(
  65. 'thumbnailUrl', video_code, 'thumbnail', fatal=False)
  66. cid = self._search_regex(r'cid=(\d+)', webpage, 'cid')
  67. entries = []
  68. lq_page = self._download_webpage(
  69. 'http://interface.bilibili.com/v_cdn_play?appkey=1&cid=%s' % cid,
  70. video_id,
  71. note='Downloading LQ video info'
  72. )
  73. try:
  74. err_info = json.loads(lq_page)
  75. raise ExtractorError(
  76. 'BiliBili said: ' + err_info['error_text'], expected=True)
  77. except ValueError:
  78. pass
  79. lq_doc = compat_etree_fromstring(lq_page)
  80. lq_durls = lq_doc.findall('./durl')
  81. hq_doc = self._download_xml(
  82. 'http://interface.bilibili.com/playurl?appkey=1&cid=%s' % cid,
  83. video_id,
  84. note='Downloading HQ video info',
  85. fatal=False,
  86. )
  87. if hq_doc is not False:
  88. hq_durls = hq_doc.findall('./durl')
  89. assert len(lq_durls) == len(hq_durls)
  90. else:
  91. hq_durls = itertools.repeat(None)
  92. i = 1
  93. for lq_durl, hq_durl in zip(lq_durls, hq_durls):
  94. formats = [{
  95. 'format_id': 'lq',
  96. 'quality': 1,
  97. 'url': lq_durl.find('./url').text,
  98. 'filesize': int_or_none(
  99. lq_durl.find('./size'), get_attr='text'),
  100. }]
  101. if hq_durl is not None:
  102. formats.append({
  103. 'format_id': 'hq',
  104. 'quality': 2,
  105. 'ext': 'flv',
  106. 'url': hq_durl.find('./url').text,
  107. 'filesize': int_or_none(
  108. hq_durl.find('./size'), get_attr='text'),
  109. })
  110. self._sort_formats(formats)
  111. entries.append({
  112. 'id': '%s_part%d' % (video_id, i),
  113. 'title': title,
  114. 'formats': formats,
  115. 'duration': duration,
  116. 'upload_date': upload_date,
  117. 'thumbnail': thumbnail,
  118. })
  119. i += 1
  120. return {
  121. '_type': 'multi_video',
  122. 'entries': entries,
  123. 'id': video_id,
  124. 'title': title
  125. }