You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

144 lines
5.9 KiB

  1. import json
  2. import os
  3. import re
  4. import xml.etree.ElementTree
  5. from .common import InfoExtractor
  6. from ..utils import (
  7. ExtractorError,
  8. formatSeconds,
  9. )
  10. class JustinTVIE(InfoExtractor):
  11. """Information extractor for justin.tv and twitch.tv"""
  12. # TODO: One broadcast may be split into multiple videos. The key
  13. # 'broadcast_id' is the same for all parts, and 'broadcast_part'
  14. # starts at 1 and increases. Can we treat all parts as one video?
  15. _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
  16. (?:
  17. (?P<channelid>[^/]+)|
  18. (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
  19. (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
  20. )
  21. /?(?:\#.*)?$
  22. """
  23. _JUSTIN_PAGE_LIMIT = 100
  24. IE_NAME = u'justin.tv'
  25. def report_download_page(self, channel, offset):
  26. """Report attempt to download a single page of videos."""
  27. self.to_screen(u'%s: Downloading video information from %d to %d' %
  28. (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
  29. # Return count of items, list of *valid* items
  30. def _parse_page(self, url, video_id):
  31. info_json = self._download_webpage(url, video_id,
  32. u'Downloading video info JSON',
  33. u'unable to download video info JSON')
  34. response = json.loads(info_json)
  35. if type(response) != list:
  36. error_text = response.get('error', 'unknown error')
  37. raise ExtractorError(u'Justin.tv API: %s' % error_text)
  38. info = []
  39. for clip in response:
  40. video_url = clip['video_file_url']
  41. if video_url:
  42. video_extension = os.path.splitext(video_url)[1][1:]
  43. video_date = re.sub('-', '', clip['start_time'][:10])
  44. video_uploader_id = clip.get('user_id', clip.get('channel_id'))
  45. video_id = clip['id']
  46. video_title = clip.get('title', video_id)
  47. info.append({
  48. 'id': video_id,
  49. 'url': video_url,
  50. 'title': video_title,
  51. 'uploader': clip.get('channel_name', video_uploader_id),
  52. 'uploader_id': video_uploader_id,
  53. 'upload_date': video_date,
  54. 'ext': video_extension,
  55. })
  56. return (len(response), info)
  57. def _real_extract(self, url):
  58. mobj = re.match(self._VALID_URL, url)
  59. if mobj is None:
  60. raise ExtractorError(u'invalid URL: %s' % url)
  61. api_base = 'http://api.justin.tv'
  62. paged = False
  63. if mobj.group('channelid'):
  64. paged = True
  65. video_id = mobj.group('channelid')
  66. api = api_base + '/channel/archives/%s.json' % video_id
  67. elif mobj.group('chapterid'):
  68. chapter_id = mobj.group('chapterid')
  69. webpage = self._download_webpage(url, chapter_id)
  70. m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
  71. if not m:
  72. raise ExtractorError(u'Cannot find archive of a chapter')
  73. archive_id = m.group(1)
  74. api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
  75. chapter_info_xml = self._download_webpage(api, chapter_id,
  76. note=u'Downloading chapter information',
  77. errnote=u'Chapter information download failed')
  78. doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
  79. for a in doc.findall('.//archive'):
  80. if archive_id == a.find('./id').text:
  81. break
  82. else:
  83. raise ExtractorError(u'Could not find chapter in chapter information')
  84. video_url = a.find('./video_file_url').text
  85. video_ext = video_url.rpartition('.')[2] or u'flv'
  86. chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
  87. chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
  88. note='Downloading chapter metadata',
  89. errnote='Download of chapter metadata failed')
  90. chapter_info = json.loads(chapter_info_json)
  91. bracket_start = int(doc.find('.//bracket_start').text)
  92. bracket_end = int(doc.find('.//bracket_end').text)
  93. # TODO determine start (and probably fix up file)
  94. # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
  95. #video_url += u'?start=' + TODO:start_timestamp
  96. # bracket_start is 13290, but we want 51670615
  97. self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
  98. u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
  99. info = {
  100. 'id': u'c' + chapter_id,
  101. 'url': video_url,
  102. 'ext': video_ext,
  103. 'title': chapter_info['title'],
  104. 'thumbnail': chapter_info['preview'],
  105. 'description': chapter_info['description'],
  106. 'uploader': chapter_info['channel']['display_name'],
  107. 'uploader_id': chapter_info['channel']['name'],
  108. }
  109. return [info]
  110. else:
  111. video_id = mobj.group('videoid')
  112. api = api_base + '/broadcast/by_archive/%s.json' % video_id
  113. self.report_extraction(video_id)
  114. info = []
  115. offset = 0
  116. limit = self._JUSTIN_PAGE_LIMIT
  117. while True:
  118. if paged:
  119. self.report_download_page(video_id, offset)
  120. page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
  121. page_count, page_info = self._parse_page(page_url, video_id)
  122. info.extend(page_info)
  123. if not paged or page_count != limit:
  124. break
  125. offset += limit
  126. return info