You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

155 lines
6.3 KiB

  1. import json
  2. import os
  3. import re
  4. import xml.etree.ElementTree
  5. from .common import InfoExtractor
  6. from ..utils import (
  7. ExtractorError,
  8. formatSeconds,
  9. )
  10. class JustinTVIE(InfoExtractor):
  11. """Information extractor for justin.tv and twitch.tv"""
  12. # TODO: One broadcast may be split into multiple videos. The key
  13. # 'broadcast_id' is the same for all parts, and 'broadcast_part'
  14. # starts at 1 and increases. Can we treat all parts as one video?
  15. _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
  16. (?:
  17. (?P<channelid>[^/]+)|
  18. (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
  19. (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
  20. )
  21. /?(?:\#.*)?$
  22. """
  23. _JUSTIN_PAGE_LIMIT = 100
  24. IE_NAME = u'justin.tv'
  25. _TEST = {
  26. u'url': u'http://www.twitch.tv/thegamedevhub/b/296128360',
  27. u'file': u'296128360.flv',
  28. u'md5': u'ecaa8a790c22a40770901460af191c9a',
  29. u'info_dict': {
  30. u"upload_date": u"20110927",
  31. u"uploader_id": 25114803,
  32. u"uploader": u"thegamedevhub",
  33. u"title": u"Beginner Series - Scripting With Python Pt.1"
  34. }
  35. }
  36. def report_download_page(self, channel, offset):
  37. """Report attempt to download a single page of videos."""
  38. self.to_screen(u'%s: Downloading video information from %d to %d' %
  39. (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
  40. # Return count of items, list of *valid* items
  41. def _parse_page(self, url, video_id):
  42. info_json = self._download_webpage(url, video_id,
  43. u'Downloading video info JSON',
  44. u'unable to download video info JSON')
  45. response = json.loads(info_json)
  46. if type(response) != list:
  47. error_text = response.get('error', 'unknown error')
  48. raise ExtractorError(u'Justin.tv API: %s' % error_text)
  49. info = []
  50. for clip in response:
  51. video_url = clip['video_file_url']
  52. if video_url:
  53. video_extension = os.path.splitext(video_url)[1][1:]
  54. video_date = re.sub('-', '', clip['start_time'][:10])
  55. video_uploader_id = clip.get('user_id', clip.get('channel_id'))
  56. video_id = clip['id']
  57. video_title = clip.get('title', video_id)
  58. info.append({
  59. 'id': video_id,
  60. 'url': video_url,
  61. 'title': video_title,
  62. 'uploader': clip.get('channel_name', video_uploader_id),
  63. 'uploader_id': video_uploader_id,
  64. 'upload_date': video_date,
  65. 'ext': video_extension,
  66. })
  67. return (len(response), info)
  68. def _real_extract(self, url):
  69. mobj = re.match(self._VALID_URL, url)
  70. if mobj is None:
  71. raise ExtractorError(u'invalid URL: %s' % url)
  72. api_base = 'http://api.justin.tv'
  73. paged = False
  74. if mobj.group('channelid'):
  75. paged = True
  76. video_id = mobj.group('channelid')
  77. api = api_base + '/channel/archives/%s.json' % video_id
  78. elif mobj.group('chapterid'):
  79. chapter_id = mobj.group('chapterid')
  80. webpage = self._download_webpage(url, chapter_id)
  81. m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
  82. if not m:
  83. raise ExtractorError(u'Cannot find archive of a chapter')
  84. archive_id = m.group(1)
  85. api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
  86. chapter_info_xml = self._download_webpage(api, chapter_id,
  87. note=u'Downloading chapter information',
  88. errnote=u'Chapter information download failed')
  89. doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
  90. for a in doc.findall('.//archive'):
  91. if archive_id == a.find('./id').text:
  92. break
  93. else:
  94. raise ExtractorError(u'Could not find chapter in chapter information')
  95. video_url = a.find('./video_file_url').text
  96. video_ext = video_url.rpartition('.')[2] or u'flv'
  97. chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
  98. chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
  99. note='Downloading chapter metadata',
  100. errnote='Download of chapter metadata failed')
  101. chapter_info = json.loads(chapter_info_json)
  102. bracket_start = int(doc.find('.//bracket_start').text)
  103. bracket_end = int(doc.find('.//bracket_end').text)
  104. # TODO determine start (and probably fix up file)
  105. # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
  106. #video_url += u'?start=' + TODO:start_timestamp
  107. # bracket_start is 13290, but we want 51670615
  108. self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
  109. u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
  110. info = {
  111. 'id': u'c' + chapter_id,
  112. 'url': video_url,
  113. 'ext': video_ext,
  114. 'title': chapter_info['title'],
  115. 'thumbnail': chapter_info['preview'],
  116. 'description': chapter_info['description'],
  117. 'uploader': chapter_info['channel']['display_name'],
  118. 'uploader_id': chapter_info['channel']['name'],
  119. }
  120. return [info]
  121. else:
  122. video_id = mobj.group('videoid')
  123. api = api_base + '/broadcast/by_archive/%s.json' % video_id
  124. self.report_extraction(video_id)
  125. info = []
  126. offset = 0
  127. limit = self._JUSTIN_PAGE_LIMIT
  128. while True:
  129. if paged:
  130. self.report_download_page(video_id, offset)
  131. page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
  132. page_count, page_info = self._parse_page(page_url, video_id)
  133. info.extend(page_info)
  134. if not paged or page_count != limit:
  135. break
  136. offset += limit
  137. return info