You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

222 lines
9.1 KiB

  1. from __future__ import unicode_literals
  2. import re
  3. from .subtitles import SubtitlesInfoExtractor
  4. from ..utils import ExtractorError
  5. class BBCCoUkIE(SubtitlesInfoExtractor):
  6. IE_NAME = 'bbc.co.uk'
  7. IE_DESC = 'BBC iPlayer'
  8. _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/(?:programmes|iplayer/episode)/(?P<id>[\da-z]{8})'
  9. _TESTS = [
  10. {
  11. 'url': 'http://www.bbc.co.uk/programmes/b039g8p7',
  12. 'info_dict': {
  13. 'id': 'b039d07m',
  14. 'ext': 'flv',
  15. 'title': 'Kaleidoscope: Leonard Cohen',
  16. 'description': 'md5:db4755d7a665ae72343779f7dacb402c',
  17. 'duration': 1740,
  18. },
  19. 'params': {
  20. # rtmp download
  21. 'skip_download': True,
  22. }
  23. },
  24. {
  25. 'url': 'http://www.bbc.co.uk/iplayer/episode/b00yng5w/The_Man_in_Black_Series_3_The_Printed_Name/',
  26. 'info_dict': {
  27. 'id': 'b00yng1d',
  28. 'ext': 'flv',
  29. 'title': 'The Man in Black: Series 3: The Printed Name',
  30. 'description': "Mark Gatiss introduces Nicholas Pierpan's chilling tale of a writer's devilish pact with a mysterious man. Stars Ewan Bailey.",
  31. 'duration': 1800,
  32. },
  33. 'params': {
  34. # rtmp download
  35. 'skip_download': True,
  36. },
  37. 'skip': 'Episode is no longer available on BBC iPlayer Radio',
  38. },
  39. {
  40. 'url': 'http://www.bbc.co.uk/iplayer/episode/b03vhd1f/The_Voice_UK_Series_3_Blind_Auditions_5/',
  41. 'info_dict': {
  42. 'id': 'b00yng1d',
  43. 'ext': 'flv',
  44. 'title': 'The Voice UK: Series 3: Blind Auditions 5',
  45. 'description': "Emma Willis and Marvin Humes present the fifth set of blind auditions in the singing competition, as the coaches continue to build their teams based on voice alone.",
  46. 'duration': 5100,
  47. },
  48. 'params': {
  49. # rtmp download
  50. 'skip_download': True,
  51. },
  52. 'skip': 'Currently BBC iPlayer TV programmes are available to play in the UK only',
  53. }
  54. ]
  55. def _extract_asx_playlist(self, connection, programme_id):
  56. asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist')
  57. return [ref.get('href') for ref in asx.findall('./Entry/ref')]
  58. def _extract_connection(self, connection, programme_id):
  59. formats = []
  60. protocol = connection.get('protocol')
  61. supplier = connection.get('supplier')
  62. if protocol == 'http':
  63. href = connection.get('href')
  64. # ASX playlist
  65. if supplier == 'asx':
  66. for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)):
  67. formats.append({
  68. 'url': ref,
  69. 'format_id': 'ref%s_%s' % (i, supplier),
  70. })
  71. # Direct link
  72. else:
  73. formats.append({
  74. 'url': href,
  75. 'format_id': supplier,
  76. })
  77. elif protocol == 'rtmp':
  78. application = connection.get('application', 'ondemand')
  79. auth_string = connection.get('authString')
  80. identifier = connection.get('identifier')
  81. server = connection.get('server')
  82. formats.append({
  83. 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string),
  84. 'play_path': identifier,
  85. 'app': '%s?%s' % (application, auth_string),
  86. 'page_url': 'http://www.bbc.co.uk',
  87. 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf',
  88. 'rtmp_live': False,
  89. 'ext': 'flv',
  90. 'format_id': supplier,
  91. })
  92. return formats
  93. def _extract_items(self, playlist):
  94. return playlist.findall('./{http://bbc.co.uk/2008/emp/playlist}item')
  95. def _extract_medias(self, media_selection):
  96. return media_selection.findall('./{http://bbc.co.uk/2008/mp/mediaselection}media')
  97. def _extract_connections(self, media):
  98. return media.findall('./{http://bbc.co.uk/2008/mp/mediaselection}connection')
  99. def _extract_video(self, media, programme_id):
  100. formats = []
  101. vbr = int(media.get('bitrate'))
  102. vcodec = media.get('encoding')
  103. service = media.get('service')
  104. width = int(media.get('width'))
  105. height = int(media.get('height'))
  106. file_size = int(media.get('media_file_size'))
  107. for connection in self._extract_connections(media):
  108. conn_formats = self._extract_connection(connection, programme_id)
  109. for format in conn_formats:
  110. format.update({
  111. 'format_id': '%s_%s' % (service, format['format_id']),
  112. 'width': width,
  113. 'height': height,
  114. 'vbr': vbr,
  115. 'vcodec': vcodec,
  116. 'filesize': file_size,
  117. })
  118. formats.extend(conn_formats)
  119. return formats
  120. def _extract_audio(self, media, programme_id):
  121. formats = []
  122. abr = int(media.get('bitrate'))
  123. acodec = media.get('encoding')
  124. service = media.get('service')
  125. for connection in self._extract_connections(media):
  126. conn_formats = self._extract_connection(connection, programme_id)
  127. for format in conn_formats:
  128. format.update({
  129. 'format_id': '%s_%s' % (service, format['format_id']),
  130. 'abr': abr,
  131. 'acodec': acodec,
  132. })
  133. formats.extend(conn_formats)
  134. return formats
  135. def _extract_captions(self, media, programme_id):
  136. subtitles = {}
  137. for connection in self._extract_connections(media):
  138. captions = self._download_xml(connection.get('href'), programme_id, 'Downloading captions')
  139. lang = captions.get('{http://www.w3.org/XML/1998/namespace}lang', 'en')
  140. ps = captions.findall('./{0}body/{0}div/{0}p'.format('{http://www.w3.org/2006/10/ttaf1}'))
  141. srt = ''
  142. for pos, p in enumerate(ps):
  143. srt += '%s\r\n%s --> %s\r\n%s\r\n\r\n' % (str(pos), p.get('begin'), p.get('end'),
  144. p.text.strip() if p.text is not None else '')
  145. subtitles[lang] = srt
  146. return subtitles
  147. def _real_extract(self, url):
  148. mobj = re.match(self._VALID_URL, url)
  149. group_id = mobj.group('id')
  150. webpage = self._download_webpage(url, group_id, 'Downloading video page')
  151. if re.search(r'id="emp-error" class="notinuk">', webpage):
  152. raise ExtractorError('Currently BBC iPlayer TV programmes are available to play in the UK only',
  153. expected=True)
  154. playlist = self._download_xml('http://www.bbc.co.uk/iplayer/playlist/%s' % group_id, group_id,
  155. 'Downloading playlist XML')
  156. no_items = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}noItems')
  157. if no_items is not None:
  158. reason = no_items.get('reason')
  159. if reason == 'preAvailability':
  160. msg = 'Episode %s is not yet available' % group_id
  161. elif reason == 'postAvailability':
  162. msg = 'Episode %s is no longer available' % group_id
  163. else:
  164. msg = 'Episode %s is not available: %s' % (group_id, reason)
  165. raise ExtractorError(msg, expected=True)
  166. formats = []
  167. subtitles = None
  168. for item in self._extract_items(playlist):
  169. kind = item.get('kind')
  170. if kind != 'programme' and kind != 'radioProgramme':
  171. continue
  172. title = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}title').text
  173. description = playlist.find('./{http://bbc.co.uk/2008/emp/playlist}summary').text
  174. programme_id = item.get('identifier')
  175. duration = int(item.get('duration'))
  176. media_selection = self._download_xml(
  177. 'http://open.live.bbc.co.uk/mediaselector/5/select/version/2.0/mediaset/pc/vpid/%s' % programme_id,
  178. programme_id, 'Downloading media selection XML')
  179. for media in self._extract_medias(media_selection):
  180. kind = media.get('kind')
  181. if kind == 'audio':
  182. formats.extend(self._extract_audio(media, programme_id))
  183. elif kind == 'video':
  184. formats.extend(self._extract_video(media, programme_id))
  185. elif kind == 'captions':
  186. subtitles = self._extract_captions(media, programme_id)
  187. if self._downloader.params.get('listsubtitles', False):
  188. self._list_available_subtitles(programme_id, subtitles)
  189. return
  190. self._sort_formats(formats)
  191. return {
  192. 'id': programme_id,
  193. 'title': title,
  194. 'description': description,
  195. 'duration': duration,
  196. 'formats': formats,
  197. 'subtitles': subtitles,
  198. }