You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

99 lines
3.7 KiB

  1. from __future__ import unicode_literals
  2. import re
  3. from .common import InfoExtractor
  4. from ..compat import (
  5. compat_urllib_request,
  6. compat_urlparse,
  7. )
  8. from ..utils import (
  9. ExtractorError,
  10. determine_ext,
  11. int_or_none,
  12. )
  13. class VoiceRepublicIE(InfoExtractor):
  14. _VALID_URL = r'https?://voicerepublic\.com/(?:talks|embed)/(?P<id>[0-9a-z-]+)'
  15. _TESTS = [{
  16. 'url': 'http://voicerepublic.com/talks/watching-the-watchers-building-a-sousveillance-state',
  17. 'md5': '0554a24d1657915aa8e8f84e15dc9353',
  18. 'info_dict': {
  19. 'id': '2296',
  20. 'display_id': 'watching-the-watchers-building-a-sousveillance-state',
  21. 'ext': 'm4a',
  22. 'title': 'Watching the Watchers: Building a Sousveillance State',
  23. 'description': 'md5:715ba964958afa2398df615809cfecb1',
  24. 'thumbnail': 're:^https?://.*\.(?:png|jpg)$',
  25. 'duration': 1800,
  26. 'view_count': int,
  27. }
  28. }, {
  29. 'url': 'http://voicerepublic.com/embed/watching-the-watchers-building-a-sousveillance-state',
  30. 'only_matching': True,
  31. }]
  32. def _real_extract(self, url):
  33. display_id = self._match_id(url)
  34. req = compat_urllib_request.Request(
  35. compat_urlparse.urljoin(url, '/talks/%s' % display_id))
  36. # Older versions of Firefox get redirected to an "upgrade browser" page
  37. req.add_header('User-Agent', 'youtube-dl')
  38. webpage = self._download_webpage(req, display_id)
  39. if '>Queued for processing, please stand by...<' in webpage:
  40. raise ExtractorError(
  41. 'Audio is still queued for processing', expected=True)
  42. config = self._search_regex(
  43. r'(?s)return ({.+?});\s*\n', webpage,
  44. 'data', default=None)
  45. data = self._parse_json(config, display_id, fatal=False) if config else None
  46. if data:
  47. title = data['title']
  48. description = data.get('teaser')
  49. talk_id = data.get('talk_id') or display_id
  50. talk = data['talk']
  51. duration = int_or_none(talk.get('duration'))
  52. formats = [{
  53. 'url': compat_urlparse.urljoin(url, talk_url),
  54. 'format_id': format_id,
  55. 'ext': determine_ext(talk_url) or format_id,
  56. 'vcodec': 'none',
  57. } for format_id, talk_url in talk['links'].items()]
  58. else:
  59. title = self._og_search_title(webpage)
  60. description = self._html_search_regex(
  61. r"(?s)<div class='talk-teaser'[^>]*>(.+?)</div>",
  62. webpage, 'description', fatal=False)
  63. talk_id = self._search_regex(
  64. [r"id='jc-(\d+)'", r"data-shareable-id='(\d+)'"],
  65. webpage, 'talk id', default=None) or display_id
  66. duration = None
  67. player = self._search_regex(
  68. r"class='vr-player jp-jplayer'([^>]+)>", webpage, 'player')
  69. formats = [{
  70. 'url': compat_urlparse.urljoin(url, talk_url),
  71. 'format_id': format_id,
  72. 'ext': determine_ext(talk_url) or format_id,
  73. 'vcodec': 'none',
  74. } for format_id, talk_url in re.findall(r"data-([^=]+)='([^']+)'", player)]
  75. self._sort_formats(formats)
  76. thumbnail = self._og_search_thumbnail(webpage)
  77. view_count = int_or_none(self._search_regex(
  78. r"class='play-count[^']*'>\s*(\d+) plays",
  79. webpage, 'play count', fatal=False))
  80. return {
  81. 'id': talk_id,
  82. 'display_id': display_id,
  83. 'title': title,
  84. 'description': description,
  85. 'thumbnail': thumbnail,
  86. 'duration': duration,
  87. 'view_count': view_count,
  88. 'formats': formats,
  89. }