You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

59 lines
1.9 KiB

  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. from .common import InfoExtractor
  5. from ..utils import unified_strdate
  6. class LibsynIE(InfoExtractor):
  7. _VALID_URL = r'https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+)'
  8. _TEST = {
  9. 'url': 'http://html5-player.libsyn.com/embed/episode/id/3377616/',
  10. 'md5': '443360ee1b58007bc3dcf09b41d093bb',
  11. 'info_dict': {
  12. 'id': '3377616',
  13. 'ext': 'mp3',
  14. 'title': "The Daily Show Podcast without Jon Stewart - Episode 12: Bassem Youssef: Egypt's Jon Stewart",
  15. 'description': 'md5:601cb790edd05908957dae8aaa866465',
  16. 'upload_date': '20150220',
  17. },
  18. }
  19. def _real_extract(self, url):
  20. video_id = self._match_id(url)
  21. webpage = self._download_webpage(url, video_id)
  22. formats = [{
  23. 'url': media_url,
  24. } for media_url in set(re.findall('var\s+mediaURL(?:Libsyn)?\s*=\s*"([^"]+)"', webpage))]
  25. podcast_title = self._search_regex(
  26. r'<h2>([^<]+)</h2>', webpage, 'title')
  27. episode_title = self._search_regex(
  28. r'<h3>([^<]+)</h3>', webpage, 'title', default=None)
  29. title = '%s - %s' % (podcast_title, episode_title) if podcast_title else episode_title
  30. description = self._html_search_regex(
  31. r'<div id="info_text_body">(.+?)</div>', webpage,
  32. 'description', fatal=False)
  33. thumbnail = self._search_regex(
  34. r'<img[^>]+class="info-show-icon"[^>]+src="([^"]+)"',
  35. webpage, 'thumbnail', fatal=False)
  36. release_date = unified_strdate(self._search_regex(
  37. r'<div class="release_date">Released: ([^<]+)<', webpage, 'release date', fatal=False))
  38. return {
  39. 'id': video_id,
  40. 'title': title,
  41. 'description': description,
  42. 'thumbnail': thumbnail,
  43. 'upload_date': release_date,
  44. 'formats': formats,
  45. }