You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

81 lines
2.9 KiB

  1. from __future__ import unicode_literals
  2. import re
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. determine_ext,
  6. int_or_none,
  7. js_to_json,
  8. unescapeHTML,
  9. )
  10. class StitcherIE(InfoExtractor):
  11. _VALID_URL = r'https?://(?:www\.)?stitcher\.com/podcast/(?:[^/]+/)+e/(?:(?P<display_id>[^/#?&]+?)-)?(?P<id>\d+)(?:[/#?&]|$)'
  12. _TESTS = [{
  13. 'url': 'http://www.stitcher.com/podcast/the-talking-machines/e/40789481?autoplay=true',
  14. 'md5': '391dd4e021e6edeb7b8e68fbf2e9e940',
  15. 'info_dict': {
  16. 'id': '40789481',
  17. 'ext': 'mp3',
  18. 'title': 'Machine Learning Mastery and Cancer Clusters',
  19. 'description': 'md5:55163197a44e915a14a1ac3a1de0f2d3',
  20. 'duration': 1604,
  21. 'thumbnail': 're:^https?://.*\.jpg',
  22. },
  23. }, {
  24. 'url': 'http://www.stitcher.com/podcast/panoply/vulture-tv/e/the-rare-hourlong-comedy-plus-40846275?autoplay=true',
  25. 'info_dict': {
  26. 'id': '40846275',
  27. 'display_id': 'the-rare-hourlong-comedy-plus',
  28. 'ext': 'mp3',
  29. 'title': "The CW's 'Crazy Ex-Girlfriend'",
  30. 'description': 'md5:04f1e2f98eb3f5cbb094cea0f9e19b17',
  31. 'duration': 2235,
  32. 'thumbnail': 're:^https?://.*\.jpg',
  33. },
  34. 'params': {
  35. 'skip_download': True,
  36. },
  37. }, {
  38. # escaped title
  39. 'url': 'http://www.stitcher.com/podcast/marketplace-on-stitcher/e/40910226?autoplay=true',
  40. 'only_matching': True,
  41. }, {
  42. 'url': 'http://www.stitcher.com/podcast/panoply/getting-in/e/episode-2a-how-many-extracurriculars-should-i-have-40876278?autoplay=true',
  43. 'only_matching': True,
  44. }]
  45. def _real_extract(self, url):
  46. mobj = re.match(self._VALID_URL, url)
  47. audio_id = mobj.group('id')
  48. display_id = mobj.group('display_id') or audio_id
  49. webpage = self._download_webpage(url, display_id)
  50. episode = self._parse_json(
  51. js_to_json(self._search_regex(
  52. r'(?s)var\s+stitcher\s*=\s*({.+?});\n', webpage, 'episode config')),
  53. display_id)['config']['episode']
  54. title = unescapeHTML(episode['title'])
  55. formats = [{
  56. 'url': episode[episode_key],
  57. 'ext': determine_ext(episode[episode_key]) or 'mp3',
  58. 'vcodec': 'none',
  59. } for episode_key in ('episodeURL',) if episode.get(episode_key)]
  60. description = self._search_regex(
  61. r'Episode Info:\s*</span>([^<]+)<', webpage, 'description', fatal=False)
  62. duration = int_or_none(episode.get('duration'))
  63. thumbnail = episode.get('episodeImage')
  64. return {
  65. 'id': audio_id,
  66. 'display_id': display_id,
  67. 'title': title,
  68. 'description': description,
  69. 'duration': duration,
  70. 'thumbnail': thumbnail,
  71. 'formats': formats,
  72. }