Merge branch 'the-daily-show-podcast' of https://github.com/fstirlitz/youtube-dl into fstirlitz-the-daily-show-podcast

10 years ago · ef249a2cd7
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -84,7 +84,7 @@ from .cnn import (
 )
 from .collegehumor import CollegeHumorIE
 from .collegerama import CollegeRamaIE
 from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE
 from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE, TheDailyShowPodcastIE
 from .comcarcoff import ComCarCoffIE
 from .commonmistakes import CommonMistakesIE, UnicodeBOMIE
 from .condenast import CondeNastIE
@ -251,6 +251,7 @@ from .letv import (
    LetvPlaylistIE
 )
 from .lifenews import LifeNewsIE
 from .libsyn import LibsynIE
 from .liveleak import LiveLeakIE
 from .livestream import (
    LivestreamIE,
--- a/youtube_dl/extractor/comedycentral.py
+++ b/youtube_dl/extractor/comedycentral.py
@ -2,6 +2,7 @@ from __future__ import unicode_literals
 import re
 from .common import InfoExtractor
 from .mtv import MTVServicesInfoExtractor
 from ..compat import (
    compat_str,
@ -272,3 +273,27 @@ class ComedyCentralShowsIE(MTVServicesInfoExtractor):
            'title': show_name + ' ' + title,
            'description': description,
        }
 class TheDailyShowPodcastIE(InfoExtractor):
    _VALID_URL = r'(?P<scheme>https?:)?//thedailyshow\.cc\.com/podcast/(?P<id>[a-z\-]+)'
    _TESTS = [{
        "url": "http://thedailyshow.cc.com/podcast/episodetwelve",
        'only_matching': True,
    }]
    def _real_extract(self, url):
        display_id = self._match_id(url)
        webpage = self._download_webpage(url, display_id)
        player_url = self._search_regex(r'<iframe(?:\s+[^>]+)?\s*src="((?:https?:)?//html5-player\.libsyn\.com/embed/episode/id/[0-9]+)', webpage, 'player URL')
        if player_url.startswith('//'):
            mobj = re.match(self._VALID_URL, url)
            scheme = mobj.group('scheme')
            if not scheme:
                scheme = 'https:'
            player_url = scheme + player_url
        return {
            '_type': 'url_transparent',
            'url': player_url,
        }
--- a/youtube_dl/extractor/libsyn.py
+++ b/youtube_dl/extractor/libsyn.py
@ -0,0 +1,50 @@
 # encoding: utf-8
 from .common import InfoExtractor
 from ..utils import (
    unified_strdate,
 )
 class LibsynIE(InfoExtractor):
    _VALID_URL = r'(?:https?:)?//html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+)(?:/.*)?'
    _TESTS = [{
        'url': "http://html5-player.libsyn.com/embed/episode/id/3377616/",
        'info_dict': {
            'id': "3377616",
            'ext': "mp3",
            'title': "Episode 12: Bassem Youssef: Egypt's Jon Stewart",
            'description': "<p>Bassem Youssef joins executive producer Steve Bodow and senior producer Sara Taksler for a conversation about how&nbsp;<em style=\"font-family: Tahoma, Geneva, sans-serif; font-size: 12.8000001907349px;\">The Daily Show</em>&nbsp;inspired Bassem to create&nbsp;<em style=\"font-family: Tahoma, Geneva, sans-serif; font-size: 12.8000001907349px;\">Al-Bernameg</em>, his massively popular (and now banned) Egyptian news satire program. Sara discusses her soon-to-be-released documentary,&nbsp;<em style=\"font-family: Tahoma, Geneva, sans-serif; font-size: 12.8000001907349px;\">Tickling Giants</em>, which chronicles how Bassem and his staff risked their safety every day to tell jokes.</p>",
        },
    }]
    def _real_extract(self, url):
        if url.startswith('//'):
            url = 'https:' + url
        display_id = self._match_id(url)
        webpage = self._download_webpage(url, display_id)
        podcast_title         = self._search_regex(r'<h2>(.*?)</h2>', webpage, 'show title')
        podcast_episode_title = self._search_regex(r'<h3>(.*?)</h3>', webpage, 'episode title')
        podcast_date          = unified_strdate(self._search_regex(r'<div class="release_date">Released: (.*?)</div>', webpage, 'release date'))
        podcast_description   = self._search_regex(r'<div id="info_text_body">(.*?)</div>', webpage, 'description')
        url0 = self._search_regex(r'var mediaURLLibsyn = "(?P<url0>https?://.*)";', webpage, 'first media URL')
        url1 = self._search_regex(r'var mediaURL = "(?P<url1>https?://.*)";', webpage, 'second media URL')
        if url0 != url1:
            formats = [{
                'url': url0
            }, {
                'url': url1
            }]
        else:
            formats = [{
                'url': url0
            }]
        return {
            'id': display_id,
            'title': podcast_episode_title,
            'description': podcast_description,
            'upload_date': podcast_date,
            'formats': formats,
        }