[crackle] Add new extractor

9 years ago · 80f772c28a
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -126,6 +126,7 @@ from .comcarcoff import ComCarCoffIE
 from .commonmistakes import CommonMistakesIE, UnicodeBOMIE
 from .condenast import CondeNastIE
 from .cracked import CrackedIE
 from .crackle import CrackleIE
 from .criterion import CriterionIE
 from .crooksandliars import CrooksAndLiarsIE
 from .crunchyroll import (
--- a/youtube_dl/extractor/comcarcoff.py
+++ b/youtube_dl/extractor/comcarcoff.py
@ -2,6 +2,7 @@
 from __future__ import unicode_literals

 from .common import InfoExtractor
 from ..compat import compat_str
 from ..utils import (
    int_or_none,
    parse_duration,
@ -14,14 +15,13 @@ class ComCarCoffIE(InfoExtractor):
    _TESTS = [{
        'url': 'http://comediansincarsgettingcoffee.com/miranda-sings-happy-thanksgiving-miranda/',
        'info_dict': {
            'id': 'miranda-sings-happy-thanksgiving-miranda',
            'id': '2494164',
            'ext': 'mp4',
            'upload_date': '20141127',
            'timestamp': 1417107600,
            'duration': 1232,
            'title': 'Happy Thanksgiving Miranda',
            'description': 'Jerry Seinfeld and his special guest Miranda Sings cruise around town in search of coffee, complaining and apologizing along the way.',
            'thumbnail': 'http://ccc.crackle.com/images/s5e4_thumb.jpg',
        },
        'params': {
            'skip_download': 'requires ffmpeg',
@ -39,15 +39,14 @@ class ComCarCoffIE(InfoExtractor):
                r'window\.app\s*=\s*({.+?});\n', webpage, 'full data json'),
            display_id)['videoData']

        video_id = full_data['activeVideo']['video']
        video_data = full_data.get('videos', {}).get(video_id) or full_data['singleshots'][video_id]
        display_id = full_data['activeVideo']['video']
        video_data = full_data.get('videos', {}).get(display_id) or full_data['singleshots'][display_id]
        video_id = compat_str(video_data['mediaId'])
        thumbnails = [{
            'url': video_data['images']['thumb'],
        }, {
            'url': video_data['images']['poster'],
        }]
        formats = self._extract_m3u8_formats(
            video_data['mediaUrl'], video_id, ext='mp4')

        timestamp = int_or_none(video_data.get('pubDateTime')) or parse_iso8601(
            video_data.get('pubDate'))
@ -55,6 +54,8 @@ class ComCarCoffIE(InfoExtractor):
            video_data.get('duration'))

        return {
            '_type': 'url_transparent',
            'url': 'crackle:%s' % video_id,
            'id': video_id,
            'display_id': display_id,
            'title': video_data['title'],
@ -62,6 +63,7 @@ class ComCarCoffIE(InfoExtractor):
            'timestamp': timestamp,
            'duration': duration,
            'thumbnails': thumbnails,
            'formats': formats,
            'season_number': int_or_none(video_data.get('season')),
            'episode_number': int_or_none(video_data.get('episode')),
            'webpage_url': 'http://comediansincarsgettingcoffee.com/%s' % (video_data.get('urlSlug', video_data.get('slug'))),
        }
--- a/youtube_dl/extractor/crackle.py
+++ b/youtube_dl/extractor/crackle.py
@ -0,0 +1,92 @@
 # coding: utf-8
 from __future__ import unicode_literals

 from .common import InfoExtractor
 from ..utils import int_or_none


 class CrackleIE(InfoExtractor):
    _VALID_URL = r'(?:crackle:|https?://(?:www\.)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P<id>\d+)'
    _TEST = {
        'url': 'http://www.crackle.com/the-art-of-more/2496419',
        'info_dict': {
            'id': '2496419',
            'ext': 'mp4',
            'title': 'Heavy Lies the Head',
            'description': 'md5:bb56aa0708fe7b9a4861535f15c3abca',
        },
        'params': {
            # m3u8 download
            'skip_download': True,
        }
    }

    # extracted from http://legacyweb-us.crackle.com/flash/QueryReferrer.ashx
    _SUBTITLE_SERVER = 'http://web-us-az.crackle.com'
    _UPLYNK_OWNER_ID = 'e8773f7770a44dbd886eee4fca16a66b'
    _THUMBNAIL_TEMPLATE = 'http://images-us-am.crackle.com/%stnl_1920x1080.jpg?ts=20140107233116?c=635333335057637614'

    # extracted from http://legacyweb-us.crackle.com/flash/ReferrerRedirect.ashx
    _MEDIA_FILE_SLOTS = {
        'c544.flv': {
            'width': 544,
            'height': 306,
        },
        '360p.mp4': {
            'width': 640,
            'height': 360,
        },
        '480p.mp4': {
            'width': 852,
            'height': 478,
        },
        '480p_1mbps.mp4': {
            'width': 852,
            'height': 478,
        },
    }

    def _real_extract(self, url):
        video_id = self._match_id(url)
        item = self._download_xml(
            'http://legacyweb-us.crackle.com/app/revamp/vidwallcache.aspx?flags=-1&fm=%s' % video_id, video_id).find('i')
        title = item.attrib['t']

        thumbnail = None
        subtitles = {}
        formats = self._extract_m3u8_formats('http://content.uplynk.com/ext/%s/%s.m3u8' % (self._UPLYNK_OWNER_ID, video_id), video_id, 'mp4', fatal=None)
        path = item.attrib.get('p')
        if path:
            thumbnail = self._THUMBNAIL_TEMPLATE % path
            http_base_url = 'http://ahttp.crackle.com/' + path
            for mfs_path, mfs_info in self._MEDIA_FILE_SLOTS.items():
                formats.append({
                    'url': http_base_url + mfs_path,
                    'format_id': mfs_path.split('.')[0],
                    'width': mfs_info['width'],
                    'height': mfs_info['height'],
                })
            for cc in item.findall('cc'):
                locale = cc.attrib.get('l')
                v = cc.attrib.get('v')
                if locale and v:
                    if locale not in subtitles:
                        subtitles[locale] = []
                    subtitles[locale] = [{
                        'url': '%s/%s%s_%s.xml' % (self._SUBTITLE_SERVER, path, locale, v),
                        'ext': 'ttml',
                    }]
        self._sort_formats(formats, ('width', 'height', 'tbr'))

        return {
            'id': video_id,
            'title': title,
            'description': item.attrib.get('d'),
            'duration': int(item.attrib.get('r'), 16) if item.attrib.get('r') else None,
            'series': item.attrib.get('sn'),
            'season_number': int_or_none(item.attrib.get('se')),
            'episode_number': int_or_none(item.attrib.get('ep')),
            'thumbnail': thumbnail,
            'subtitles': subtitles,
            'formats': formats,
        }