Add extractors for video.mit.edu and techtv.mit.edu (closes #1327)

video.mit.edu just embeds the videos from techtv.mit.edu
11 years ago · 67b22dd036
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -50,6 +50,7 @@ from .keek import KeekIE
 from .liveleak import LiveLeakIE
 from .livestream import LivestreamIE
 from .metacafe import MetacafeIE
 from .mit import TechTVMITIE, MITIE
 from .mixcloud import MixcloudIE
 from .mtv import MTVIE
 from .muzu import MuzuTVIE
--- a/youtube_dl/extractor/mit.py
+++ b/youtube_dl/extractor/mit.py
@ -0,0 +1,76 @@
 import re
 import json

 from .common import InfoExtractor
 from ..utils import (
    clean_html,
    get_element_by_id,
 )


 class TechTVMITIE(InfoExtractor):
    IE_NAME = u'techtv.mit.edu'
    _VALID_URL = r'https?://techtv\.mit\.edu/(videos|embeds)/(?P<id>\d+)'

    _TEST = {
        u'url': u'http://techtv.mit.edu/videos/25418-mit-dna-learning-center-set',
        u'file': u'25418.mp4',
        u'md5': u'1f8cb3e170d41fd74add04d3c9330e5f',
        u'info_dict': {
            u'title': u'MIT DNA Learning Center Set',
            u'description': u'md5:82313335e8a8a3f243351ba55bc1b474',
        },
    }

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        video_id = mobj.group('id')
        webpage = self._download_webpage(
            'http://techtv.mit.edu/videos/%s' % video_id, video_id)
        embed_page = self._download_webpage(
            'http://techtv.mit.edu/embeds/%s/' % video_id, video_id,
            note=u'Downloading embed page')

        base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)',
            embed_page, u'base url')
        formats_json = self._search_regex(r'bitrates: (\[.+?\])', embed_page,
            u'video formats')
        formats = json.loads(formats_json)
        formats = sorted(formats, key=lambda f: f['bitrate'])

        title = get_element_by_id('edit-title', webpage)
        description = clean_html(get_element_by_id('edit-description', webpage))
        thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'',
            embed_page, u'thumbnail', flags=re.DOTALL)

        return {'id': video_id,
                'title': title,
                'url': base_url + formats[-1]['url'].replace('mp4:', ''),
                'ext': 'mp4',
                'description': description,
                'thumbnail': thumbnail,
                }


 class MITIE(TechTVMITIE):
    IE_NAME = u'video.mit.edu'
    _VALID_URL = r'https?://video\.mit\.edu/watch/(?P<title>[^/]+)'

    _TEST = {
        u'url': u'http://video.mit.edu/watch/the-government-is-profiling-you-13222/',
        u'file': u'21783.mp4',
        u'md5': u'7db01d5ccc1895fc5010e9c9e13648da',
        u'info_dict': {
            u'title': u'The Government is Profiling You',
            u'description': u'md5:ad5795fe1e1623b73620dbfd47df9afd',
        },
    }

    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        page_title = mobj.group('title')
        webpage = self._download_webpage(url, page_title)
        self.to_screen('%s: Extracting %s url' % (page_title, TechTVMITIE.IE_NAME))
        embed_url = self._search_regex(r'<iframe .*?src="(.+?)"', webpage,
            u'embed url')
        return self.url_result(embed_url, ie='TechTVMIT')