zolfa
/
youtube-dl

from __future__ import unicode_literals
import re
from .common import InfoExtractorfrom ..utils import (    ExtractorError,    orderedSet,    unescapeHTML,)

class StanfordOpenClassroomIE(InfoExtractor):    IE_NAME = 'stanfordoc'    IE_DESC = 'Stanford Open ClassRoom'    _VALID_URL = r'https?://openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'    _TEST = {        'url': 'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100',        'md5': '544a9468546059d4e80d76265b0443b8',        'info_dict': {            'id': 'PracticalUnix_intro-environment',            'ext': 'mp4',            'title': 'Intro Environment',        }    }
    def _real_extract(self, url):        mobj = re.match(self._VALID_URL, url)
        if mobj.group('course') and mobj.group('video'):  # A specific video            course = mobj.group('course')            video = mobj.group('video')            info = {                'id': course + '_' + video,                'uploader': None,                'upload_date': None,            }
            baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'            xmlUrl = baseUrl + video + '.xml'            mdoc = self._download_xml(xmlUrl, info['id'])            try:                info['title'] = mdoc.findall('./title')[0].text                info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text            except IndexError:                raise ExtractorError('Invalid metadata XML file')            return info        elif mobj.group('course'):  # A course page            course = mobj.group('course')            info = {                'id': course,                '_type': 'playlist',                'uploader': None,                'upload_date': None,            }
            coursepage = self._download_webpage(                url, info['id'],                note='Downloading course info page',                errnote='Unable to download course info page')
            info['title'] = self._html_search_regex(                r'<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
            info['description'] = self._html_search_regex(                r'(?s)<description>([^<]+)</description>',                coursepage, 'description', fatal=False)
            links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))            info['entries'] = [self.url_result(                'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l)            ) for l in links]            return info        else:  # Root page            info = {                'id': 'Stanford OpenClassroom',                '_type': 'playlist',                'uploader': None,                'upload_date': None,            }            info['title'] = info['id']
            rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'            rootpage = self._download_webpage(rootURL, info['id'],                                              errnote='Unable to download course info page')
            links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))            info['entries'] = [self.url_result(                'http://openclassroom.stanford.edu/MainFolder/%s' % unescapeHTML(l)            ) for l in links]            return info