|
|
@ -282,6 +282,14 @@ def _simplify_title(title): |
|
|
|
expr = re.compile(ur'[^\w\d_\-]+', flags=re.UNICODE) |
|
|
|
return expr.sub(u'_', title).strip(u'_') |
|
|
|
|
|
|
|
def _orderedSet(iterable): |
|
|
|
""" Remove all duplicates from the input iterable """ |
|
|
|
res = [] |
|
|
|
for el in iterable: |
|
|
|
if el not in res: |
|
|
|
res.append(el) |
|
|
|
return res |
|
|
|
|
|
|
|
class DownloadError(Exception): |
|
|
|
"""Download Error exception. |
|
|
|
|
|
|
@ -711,25 +719,6 @@ class FileDownloader(object): |
|
|
|
return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"' |
|
|
|
return None |
|
|
|
|
|
|
|
def process_dict(self, info_dict): |
|
|
|
""" Download and handle the extracted information. |
|
|
|
For details on the specification of the various types of content, refer to the _process_* functions. """ |
|
|
|
if info_dict['type'] == 'playlist': |
|
|
|
self._process_playlist(info_dict) |
|
|
|
elif info_dict['type'] == 'legacy-video': |
|
|
|
self.process_info(info_dict) |
|
|
|
else: |
|
|
|
raise ValueError('Invalid item type') |
|
|
|
|
|
|
|
def _process_playlist(self, info_dict): |
|
|
|
assert info_dict['type'] == 'playlist' |
|
|
|
assert 'title' in info_dict |
|
|
|
assert 'stitle' in info_dict |
|
|
|
entries = info_dict['list'] |
|
|
|
|
|
|
|
for e in entries: |
|
|
|
self.process_dict(e) |
|
|
|
|
|
|
|
def process_info(self, info_dict): |
|
|
|
"""Process a single dictionary returned by an InfoExtractor.""" |
|
|
|
|
|
|
@ -3766,9 +3755,13 @@ class MixcloudIE(InfoExtractor): |
|
|
|
class StanfordOpenClassroomIE(InfoExtractor): |
|
|
|
"""Information extractor for Stanford's Open ClassRoom""" |
|
|
|
|
|
|
|
_VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' |
|
|
|
_VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$' |
|
|
|
IE_NAME = u'stanfordoc' |
|
|
|
|
|
|
|
def report_download_webpage(self, objid): |
|
|
|
"""Report information extraction.""" |
|
|
|
self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid)) |
|
|
|
|
|
|
|
def report_extraction(self, video_id): |
|
|
|
"""Report information extraction.""" |
|
|
|
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) |
|
|
@ -3792,7 +3785,7 @@ class StanfordOpenClassroomIE(InfoExtractor): |
|
|
|
try: |
|
|
|
metaXml = urllib2.urlopen(xmlUrl).read() |
|
|
|
except (urllib2.URLError, httplib.HTTPException, socket.error), err: |
|
|
|
self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % str(err)) |
|
|
|
self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % unicode(err)) |
|
|
|
return |
|
|
|
mdoc = xml.etree.ElementTree.fromstring(metaXml) |
|
|
|
try: |
|
|
@ -3809,13 +3802,74 @@ class StanfordOpenClassroomIE(InfoExtractor): |
|
|
|
self._downloader.process_info(info) |
|
|
|
except UnavailableVideoError, err: |
|
|
|
self._downloader.trouble(u'\nERROR: unable to download video') |
|
|
|
else: |
|
|
|
print('TODO: Not yet implemented') |
|
|
|
1/0 |
|
|
|
elif mobj.group('course'): # A course page |
|
|
|
unescapeHTML = HTMLParser.HTMLParser().unescape |
|
|
|
|
|
|
|
course = mobj.group('course') |
|
|
|
info = { |
|
|
|
'id': _simplify_title(course), |
|
|
|
'type': 'playlist', |
|
|
|
} |
|
|
|
|
|
|
|
self.report_download_webpage(info['id']) |
|
|
|
try: |
|
|
|
coursepage = urllib2.urlopen(url).read() |
|
|
|
except (urllib2.URLError, httplib.HTTPException, socket.error), err: |
|
|
|
self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err)) |
|
|
|
return |
|
|
|
|
|
|
|
m = re.search('<h1>([^<]+)</h1>', coursepage) |
|
|
|
if m: |
|
|
|
info['title'] = unescapeHTML(m.group(1)) |
|
|
|
else: |
|
|
|
info['title'] = info['id'] |
|
|
|
info['stitle'] = _simplify_title(info['title']) |
|
|
|
|
|
|
|
m = re.search('<description>([^<]+)</description>', coursepage) |
|
|
|
if m: |
|
|
|
info['description'] = unescapeHTML(m.group(1)) |
|
|
|
|
|
|
|
links = _orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage)) |
|
|
|
info['list'] = [ |
|
|
|
{ |
|
|
|
'type': 'reference', |
|
|
|
'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage), |
|
|
|
} |
|
|
|
for vpage in links] |
|
|
|
|
|
|
|
for entry in info['list']: |
|
|
|
assert entry['type'] == 'reference' |
|
|
|
self.extract(entry['url']) |
|
|
|
else: # Root page |
|
|
|
unescapeHTML = HTMLParser.HTMLParser().unescape |
|
|
|
|
|
|
|
info = { |
|
|
|
'id': 'Stanford OpenClassroom', |
|
|
|
'type': 'playlist', |
|
|
|
} |
|
|
|
|
|
|
|
self.report_download_webpage(info['id']) |
|
|
|
rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php' |
|
|
|
try: |
|
|
|
rootpage = urllib2.urlopen(rootURL).read() |
|
|
|
except (urllib2.URLError, httplib.HTTPException, socket.error), err: |
|
|
|
self._downloader.trouble(u'ERROR: unable to download course info page: ' + unicode(err)) |
|
|
|
return |
|
|
|
|
|
|
|
info['title'] = info['id'] |
|
|
|
info['stitle'] = _simplify_title(info['title']) |
|
|
|
|
|
|
|
links = _orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage)) |
|
|
|
info['list'] = [ |
|
|
|
{ |
|
|
|
'type': 'reference', |
|
|
|
'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage), |
|
|
|
} |
|
|
|
for cpage in links] |
|
|
|
|
|
|
|
for entry in info['list']: |
|
|
|
assert entry['type'] == 'reference' |
|
|
|
self.extract(entry['url']) |
|
|
|
|
|
|
|
|
|
|
|
class PostProcessor(object): |
|
|
|