You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

105 lines
4.0 KiB

  1. import re
  2. from .common import InfoExtractor
  3. from ..utils import (
  4. ExtractorError,
  5. orderedSet,
  6. unescapeHTML,
  7. )
  8. class StanfordOpenClassroomIE(InfoExtractor):
  9. IE_NAME = u'stanfordoc'
  10. IE_DESC = u'Stanford Open ClassRoom'
  11. _VALID_URL = r'^(?:https?://)?openclassroom\.stanford\.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
  12. _TEST = {
  13. u'url': u'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100',
  14. u'file': u'PracticalUnix_intro-environment.mp4',
  15. u'md5': u'544a9468546059d4e80d76265b0443b8',
  16. u'info_dict': {
  17. u"title": u"Intro Environment"
  18. }
  19. }
  20. def _real_extract(self, url):
  21. mobj = re.match(self._VALID_URL, url)
  22. if mobj is None:
  23. raise ExtractorError(u'Invalid URL: %s' % url)
  24. if mobj.group('course') and mobj.group('video'): # A specific video
  25. course = mobj.group('course')
  26. video = mobj.group('video')
  27. info = {
  28. 'id': course + '_' + video,
  29. 'uploader': None,
  30. 'upload_date': None,
  31. }
  32. self.report_extraction(info['id'])
  33. baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
  34. xmlUrl = baseUrl + video + '.xml'
  35. mdoc = self._download_xml(xmlUrl, info['id'])
  36. try:
  37. info['title'] = mdoc.findall('./title')[0].text
  38. info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
  39. except IndexError:
  40. raise ExtractorError(u'Invalid metadata XML file')
  41. info['ext'] = info['url'].rpartition('.')[2]
  42. return [info]
  43. elif mobj.group('course'): # A course page
  44. course = mobj.group('course')
  45. info = {
  46. 'id': course,
  47. 'type': 'playlist',
  48. 'uploader': None,
  49. 'upload_date': None,
  50. }
  51. coursepage = self._download_webpage(url, info['id'],
  52. note='Downloading course info page',
  53. errnote='Unable to download course info page')
  54. info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
  55. info['description'] = self._html_search_regex('<description>([^<]+)</description>',
  56. coursepage, u'description', fatal=False)
  57. links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
  58. info['list'] = [
  59. {
  60. 'type': 'reference',
  61. 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
  62. }
  63. for vpage in links]
  64. results = []
  65. for entry in info['list']:
  66. assert entry['type'] == 'reference'
  67. results += self.extract(entry['url'])
  68. return results
  69. else: # Root page
  70. info = {
  71. 'id': 'Stanford OpenClassroom',
  72. 'type': 'playlist',
  73. 'uploader': None,
  74. 'upload_date': None,
  75. }
  76. rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
  77. rootpage = self._download_webpage(rootURL, info['id'],
  78. errnote=u'Unable to download course info page')
  79. info['title'] = info['id']
  80. links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
  81. info['list'] = [
  82. {
  83. 'type': 'reference',
  84. 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
  85. }
  86. for cpage in links]
  87. results = []
  88. for entry in info['list']:
  89. assert entry['type'] == 'reference'
  90. results += self.extract(entry['url'])
  91. return results