You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

112 lines
4.4 KiB

  1. import re
  2. import socket
  3. import xml.etree.ElementTree
  4. from .common import InfoExtractor
  5. from ..utils import (
  6. compat_http_client,
  7. compat_str,
  8. compat_urllib_error,
  9. compat_urllib_request,
  10. ExtractorError,
  11. orderedSet,
  12. unescapeHTML,
  13. )
  14. class StanfordOpenClassroomIE(InfoExtractor):
  15. """Information extractor for Stanford's Open ClassRoom"""
  16. _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
  17. IE_NAME = u'stanfordoc'
  18. def _real_extract(self, url):
  19. mobj = re.match(self._VALID_URL, url)
  20. if mobj is None:
  21. raise ExtractorError(u'Invalid URL: %s' % url)
  22. if mobj.group('course') and mobj.group('video'): # A specific video
  23. course = mobj.group('course')
  24. video = mobj.group('video')
  25. info = {
  26. 'id': course + '_' + video,
  27. 'uploader': None,
  28. 'upload_date': None,
  29. }
  30. self.report_extraction(info['id'])
  31. baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
  32. xmlUrl = baseUrl + video + '.xml'
  33. try:
  34. metaXml = compat_urllib_request.urlopen(xmlUrl).read()
  35. except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  36. raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
  37. mdoc = xml.etree.ElementTree.fromstring(metaXml)
  38. try:
  39. info['title'] = mdoc.findall('./title')[0].text
  40. info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
  41. except IndexError:
  42. raise ExtractorError(u'Invalid metadata XML file')
  43. info['ext'] = info['url'].rpartition('.')[2]
  44. return [info]
  45. elif mobj.group('course'): # A course page
  46. course = mobj.group('course')
  47. info = {
  48. 'id': course,
  49. 'type': 'playlist',
  50. 'uploader': None,
  51. 'upload_date': None,
  52. }
  53. coursepage = self._download_webpage(url, info['id'],
  54. note='Downloading course info page',
  55. errnote='Unable to download course info page')
  56. info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
  57. info['description'] = self._html_search_regex('<description>([^<]+)</description>',
  58. coursepage, u'description', fatal=False)
  59. links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
  60. info['list'] = [
  61. {
  62. 'type': 'reference',
  63. 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
  64. }
  65. for vpage in links]
  66. results = []
  67. for entry in info['list']:
  68. assert entry['type'] == 'reference'
  69. results += self.extract(entry['url'])
  70. return results
  71. else: # Root page
  72. info = {
  73. 'id': 'Stanford OpenClassroom',
  74. 'type': 'playlist',
  75. 'uploader': None,
  76. 'upload_date': None,
  77. }
  78. self.report_download_webpage(info['id'])
  79. rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
  80. try:
  81. rootpage = compat_urllib_request.urlopen(rootURL).read()
  82. except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  83. raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
  84. info['title'] = info['id']
  85. links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
  86. info['list'] = [
  87. {
  88. 'type': 'reference',
  89. 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
  90. }
  91. for cpage in links]
  92. results = []
  93. for entry in info['list']:
  94. assert entry['type'] == 'reference'
  95. results += self.extract(entry['url'])
  96. return results