You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

119 lines
4.7 KiB

  1. import re
  2. import socket
  3. import xml.etree.ElementTree
  4. from .common import InfoExtractor
  5. from ..utils import (
  6. compat_http_client,
  7. compat_str,
  8. compat_urllib_error,
  9. compat_urllib_request,
  10. ExtractorError,
  11. orderedSet,
  12. unescapeHTML,
  13. )
  14. class StanfordOpenClassroomIE(InfoExtractor):
  15. IE_NAME = u'stanfordoc'
  16. IE_DESC = u'Stanford Open ClassRoom'
  17. _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
  18. _TEST = {
  19. u'url': u'http://openclassroom.stanford.edu/MainFolder/VideoPage.php?course=PracticalUnix&video=intro-environment&speed=100',
  20. u'file': u'PracticalUnix_intro-environment.mp4',
  21. u'md5': u'544a9468546059d4e80d76265b0443b8',
  22. u'info_dict': {
  23. u"title": u"Intro Environment"
  24. }
  25. }
  26. def _real_extract(self, url):
  27. mobj = re.match(self._VALID_URL, url)
  28. if mobj is None:
  29. raise ExtractorError(u'Invalid URL: %s' % url)
  30. if mobj.group('course') and mobj.group('video'): # A specific video
  31. course = mobj.group('course')
  32. video = mobj.group('video')
  33. info = {
  34. 'id': course + '_' + video,
  35. 'uploader': None,
  36. 'upload_date': None,
  37. }
  38. self.report_extraction(info['id'])
  39. baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
  40. xmlUrl = baseUrl + video + '.xml'
  41. try:
  42. metaXml = compat_urllib_request.urlopen(xmlUrl).read()
  43. except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  44. raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
  45. mdoc = xml.etree.ElementTree.fromstring(metaXml)
  46. try:
  47. info['title'] = mdoc.findall('./title')[0].text
  48. info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
  49. except IndexError:
  50. raise ExtractorError(u'Invalid metadata XML file')
  51. info['ext'] = info['url'].rpartition('.')[2]
  52. return [info]
  53. elif mobj.group('course'): # A course page
  54. course = mobj.group('course')
  55. info = {
  56. 'id': course,
  57. 'type': 'playlist',
  58. 'uploader': None,
  59. 'upload_date': None,
  60. }
  61. coursepage = self._download_webpage(url, info['id'],
  62. note='Downloading course info page',
  63. errnote='Unable to download course info page')
  64. info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
  65. info['description'] = self._html_search_regex('<description>([^<]+)</description>',
  66. coursepage, u'description', fatal=False)
  67. links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
  68. info['list'] = [
  69. {
  70. 'type': 'reference',
  71. 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
  72. }
  73. for vpage in links]
  74. results = []
  75. for entry in info['list']:
  76. assert entry['type'] == 'reference'
  77. results += self.extract(entry['url'])
  78. return results
  79. else: # Root page
  80. info = {
  81. 'id': 'Stanford OpenClassroom',
  82. 'type': 'playlist',
  83. 'uploader': None,
  84. 'upload_date': None,
  85. }
  86. self.report_download_webpage(info['id'])
  87. rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
  88. try:
  89. rootpage = compat_urllib_request.urlopen(rootURL).read()
  90. except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
  91. raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
  92. info['title'] = info['id']
  93. links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
  94. info['list'] = [
  95. {
  96. 'type': 'reference',
  97. 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
  98. }
  99. for cpage in links]
  100. results = []
  101. for entry in info['list']:
  102. assert entry['type'] == 'reference'
  103. results += self.extract(entry['url'])
  104. return results