You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

192 lines
7.1 KiB

  1. from __future__ import unicode_literals
  2. import re
  3. from .common import InfoExtractor
  4. from ..compat import (
  5. compat_xpath,
  6. )
  7. from ..utils import (
  8. int_or_none,
  9. parse_duration,
  10. smuggle_url,
  11. unsmuggle_url,
  12. xpath_text,
  13. )
  14. class MicrosoftVirtualAcademyBaseIE(InfoExtractor):
  15. def _extract_base_url(self, course_id, display_id):
  16. return self._download_json(
  17. 'https://api-mlxprod.microsoft.com/services/products/anonymous/%s' % course_id,
  18. display_id, 'Downloading course base URL')
  19. def _extract_chapter_and_title(self, title):
  20. if not title:
  21. return None, None
  22. m = re.search(r'(?P<chapter>\d+)\s*\|\s*(?P<title>.+)', title)
  23. return (int(m.group('chapter')), m.group('title')) if m else (None, title)
  24. class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE):
  25. IE_NAME = 'mva'
  26. IE_DESC = 'Microsoft Virtual Academy videos'
  27. _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME
  28. _TESTS = [{
  29. 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382',
  30. 'md5': '7826c44fc31678b12ad8db11f6b5abb9',
  31. 'info_dict': {
  32. 'id': 'gfVXISmEB_6804984382',
  33. 'ext': 'mp4',
  34. 'title': 'Course Introduction',
  35. 'formats': 'mincount:3',
  36. 'subtitles': {
  37. 'en': [{
  38. 'ext': 'ttml',
  39. }],
  40. },
  41. }
  42. }, {
  43. 'url': 'mva:11788:gfVXISmEB_6804984382',
  44. 'only_matching': True,
  45. }]
  46. def _real_extract(self, url):
  47. url, smuggled_data = unsmuggle_url(url, {})
  48. mobj = re.match(self._VALID_URL, url)
  49. course_id = mobj.group('course_id')
  50. video_id = mobj.group('id')
  51. base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id)
  52. settings = self._download_xml(
  53. '%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id),
  54. video_id, 'Downloading video settings XML')
  55. _, title = self._extract_chapter_and_title(xpath_text(
  56. settings, './/Title', 'title', fatal=True))
  57. formats = []
  58. for sources in settings.findall(compat_xpath('.//MediaSources')):
  59. if sources.get('videoType') == 'smoothstreaming':
  60. continue
  61. for source in sources.findall(compat_xpath('./MediaSource')):
  62. video_url = source.text
  63. if not video_url or not video_url.startswith('http'):
  64. continue
  65. video_mode = source.get('videoMode')
  66. height = int_or_none(self._search_regex(
  67. r'^(\d+)[pP]$', video_mode or '', 'height', default=None))
  68. codec = source.get('codec')
  69. acodec, vcodec = [None] * 2
  70. if codec:
  71. codecs = codec.split(',')
  72. if len(codecs) == 2:
  73. acodec, vcodec = codecs
  74. elif len(codecs) == 1:
  75. vcodec = codecs[0]
  76. formats.append({
  77. 'url': video_url,
  78. 'format_id': video_mode,
  79. 'height': height,
  80. 'acodec': acodec,
  81. 'vcodec': vcodec,
  82. })
  83. self._sort_formats(formats)
  84. subtitles = {}
  85. for source in settings.findall(compat_xpath('.//MarkerResourceSource')):
  86. subtitle_url = source.text
  87. if not subtitle_url:
  88. continue
  89. subtitles.setdefault('en', []).append({
  90. 'url': '%s/%s' % (base_url, subtitle_url),
  91. 'ext': source.get('type'),
  92. })
  93. return {
  94. 'id': video_id,
  95. 'title': title,
  96. 'subtitles': subtitles,
  97. 'formats': formats
  98. }
  99. class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE):
  100. IE_NAME = 'mva:course'
  101. IE_DESC = 'Microsoft Virtual Academy courses'
  102. _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME
  103. _TESTS = [{
  104. 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788',
  105. 'info_dict': {
  106. 'id': '11788',
  107. 'title': 'Microsoft Azure Fundamentals: Virtual Machines',
  108. },
  109. 'playlist_count': 36,
  110. }, {
  111. # with emphasized chapters
  112. 'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335',
  113. 'info_dict': {
  114. 'id': '16335',
  115. 'title': 'Developing Windows 10 Games with Construct 2',
  116. },
  117. 'playlist_count': 10,
  118. }, {
  119. 'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788',
  120. 'only_matching': True,
  121. }, {
  122. 'url': 'mva:course:11788',
  123. 'only_matching': True,
  124. }]
  125. @classmethod
  126. def suitable(cls, url):
  127. return False if MicrosoftVirtualAcademyIE.suitable(url) else super(
  128. MicrosoftVirtualAcademyCourseIE, cls).suitable(url)
  129. def _real_extract(self, url):
  130. mobj = re.match(self._VALID_URL, url)
  131. course_id = mobj.group('id')
  132. display_id = mobj.group('display_id')
  133. base_url = self._extract_base_url(course_id, display_id)
  134. manifest = self._download_json(
  135. '%s/imsmanifestlite.json' % base_url,
  136. display_id, 'Downloading course manifest JSON')['manifest']
  137. organization = manifest['organizations']['organization'][0]
  138. entries = []
  139. for chapter in organization['item']:
  140. chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title'))
  141. chapter_id = chapter.get('@identifier')
  142. for item in chapter.get('item', []):
  143. item_id = item.get('@identifier')
  144. if not item_id:
  145. continue
  146. metadata = item.get('resource', {}).get('metadata') or {}
  147. if metadata.get('learningresourcetype') != 'Video':
  148. continue
  149. _, title = self._extract_chapter_and_title(item.get('title'))
  150. duration = parse_duration(metadata.get('duration'))
  151. description = metadata.get('description')
  152. entries.append({
  153. '_type': 'url_transparent',
  154. 'url': smuggle_url(
  155. 'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}),
  156. 'title': title,
  157. 'description': description,
  158. 'duration': duration,
  159. 'chapter': chapter_title,
  160. 'chapter_number': chapter_number,
  161. 'chapter_id': chapter_id,
  162. })
  163. title = organization.get('title') or manifest.get('metadata', {}).get('title')
  164. return self.playlist_result(entries, course_id, title)