You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

195 lines
7.3 KiB

  1. from __future__ import unicode_literals
  2. import re
  3. from .common import InfoExtractor
  4. from ..compat import (
  5. compat_xpath,
  6. )
  7. from ..utils import (
  8. int_or_none,
  9. parse_duration,
  10. smuggle_url,
  11. unsmuggle_url,
  12. xpath_text,
  13. )
  14. class MicrosoftVirtualAcademyBaseIE(InfoExtractor):
  15. def _extract_base_url(self, course_id, display_id):
  16. return self._download_json(
  17. 'https://api-mlxprod.microsoft.com/services/products/anonymous/%s' % course_id,
  18. display_id, 'Downloading course base URL')
  19. def _extract_chapter_and_title(self, title):
  20. if not title:
  21. return None, None
  22. m = re.search(r'(?P<chapter>\d+)\s*\|\s*(?P<title>.+)', title)
  23. return (int(m.group('chapter')), m.group('title')) if m else (None, title)
  24. class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE):
  25. IE_NAME = 'mva'
  26. IE_DESC = 'Microsoft Virtual Academy videos'
  27. _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/[^/?#&]+-)(?P<course_id>\d+)(?::|\?l=)(?P<id>[\da-zA-Z]+_\d+)' % IE_NAME
  28. _TESTS = [{
  29. 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788?l=gfVXISmEB_6804984382',
  30. 'md5': '7826c44fc31678b12ad8db11f6b5abb9',
  31. 'info_dict': {
  32. 'id': 'gfVXISmEB_6804984382',
  33. 'ext': 'mp4',
  34. 'title': 'Course Introduction',
  35. 'formats': 'mincount:3',
  36. 'subtitles': {
  37. 'en': [{
  38. 'ext': 'ttml',
  39. }],
  40. },
  41. }
  42. }, {
  43. 'url': 'mva:11788:gfVXISmEB_6804984382',
  44. 'only_matching': True,
  45. }]
  46. def _real_extract(self, url):
  47. url, smuggled_data = unsmuggle_url(url, {})
  48. mobj = re.match(self._VALID_URL, url)
  49. course_id = mobj.group('course_id')
  50. video_id = mobj.group('id')
  51. base_url = smuggled_data.get('base_url') or self._extract_base_url(course_id, video_id)
  52. settings = self._download_xml(
  53. '%s/content/content_%s/videosettings.xml?v=1' % (base_url, video_id),
  54. video_id, 'Downloading video settings XML')
  55. _, title = self._extract_chapter_and_title(xpath_text(
  56. settings, './/Title', 'title', fatal=True))
  57. formats = []
  58. for sources in settings.findall(compat_xpath('.//MediaSources')):
  59. sources_type = sources.get('videoType')
  60. for source in sources.findall(compat_xpath('./MediaSource')):
  61. video_url = source.text
  62. if not video_url or not video_url.startswith('http'):
  63. continue
  64. if sources_type == 'smoothstreaming':
  65. formats.extend(self._extract_ism_formats(
  66. video_url, video_id, 'mss', fatal=False))
  67. continue
  68. video_mode = source.get('videoMode')
  69. height = int_or_none(self._search_regex(
  70. r'^(\d+)[pP]$', video_mode or '', 'height', default=None))
  71. codec = source.get('codec')
  72. acodec, vcodec = [None] * 2
  73. if codec:
  74. codecs = codec.split(',')
  75. if len(codecs) == 2:
  76. acodec, vcodec = codecs
  77. elif len(codecs) == 1:
  78. vcodec = codecs[0]
  79. formats.append({
  80. 'url': video_url,
  81. 'format_id': video_mode,
  82. 'height': height,
  83. 'acodec': acodec,
  84. 'vcodec': vcodec,
  85. })
  86. self._sort_formats(formats)
  87. subtitles = {}
  88. for source in settings.findall(compat_xpath('.//MarkerResourceSource')):
  89. subtitle_url = source.text
  90. if not subtitle_url:
  91. continue
  92. subtitles.setdefault('en', []).append({
  93. 'url': '%s/%s' % (base_url, subtitle_url),
  94. 'ext': source.get('type'),
  95. })
  96. return {
  97. 'id': video_id,
  98. 'title': title,
  99. 'subtitles': subtitles,
  100. 'formats': formats
  101. }
  102. class MicrosoftVirtualAcademyCourseIE(MicrosoftVirtualAcademyBaseIE):
  103. IE_NAME = 'mva:course'
  104. IE_DESC = 'Microsoft Virtual Academy courses'
  105. _VALID_URL = r'(?:%s:|https?://(?:mva\.microsoft|(?:www\.)?microsoftvirtualacademy)\.com/[^/]+/training-courses/(?P<display_id>[^/?#&]+)-)(?P<id>\d+)' % IE_NAME
  106. _TESTS = [{
  107. 'url': 'https://mva.microsoft.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788',
  108. 'info_dict': {
  109. 'id': '11788',
  110. 'title': 'Microsoft Azure Fundamentals: Virtual Machines',
  111. },
  112. 'playlist_count': 36,
  113. }, {
  114. # with emphasized chapters
  115. 'url': 'https://mva.microsoft.com/en-US/training-courses/developing-windows-10-games-with-construct-2-16335',
  116. 'info_dict': {
  117. 'id': '16335',
  118. 'title': 'Developing Windows 10 Games with Construct 2',
  119. },
  120. 'playlist_count': 10,
  121. }, {
  122. 'url': 'https://www.microsoftvirtualacademy.com/en-US/training-courses/microsoft-azure-fundamentals-virtual-machines-11788',
  123. 'only_matching': True,
  124. }, {
  125. 'url': 'mva:course:11788',
  126. 'only_matching': True,
  127. }]
  128. @classmethod
  129. def suitable(cls, url):
  130. return False if MicrosoftVirtualAcademyIE.suitable(url) else super(
  131. MicrosoftVirtualAcademyCourseIE, cls).suitable(url)
  132. def _real_extract(self, url):
  133. mobj = re.match(self._VALID_URL, url)
  134. course_id = mobj.group('id')
  135. display_id = mobj.group('display_id')
  136. base_url = self._extract_base_url(course_id, display_id)
  137. manifest = self._download_json(
  138. '%s/imsmanifestlite.json' % base_url,
  139. display_id, 'Downloading course manifest JSON')['manifest']
  140. organization = manifest['organizations']['organization'][0]
  141. entries = []
  142. for chapter in organization['item']:
  143. chapter_number, chapter_title = self._extract_chapter_and_title(chapter.get('title'))
  144. chapter_id = chapter.get('@identifier')
  145. for item in chapter.get('item', []):
  146. item_id = item.get('@identifier')
  147. if not item_id:
  148. continue
  149. metadata = item.get('resource', {}).get('metadata') or {}
  150. if metadata.get('learningresourcetype') != 'Video':
  151. continue
  152. _, title = self._extract_chapter_and_title(item.get('title'))
  153. duration = parse_duration(metadata.get('duration'))
  154. description = metadata.get('description')
  155. entries.append({
  156. '_type': 'url_transparent',
  157. 'url': smuggle_url(
  158. 'mva:%s:%s' % (course_id, item_id), {'base_url': base_url}),
  159. 'title': title,
  160. 'description': description,
  161. 'duration': duration,
  162. 'chapter': chapter_title,
  163. 'chapter_number': chapter_number,
  164. 'chapter_id': chapter_id,
  165. })
  166. title = organization.get('title') or manifest.get('metadata', {}).get('title')
  167. return self.playlist_result(entries, course_id, title)