You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

300 lines
12 KiB

10 years ago
10 years ago
10 years ago
10 years ago
11 years ago
10 years ago
  1. from __future__ import unicode_literals
  2. import re
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. ExtractorError,
  6. parse_filesize,
  7. qualities,
  8. )
  9. class Channel9IE(InfoExtractor):
  10. '''
  11. Common extractor for channel9.msdn.com.
  12. The type of provided URL (video or playlist) is determined according to
  13. meta Search.PageType from web page HTML rather than URL itself, as it is
  14. not always possible to do.
  15. '''
  16. IE_DESC = 'Channel 9'
  17. IE_NAME = 'channel9'
  18. _VALID_URL = r'https?://(?:www\.)?channel9\.msdn\.com/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)'
  19. _TESTS = [{
  20. 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002',
  21. 'md5': 'bbd75296ba47916b754e73c3a4bbdf10',
  22. 'info_dict': {
  23. 'id': 'Events/TechEd/Australia/2013/KOS002',
  24. 'ext': 'mp4',
  25. 'title': 'Developer Kick-Off Session: Stuff We Love',
  26. 'description': 'md5:c08d72240b7c87fcecafe2692f80e35f',
  27. 'duration': 4576,
  28. 'thumbnail': 're:http://.*\.jpg',
  29. 'session_code': 'KOS002',
  30. 'session_day': 'Day 1',
  31. 'session_room': 'Arena 1A',
  32. 'session_speakers': ['Ed Blankenship', 'Andrew Coates', 'Brady Gaster', 'Patrick Klug',
  33. 'Mads Kristensen'],
  34. },
  35. }, {
  36. 'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
  37. 'md5': 'b43ee4529d111bc37ba7ee4f34813e68',
  38. 'info_dict': {
  39. 'id': 'posts/Self-service-BI-with-Power-BI-nuclear-testing',
  40. 'ext': 'mp4',
  41. 'title': 'Self-service BI with Power BI - nuclear testing',
  42. 'description': 'md5:d1e6ecaafa7fb52a2cacdf9599829f5b',
  43. 'duration': 1540,
  44. 'thumbnail': 're:http://.*\.jpg',
  45. 'authors': ['Mike Wilmot'],
  46. },
  47. }, {
  48. # low quality mp4 is best
  49. 'url': 'https://channel9.msdn.com/Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
  50. 'info_dict': {
  51. 'id': 'Events/CPP/CppCon-2015/Ranges-for-the-Standard-Library',
  52. 'ext': 'mp4',
  53. 'title': 'Ranges for the Standard Library',
  54. 'description': 'md5:2e6b4917677af3728c5f6d63784c4c5d',
  55. 'duration': 5646,
  56. 'thumbnail': 're:http://.*\.jpg',
  57. },
  58. 'params': {
  59. 'skip_download': True,
  60. },
  61. }, {
  62. 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
  63. 'info_dict': {
  64. 'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
  65. 'title': 'Channel 9',
  66. },
  67. 'playlist_count': 2,
  68. }, {
  69. 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
  70. 'only_matching': True,
  71. }, {
  72. 'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
  73. 'only_matching': True,
  74. }]
  75. _RSS_URL = 'http://channel9.msdn.com/%s/RSS'
  76. def _formats_from_html(self, html):
  77. FORMAT_REGEX = r'''
  78. (?x)
  79. <a\s+href="(?P<url>[^"]+)">(?P<quality>[^<]+)</a>\s*
  80. <span\s+class="usage">\((?P<note>[^\)]+)\)</span>\s*
  81. (?:<div\s+class="popup\s+rounded">\s*
  82. <h3>File\s+size</h3>\s*(?P<filesize>.*?)\s*
  83. </div>)? # File size part may be missing
  84. '''
  85. quality = qualities((
  86. 'MP3', 'MP4',
  87. 'Low Quality WMV', 'Low Quality MP4',
  88. 'Mid Quality WMV', 'Mid Quality MP4',
  89. 'High Quality WMV', 'High Quality MP4'))
  90. formats = [{
  91. 'url': x.group('url'),
  92. 'format_id': x.group('quality'),
  93. 'format_note': x.group('note'),
  94. 'format': '%s (%s)' % (x.group('quality'), x.group('note')),
  95. 'filesize_approx': parse_filesize(x.group('filesize')),
  96. 'quality': quality(x.group('quality')),
  97. 'vcodec': 'none' if x.group('note') == 'Audio only' else None,
  98. } for x in list(re.finditer(FORMAT_REGEX, html))]
  99. self._sort_formats(formats)
  100. return formats
  101. def _extract_title(self, html):
  102. title = self._html_search_meta('title', html, 'title')
  103. if title is None:
  104. title = self._og_search_title(html)
  105. TITLE_SUFFIX = ' (Channel 9)'
  106. if title is not None and title.endswith(TITLE_SUFFIX):
  107. title = title[:-len(TITLE_SUFFIX)]
  108. return title
  109. def _extract_description(self, html):
  110. DESCRIPTION_REGEX = r'''(?sx)
  111. <div\s+class="entry-content">\s*
  112. <div\s+id="entry-body">\s*
  113. (?P<description>.+?)\s*
  114. </div>\s*
  115. </div>
  116. '''
  117. m = re.search(DESCRIPTION_REGEX, html)
  118. if m is not None:
  119. return m.group('description')
  120. return self._html_search_meta('description', html, 'description')
  121. def _extract_duration(self, html):
  122. m = re.search(r'"length": *"(?P<hours>\d{2}):(?P<minutes>\d{2}):(?P<seconds>\d{2})"', html)
  123. return ((int(m.group('hours')) * 60 * 60) + (int(m.group('minutes')) * 60) + int(m.group('seconds'))) if m else None
  124. def _extract_slides(self, html):
  125. m = re.search(r'<a href="(?P<slidesurl>[^"]+)" class="slides">Slides</a>', html)
  126. return m.group('slidesurl') if m is not None else None
  127. def _extract_zip(self, html):
  128. m = re.search(r'<a href="(?P<zipurl>[^"]+)" class="zip">Zip</a>', html)
  129. return m.group('zipurl') if m is not None else None
  130. def _extract_avg_rating(self, html):
  131. m = re.search(r'<p class="avg-rating">Avg Rating: <span>(?P<avgrating>[^<]+)</span></p>', html)
  132. return float(m.group('avgrating')) if m is not None else 0
  133. def _extract_rating_count(self, html):
  134. m = re.search(r'<div class="rating-count">\((?P<ratingcount>[^<]+)\)</div>', html)
  135. return int(self._fix_count(m.group('ratingcount'))) if m is not None else 0
  136. def _extract_view_count(self, html):
  137. m = re.search(r'<li class="views">\s*<span class="count">(?P<viewcount>[^<]+)</span> Views\s*</li>', html)
  138. return int(self._fix_count(m.group('viewcount'))) if m is not None else 0
  139. def _extract_comment_count(self, html):
  140. m = re.search(r'<li class="comments">\s*<a href="#comments">\s*<span class="count">(?P<commentcount>[^<]+)</span> Comments\s*</a>\s*</li>', html)
  141. return int(self._fix_count(m.group('commentcount'))) if m is not None else 0
  142. def _fix_count(self, count):
  143. return int(str(count).replace(',', '')) if count is not None else None
  144. def _extract_authors(self, html):
  145. m = re.search(r'(?s)<li class="author">(.*?)</li>', html)
  146. if m is None:
  147. return None
  148. return re.findall(r'<a href="/Niners/[^"]+">([^<]+)</a>', m.group(1))
  149. def _extract_session_code(self, html):
  150. m = re.search(r'<li class="code">\s*(?P<code>.+?)\s*</li>', html)
  151. return m.group('code') if m is not None else None
  152. def _extract_session_day(self, html):
  153. m = re.search(r'<li class="day">\s*<a href="/Events/[^"]+">(?P<day>[^<]+)</a>\s*</li>', html)
  154. return m.group('day').strip() if m is not None else None
  155. def _extract_session_room(self, html):
  156. m = re.search(r'<li class="room">\s*(?P<room>.+?)\s*</li>', html)
  157. return m.group('room') if m is not None else None
  158. def _extract_session_speakers(self, html):
  159. return re.findall(r'<a href="/Events/Speakers/[^"]+">([^<]+)</a>', html)
  160. def _extract_content(self, html, content_path):
  161. # Look for downloadable content
  162. formats = self._formats_from_html(html)
  163. slides = self._extract_slides(html)
  164. zip_ = self._extract_zip(html)
  165. # Nothing to download
  166. if len(formats) == 0 and slides is None and zip_ is None:
  167. self._downloader.report_warning('None of recording, slides or zip are available for %s' % content_path)
  168. return
  169. # Extract meta
  170. title = self._extract_title(html)
  171. description = self._extract_description(html)
  172. thumbnail = self._og_search_thumbnail(html)
  173. duration = self._extract_duration(html)
  174. avg_rating = self._extract_avg_rating(html)
  175. rating_count = self._extract_rating_count(html)
  176. view_count = self._extract_view_count(html)
  177. comment_count = self._extract_comment_count(html)
  178. common = {
  179. '_type': 'video',
  180. 'id': content_path,
  181. 'description': description,
  182. 'thumbnail': thumbnail,
  183. 'duration': duration,
  184. 'avg_rating': avg_rating,
  185. 'rating_count': rating_count,
  186. 'view_count': view_count,
  187. 'comment_count': comment_count,
  188. }
  189. result = []
  190. if slides is not None:
  191. d = common.copy()
  192. d.update({'title': title + '-Slides', 'url': slides})
  193. result.append(d)
  194. if zip_ is not None:
  195. d = common.copy()
  196. d.update({'title': title + '-Zip', 'url': zip_})
  197. result.append(d)
  198. if len(formats) > 0:
  199. d = common.copy()
  200. d.update({'title': title, 'formats': formats})
  201. result.append(d)
  202. return result
  203. def _extract_entry_item(self, html, content_path):
  204. contents = self._extract_content(html, content_path)
  205. if contents is None:
  206. return contents
  207. if len(contents) > 1:
  208. raise ExtractorError('Got more than one entry')
  209. result = contents[0]
  210. result['authors'] = self._extract_authors(html)
  211. return result
  212. def _extract_session(self, html, content_path):
  213. contents = self._extract_content(html, content_path)
  214. if contents is None:
  215. return contents
  216. session_meta = {
  217. 'session_code': self._extract_session_code(html),
  218. 'session_day': self._extract_session_day(html),
  219. 'session_room': self._extract_session_room(html),
  220. 'session_speakers': self._extract_session_speakers(html),
  221. }
  222. for content in contents:
  223. content.update(session_meta)
  224. return self.playlist_result(contents)
  225. def _extract_list(self, video_id, rss_url=None):
  226. if not rss_url:
  227. rss_url = self._RSS_URL % video_id
  228. rss = self._download_xml(rss_url, video_id, 'Downloading RSS')
  229. entries = [self.url_result(session_url.text, 'Channel9')
  230. for session_url in rss.findall('./channel/item/link')]
  231. title_text = rss.find('./channel/title').text
  232. return self.playlist_result(entries, video_id, title_text)
  233. def _real_extract(self, url):
  234. mobj = re.match(self._VALID_URL, url)
  235. content_path = mobj.group('contentpath')
  236. rss = mobj.group('rss')
  237. if rss:
  238. return self._extract_list(content_path, url)
  239. webpage = self._download_webpage(
  240. url, content_path, 'Downloading web page')
  241. page_type = self._search_regex(
  242. r'<meta[^>]+name=(["\'])WT\.entryid\1[^>]+content=(["\'])(?P<pagetype>[^:]+).+?\2',
  243. webpage, 'page type', default=None, group='pagetype')
  244. if page_type:
  245. if page_type == 'Entry': # Any 'item'-like page, may contain downloadable content
  246. return self._extract_entry_item(webpage, content_path)
  247. elif page_type == 'Session': # Event session page, may contain downloadable content
  248. return self._extract_session(webpage, content_path)
  249. elif page_type == 'Event':
  250. return self._extract_list(content_path)
  251. else:
  252. raise ExtractorError('Unexpected WT.entryid %s' % page_type, expected=True)
  253. else: # Assuming list
  254. return self._extract_list(content_path)