You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

121 lines
4.4 KiB

11 years ago
11 years ago
11 years ago
10 years ago
11 years ago
11 years ago
10 years ago
10 years ago
  1. from __future__ import unicode_literals
  2. import re
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. determine_ext,
  6. mimetype2ext,
  7. parse_duration,
  8. qualities,
  9. url_or_none,
  10. )
  11. class ImdbIE(InfoExtractor):
  12. IE_NAME = 'imdb'
  13. IE_DESC = 'Internet Movie Database trailers'
  14. _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title|list).+?[/-]vi(?P<id>\d+)'
  15. _TESTS = [{
  16. 'url': 'http://www.imdb.com/video/imdb/vi2524815897',
  17. 'info_dict': {
  18. 'id': '2524815897',
  19. 'ext': 'mp4',
  20. 'title': 'No. 2 from Ice Age: Continental Drift (2012)',
  21. 'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7',
  22. }
  23. }, {
  24. 'url': 'http://www.imdb.com/video/_/vi2524815897',
  25. 'only_matching': True,
  26. }, {
  27. 'url': 'http://www.imdb.com/title/tt1667889/?ref_=ext_shr_eml_vi#lb-vi2524815897',
  28. 'only_matching': True,
  29. }, {
  30. 'url': 'http://www.imdb.com/title/tt1667889/#lb-vi2524815897',
  31. 'only_matching': True,
  32. }, {
  33. 'url': 'http://www.imdb.com/videoplayer/vi1562949145',
  34. 'only_matching': True,
  35. }, {
  36. 'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561',
  37. 'only_matching': True,
  38. }, {
  39. 'url': 'https://www.imdb.com/list/ls009921623/videoplayer/vi260482329',
  40. 'only_matching': True,
  41. }]
  42. def _real_extract(self, url):
  43. video_id = self._match_id(url)
  44. webpage = self._download_webpage(
  45. 'https://www.imdb.com/videoplayer/vi' + video_id, video_id)
  46. video_metadata = self._parse_json(self._search_regex(
  47. r'window\.IMDbReactInitialState\.push\(({.+?})\);', webpage,
  48. 'video metadata'), video_id)['videos']['videoMetadata']['vi' + video_id]
  49. title = self._html_search_meta(
  50. ['og:title', 'twitter:title'], webpage) or self._html_search_regex(
  51. r'<title>(.+?)</title>', webpage, 'title', fatal=False) or video_metadata['title']
  52. quality = qualities(('SD', '480p', '720p', '1080p'))
  53. formats = []
  54. for encoding in video_metadata.get('encodings', []):
  55. if not encoding or not isinstance(encoding, dict):
  56. continue
  57. video_url = url_or_none(encoding.get('videoUrl'))
  58. if not video_url:
  59. continue
  60. ext = mimetype2ext(encoding.get(
  61. 'mimeType')) or determine_ext(video_url)
  62. if ext == 'm3u8':
  63. formats.extend(self._extract_m3u8_formats(
  64. video_url, video_id, 'mp4', entry_protocol='m3u8_native',
  65. m3u8_id='hls', fatal=False))
  66. continue
  67. format_id = encoding.get('definition')
  68. formats.append({
  69. 'format_id': format_id,
  70. 'url': video_url,
  71. 'ext': ext,
  72. 'quality': quality(format_id),
  73. })
  74. self._sort_formats(formats)
  75. return {
  76. 'id': video_id,
  77. 'title': title,
  78. 'formats': formats,
  79. 'description': video_metadata.get('description'),
  80. 'thumbnail': video_metadata.get('slate', {}).get('url'),
  81. 'duration': parse_duration(video_metadata.get('duration')),
  82. }
  83. class ImdbListIE(InfoExtractor):
  84. IE_NAME = 'imdb:list'
  85. IE_DESC = 'Internet Movie Database lists'
  86. _VALID_URL = r'https?://(?:www\.)?imdb\.com/list/ls(?P<id>\d{9})(?!/videoplayer/vi\d+)'
  87. _TEST = {
  88. 'url': 'https://www.imdb.com/list/ls009921623/',
  89. 'info_dict': {
  90. 'id': '009921623',
  91. 'title': 'The Bourne Legacy',
  92. 'description': 'A list of trailers, clips, and more from The Bourne Legacy, starring Jeremy Renner and Rachel Weisz.',
  93. },
  94. 'playlist_count': 8,
  95. }
  96. def _real_extract(self, url):
  97. list_id = self._match_id(url)
  98. webpage = self._download_webpage(url, list_id)
  99. entries = [
  100. self.url_result('http://www.imdb.com' + m, 'Imdb')
  101. for m in re.findall(r'href="(/list/ls%s/videoplayer/vi[^"]+)"' % list_id, webpage)]
  102. list_title = self._html_search_regex(
  103. r'<h1[^>]+class="[^"]*header[^"]*"[^>]*>(.*?)</h1>',
  104. webpage, 'list title')
  105. list_description = self._html_search_regex(
  106. r'<div[^>]+class="[^"]*list-description[^"]*"[^>]*><p>(.*?)</p>',
  107. webpage, 'list description')
  108. return self.playlist_result(entries, list_id, list_title, list_description)