You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

110 lines
4.0 KiB

11 years ago
11 years ago
11 years ago
10 years ago
10 years ago
11 years ago
11 years ago
11 years ago
10 years ago
10 years ago
11 years ago
  1. from __future__ import unicode_literals
  2. import re
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. mimetype2ext,
  6. qualities,
  7. )
  8. class ImdbIE(InfoExtractor):
  9. IE_NAME = 'imdb'
  10. IE_DESC = 'Internet Movie Database trailers'
  11. _VALID_URL = r'https?://(?:www|m)\.imdb\.com/video/[^/]+/vi(?P<id>\d+)'
  12. _TESTS = [{
  13. 'url': 'http://www.imdb.com/video/imdb/vi2524815897',
  14. 'info_dict': {
  15. 'id': '2524815897',
  16. 'ext': 'mp4',
  17. 'title': 'Ice Age: Continental Drift Trailer (No. 2) - IMDb',
  18. 'description': 'md5:9061c2219254e5d14e03c25c98e96a81',
  19. }
  20. }, {
  21. 'url': 'http://www.imdb.com/video/_/vi2524815897',
  22. 'only_matching': True,
  23. }]
  24. def _real_extract(self, url):
  25. video_id = self._match_id(url)
  26. webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id)
  27. descr = self._html_search_regex(
  28. r'(?s)<span itemprop="description">(.*?)</span>',
  29. webpage, 'description', fatal=False)
  30. player_url = 'http://www.imdb.com/video/imdb/vi%s/imdb/single' % video_id
  31. player_page = self._download_webpage(
  32. player_url, video_id, 'Downloading player page')
  33. # the player page contains the info for the default format, we have to
  34. # fetch other pages for the rest of the formats
  35. extra_formats = re.findall(r'href="(?P<url>%s.*?)".*?>(?P<name>.*?)<' % re.escape(player_url), player_page)
  36. format_pages = [
  37. self._download_webpage(
  38. f_url, video_id, 'Downloading info for %s format' % f_name)
  39. for f_url, f_name in extra_formats]
  40. format_pages.append(player_page)
  41. quality = qualities(('SD', '480p', '720p', '1080p'))
  42. formats = []
  43. for format_page in format_pages:
  44. json_data = self._search_regex(
  45. r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>',
  46. format_page, 'json data', flags=re.DOTALL)
  47. info = self._parse_json(json_data, video_id, fatal=False)
  48. if not info:
  49. continue
  50. format_info = info.get('videoPlayerObject', {}).get('video', {})
  51. if not format_info:
  52. continue
  53. video_info_list = format_info.get('videoInfoList')
  54. if not video_info_list or not isinstance(video_info_list, list):
  55. continue
  56. video_info = video_info_list[0]
  57. if not video_info or not isinstance(video_info, dict):
  58. continue
  59. video_url = video_info.get('videoUrl')
  60. if not video_url:
  61. continue
  62. format_id = format_info.get('ffname')
  63. formats.append({
  64. 'format_id': format_id,
  65. 'url': video_url,
  66. 'ext': mimetype2ext(video_info.get('videoMimeType')),
  67. 'quality': quality(format_id),
  68. })
  69. self._sort_formats(formats)
  70. return {
  71. 'id': video_id,
  72. 'title': self._og_search_title(webpage),
  73. 'formats': formats,
  74. 'description': descr,
  75. 'thumbnail': format_info['slate'],
  76. }
  77. class ImdbListIE(InfoExtractor):
  78. IE_NAME = 'imdb:list'
  79. IE_DESC = 'Internet Movie Database lists'
  80. _VALID_URL = r'https?://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})'
  81. _TEST = {
  82. 'url': 'http://www.imdb.com/list/JFs9NWw6XI0',
  83. 'info_dict': {
  84. 'id': 'JFs9NWw6XI0',
  85. 'title': 'March 23, 2012 Releases',
  86. },
  87. 'playlist_count': 7,
  88. }
  89. def _real_extract(self, url):
  90. list_id = self._match_id(url)
  91. webpage = self._download_webpage(url, list_id)
  92. entries = [
  93. self.url_result('http://www.imdb.com' + m, 'Imdb')
  94. for m in re.findall(r'href="(/video/imdb/vi[^"]+)"\s+data-type="playlist"', webpage)]
  95. list_title = self._html_search_regex(
  96. r'<h1 class="header">(.*?)</h1>', webpage, 'list title')
  97. return self.playlist_result(entries, list_id, list_title)