You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

116 lines
4.3 KiB

11 years ago
11 years ago
11 years ago
10 years ago
10 years ago
11 years ago
11 years ago
11 years ago
10 years ago
10 years ago
11 years ago
  1. from __future__ import unicode_literals
  2. import re
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. mimetype2ext,
  6. qualities,
  7. )
  8. class ImdbIE(InfoExtractor):
  9. IE_NAME = 'imdb'
  10. IE_DESC = 'Internet Movie Database trailers'
  11. _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video/[^/]+/|title/tt\d+.*?#lb-)vi(?P<id>\d+)'
  12. _TESTS = [{
  13. 'url': 'http://www.imdb.com/video/imdb/vi2524815897',
  14. 'info_dict': {
  15. 'id': '2524815897',
  16. 'ext': 'mp4',
  17. 'title': 'Ice Age: Continental Drift Trailer (No. 2) - IMDb',
  18. 'description': 'md5:9061c2219254e5d14e03c25c98e96a81',
  19. }
  20. }, {
  21. 'url': 'http://www.imdb.com/video/_/vi2524815897',
  22. 'only_matching': True,
  23. }, {
  24. 'url': 'http://www.imdb.com/title/tt1667889/?ref_=ext_shr_eml_vi#lb-vi2524815897',
  25. 'only_matching': True,
  26. }, {
  27. 'url': 'http://www.imdb.com/title/tt1667889/#lb-vi2524815897',
  28. 'only_matching': True,
  29. }]
  30. def _real_extract(self, url):
  31. video_id = self._match_id(url)
  32. webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id)
  33. descr = self._html_search_regex(
  34. r'(?s)<span itemprop="description">(.*?)</span>',
  35. webpage, 'description', fatal=False)
  36. player_url = 'http://www.imdb.com/video/imdb/vi%s/imdb/single' % video_id
  37. player_page = self._download_webpage(
  38. player_url, video_id, 'Downloading player page')
  39. # the player page contains the info for the default format, we have to
  40. # fetch other pages for the rest of the formats
  41. extra_formats = re.findall(r'href="(?P<url>%s.*?)".*?>(?P<name>.*?)<' % re.escape(player_url), player_page)
  42. format_pages = [
  43. self._download_webpage(
  44. f_url, video_id, 'Downloading info for %s format' % f_name)
  45. for f_url, f_name in extra_formats]
  46. format_pages.append(player_page)
  47. quality = qualities(('SD', '480p', '720p', '1080p'))
  48. formats = []
  49. for format_page in format_pages:
  50. json_data = self._search_regex(
  51. r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>',
  52. format_page, 'json data', flags=re.DOTALL)
  53. info = self._parse_json(json_data, video_id, fatal=False)
  54. if not info:
  55. continue
  56. format_info = info.get('videoPlayerObject', {}).get('video', {})
  57. if not format_info:
  58. continue
  59. video_info_list = format_info.get('videoInfoList')
  60. if not video_info_list or not isinstance(video_info_list, list):
  61. continue
  62. video_info = video_info_list[0]
  63. if not video_info or not isinstance(video_info, dict):
  64. continue
  65. video_url = video_info.get('videoUrl')
  66. if not video_url:
  67. continue
  68. format_id = format_info.get('ffname')
  69. formats.append({
  70. 'format_id': format_id,
  71. 'url': video_url,
  72. 'ext': mimetype2ext(video_info.get('videoMimeType')),
  73. 'quality': quality(format_id),
  74. })
  75. self._sort_formats(formats)
  76. return {
  77. 'id': video_id,
  78. 'title': self._og_search_title(webpage),
  79. 'formats': formats,
  80. 'description': descr,
  81. 'thumbnail': format_info['slate'],
  82. }
  83. class ImdbListIE(InfoExtractor):
  84. IE_NAME = 'imdb:list'
  85. IE_DESC = 'Internet Movie Database lists'
  86. _VALID_URL = r'https?://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})'
  87. _TEST = {
  88. 'url': 'http://www.imdb.com/list/JFs9NWw6XI0',
  89. 'info_dict': {
  90. 'id': 'JFs9NWw6XI0',
  91. 'title': 'March 23, 2012 Releases',
  92. },
  93. 'playlist_count': 7,
  94. }
  95. def _real_extract(self, url):
  96. list_id = self._match_id(url)
  97. webpage = self._download_webpage(url, list_id)
  98. entries = [
  99. self.url_result('http://www.imdb.com' + m, 'Imdb')
  100. for m in re.findall(r'href="(/video/imdb/vi[^"]+)"\s+data-type="playlist"', webpage)]
  101. list_title = self._html_search_regex(
  102. r'<h1 class="header">(.*?)</h1>', webpage, 'list title')
  103. return self.playlist_result(entries, list_id, list_title)