You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

147 lines
5.1 KiB

11 years ago
11 years ago
11 years ago
10 years ago
11 years ago
11 years ago
10 years ago
10 years ago
  1. from __future__ import unicode_literals
  2. import base64
  3. import json
  4. import re
  5. from .common import InfoExtractor
  6. from ..utils import (
  7. determine_ext,
  8. mimetype2ext,
  9. parse_duration,
  10. qualities,
  11. try_get,
  12. url_or_none,
  13. )
  14. class ImdbIE(InfoExtractor):
  15. IE_NAME = 'imdb'
  16. IE_DESC = 'Internet Movie Database trailers'
  17. _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title|list).*?[/-]vi(?P<id>\d+)'
  18. _TESTS = [{
  19. 'url': 'http://www.imdb.com/video/imdb/vi2524815897',
  20. 'info_dict': {
  21. 'id': '2524815897',
  22. 'ext': 'mp4',
  23. 'title': 'No. 2',
  24. 'description': 'md5:87bd0bdc61e351f21f20d2d7441cb4e7',
  25. 'duration': 152,
  26. }
  27. }, {
  28. 'url': 'http://www.imdb.com/video/_/vi2524815897',
  29. 'only_matching': True,
  30. }, {
  31. 'url': 'http://www.imdb.com/title/tt1667889/?ref_=ext_shr_eml_vi#lb-vi2524815897',
  32. 'only_matching': True,
  33. }, {
  34. 'url': 'http://www.imdb.com/title/tt1667889/#lb-vi2524815897',
  35. 'only_matching': True,
  36. }, {
  37. 'url': 'http://www.imdb.com/videoplayer/vi1562949145',
  38. 'only_matching': True,
  39. }, {
  40. 'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561',
  41. 'only_matching': True,
  42. }, {
  43. 'url': 'https://www.imdb.com/list/ls009921623/videoplayer/vi260482329',
  44. 'only_matching': True,
  45. }]
  46. def _real_extract(self, url):
  47. video_id = self._match_id(url)
  48. data = self._download_json(
  49. 'https://www.imdb.com/ve/data/VIDEO_PLAYBACK_DATA', video_id,
  50. query={
  51. 'key': base64.b64encode(json.dumps({
  52. 'type': 'VIDEO_PLAYER',
  53. 'subType': 'FORCE_LEGACY',
  54. 'id': 'vi%s' % video_id,
  55. }).encode()).decode(),
  56. })[0]
  57. quality = qualities(('SD', '480p', '720p', '1080p'))
  58. formats = []
  59. for encoding in data['videoLegacyEncodings']:
  60. if not encoding or not isinstance(encoding, dict):
  61. continue
  62. video_url = url_or_none(encoding.get('url'))
  63. if not video_url:
  64. continue
  65. ext = mimetype2ext(encoding.get(
  66. 'mimeType')) or determine_ext(video_url)
  67. if ext == 'm3u8':
  68. formats.extend(self._extract_m3u8_formats(
  69. video_url, video_id, 'mp4', entry_protocol='m3u8_native',
  70. preference=1, m3u8_id='hls', fatal=False))
  71. continue
  72. format_id = encoding.get('definition')
  73. formats.append({
  74. 'format_id': format_id,
  75. 'url': video_url,
  76. 'ext': ext,
  77. 'quality': quality(format_id),
  78. })
  79. self._sort_formats(formats)
  80. webpage = self._download_webpage(
  81. 'https://www.imdb.com/video/vi' + video_id, video_id)
  82. video_metadata = self._parse_json(self._search_regex(
  83. r'args\.push\(\s*({.+?})\s*\)\s*;', webpage,
  84. 'video metadata'), video_id)
  85. video_info = video_metadata.get('VIDEO_INFO')
  86. if video_info and isinstance(video_info, dict):
  87. info = try_get(
  88. video_info, lambda x: x[list(video_info.keys())[0]][0], dict)
  89. else:
  90. info = {}
  91. title = self._html_search_meta(
  92. ['og:title', 'twitter:title'], webpage) or self._html_search_regex(
  93. r'<title>(.+?)</title>', webpage, 'title',
  94. default=None) or info['videoTitle']
  95. return {
  96. 'id': video_id,
  97. 'title': title,
  98. 'alt_title': info.get('videoSubTitle'),
  99. 'formats': formats,
  100. 'description': info.get('videoDescription'),
  101. 'thumbnail': url_or_none(try_get(
  102. video_metadata, lambda x: x['videoSlate']['source'])),
  103. 'duration': parse_duration(info.get('videoRuntime')),
  104. }
  105. class ImdbListIE(InfoExtractor):
  106. IE_NAME = 'imdb:list'
  107. IE_DESC = 'Internet Movie Database lists'
  108. _VALID_URL = r'https?://(?:www\.)?imdb\.com/list/ls(?P<id>\d{9})(?!/videoplayer/vi\d+)'
  109. _TEST = {
  110. 'url': 'https://www.imdb.com/list/ls009921623/',
  111. 'info_dict': {
  112. 'id': '009921623',
  113. 'title': 'The Bourne Legacy',
  114. 'description': 'A list of trailers, clips, and more from The Bourne Legacy, starring Jeremy Renner and Rachel Weisz.',
  115. },
  116. 'playlist_count': 8,
  117. }
  118. def _real_extract(self, url):
  119. list_id = self._match_id(url)
  120. webpage = self._download_webpage(url, list_id)
  121. entries = [
  122. self.url_result('http://www.imdb.com' + m, 'Imdb')
  123. for m in re.findall(r'href="(/list/ls%s/videoplayer/vi[^"]+)"' % list_id, webpage)]
  124. list_title = self._html_search_regex(
  125. r'<h1[^>]+class="[^"]*header[^"]*"[^>]*>(.*?)</h1>',
  126. webpage, 'list title')
  127. list_description = self._html_search_regex(
  128. r'<div[^>]+class="[^"]*list-description[^"]*"[^>]*><p>(.*?)</p>',
  129. webpage, 'list description')
  130. return self.playlist_result(entries, list_id, list_title, list_description)