You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

93 lines
3.3 KiB

11 years ago
11 years ago
11 years ago
10 years ago
10 years ago
11 years ago
11 years ago
11 years ago
10 years ago
10 years ago
11 years ago
  1. from __future__ import unicode_literals
  2. import re
  3. import json
  4. from .common import InfoExtractor
  5. from ..utils import (
  6. qualities,
  7. )
  8. class ImdbIE(InfoExtractor):
  9. IE_NAME = 'imdb'
  10. IE_DESC = 'Internet Movie Database trailers'
  11. _VALID_URL = r'http://(?:www|m)\.imdb\.com/video/imdb/vi(?P<id>\d+)'
  12. _TEST = {
  13. 'url': 'http://www.imdb.com/video/imdb/vi2524815897',
  14. 'info_dict': {
  15. 'id': '2524815897',
  16. 'ext': 'mp4',
  17. 'title': 'Ice Age: Continental Drift Trailer (No. 2) - IMDb',
  18. 'description': 'md5:9061c2219254e5d14e03c25c98e96a81',
  19. }
  20. }
  21. def _real_extract(self, url):
  22. video_id = self._match_id(url)
  23. webpage = self._download_webpage('http://www.imdb.com/video/imdb/vi%s' % video_id, video_id)
  24. descr = self._html_search_regex(
  25. r'(?s)<span itemprop="description">(.*?)</span>',
  26. webpage, 'description', fatal=False)
  27. player_url = 'http://www.imdb.com/video/imdb/vi%s/imdb/single' % video_id
  28. player_page = self._download_webpage(
  29. player_url, video_id, 'Downloading player page')
  30. # the player page contains the info for the default format, we have to
  31. # fetch other pages for the rest of the formats
  32. extra_formats = re.findall(r'href="(?P<url>%s.*?)".*?>(?P<name>.*?)<' % re.escape(player_url), player_page)
  33. format_pages = [
  34. self._download_webpage(
  35. f_url, video_id, 'Downloading info for %s format' % f_name)
  36. for f_url, f_name in extra_formats]
  37. format_pages.append(player_page)
  38. quality = qualities(['SD', '480p', '720p'])
  39. formats = []
  40. for format_page in format_pages:
  41. json_data = self._search_regex(
  42. r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>',
  43. format_page, 'json data', flags=re.DOTALL)
  44. info = json.loads(json_data)
  45. format_info = info['videoPlayerObject']['video']
  46. f_id = format_info['ffname']
  47. formats.append({
  48. 'format_id': f_id,
  49. 'url': format_info['videoInfoList'][0]['videoUrl'],
  50. 'quality': quality(f_id),
  51. })
  52. self._sort_formats(formats)
  53. return {
  54. 'id': video_id,
  55. 'title': self._og_search_title(webpage),
  56. 'formats': formats,
  57. 'description': descr,
  58. 'thumbnail': format_info['slate'],
  59. }
  60. class ImdbListIE(InfoExtractor):
  61. IE_NAME = 'imdb:list'
  62. IE_DESC = 'Internet Movie Database lists'
  63. _VALID_URL = r'http://www\.imdb\.com/list/(?P<id>[\da-zA-Z_-]{11})'
  64. _TEST = {
  65. 'url': 'http://www.imdb.com/list/JFs9NWw6XI0',
  66. 'info_dict': {
  67. 'id': 'JFs9NWw6XI0',
  68. 'title': 'March 23, 2012 Releases',
  69. },
  70. 'playlist_count': 7,
  71. }
  72. def _real_extract(self, url):
  73. list_id = self._match_id(url)
  74. webpage = self._download_webpage(url, list_id)
  75. entries = [
  76. self.url_result('http://www.imdb.com' + m, 'Imdb')
  77. for m in re.findall(r'href="(/video/imdb/vi[^"]+)"\s+data-type="playlist"', webpage)]
  78. list_title = self._html_search_regex(
  79. r'<h1 class="header">(.*?)</h1>', webpage, 'list title')
  80. return self.playlist_result(entries, list_id, list_title)