You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

171 lines
6.3 KiB

  1. from __future__ import unicode_literals
  2. import re
  3. from .common import InfoExtractor
  4. from ..utils import (
  5. ExtractorError,
  6. clean_html,
  7. determine_ext,
  8. int_or_none,
  9. js_to_json,
  10. parse_duration,
  11. )
  12. class SnagFilmsEmbedIE(InfoExtractor):
  13. _VALID_URL = r'https?://(?:(?:www|embed)\.)?snagfilms\.com/embed/player\?.*\bfilmId=(?P<id>[\da-f-]{36})'
  14. _TESTS = [{
  15. 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500',
  16. 'md5': '2924e9215c6eff7a55ed35b72276bd93',
  17. 'info_dict': {
  18. 'id': '74849a00-85a9-11e1-9660-123139220831',
  19. 'ext': 'mp4',
  20. 'title': '#whilewewatch',
  21. }
  22. }, {
  23. 'url': 'http://www.snagfilms.com/embed/player?filmId=0000014c-de2f-d5d6-abcf-ffef58af0017',
  24. 'only_matching': True,
  25. }]
  26. @staticmethod
  27. def _extract_url(webpage):
  28. mobj = re.search(
  29. r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?snagfilms\.com/embed/player.+?)\1',
  30. webpage)
  31. if mobj:
  32. return mobj.group('url')
  33. def _real_extract(self, url):
  34. video_id = self._match_id(url)
  35. webpage = self._download_webpage(url, video_id)
  36. if '>This film is not playable in your area.<' in webpage:
  37. raise ExtractorError(
  38. 'Film %s is not playable in your area.' % video_id, expected=True)
  39. formats = []
  40. for source in self._parse_json(js_to_json(self._search_regex(
  41. r'(?s)sources:\s*(\[.+?\]),', webpage, 'json')), video_id):
  42. file_ = source.get('file')
  43. if not file_:
  44. continue
  45. type_ = source.get('type')
  46. format_id = source.get('label')
  47. ext = determine_ext(file_)
  48. if any(_ == 'm3u8' for _ in (type_, ext)):
  49. formats.extend(self._extract_m3u8_formats(
  50. file_, video_id, 'mp4', m3u8_id='hls'))
  51. else:
  52. bitrate = int_or_none(self._search_regex(
  53. r'(\d+)kbps', file_, 'bitrate', default=None))
  54. height = int_or_none(self._search_regex(
  55. r'^(\d+)[pP]$', format_id, 'height', default=None))
  56. formats.append({
  57. 'url': file_,
  58. 'format_id': format_id,
  59. 'tbr': bitrate,
  60. 'height': height,
  61. })
  62. self._sort_formats(formats)
  63. title = self._search_regex(
  64. [r"title\s*:\s*'([^']+)'", r'<title>([^<]+)</title>'],
  65. webpage, 'title')
  66. return {
  67. 'id': video_id,
  68. 'title': title,
  69. 'formats': formats,
  70. }
  71. class SnagFilmsIE(InfoExtractor):
  72. _VALID_URL = r'https?://(?:www\.)?snagfilms\.com/(?:films/title|show)/(?P<id>[^?#]+)'
  73. _TESTS = [{
  74. 'url': 'http://www.snagfilms.com/films/title/lost_for_life',
  75. 'md5': '19844f897b35af219773fd63bdec2942',
  76. 'info_dict': {
  77. 'id': '0000014c-de2f-d5d6-abcf-ffef58af0017',
  78. 'display_id': 'lost_for_life',
  79. 'ext': 'mp4',
  80. 'title': 'Lost for Life',
  81. 'description': 'md5:fbdacc8bb6b455e464aaf98bc02e1c82',
  82. 'thumbnail': 're:^https?://.*\.jpg',
  83. 'duration': 4489,
  84. 'categories': ['Documentary', 'Crime', 'Award Winning', 'Festivals']
  85. }
  86. }, {
  87. 'url': 'http://www.snagfilms.com/show/the_world_cut_project/india',
  88. 'md5': 'e6292e5b837642bbda82d7f8bf3fbdfd',
  89. 'info_dict': {
  90. 'id': '00000145-d75c-d96e-a9c7-ff5c67b20000',
  91. 'display_id': 'the_world_cut_project/india',
  92. 'ext': 'mp4',
  93. 'title': 'India',
  94. 'description': 'md5:5c168c5a8f4719c146aad2e0dfac6f5f',
  95. 'thumbnail': 're:^https?://.*\.jpg',
  96. 'duration': 979,
  97. 'categories': ['Documentary', 'Sports', 'Politics']
  98. }
  99. }, {
  100. # Film is not playable in your area.
  101. 'url': 'http://www.snagfilms.com/films/title/inside_mecca',
  102. 'only_matching': True,
  103. }, {
  104. # Film is not available.
  105. 'url': 'http://www.snagfilms.com/show/augie_alone/flirting',
  106. 'only_matching': True,
  107. }]
  108. def _real_extract(self, url):
  109. display_id = self._match_id(url)
  110. webpage = self._download_webpage(url, display_id)
  111. if ">Sorry, the Film you're looking for is not available.<" in webpage:
  112. raise ExtractorError(
  113. 'Film %s is not available.' % display_id, expected=True)
  114. film_id = self._search_regex(r'filmId=([\da-f-]{36})"', webpage, 'film id')
  115. snag = self._parse_json(
  116. self._search_regex(
  117. 'Snag\.page\.data\s*=\s*(\[.+?\]);', webpage, 'snag'),
  118. display_id)
  119. for item in snag:
  120. if item.get('data', {}).get('film', {}).get('id') == film_id:
  121. data = item['data']['film']
  122. title = data['title']
  123. description = clean_html(data.get('synopsis'))
  124. thumbnail = data.get('image')
  125. duration = int_or_none(data.get('duration') or data.get('runtime'))
  126. categories = [
  127. category['title'] for category in data.get('categories', [])
  128. if category.get('title')]
  129. break
  130. else:
  131. title = self._search_regex(
  132. r'itemprop="title">([^<]+)<', webpage, 'title')
  133. description = self._html_search_regex(
  134. r'(?s)<div itemprop="description" class="film-synopsis-inner ">(.+?)</div>',
  135. webpage, 'description', default=None) or self._og_search_description(webpage)
  136. thumbnail = self._og_search_thumbnail(webpage)
  137. duration = parse_duration(self._search_regex(
  138. r'<span itemprop="duration" class="film-duration strong">([^<]+)<',
  139. webpage, 'duration', fatal=False))
  140. categories = re.findall(r'<a href="/movies/[^"]+">([^<]+)</a>', webpage)
  141. return {
  142. '_type': 'url_transparent',
  143. 'url': 'http://embed.snagfilms.com/embed/player?filmId=%s' % film_id,
  144. 'id': film_id,
  145. 'display_id': display_id,
  146. 'title': title,
  147. 'description': description,
  148. 'thumbnail': thumbnail,
  149. 'duration': duration,
  150. 'categories': categories,
  151. }