You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

90 lines
3.1 KiB

10 years ago
10 years ago
  1. from __future__ import unicode_literals
  2. import re
  3. from .common import InfoExtractor
  4. from .youtube import YoutubeIE
  5. from ..utils import (
  6. parse_iso8601,
  7. str_to_int,
  8. )
  9. class CrackedIE(InfoExtractor):
  10. _VALID_URL = r'https?://(?:www\.)?cracked\.com/video_(?P<id>\d+)_[\da-z-]+\.html'
  11. _TESTS = [{
  12. 'url': 'http://www.cracked.com/video_19070_if-animal-actors-got-e21-true-hollywood-stories.html',
  13. 'md5': '89b90b9824e3806ca95072c4d78f13f7',
  14. 'info_dict': {
  15. 'id': '19070',
  16. 'ext': 'mp4',
  17. 'title': 'If Animal Actors Got E! True Hollywood Stories',
  18. 'timestamp': 1404954000,
  19. 'upload_date': '20140710',
  20. }
  21. }, {
  22. # youtube embed
  23. 'url': 'http://www.cracked.com/video_19006_4-plot-holes-you-didnt-notice-in-your-favorite-movies.html',
  24. 'md5': 'ccd52866b50bde63a6ef3b35016ba8c7',
  25. 'info_dict': {
  26. 'id': 'EjI00A3rZD0',
  27. 'ext': 'mp4',
  28. 'title': "4 Plot Holes You Didn't Notice in Your Favorite Movies - The Spit Take",
  29. 'description': 'md5:c603708c718b796fe6079e2b3351ffc7',
  30. 'upload_date': '20140725',
  31. 'uploader_id': 'Cracked',
  32. 'uploader': 'Cracked',
  33. }
  34. }]
  35. def _real_extract(self, url):
  36. video_id = self._match_id(url)
  37. webpage = self._download_webpage(url, video_id)
  38. youtube_url = YoutubeIE._extract_url(webpage)
  39. if youtube_url:
  40. return self.url_result(youtube_url, ie=YoutubeIE.ie_key())
  41. video_url = self._html_search_regex(
  42. [r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'<video\s+src="([^"]+)"'],
  43. webpage, 'video URL')
  44. title = self._search_regex(
  45. [r'property="?og:title"?\s+content="([^"]+)"', r'class="?title"?>([^<]+)'],
  46. webpage, 'title')
  47. description = self._search_regex(
  48. r'name="?(?:og:)?description"?\s+content="([^"]+)"',
  49. webpage, 'description', default=None)
  50. timestamp = self._html_search_regex(
  51. r'"date"\s*:\s*"([^"]+)"', webpage, 'upload date', fatal=False)
  52. if timestamp:
  53. timestamp = parse_iso8601(timestamp[:-6])
  54. view_count = str_to_int(self._html_search_regex(
  55. r'<span\s+class="?views"? id="?viewCounts"?>([\d,\.]+) Views</span>',
  56. webpage, 'view count', fatal=False))
  57. comment_count = str_to_int(self._html_search_regex(
  58. r'<span\s+id="?commentCounts"?>([\d,\.]+)</span>',
  59. webpage, 'comment count', fatal=False))
  60. m = re.search(r'_(?P<width>\d+)X(?P<height>\d+)\.mp4$', video_url)
  61. if m:
  62. width = int(m.group('width'))
  63. height = int(m.group('height'))
  64. else:
  65. width = height = None
  66. return {
  67. 'id': video_id,
  68. 'url': video_url,
  69. 'title': title,
  70. 'description': description,
  71. 'timestamp': timestamp,
  72. 'view_count': view_count,
  73. 'comment_count': comment_count,
  74. 'height': height,
  75. 'width': width,
  76. }