You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

118 lines
4.1 KiB

  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. from .common import InfoExtractor
  5. from ..compat import compat_str
  6. from ..utils import (
  7. determine_ext,
  8. parse_duration,
  9. try_get,
  10. unified_strdate,
  11. )
  12. class MediasetIE(InfoExtractor):
  13. _VALID_URL = r'''(?x)
  14. (?:
  15. mediaset:|
  16. https?://
  17. (?:www\.)?video\.mediaset\.it/
  18. (?:
  19. (?:video|on-demand)/(?:[^/]+/)+[^/]+_|
  20. player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=
  21. )
  22. )(?P<id>[0-9]+)
  23. '''
  24. _TESTS = [{
  25. # full episode
  26. 'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html',
  27. 'md5': '9b75534d42c44ecef7bf1ffeacb7f85d',
  28. 'info_dict': {
  29. 'id': '661824',
  30. 'ext': 'mp4',
  31. 'title': 'Quarta puntata',
  32. 'description': 'md5:7183696d6df570e3412a5ef74b27c5e2',
  33. 'thumbnail': r're:^https?://.*\.jpg$',
  34. 'duration': 1414,
  35. 'creator': 'mediaset',
  36. 'upload_date': '20161107',
  37. 'series': 'Hello Goodbye',
  38. 'categories': ['reality'],
  39. },
  40. 'expected_warnings': ['is not a supported codec'],
  41. }, {
  42. # clip
  43. 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html',
  44. 'only_matching': True,
  45. }, {
  46. # iframe simple
  47. 'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true',
  48. 'only_matching': True,
  49. }, {
  50. # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/)
  51. 'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true',
  52. 'only_matching': True,
  53. }, {
  54. 'url': 'mediaset:661824',
  55. 'only_matching': True,
  56. }]
  57. @staticmethod
  58. def _extract_urls(webpage):
  59. return [
  60. mobj.group('url')
  61. for mobj in re.finditer(
  62. r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>https?://(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=\d+.*?)\1',
  63. webpage)]
  64. def _real_extract(self, url):
  65. video_id = self._match_id(url)
  66. video_list = self._download_json(
  67. 'http://cdnsel01.mediaset.net/GetCdn.aspx',
  68. video_id, 'Downloading video CDN JSON', query={
  69. 'streamid': video_id,
  70. 'format': 'json',
  71. })['videoList']
  72. formats = []
  73. for format_url in video_list:
  74. if '.ism' in format_url:
  75. formats.extend(self._extract_ism_formats(
  76. format_url, video_id, ism_id='mss', fatal=False))
  77. else:
  78. formats.append({
  79. 'url': format_url,
  80. 'format_id': determine_ext(format_url),
  81. })
  82. self._sort_formats(formats)
  83. mediainfo = self._download_json(
  84. 'http://plr.video.mediaset.it/html/metainfo.sjson',
  85. video_id, 'Downloading video info JSON', query={
  86. 'id': video_id,
  87. })['video']
  88. title = mediainfo['title']
  89. creator = try_get(
  90. mediainfo, lambda x: x['brand-info']['publisher'], compat_str)
  91. category = try_get(
  92. mediainfo, lambda x: x['brand-info']['category'], compat_str)
  93. categories = [category] if category else None
  94. return {
  95. 'id': video_id,
  96. 'title': title,
  97. 'description': mediainfo.get('short-description'),
  98. 'thumbnail': mediainfo.get('thumbnail'),
  99. 'duration': parse_duration(mediainfo.get('duration')),
  100. 'creator': creator,
  101. 'upload_date': unified_strdate(mediainfo.get('production-date')),
  102. 'webpage_url': mediainfo.get('url'),
  103. 'series': mediainfo.get('brand-value'),
  104. 'categories': categories,
  105. 'formats': formats,
  106. }