You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

193 lines
7.0 KiB

10 years ago
  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import re
  4. from .common import InfoExtractor
  5. from ..compat import compat_chr
  6. from ..utils import (
  7. decode_packed_codes,
  8. determine_ext,
  9. ExtractorError,
  10. int_or_none,
  11. js_to_json,
  12. urlencode_postdata,
  13. )
  14. # based on openload_decode from 2bfeee69b976fe049761dd3012e30b637ee05a58
  15. def aa_decode(aa_code):
  16. symbol_table = [
  17. ('7', '((゚ー゚) + (o^_^o))'),
  18. ('6', '((o^_^o) +(o^_^o))'),
  19. ('5', '((゚ー゚) + (゚Θ゚))'),
  20. ('2', '((o^_^o) - (゚Θ゚))'),
  21. ('4', '(゚ー゚)'),
  22. ('3', '(o^_^o)'),
  23. ('1', '(゚Θ゚)'),
  24. ('0', '(c^_^o)'),
  25. ]
  26. delim = '(゚Д゚)[゚ε゚]+'
  27. ret = ''
  28. for aa_char in aa_code.split(delim):
  29. for val, pat in symbol_table:
  30. aa_char = aa_char.replace(pat, val)
  31. aa_char = aa_char.replace('+ ', '')
  32. m = re.match(r'^\d+', aa_char)
  33. if m:
  34. ret += compat_chr(int(m.group(0), 8))
  35. else:
  36. m = re.match(r'^u([\da-f]+)', aa_char)
  37. if m:
  38. ret += compat_chr(int(m.group(1), 16))
  39. return ret
  40. class XFileShareIE(InfoExtractor):
  41. _SITES = (
  42. (r'clipwatching\.com', 'ClipWatching'),
  43. (r'gounlimited\.to', 'GoUnlimited'),
  44. (r'govid\.me', 'GoVid'),
  45. (r'holavid\.com', 'HolaVid'),
  46. (r'streamty\.com', 'Streamty'),
  47. (r'thevideobee\.to', 'TheVideoBee'),
  48. (r'uqload\.com', 'Uqload'),
  49. (r'vidbom\.com', 'VidBom'),
  50. (r'vidlo\.us', 'vidlo'),
  51. (r'vidlocker\.xyz', 'VidLocker'),
  52. (r'vidshare\.tv', 'VidShare'),
  53. (r'vup\.to', 'VUp'),
  54. (r'xvideosharing\.com', 'XVideoSharing'),
  55. )
  56. IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1])
  57. _VALID_URL = (r'https?://(?:www\.)?(?P<host>%s)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
  58. % '|'.join(site for site in list(zip(*_SITES))[0]))
  59. _FILE_NOT_FOUND_REGEXES = (
  60. r'>(?:404 - )?File Not Found<',
  61. r'>The file was removed by administrator<',
  62. )
  63. _TESTS = [{
  64. 'url': 'http://xvideosharing.com/fq65f94nd2ve',
  65. 'md5': '4181f63957e8fe90ac836fa58dc3c8a6',
  66. 'info_dict': {
  67. 'id': 'fq65f94nd2ve',
  68. 'ext': 'mp4',
  69. 'title': 'sample',
  70. 'thumbnail': r're:http://.*\.jpg',
  71. },
  72. }]
  73. @staticmethod
  74. def _extract_urls(webpage):
  75. return [
  76. mobj.group('url')
  77. for mobj in re.finditer(
  78. r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1'
  79. % '|'.join(site for site in list(zip(*XFileShareIE._SITES))[0]),
  80. webpage)]
  81. def _real_extract(self, url):
  82. host, video_id = re.match(self._VALID_URL, url).groups()
  83. url = 'https://%s/' % host + ('embed-%s.html' % video_id if host in ('govid.me', 'vidlo.us') else video_id)
  84. webpage = self._download_webpage(url, video_id)
  85. if any(re.search(p, webpage) for p in self._FILE_NOT_FOUND_REGEXES):
  86. raise ExtractorError('Video %s does not exist' % video_id, expected=True)
  87. fields = self._hidden_inputs(webpage)
  88. if fields.get('op') == 'download1':
  89. countdown = int_or_none(self._search_regex(
  90. r'<span id="countdown_str">(?:[Ww]ait)?\s*<span id="cxc">(\d+)</span>\s*(?:seconds?)?</span>',
  91. webpage, 'countdown', default=None))
  92. if countdown:
  93. self._sleep(countdown, video_id)
  94. webpage = self._download_webpage(
  95. url, video_id, 'Downloading video page',
  96. data=urlencode_postdata(fields), headers={
  97. 'Referer': url,
  98. 'Content-type': 'application/x-www-form-urlencoded',
  99. })
  100. title = (self._search_regex(
  101. (r'style="z-index: [0-9]+;">([^<]+)</span>',
  102. r'<td nowrap>([^<]+)</td>',
  103. r'h4-fine[^>]*>([^<]+)<',
  104. r'>Watch (.+)[ <]',
  105. r'<h2 class="video-page-head">([^<]+)</h2>',
  106. r'<h2 style="[^"]*color:#403f3d[^"]*"[^>]*>([^<]+)<', # streamin.to
  107. r'title\s*:\s*"([^"]+)"'), # govid.me
  108. webpage, 'title', default=None) or self._og_search_title(
  109. webpage, default=None) or video_id).strip()
  110. for regex, func in (
  111. (r'(eval\(function\(p,a,c,k,e,d\){.+)', decode_packed_codes),
  112. (r'(゚.+)', aa_decode)):
  113. obf_code = self._search_regex(regex, webpage, 'obfuscated code', default=None)
  114. if obf_code:
  115. webpage = webpage.replace(obf_code, func(obf_code))
  116. formats = []
  117. jwplayer_data = self._search_regex(
  118. [
  119. r'jwplayer\("[^"]+"\)\.load\(\[({.+?})\]\);',
  120. r'jwplayer\("[^"]+"\)\.setup\(({.+?})\);',
  121. ], webpage,
  122. 'jwplayer data', default=None)
  123. if jwplayer_data:
  124. jwplayer_data = self._parse_json(
  125. jwplayer_data.replace(r"\'", "'"), video_id, js_to_json)
  126. if jwplayer_data:
  127. formats = self._parse_jwplayer_data(
  128. jwplayer_data, video_id, False,
  129. m3u8_id='hls', mpd_id='dash')['formats']
  130. if not formats:
  131. urls = []
  132. for regex in (
  133. r'(?:file|src)\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1',
  134. r'file_link\s*=\s*(["\'])(?P<url>http(?:(?!\1).)+)\1',
  135. r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http(?:(?!\2).)+)\2\)',
  136. r'<embed[^>]+src=(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'):
  137. for mobj in re.finditer(regex, webpage):
  138. video_url = mobj.group('url')
  139. if video_url not in urls:
  140. urls.append(video_url)
  141. sources = self._search_regex(
  142. r'sources\s*:\s*(\[(?!{)[^\]]+\])', webpage, 'sources', default=None)
  143. if sources:
  144. urls.extend(self._parse_json(sources, video_id))
  145. formats = []
  146. for video_url in urls:
  147. if determine_ext(video_url) == 'm3u8':
  148. formats.extend(self._extract_m3u8_formats(
  149. video_url, video_id, 'mp4',
  150. entry_protocol='m3u8_native', m3u8_id='hls',
  151. fatal=False))
  152. else:
  153. formats.append({
  154. 'url': video_url,
  155. 'format_id': 'sd',
  156. })
  157. self._sort_formats(formats)
  158. thumbnail = self._search_regex(
  159. [
  160. r'<video[^>]+poster="([^"]+)"',
  161. r'(?:image|poster)\s*:\s*["\'](http[^"\']+)["\'],',
  162. ], webpage, 'thumbnail', default=None)
  163. return {
  164. 'id': video_id,
  165. 'title': title,
  166. 'thumbnail': thumbnail,
  167. 'formats': formats,
  168. }