You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

601 lines
22 KiB

  1. # coding: utf-8
  2. from __future__ import unicode_literals
  3. import functools
  4. import itertools
  5. import operator
  6. import re
  7. from .common import InfoExtractor
  8. from ..compat import (
  9. compat_HTTPError,
  10. compat_str,
  11. compat_urllib_request,
  12. )
  13. from .openload import PhantomJSwrapper
  14. from ..utils import (
  15. determine_ext,
  16. ExtractorError,
  17. int_or_none,
  18. NO_DEFAULT,
  19. orderedSet,
  20. remove_quotes,
  21. str_to_int,
  22. url_or_none,
  23. )
  24. class PornHubBaseIE(InfoExtractor):
  25. def _download_webpage_handle(self, *args, **kwargs):
  26. def dl(*args, **kwargs):
  27. return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs)
  28. webpage, urlh = dl(*args, **kwargs)
  29. if any(re.search(p, webpage) for p in (
  30. r'<body\b[^>]+\bonload=["\']go\(\)',
  31. r'document\.cookie\s*=\s*["\']RNKEY=',
  32. r'document\.location\.reload\(true\)')):
  33. url_or_request = args[0]
  34. url = (url_or_request.get_full_url()
  35. if isinstance(url_or_request, compat_urllib_request.Request)
  36. else url_or_request)
  37. phantom = PhantomJSwrapper(self, required_version='2.0')
  38. phantom.get(url, html=webpage)
  39. webpage, urlh = dl(*args, **kwargs)
  40. return webpage, urlh
  41. class PornHubIE(PornHubBaseIE):
  42. IE_DESC = 'PornHub and Thumbzilla'
  43. _VALID_URL = r'''(?x)
  44. https?://
  45. (?:
  46. (?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
  47. (?:www\.)?thumbzilla\.com/video/
  48. )
  49. (?P<id>[\da-z]+)
  50. '''
  51. _TESTS = [{
  52. 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015',
  53. 'md5': '1e19b41231a02eba417839222ac9d58e',
  54. 'info_dict': {
  55. 'id': '648719015',
  56. 'ext': 'mp4',
  57. 'title': 'Seductive Indian beauty strips down and fingers her pink pussy',
  58. 'uploader': 'Babes',
  59. 'upload_date': '20130628',
  60. 'duration': 361,
  61. 'view_count': int,
  62. 'like_count': int,
  63. 'dislike_count': int,
  64. 'comment_count': int,
  65. 'age_limit': 18,
  66. 'tags': list,
  67. 'categories': list,
  68. },
  69. }, {
  70. # non-ASCII title
  71. 'url': 'http://www.pornhub.com/view_video.php?viewkey=1331683002',
  72. 'info_dict': {
  73. 'id': '1331683002',
  74. 'ext': 'mp4',
  75. 'title': '重庆婷婷女王足交',
  76. 'uploader': 'Unknown',
  77. 'upload_date': '20150213',
  78. 'duration': 1753,
  79. 'view_count': int,
  80. 'like_count': int,
  81. 'dislike_count': int,
  82. 'comment_count': int,
  83. 'age_limit': 18,
  84. 'tags': list,
  85. 'categories': list,
  86. },
  87. 'params': {
  88. 'skip_download': True,
  89. },
  90. }, {
  91. # subtitles
  92. 'url': 'https://www.pornhub.com/view_video.php?viewkey=ph5af5fef7c2aa7',
  93. 'info_dict': {
  94. 'id': 'ph5af5fef7c2aa7',
  95. 'ext': 'mp4',
  96. 'title': 'BFFS - Cute Teen Girls Share Cock On the Floor',
  97. 'uploader': 'BFFs',
  98. 'duration': 622,
  99. 'view_count': int,
  100. 'like_count': int,
  101. 'dislike_count': int,
  102. 'comment_count': int,
  103. 'age_limit': 18,
  104. 'tags': list,
  105. 'categories': list,
  106. 'subtitles': {
  107. 'en': [{
  108. "ext": 'srt'
  109. }]
  110. },
  111. },
  112. 'params': {
  113. 'skip_download': True,
  114. },
  115. }, {
  116. 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d',
  117. 'only_matching': True,
  118. }, {
  119. # removed at the request of cam4.com
  120. 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862',
  121. 'only_matching': True,
  122. }, {
  123. # removed at the request of the copyright owner
  124. 'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859',
  125. 'only_matching': True,
  126. }, {
  127. # removed by uploader
  128. 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111',
  129. 'only_matching': True,
  130. }, {
  131. # private video
  132. 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7',
  133. 'only_matching': True,
  134. }, {
  135. 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
  136. 'only_matching': True,
  137. }, {
  138. 'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
  139. 'only_matching': True,
  140. }, {
  141. 'url': 'https://www.pornhub.net/view_video.php?viewkey=203640933',
  142. 'only_matching': True,
  143. }]
  144. @staticmethod
  145. def _extract_urls(webpage):
  146. return re.findall(
  147. r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub\.(?:com|net)/embed/[\da-z]+)',
  148. webpage)
  149. def _extract_count(self, pattern, webpage, name):
  150. return str_to_int(self._search_regex(
  151. pattern, webpage, '%s count' % name, fatal=False))
  152. def _real_extract(self, url):
  153. mobj = re.match(self._VALID_URL, url)
  154. host = mobj.group('host') or 'pornhub.com'
  155. video_id = mobj.group('id')
  156. self._set_cookie(host, 'age_verified', '1')
  157. def dl_webpage(platform):
  158. self._set_cookie(host, 'platform', platform)
  159. return self._download_webpage(
  160. 'https://www.%s/view_video.php?viewkey=%s' % (host, video_id),
  161. video_id, 'Downloading %s webpage' % platform)
  162. webpage = dl_webpage('pc')
  163. error_msg = self._html_search_regex(
  164. r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>',
  165. webpage, 'error message', default=None, group='error')
  166. if error_msg:
  167. error_msg = re.sub(r'\s+', ' ', error_msg)
  168. raise ExtractorError(
  169. 'PornHub said: %s' % error_msg,
  170. expected=True, video_id=video_id)
  171. # video_title from flashvars contains whitespace instead of non-ASCII (see
  172. # http://www.pornhub.com/view_video.php?viewkey=1331683002), not relying
  173. # on that anymore.
  174. title = self._html_search_meta(
  175. 'twitter:title', webpage, default=None) or self._search_regex(
  176. (r'<h1[^>]+class=["\']title["\'][^>]*>(?P<title>[^<]+)',
  177. r'<div[^>]+data-video-title=(["\'])(?P<title>.+?)\1',
  178. r'shareTitle\s*=\s*(["\'])(?P<title>.+?)\1'),
  179. webpage, 'title', group='title')
  180. video_urls = []
  181. video_urls_set = set()
  182. subtitles = {}
  183. flashvars = self._parse_json(
  184. self._search_regex(
  185. r'var\s+flashvars_\d+\s*=\s*({.+?});', webpage, 'flashvars', default='{}'),
  186. video_id)
  187. if flashvars:
  188. subtitle_url = url_or_none(flashvars.get('closedCaptionsFile'))
  189. if subtitle_url:
  190. subtitles.setdefault('en', []).append({
  191. 'url': subtitle_url,
  192. 'ext': 'srt',
  193. })
  194. thumbnail = flashvars.get('image_url')
  195. duration = int_or_none(flashvars.get('video_duration'))
  196. media_definitions = flashvars.get('mediaDefinitions')
  197. if isinstance(media_definitions, list):
  198. for definition in media_definitions:
  199. if not isinstance(definition, dict):
  200. continue
  201. video_url = definition.get('videoUrl')
  202. if not video_url or not isinstance(video_url, compat_str):
  203. continue
  204. if video_url in video_urls_set:
  205. continue
  206. video_urls_set.add(video_url)
  207. video_urls.append(
  208. (video_url, int_or_none(definition.get('quality'))))
  209. else:
  210. thumbnail, duration = [None] * 2
  211. def extract_js_vars(webpage, pattern, default=NO_DEFAULT):
  212. assignments = self._search_regex(
  213. pattern, webpage, 'encoded url', default=default)
  214. if not assignments:
  215. return {}
  216. assignments = assignments.split(';')
  217. js_vars = {}
  218. def parse_js_value(inp):
  219. inp = re.sub(r'/\*(?:(?!\*/).)*?\*/', '', inp)
  220. if '+' in inp:
  221. inps = inp.split('+')
  222. return functools.reduce(
  223. operator.concat, map(parse_js_value, inps))
  224. inp = inp.strip()
  225. if inp in js_vars:
  226. return js_vars[inp]
  227. return remove_quotes(inp)
  228. for assn in assignments:
  229. assn = assn.strip()
  230. if not assn:
  231. continue
  232. assn = re.sub(r'var\s+', '', assn)
  233. vname, value = assn.split('=', 1)
  234. js_vars[vname] = parse_js_value(value)
  235. return js_vars
  236. def add_video_url(video_url):
  237. v_url = url_or_none(video_url)
  238. if not v_url:
  239. return
  240. if v_url in video_urls_set:
  241. return
  242. video_urls.append((v_url, None))
  243. video_urls_set.add(v_url)
  244. if not video_urls:
  245. FORMAT_PREFIXES = ('media', 'quality')
  246. js_vars = extract_js_vars(
  247. webpage, r'(var\s+(?:%s)_.+)' % '|'.join(FORMAT_PREFIXES),
  248. default=None)
  249. if js_vars:
  250. for key, format_url in js_vars.items():
  251. if any(key.startswith(p) for p in FORMAT_PREFIXES):
  252. add_video_url(format_url)
  253. if not video_urls and re.search(
  254. r'<[^>]+\bid=["\']lockedPlayer', webpage):
  255. raise ExtractorError(
  256. 'Video %s is locked' % video_id, expected=True)
  257. if not video_urls:
  258. js_vars = extract_js_vars(
  259. dl_webpage('tv'), r'(var.+?mediastring.+?)</script>')
  260. add_video_url(js_vars['mediastring'])
  261. for mobj in re.finditer(
  262. r'<a[^>]+\bclass=["\']downloadBtn\b[^>]+\bhref=(["\'])(?P<url>(?:(?!\1).)+)\1',
  263. webpage):
  264. video_url = mobj.group('url')
  265. if video_url not in video_urls_set:
  266. video_urls.append((video_url, None))
  267. video_urls_set.add(video_url)
  268. upload_date = None
  269. formats = []
  270. for video_url, height in video_urls:
  271. if not upload_date:
  272. upload_date = self._search_regex(
  273. r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
  274. if upload_date:
  275. upload_date = upload_date.replace('/', '')
  276. ext = determine_ext(video_url)
  277. if ext == 'mpd':
  278. formats.extend(self._extract_mpd_formats(
  279. video_url, video_id, mpd_id='dash', fatal=False))
  280. continue
  281. elif ext == 'm3u8':
  282. formats.extend(self._extract_m3u8_formats(
  283. video_url, video_id, 'mp4', entry_protocol='m3u8_native',
  284. m3u8_id='hls', fatal=False))
  285. continue
  286. tbr = None
  287. mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', video_url)
  288. if mobj:
  289. if not height:
  290. height = int(mobj.group('height'))
  291. tbr = int(mobj.group('tbr'))
  292. formats.append({
  293. 'url': video_url,
  294. 'format_id': '%dp' % height if height else None,
  295. 'height': height,
  296. 'tbr': tbr,
  297. })
  298. self._sort_formats(formats)
  299. video_uploader = self._html_search_regex(
  300. r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
  301. webpage, 'uploader', fatal=False)
  302. view_count = self._extract_count(
  303. r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
  304. like_count = self._extract_count(
  305. r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
  306. dislike_count = self._extract_count(
  307. r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
  308. comment_count = self._extract_count(
  309. r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
  310. def extract_list(meta_key):
  311. div = self._search_regex(
  312. r'(?s)<div[^>]+\bclass=["\'].*?\b%sWrapper[^>]*>(.+?)</div>'
  313. % meta_key, webpage, meta_key, default=None)
  314. if div:
  315. return re.findall(r'<a[^>]+\bhref=[^>]+>([^<]+)', div)
  316. return {
  317. 'id': video_id,
  318. 'uploader': video_uploader,
  319. 'upload_date': upload_date,
  320. 'title': title,
  321. 'thumbnail': thumbnail,
  322. 'duration': duration,
  323. 'view_count': view_count,
  324. 'like_count': like_count,
  325. 'dislike_count': dislike_count,
  326. 'comment_count': comment_count,
  327. 'formats': formats,
  328. 'age_limit': 18,
  329. 'tags': extract_list('tags'),
  330. 'categories': extract_list('categories'),
  331. 'subtitles': subtitles,
  332. }
  333. class PornHubPlaylistBaseIE(PornHubBaseIE):
  334. def _extract_entries(self, webpage, host):
  335. # Only process container div with main playlist content skipping
  336. # drop-down menu that uses similar pattern for videos (see
  337. # https://github.com/ytdl-org/youtube-dl/issues/11594).
  338. container = self._search_regex(
  339. r'(?s)(<div[^>]+class=["\']container.+)', webpage,
  340. 'container', default=webpage)
  341. return [
  342. self.url_result(
  343. 'http://www.%s/%s' % (host, video_url),
  344. PornHubIE.ie_key(), video_title=title)
  345. for video_url, title in orderedSet(re.findall(
  346. r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
  347. container))
  348. ]
  349. def _real_extract(self, url):
  350. mobj = re.match(self._VALID_URL, url)
  351. host = mobj.group('host')
  352. playlist_id = mobj.group('id')
  353. webpage = self._download_webpage(url, playlist_id)
  354. entries = self._extract_entries(webpage, host)
  355. playlist = self._parse_json(
  356. self._search_regex(
  357. r'(?:playlistObject|PLAYLIST_VIEW)\s*=\s*({.+?});', webpage,
  358. 'playlist', default='{}'),
  359. playlist_id, fatal=False)
  360. title = playlist.get('title') or self._search_regex(
  361. r'>Videos\s+in\s+(.+?)\s+[Pp]laylist<', webpage, 'title', fatal=False)
  362. return self.playlist_result(
  363. entries, playlist_id, title, playlist.get('description'))
  364. class PornHubUserIE(PornHubPlaylistBaseIE):
  365. _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?pornhub\.(?:com|net)/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/?#&]+))(?:[?#&]|/(?!videos)|$)'
  366. _TESTS = [{
  367. 'url': 'https://www.pornhub.com/model/zoe_ph',
  368. 'playlist_mincount': 118,
  369. }, {
  370. 'url': 'https://www.pornhub.com/pornstar/liz-vicious',
  371. 'info_dict': {
  372. 'id': 'liz-vicious',
  373. },
  374. 'playlist_mincount': 118,
  375. }, {
  376. 'url': 'https://www.pornhub.com/users/russianveet69',
  377. 'only_matching': True,
  378. }, {
  379. 'url': 'https://www.pornhub.com/channels/povd',
  380. 'only_matching': True,
  381. }, {
  382. 'url': 'https://www.pornhub.com/model/zoe_ph?abc=1',
  383. 'only_matching': True,
  384. }]
  385. def _real_extract(self, url):
  386. mobj = re.match(self._VALID_URL, url)
  387. user_id = mobj.group('id')
  388. return self.url_result(
  389. '%s/videos' % mobj.group('url'), ie=PornHubPagedVideoListIE.ie_key(),
  390. video_id=user_id)
  391. class PornHubPagedPlaylistBaseIE(PornHubPlaylistBaseIE):
  392. @staticmethod
  393. def _has_more(webpage):
  394. return re.search(
  395. r'''(?x)
  396. <li[^>]+\bclass=["\']page_next|
  397. <link[^>]+\brel=["\']next|
  398. <button[^>]+\bid=["\']moreDataBtn
  399. ''', webpage) is not None
  400. def _real_extract(self, url):
  401. mobj = re.match(self._VALID_URL, url)
  402. host = mobj.group('host')
  403. item_id = mobj.group('id')
  404. page = int_or_none(self._search_regex(
  405. r'\bpage=(\d+)', url, 'page', default=None))
  406. entries = []
  407. for page_num in (page, ) if page is not None else itertools.count(1):
  408. try:
  409. webpage = self._download_webpage(
  410. url, item_id, 'Downloading page %d' % page_num,
  411. query={'page': page_num})
  412. except ExtractorError as e:
  413. if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
  414. break
  415. raise
  416. page_entries = self._extract_entries(webpage, host)
  417. if not page_entries:
  418. break
  419. entries.extend(page_entries)
  420. if not self._has_more(webpage):
  421. break
  422. return self.playlist_result(orderedSet(entries), item_id)
  423. class PornHubPagedVideoListIE(PornHubPagedPlaylistBaseIE):
  424. _VALID_URL = r'https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?P<id>(?:[^/]+/)*[^/?#&]+)'
  425. _TESTS = [{
  426. 'url': 'https://www.pornhub.com/model/zoe_ph/videos',
  427. 'only_matching': True,
  428. }, {
  429. 'url': 'http://www.pornhub.com/users/rushandlia/videos',
  430. 'only_matching': True,
  431. }, {
  432. 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos',
  433. 'info_dict': {
  434. 'id': 'pornstar/jenny-blighe/videos',
  435. },
  436. 'playlist_mincount': 149,
  437. }, {
  438. 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos?page=3',
  439. 'info_dict': {
  440. 'id': 'pornstar/jenny-blighe/videos',
  441. },
  442. 'playlist_mincount': 40,
  443. }, {
  444. # default sorting as Top Rated Videos
  445. 'url': 'https://www.pornhub.com/channels/povd/videos',
  446. 'info_dict': {
  447. 'id': 'channels/povd/videos',
  448. },
  449. 'playlist_mincount': 293,
  450. }, {
  451. # Top Rated Videos
  452. 'url': 'https://www.pornhub.com/channels/povd/videos?o=ra',
  453. 'only_matching': True,
  454. }, {
  455. # Most Recent Videos
  456. 'url': 'https://www.pornhub.com/channels/povd/videos?o=da',
  457. 'only_matching': True,
  458. }, {
  459. # Most Viewed Videos
  460. 'url': 'https://www.pornhub.com/channels/povd/videos?o=vi',
  461. 'only_matching': True,
  462. }, {
  463. 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public',
  464. 'only_matching': True,
  465. }, {
  466. # Most Viewed Videos
  467. 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=mv',
  468. 'only_matching': True,
  469. }, {
  470. # Top Rated Videos
  471. 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=tr',
  472. 'only_matching': True,
  473. }, {
  474. # Longest Videos
  475. 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=lg',
  476. 'only_matching': True,
  477. }, {
  478. # Newest Videos
  479. 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos?o=cm',
  480. 'only_matching': True,
  481. }, {
  482. 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/paid',
  483. 'only_matching': True,
  484. }, {
  485. 'url': 'https://www.pornhub.com/pornstar/liz-vicious/videos/fanonly',
  486. 'only_matching': True,
  487. }, {
  488. 'url': 'https://www.pornhub.com/video',
  489. 'only_matching': True,
  490. }, {
  491. 'url': 'https://www.pornhub.com/video?page=3',
  492. 'only_matching': True,
  493. }, {
  494. 'url': 'https://www.pornhub.com/video/search?search=123',
  495. 'only_matching': True,
  496. }, {
  497. 'url': 'https://www.pornhub.com/categories/teen',
  498. 'only_matching': True,
  499. }, {
  500. 'url': 'https://www.pornhub.com/categories/teen?page=3',
  501. 'only_matching': True,
  502. }, {
  503. 'url': 'https://www.pornhub.com/hd',
  504. 'only_matching': True,
  505. }, {
  506. 'url': 'https://www.pornhub.com/hd?page=3',
  507. 'only_matching': True,
  508. }, {
  509. 'url': 'https://www.pornhub.com/described-video',
  510. 'only_matching': True,
  511. }, {
  512. 'url': 'https://www.pornhub.com/described-video?page=2',
  513. 'only_matching': True,
  514. }, {
  515. 'url': 'https://www.pornhub.com/video/incategories/60fps-1/hd-porn',
  516. 'only_matching': True,
  517. }, {
  518. 'url': 'https://www.pornhub.com/playlist/44121572',
  519. 'info_dict': {
  520. 'id': 'playlist/44121572',
  521. },
  522. 'playlist_mincount': 132,
  523. }, {
  524. 'url': 'https://www.pornhub.com/playlist/4667351',
  525. 'only_matching': True,
  526. }, {
  527. 'url': 'https://de.pornhub.com/playlist/4667351',
  528. 'only_matching': True,
  529. }]
  530. @classmethod
  531. def suitable(cls, url):
  532. return (False
  533. if PornHubIE.suitable(url) or PornHubUserIE.suitable(url) or PornHubUserVideosUploadIE.suitable(url)
  534. else super(PornHubPagedVideoListIE, cls).suitable(url))
  535. class PornHubUserVideosUploadIE(PornHubPagedPlaylistBaseIE):
  536. _VALID_URL = r'(?P<url>https?://(?:[^/]+\.)?(?P<host>pornhub\.(?:com|net))/(?:(?:user|channel)s|model|pornstar)/(?P<id>[^/]+)/videos/upload)'
  537. _TESTS = [{
  538. 'url': 'https://www.pornhub.com/pornstar/jenny-blighe/videos/upload',
  539. 'info_dict': {
  540. 'id': 'jenny-blighe',
  541. },
  542. 'playlist_mincount': 129,
  543. }, {
  544. 'url': 'https://www.pornhub.com/model/zoe_ph/videos/upload',
  545. 'only_matching': True,
  546. }]