Browse Source

[pornhub] Apply scrape detection bypass for all extractors

master-ytdl-org
Sergey M․ 6 years ago
parent
commit
71a1f61700
No known key found for this signature in database GPG Key ID: 2C393E0F18A9236D
1 changed files with 24 additions and 22 deletions
  1. +24
    -22
      youtube_dl/extractor/pornhub.py

+ 24
- 22
youtube_dl/extractor/pornhub.py View File

@ -24,7 +24,29 @@ from ..utils import (
) )
class PornHubIE(InfoExtractor):
class PornHubBaseIE(InfoExtractor):
def _download_webpage_handle(self, *args, **kwargs):
def dl(*args, **kwargs):
return super(PornHubBaseIE, self)._download_webpage_handle(*args, **kwargs)
webpage, urlh = dl(*args, **kwargs)
if any(re.search(p, webpage) for p in (
r'<body\b[^>]+\bonload=["\']go\(\)',
r'document\.cookie\s*=\s*["\']RNKEY=',
r'document\.location\.reload\(true\)')):
url_or_request = args[0]
url = (url_or_request.get_full_url()
if isinstance(url_or_request, compat_urllib_request.Request)
else url_or_request)
phantom = PhantomJSwrapper(self, required_version='2.0')
phantom.get(url, html=webpage)
webpage, urlh = dl(*args, **kwargs)
return webpage, urlh
class PornHubIE(PornHubBaseIE):
IE_DESC = 'PornHub and Thumbzilla' IE_DESC = 'PornHub and Thumbzilla'
_VALID_URL = r'''(?x) _VALID_URL = r'''(?x)
https?:// https?://
@ -128,26 +150,6 @@ class PornHubIE(InfoExtractor):
'only_matching': True, 'only_matching': True,
}] }]
def _download_webpage_handle(self, *args, **kwargs):
def dl(*args, **kwargs):
return super(PornHubIE, self)._download_webpage_handle(*args, **kwargs)
webpage, urlh = dl(*args, **kwargs)
if any(re.search(p, webpage) for p in (
r'<body\b[^>]+\bonload=["\']go\(\)',
r'document\.cookie\s*=\s*["\']RNKEY=',
r'document\.location\.reload\(true\)')):
url_or_request = args[0]
url = (url_or_request.get_full_url()
if isinstance(url_or_request, compat_urllib_request.Request)
else url_or_request)
phantom = PhantomJSwrapper(self, required_version='2.0')
phantom.get(url, html=webpage)
webpage, urlh = dl(*args, **kwargs)
return webpage, urlh
@staticmethod @staticmethod
def _extract_urls(webpage): def _extract_urls(webpage):
return re.findall( return re.findall(
@ -329,7 +331,7 @@ class PornHubIE(InfoExtractor):
} }
class PornHubPlaylistBaseIE(InfoExtractor):
class PornHubPlaylistBaseIE(PornHubBaseIE):
def _extract_entries(self, webpage, host): def _extract_entries(self, webpage, host):
# Only process container div with main playlist content skipping # Only process container div with main playlist content skipping
# drop-down menu that uses similar pattern for videos (see # drop-down menu that uses similar pattern for videos (see


Loading…
Cancel
Save