|
|
@ -346,6 +346,11 @@ class InfoExtractor(object): |
|
|
|
geo restriction bypass mechanism right away in order to bypass |
|
|
|
geo restriction, of course, if the mechanism is not disabled. (experimental) |
|
|
|
|
|
|
|
_GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted |
|
|
|
IP blocks in CIDR notation for this extractor. One of these IP blocks |
|
|
|
will be used by geo restriction bypass mechanism similarly |
|
|
|
to _GEO_COUNTRIES. (experimental) |
|
|
|
|
|
|
|
NB: both these geo attributes are experimental and may change in future |
|
|
|
or be completely removed. |
|
|
|
|
|
|
@ -358,6 +363,7 @@ class InfoExtractor(object): |
|
|
|
_x_forwarded_for_ip = None |
|
|
|
_GEO_BYPASS = True |
|
|
|
_GEO_COUNTRIES = None |
|
|
|
_GEO_IP_BLOCKS = None |
|
|
|
_WORKING = True |
|
|
|
|
|
|
|
def __init__(self, downloader=None): |
|
|
@ -392,12 +398,15 @@ class InfoExtractor(object): |
|
|
|
|
|
|
|
def initialize(self): |
|
|
|
"""Initializes an instance (authentication, etc).""" |
|
|
|
self._initialize_geo_bypass(self._GEO_COUNTRIES) |
|
|
|
self._initialize_geo_bypass({ |
|
|
|
'countries': self._GEO_COUNTRIES, |
|
|
|
'ip_blocks': self._GEO_IP_BLOCKS, |
|
|
|
}) |
|
|
|
if not self._ready: |
|
|
|
self._real_initialize() |
|
|
|
self._ready = True |
|
|
|
|
|
|
|
def _initialize_geo_bypass(self, countries): |
|
|
|
def _initialize_geo_bypass(self, geo_bypass_context): |
|
|
|
""" |
|
|
|
Initialize geo restriction bypass mechanism. |
|
|
|
|
|
|
@ -408,28 +417,82 @@ class InfoExtractor(object): |
|
|
|
HTTP requests. |
|
|
|
|
|
|
|
This method will be used for initial geo bypass mechanism initialization |
|
|
|
during the instance initialization with _GEO_COUNTRIES. |
|
|
|
during the instance initialization with _GEO_COUNTRIES and |
|
|
|
_GEO_IP_BLOCKS. |
|
|
|
|
|
|
|
You may also manually call it from extractor's code if geo countries |
|
|
|
You may also manually call it from extractor's code if geo bypass |
|
|
|
information is not available beforehand (e.g. obtained during |
|
|
|
extraction) or due to some another reason. |
|
|
|
extraction) or due to some other reason. In this case you should pass |
|
|
|
this information in geo bypass context passed as first argument. It may |
|
|
|
contain following fields: |
|
|
|
|
|
|
|
countries: List of geo unrestricted countries (similar |
|
|
|
to _GEO_COUNTRIES) |
|
|
|
ip_blocks: List of geo unrestricted IP blocks in CIDR notation |
|
|
|
(similar to _GEO_IP_BLOCKS) |
|
|
|
|
|
|
|
""" |
|
|
|
if not self._x_forwarded_for_ip: |
|
|
|
country_code = self._downloader.params.get('geo_bypass_country', None) |
|
|
|
# If there is no explicit country for geo bypass specified and |
|
|
|
# the extractor is known to be geo restricted let's fake IP |
|
|
|
# as X-Forwarded-For right away. |
|
|
|
if (not country_code and |
|
|
|
self._GEO_BYPASS and |
|
|
|
self._downloader.params.get('geo_bypass', True) and |
|
|
|
countries): |
|
|
|
country_code = random.choice(countries) |
|
|
|
if country_code: |
|
|
|
self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) |
|
|
|
|
|
|
|
# Geo bypass mechanism is explicitly disabled by user |
|
|
|
if not self._downloader.params.get('geo_bypass', True): |
|
|
|
return |
|
|
|
|
|
|
|
if not geo_bypass_context: |
|
|
|
geo_bypass_context = {} |
|
|
|
|
|
|
|
# Backward compatibility: previously _initialize_geo_bypass |
|
|
|
# expected a list of countries, some 3rd party code may still use |
|
|
|
# it this way |
|
|
|
if isinstance(geo_bypass_context, (list, tuple)): |
|
|
|
geo_bypass_context = { |
|
|
|
'countries': geo_bypass_context, |
|
|
|
} |
|
|
|
|
|
|
|
# The whole point of geo bypass mechanism is to fake IP |
|
|
|
# as X-Forwarded-For HTTP header based on some IP block or |
|
|
|
# country code. |
|
|
|
|
|
|
|
# Path 1: bypassing based on IP block in CIDR notation |
|
|
|
|
|
|
|
# Explicit IP block specified by user, use it right away |
|
|
|
# regardless of whether extractor is geo bypassable or not |
|
|
|
ip_block = self._downloader.params.get('geo_bypass_ip_block', None) |
|
|
|
|
|
|
|
# Otherwise use random IP block from geo bypass context but only |
|
|
|
# if extractor is known as geo bypassable |
|
|
|
if not ip_block: |
|
|
|
ip_blocks = geo_bypass_context.get('ip_blocks') |
|
|
|
if self._GEO_BYPASS and ip_blocks: |
|
|
|
ip_block = random.choice(ip_blocks) |
|
|
|
|
|
|
|
if ip_block: |
|
|
|
self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block) |
|
|
|
if self._downloader.params.get('verbose', False): |
|
|
|
self._downloader.to_screen( |
|
|
|
'[debug] Using fake IP %s as X-Forwarded-For.' |
|
|
|
% self._x_forwarded_for_ip) |
|
|
|
return |
|
|
|
|
|
|
|
# Path 2: bypassing based on country code |
|
|
|
|
|
|
|
# Explicit country code specified by user, use it right away |
|
|
|
# regardless of whether extractor is geo bypassable or not |
|
|
|
country = self._downloader.params.get('geo_bypass_country', None) |
|
|
|
|
|
|
|
# Otherwise use random country code from geo bypass context but |
|
|
|
# only if extractor is known as geo bypassable |
|
|
|
if not country: |
|
|
|
countries = geo_bypass_context.get('countries') |
|
|
|
if self._GEO_BYPASS and countries: |
|
|
|
country = random.choice(countries) |
|
|
|
|
|
|
|
if country: |
|
|
|
self._x_forwarded_for_ip = GeoUtils.random_ipv4(country) |
|
|
|
if self._downloader.params.get('verbose', False): |
|
|
|
self._downloader.to_screen( |
|
|
|
'[debug] Using fake IP %s (%s) as X-Forwarded-For.' |
|
|
|
% (self._x_forwarded_for_ip, country_code.upper())) |
|
|
|
% (self._x_forwarded_for_ip, country.upper())) |
|
|
|
|
|
|
|
def extract(self, url): |
|
|
|
"""Extracts URL information and returns it in list of dicts.""" |
|
|
|