Coverage for youtube_dl.InfoExtractors: 58%

_VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'

_TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'

_VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'

_MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"

IE_NAME = u'youtube:playlist'

def __init__(self, downloader=None):

InfoExtractor.__init__(self, downloader)

def report_download_page(self, playlist_id, pagenum):

"""Report attempt to download playlist page with given number."""

self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))

def _real_extract(self, url):

# Extract playlist id

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._downloader.trouble(u'ERROR: invalid url: %s' % url)

return

# Single video case

if mobj.group(3) is not None:

self._downloader.download([mobj.group(3)])

return

# Download playlist pages

# prefix is 'p' as default for playlists but there are other types that need extra care

playlist_prefix = mobj.group(1)

if playlist_prefix == 'a':

playlist_access = 'artist'

else:

playlist_prefix = 'p'

playlist_access = 'view_play_list'

playlist_id = mobj.group(2)

video_ids = []

pagenum = 1

while True:

self.report_download_page(playlist_id, pagenum)

url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)

request = compat_urllib_request.Request(url)

try:

page = compat_urllib_request.urlopen(request).read().decode('utf-8')

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))

return

# Extract video identifiers

ids_in_page = []

for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):

if mobj.group(1) not in ids_in_page:

ids_in_page.append(mobj.group(1))

video_ids.extend(ids_in_page)

if self._MORE_PAGES_INDICATOR not in page:

break

pagenum = pagenum + 1

total = len(video_ids)

playliststart = self._downloader.params.get('playliststart', 1) - 1

playlistend = self._downloader.params.get('playlistend', -1)

if playlistend == -1:

video_ids = video_ids[playliststart:]

else:

video_ids = video_ids[playliststart:playlistend]

if len(video_ids) == total:

self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))

else:

self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))

for id in video_ids:

self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])

return

class YoutubeChannelIE(InfoExtractor):

"""Information Extractor for YouTube channels."""

_VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"

_TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'

_MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"

IE_NAME = u'youtube:channel'

def report_download_page(self, channel_id, pagenum):

"""Report attempt to download channel page with given number."""

self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))

def _real_extract(self, url):

# Extract channel id

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._downloader.trouble(u'ERROR: invalid url: %s' % url)

return

# Download channel pages

channel_id = mobj.group(1)

video_ids = []

pagenum = 1

while True:

self.report_download_page(channel_id, pagenum)

url = self._TEMPLATE_URL % (channel_id, pagenum)

request = compat_urllib_request.Request(url)

try:

page = compat_urllib_request.urlopen(request).read().decode('utf8')

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))

return

# Extract video identifiers

ids_in_page = []

for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):

if mobj.group(1) not in ids_in_page:

ids_in_page.append(mobj.group(1))

video_ids.extend(ids_in_page)

if self._MORE_PAGES_INDICATOR not in page:

break

pagenum = pagenum + 1

self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))

for id in video_ids:

self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])

return

class YoutubeUserIE(InfoExtractor):

"""Information Extractor for YouTube users."""

_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'

_TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'

_GDATA_PAGE_SIZE = 50

_GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'

_VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'

IE_NAME = u'youtube:user'

def __init__(self, downloader=None):

InfoExtractor.__init__(self, downloader)

def report_download_page(self, username, start_index):

"""Report attempt to download user page."""

self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %

(username, start_index, start_index + self._GDATA_PAGE_SIZE))

def _real_extract(self, url):

# Extract username

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._downloader.trouble(u'ERROR: invalid url: %s' % url)

return

username = mobj.group(1)

# Download video ids using YouTube Data API. Result size per

# query is limited (currently to 50 videos) so we need to query

# page by page until there are no video ids - it means we got

# all of them.

video_ids = []

pagenum = 0

while True:

start_index = pagenum * self._GDATA_PAGE_SIZE + 1

self.report_download_page(username, start_index)

request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))

try:

page = compat_urllib_request.urlopen(request).read().decode('utf-8')

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))

return

# Extract video identifiers

ids_in_page = []

for mobj in re.finditer(self._VIDEO_INDICATOR, page):

if mobj.group(1) not in ids_in_page:

ids_in_page.append(mobj.group(1))

video_ids.extend(ids_in_page)

# A little optimization - if current page is not

# "full", ie. does not contain PAGE_SIZE video ids then

# we can assume that this page is the last one - there

# are no more ids on further pages - no need to query

# again.

if len(ids_in_page) < self._GDATA_PAGE_SIZE:

break

pagenum += 1

all_ids_count = len(video_ids)

playliststart = self._downloader.params.get('playliststart', 1) - 1

playlistend = self._downloader.params.get('playlistend', -1)

if playlistend == -1:

video_ids = video_ids[playliststart:]

else:

video_ids = video_ids[playliststart:playlistend]

self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %

(username, all_ids_count, len(video_ids)))

for video_id in video_ids:

self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])

class BlipTVUserIE(InfoExtractor):

"""Information Extractor for blip.tv users."""

_VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'

_PAGE_SIZE = 12

IE_NAME = u'blip.tv:user'

def __init__(self, downloader=None):

InfoExtractor.__init__(self, downloader)

def report_download_page(self, username, pagenum):

"""Report attempt to download user page."""

self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %

(self.IE_NAME, username, pagenum))

def _real_extract(self, url):

# Extract username

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._downloader.trouble(u'ERROR: invalid url: %s' % url)

return

username = mobj.group(1)

page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'

request = compat_urllib_request.Request(url)

try:

page = compat_urllib_request.urlopen(request).read().decode('utf-8')

mobj = re.search(r'data-users-id="([^"]+)"', page)

page_base = page_base % mobj.group(1)

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))

return

# Download video ids using BlipTV Ajax calls. Result size per

# query is limited (currently to 12 videos) so we need to query

# page by page until there are no video ids - it means we got

# all of them.

video_ids = []

pagenum = 1

while True:

self.report_download_page(username, pagenum)

request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )

try:

page = compat_urllib_request.urlopen(request).read().decode('utf-8')

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))

return

# Extract video identifiers

ids_in_page = []

for mobj in re.finditer(r'href="/([^"]+)"', page):

if mobj.group(1) not in ids_in_page:

ids_in_page.append(unescapeHTML(mobj.group(1)))

video_ids.extend(ids_in_page)

# A little optimization - if current page is not

# "full", ie. does not contain PAGE_SIZE video ids then

# we can assume that this page is the last one - there

# are no more ids on further pages - no need to query

# again.

if len(ids_in_page) < self._PAGE_SIZE:

break

pagenum += 1

all_ids_count = len(video_ids)

playliststart = self._downloader.params.get('playliststart', 1) - 1

playlistend = self._downloader.params.get('playlistend', -1)

if playlistend == -1:

video_ids = video_ids[playliststart:]

else:

video_ids = video_ids[playliststart:playlistend]

self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %

(self.IE_NAME, username, all_ids_count, len(video_ids)))

for video_id in video_ids:

self._downloader.download([u'http://blip.tv/'+video_id])

class DepositFilesIE(InfoExtractor):

"""Information extractor for depositfiles.com"""

_VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'

IE_NAME = u'DepositFiles'

def __init__(self, downloader=None):

InfoExtractor.__init__(self, downloader)

def report_download_webpage(self, file_id):

"""Report webpage download."""

self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)

def report_extraction(self, file_id):

"""Report information extraction."""

self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)

def _real_extract(self, url):

file_id = url.split('/')[-1]

# Rebuild url in english locale

url = 'http://depositfiles.com/en/files/' + file_id

# Retrieve file webpage with 'Free download' button pressed

free_download_indication = { 'gateway_result' : '1' }

request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))

try:

self.report_download_webpage(file_id)

webpage = compat_urllib_request.urlopen(request).read()

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))

return

# Search for the real file URL

mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)

if (mobj is None) or (mobj.group(1) is None):

# Try to figure out reason of the error.

mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)

if (mobj is not None) and (mobj.group(1) is not None):

restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()

self._downloader.trouble(u'ERROR: %s' % restriction_message)

else:

self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)

return

file_url = mobj.group(1)

file_extension = os.path.splitext(file_url)[1][1:]

# Search for file title

mobj = re.search(r'<b title="(.*?)">', webpage)

if mobj is None:

self._downloader.trouble(u'ERROR: unable to extract title')

return

file_title = mobj.group(1).decode('utf-8')

return [{

'id': file_id.decode('utf-8'),

'url': file_url.decode('utf-8'),

'uploader': None,

'upload_date': None,

'title': file_title,

'ext': file_extension.decode('utf-8'),

}]

class FacebookIE(InfoExtractor):

"""Information Extractor for Facebook"""

_WORKING = False

_VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'

_LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'

_NETRC_MACHINE = 'facebook'

_available_formats = ['video', 'highqual', 'lowqual']

_video_extensions = {

'video': 'mp4',

'highqual': 'mp4',

'lowqual': 'mp4',

}

IE_NAME = u'facebook'

def __init__(self, downloader=None):

InfoExtractor.__init__(self, downloader)

def _reporter(self, message):

"""Add header and report message."""

self._downloader.to_screen(u'[facebook] %s' % message)

def report_login(self):

"""Report attempt to log in."""

self._reporter(u'Logging in')

def report_video_webpage_download(self, video_id):

"""Report attempt to download video webpage."""

self._reporter(u'%s: Downloading video webpage' % video_id)

def report_information_extraction(self, video_id):

"""Report attempt to extract video information."""

self._reporter(u'%s: Extracting video information' % video_id)

def _parse_page(self, video_webpage):

"""Extract video information from page"""

# General data

data = {'title': r'$"video_title", "(.*?)"$',

'description': r'<div class="datawrap">(.*?)</div>',

'owner': r'$"video_owner_name", "(.*?)"$',

'thumbnail': r'$"thumb_url", "(?P<THUMB>.*?)"$',

}

video_info = {}

for piece in data.keys():

mobj = re.search(data[piece], video_webpage)

if mobj is not None:

video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))

# Video urls

video_urls = {}

for fmt in self._available_formats:

mobj = re.search(r'$"%s_src\", "(.+?)"$' % fmt, video_webpage)

if mobj is not None:

# URL is in a Javascript segment inside an escaped Unicode format within

# the generally utf-8 page

video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))

video_info['video_urls'] = video_urls

return video_info

def _real_initialize(self):

if self._downloader is None:

return

useremail = None

password = None

downloader_params = self._downloader.params

# Attempt to use provided username and password or .netrc data

if downloader_params.get('username', None) is not None:

useremail = downloader_params['username']

password = downloader_params['password']

elif downloader_params.get('usenetrc', False):

try:

info = netrc.netrc().authenticators(self._NETRC_MACHINE)

if info is not None:

useremail = info[0]

password = info[2]

else:

raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)

except (IOError, netrc.NetrcParseError) as err:

self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))

return

if useremail is None:

return

# Log in

login_form = {

'email': useremail,

'pass': password,

'login': 'Log+In'

}

request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))

try:

self.report_login()

login_results = compat_urllib_request.urlopen(request).read()

if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:

self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')

return

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))

return

def _real_extract(self, url):

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

return

video_id = mobj.group('ID')

# Get video webpage

self.report_video_webpage_download(video_id)

request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)

try:

page = compat_urllib_request.urlopen(request)

video_webpage = page.read()

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))

return

# Start extracting information

self.report_information_extraction(video_id)

# Extract information

video_info = self._parse_page(video_webpage)

# uploader

if 'owner' not in video_info:

self._downloader.trouble(u'ERROR: unable to extract uploader nickname')

return

video_uploader = video_info['owner']

# title

if 'title' not in video_info:

self._downloader.trouble(u'ERROR: unable to extract video title')

return

video_title = video_info['title']

video_title = video_title.decode('utf-8')

# thumbnail image

if 'thumbnail' not in video_info:

self._downloader.trouble(u'WARNING: unable to extract video thumbnail')

video_thumbnail = ''

else:

video_thumbnail = video_info['thumbnail']

# upload date

upload_date = None

if 'upload_date' in video_info:

upload_time = video_info['upload_date']

timetuple = email.utils.parsedate_tz(upload_time)

if timetuple is not None:

try:

upload_date = time.strftime('%Y%m%d', timetuple[0:9])

except:

pass

# description

video_description = video_info.get('description', 'No description available.')

url_map = video_info['video_urls']

if len(url_map.keys()) > 0:

# Decide which formats to download

req_format = self._downloader.params.get('format', None)

format_limit = self._downloader.params.get('format_limit', None)

if format_limit is not None and format_limit in self._available_formats:

format_list = self._available_formats[self._available_formats.index(format_limit):]

else:

format_list = self._available_formats

existing_formats = [x for x in format_list if x in url_map]

if len(existing_formats) == 0:

self._downloader.trouble(u'ERROR: no known formats available for video')

return

if req_format is None:

video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality

elif req_format == 'worst':

video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality

elif req_format == '-1':

video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats

else:

# Specific format

if req_format not in url_map:

self._downloader.trouble(u'ERROR: requested format not available')

return

video_url_list = [(req_format, url_map[req_format])] # Specific format

results = []

for format_param, video_real_url in video_url_list:

# Extension

video_extension = self._video_extensions.get(format_param, 'mp4')

results.append({

'id': video_id.decode('utf-8'),

'url': video_real_url.decode('utf-8'),

'uploader': video_uploader.decode('utf-8'),

'upload_date': upload_date,

'title': video_title,

'ext': video_extension.decode('utf-8'),

'format': (format_param is None and u'NA' or format_param.decode('utf-8')),

'thumbnail': video_thumbnail.decode('utf-8'),

'description': video_description.decode('utf-8'),

})

return results

class BlipTVIE(InfoExtractor):

"""Information extractor for blip.tv"""

_VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'

_URL_EXT = r'^.*\.([a-z0-9]+)$'

IE_NAME = u'blip.tv'

def report_extraction(self, file_id):

"""Report information extraction."""

self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))

def report_direct_download(self, title):

"""Report information extraction."""

self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))

def _real_extract(self, url):

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

return

if '?' in url:

cchar = '&'

else:

cchar = '?'

json_url = url + cchar + 'skin=json&version=2&no_wrap=1'

request = compat_urllib_request.Request(json_url)

self.report_extraction(mobj.group(1))

info = None

try:

urlh = compat_urllib_request.urlopen(request)

if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download

basename = url.split('/')[-1]

title,ext = os.path.splitext(basename)

title = title.decode('UTF-8')

ext = ext.replace('.', '')

self.report_direct_download(title)

info = {

'id': title,

'url': url,

'uploader': None,

'upload_date': None,

'title': title,

'ext': ext,

'urlhandle': urlh

}

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))

return

if info is None: # Regular URL

try:

json_code_bytes = urlh.read()

json_code = json_code_bytes.decode('utf-8')

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))

return

try:

json_data = json.loads(json_code)

if 'Post' in json_data:

data = json_data['Post']

else:

data = json_data

upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')

video_url = data['media']['url']

umobj = re.match(self._URL_EXT, video_url)

if umobj is None:

raise ValueError('Can not determine filename extension')

ext = umobj.group(1)

info = {

'id': data['item_id'],

'url': video_url,

'uploader': data['display_name'],

'upload_date': upload_date,

'title': data['title'],

'ext': ext,

'format': data['media']['mimeType'],

'thumbnail': data['thumbnailUrl'],

'description': data['description'],

'player_url': data['embedUrl']

}

except (ValueError,KeyError) as err:

self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))

return

std_headers['User-Agent'] = 'iTunes/10.6.1'

return [info]

class MyVideoIE(InfoExtractor):

"""Information Extractor for myvideo.de."""

_VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'

IE_NAME = u'myvideo'

def __init__(self, downloader=None):

InfoExtractor.__init__(self, downloader)

def report_download_webpage(self, video_id):

"""Report webpage download."""

self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)

def report_extraction(self, video_id):

"""Report information extraction."""

self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)

def _real_extract(self,url):

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._download.trouble(u'ERROR: invalid URL: %s' % url)

return

video_id = mobj.group(1)

# Get video webpage

request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)

try:

self.report_download_webpage(video_id)

webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))

return

self.report_extraction(video_id)

mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',

webpage)

if mobj is None:

self._downloader.trouble(u'ERROR: unable to extract media URL')

return

video_url = mobj.group(1) + ('/%s.flv' % video_id)

mobj = re.search('<title>([^<]+)</title>', webpage)

if mobj is None:

self._downloader.trouble(u'ERROR: unable to extract title')

return

video_title = mobj.group(1)

return [{

'id': video_id,

'url': video_url,

'uploader': None,

'upload_date': None,

'title': video_title,

'ext': u'flv',

}]

class ComedyCentralIE(InfoExtractor):

"""Information extractor for The Daily Show and Colbert Report """

# urls can be abbreviations like :thedailyshow or :colbert

# urls for episodes like:

# or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day

# or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news

# or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524

|(https?://)?(www\.)?

(?P<showname>thedailyshow|colbertnation)\.com/

(full-episodes/(?P<episode>.*)|

(?P<clip>

(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))

|(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))

$"""

IE_NAME = u'comedycentral'

_available_formats = ['3500', '2200', '1700', '1200', '750', '400']

_video_extensions = {

'3500': 'mp4',

'2200': 'mp4',

'1700': 'mp4',

'1200': 'mp4',

'750': 'mp4',

'400': 'mp4',

}

_video_dimensions = {

'3500': '1280x720',

'2200': '960x540',

'1700': '768x432',

'1200': '640x360',

'750': '512x288',

'400': '384x216',

}

def suitable(self, url):

"""Receives a URL and returns True if suitable for this IE."""

return re.match(self._VALID_URL, url, re.VERBOSE) is not None

def report_extraction(self, episode_id):

self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)

def report_config_download(self, episode_id):

self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)

def report_index_download(self, episode_id):

self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)

def report_player_url(self, episode_id):

self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)

def _print_formats(self, formats):

print('Available formats:')

for x in formats:

print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))

def _real_extract(self, url):

mobj = re.match(self._VALID_URL, url, re.VERBOSE)

if mobj is None:

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

return

if mobj.group('shortname'):

if mobj.group('shortname') in ('tds', 'thedailyshow'):

url = u'http://www.thedailyshow.com/full-episodes/'

else:

url = u'http://www.colbertnation.com/full-episodes/'

mobj = re.match(self._VALID_URL, url, re.VERBOSE)

assert mobj is not None

if mobj.group('clip'):

if mobj.group('showname') == 'thedailyshow':

epTitle = mobj.group('tdstitle')

else:

epTitle = mobj.group('cntitle')

dlNewest = False

else:

dlNewest = not mobj.group('episode')

if dlNewest:

epTitle = mobj.group('showname')

else:

epTitle = mobj.group('episode')

req = compat_urllib_request.Request(url)

self.report_extraction(epTitle)

try:

htmlHandle = compat_urllib_request.urlopen(req)

html = htmlHandle.read()

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))

return

if dlNewest:

url = htmlHandle.geturl()

mobj = re.match(self._VALID_URL, url, re.VERBOSE)

if mobj is None:

self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)

return

if mobj.group('episode') == '':

self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)

return

epTitle = mobj.group('episode')

mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)

if len(mMovieParams) == 0:

# The Colbert Report embeds the information in a without

# a URL prefix; so extract the alternate reference

# and then add the URL prefix manually.

altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)

if len(altMovieParams) == 0:

self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)

return

else:

mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]

playerUrl_raw = mMovieParams[0][0]

self.report_player_url(epTitle)

try:

urlHandle = compat_urllib_request.urlopen(playerUrl_raw)

playerUrl = urlHandle.geturl()

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))

return

uri = mMovieParams[0][1]

indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})

self.report_index_download(epTitle)

try:

indexXml = compat_urllib_request.urlopen(indexUrl).read()

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))

return

results = []

idoc = xml.etree.ElementTree.fromstring(indexXml)

itemEls = idoc.findall('.//item')

for itemEl in itemEls:

mediaId = itemEl.findall('./guid')[0].text

shortMediaId = mediaId.split(':')[-1]

showId = mediaId.split(':')[-2].replace('.com', '')

officialTitle = itemEl.findall('./title')[0].text

officialDate = itemEl.findall('./pubDate')[0].text

configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +

compat_urllib_parse.urlencode({'uri': mediaId}))

configReq = compat_urllib_request.Request(configUrl)

self.report_config_download(epTitle)

try:

configXml = compat_urllib_request.urlopen(configReq).read()

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))

return

cdoc = xml.etree.ElementTree.fromstring(configXml)

turls = []

for rendition in cdoc.findall('.//rendition'):

finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)

turls.append(finfo)

if len(turls) == 0:

self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')

continue

if self._downloader.params.get('listformats', None):

self._print_formats([i[0] for i in turls])

return

# For now, just pick the highest bitrate

format,video_url = turls[-1]

# Get the format arg from the arg stream

req_format = self._downloader.params.get('format', None)

# Select format if we can find one

for f,v in turls:

if f == req_format:

format, video_url = f, v

break

# Patch to download from alternative CDN, which does not

# break on current RTMPDump builds

broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"

better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"

if video_url.startswith(broken_cdn):

video_url = video_url.replace(broken_cdn, better_cdn)

effTitle = showId + u'-' + epTitle

info = {

'id': shortMediaId,

'url': video_url,

'uploader': showId,

'upload_date': officialDate,

'title': effTitle,

'ext': 'mp4',

'format': format,

'thumbnail': None,

'description': officialTitle,

'player_url': None #playerUrl

}

results.append(info)

return results

class EscapistIE(InfoExtractor):

"""Information extractor for The Escapist """

_VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'

IE_NAME = u'escapist'

def report_extraction(self, showName):

self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)

def report_config_download(self, showName):

self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)

def _real_extract(self, url):

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

return

showName = mobj.group('showname')

videoId = mobj.group('episode')

self.report_extraction(showName)

try:

webPage = compat_urllib_request.urlopen(url)

webPageBytes = webPage.read()

m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])

webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))

return

descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)

description = unescapeHTML(descMatch.group(1))

imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)

imgUrl = unescapeHTML(imgMatch.group(1))

playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)

playerUrl = unescapeHTML(playerUrlMatch.group(1))

configUrlMatch = re.search('config=(.*)$', playerUrl)

configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))

self.report_config_download(showName)

try:

configJSON = compat_urllib_request.urlopen(configUrl)

m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])

configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))

return

# Technically, it's JavaScript, not JSON

configJSON = configJSON.replace("'", '"')

try:

config = json.loads(configJSON)

except (ValueError,) as err:

self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))

return

playlist = config['playlist']

videoUrl = playlist[1]['url']

info = {

'id': videoId,

'url': videoUrl,

'uploader': showName,

'upload_date': None,

'title': showName,

'ext': 'flv',

'thumbnail': imgUrl,

'description': description,

'player_url': playerUrl,

}

return [info]

class CollegeHumorIE(InfoExtractor):

"""Information extractor for collegehumor.com"""

_WORKING = False

_VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'

IE_NAME = u'collegehumor'

def report_manifest(self, video_id):

"""Report information extraction."""

self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))

def report_extraction(self, video_id):

"""Report information extraction."""

self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

def _real_extract(self, url):

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

return

video_id = mobj.group('videoid')

info = {

'id': video_id,

'uploader': None,

'upload_date': None,

}

self.report_extraction(video_id)

xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id

try:

metaXml = compat_urllib_request.urlopen(xmlUrl).read()

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))

return

mdoc = xml.etree.ElementTree.fromstring(metaXml)

try:

videoNode = mdoc.findall('./video')[0]

info['description'] = videoNode.findall('./description')[0].text

info['title'] = videoNode.findall('./caption')[0].text

info['thumbnail'] = videoNode.findall('./thumbnail')[0].text

manifest_url = videoNode.findall('./file')[0].text

except IndexError:

self._downloader.trouble(u'\nERROR: Invalid metadata XML file')

return

manifest_url += '?hdcore=2.10.3'

self.report_manifest(video_id)

try:

manifestXml = compat_urllib_request.urlopen(manifest_url).read()

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))

return

adoc = xml.etree.ElementTree.fromstring(manifestXml)

try:

media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]

node_id = media_node.attrib['url']

video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text

except IndexError as err:

self._downloader.trouble(u'\nERROR: Invalid manifest file')

return

url_pr = compat_urllib_parse_urlparse(manifest_url)

url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'

info['url'] = url

info['ext'] = 'f4f'

return [info]

class XVideosIE(InfoExtractor):

"""Information extractor for xvideos.com"""

_VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'

IE_NAME = u'xvideos'

def report_webpage(self, video_id):

"""Report information extraction."""

self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))

def report_extraction(self, video_id):

"""Report information extraction."""

self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

def _real_extract(self, url):

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

return

video_id = mobj.group(1)

self.report_webpage(video_id)

request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)

try:

webpage_bytes = compat_urllib_request.urlopen(request).read()

webpage = webpage_bytes.decode('utf-8', 'replace')

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))

return

self.report_extraction(video_id)

# Extract video URL

mobj = re.search(r'flv_url=(.+?)&', webpage)

if mobj is None:

self._downloader.trouble(u'ERROR: unable to extract video url')

return

video_url = compat_urllib_parse.unquote(mobj.group(1))

# Extract title

mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)

if mobj is None:

self._downloader.trouble(u'ERROR: unable to extract video title')

return

video_title = mobj.group(1)

# Extract video thumbnail

mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)

if mobj is None:

self._downloader.trouble(u'ERROR: unable to extract video thumbnail')

return

video_thumbnail = mobj.group(0)

info = {

'id': video_id,

'url': video_url,

'uploader': None,

'upload_date': None,

'title': video_title,

'ext': 'flv',

'thumbnail': video_thumbnail,

'description': None,

}

return [info]

class SoundcloudIE(InfoExtractor):

"""Information extractor for soundcloud.com

To access the media, the uid of the song and a stream token

must be extracted from the page source and the script must make

a request to media.soundcloud.com/crossdomain.xml. Then

the media can be grabbed by requesting from an url composed

of the stream token and uid

"""

_VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'

IE_NAME = u'soundcloud'

def __init__(self, downloader=None):

InfoExtractor.__init__(self, downloader)

def report_resolve(self, video_id):

"""Report information extraction."""

self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))

def report_extraction(self, video_id):

"""Report information extraction."""

self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))

def _real_extract(self, url):

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

return

# extract uploader (which is in the url)

uploader = mobj.group(1)

# extract simple title (uploader + slug of song title)

slug_title = mobj.group(2)

simple_title = uploader + u'-' + slug_title

self.report_resolve('%s/%s' % (uploader, slug_title))

url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)

resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'

request = compat_urllib_request.Request(resolv_url)

try:

info_json_bytes = compat_urllib_request.urlopen(request).read()

info_json = info_json_bytes.decode('utf-8')

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))

return

info = json.loads(info_json)

video_id = info['id']

self.report_extraction('%s/%s' % (uploader, slug_title))

streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'

request = compat_urllib_request.Request(streams_url)

try:

stream_json_bytes = compat_urllib_request.urlopen(request).read()

stream_json = stream_json_bytes.decode('utf-8')

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))

return

streams = json.loads(stream_json)

mediaURL = streams['http_mp3_128_url']

return [{

'id': info['id'],

'url': mediaURL,

'uploader': info['user']['username'],

'upload_date': info['created_at'],

'title': info['title'],

'ext': u'mp3',

'description': info['description'],

}]

class InfoQIE(InfoExtractor):

"""Information extractor for infoq.com"""

_VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'

IE_NAME = u'infoq'

def report_webpage(self, video_id):

"""Report information extraction."""

self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))

def report_extraction(self, video_id):

"""Report information extraction."""

self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

def _real_extract(self, url):

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

return

self.report_webpage(url)

request = compat_urllib_request.Request(url)

try:

webpage = compat_urllib_request.urlopen(request).read()

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))

return

self.report_extraction(url)

# Extract video URL

mobj = re.search(r"jsclassref='([^']*)'", webpage)

if mobj is None:

self._downloader.trouble(u'ERROR: unable to extract video url')

return

video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))

# Extract title

mobj = re.search(r'contentTitle = "(.*?)";', webpage)

if mobj is None:

self._downloader.trouble(u'ERROR: unable to extract video title')

return

video_title = mobj.group(1).decode('utf-8')

# Extract description

video_description = u'No description available.'

mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)

if mobj is not None:

video_description = mobj.group(1).decode('utf-8')

video_filename = video_url.split('/')[-1]

video_id, extension = video_filename.split('.')

info = {

'id': video_id,

'url': video_url,

'uploader': None,

'upload_date': None,

'title': video_title,

'ext': extension, # Extension is always(?) mp4, but seems to be flv

'thumbnail': None,

'description': video_description,

}

return [info]

class MixcloudIE(InfoExtractor):

"""Information extractor for www.mixcloud.com"""

_WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/

_VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'

IE_NAME = u'mixcloud'

def __init__(self, downloader=None):

InfoExtractor.__init__(self, downloader)

def report_download_json(self, file_id):

"""Report JSON download."""

self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)

def report_extraction(self, file_id):

"""Report information extraction."""

self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))

def get_urls(self, jsonData, fmt, bitrate='best'):

"""Get urls from 'audio_formats' section in json"""

file_url = None

try:

bitrate_list = jsonData[fmt]

if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:

bitrate = max(bitrate_list) # select highest

url_list = jsonData[fmt][bitrate]

except TypeError: # we have no bitrate info.

url_list = jsonData[fmt]

return url_list

def check_urls(self, url_list):

"""Returns 1st active url from list"""

for url in url_list:

try:

compat_urllib_request.urlopen(url)

return url

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

url = None

return None

def _print_formats(self, formats):

print('Available formats:')

for fmt in formats.keys():

for b in formats[fmt]:

try:

ext = formats[fmt][b][0]

print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))

except TypeError: # we have no bitrate info

ext = formats[fmt][0]

print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))

break

def _real_extract(self, url):

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

return

# extract uploader & filename from url

uploader = mobj.group(1).decode('utf-8')

file_id = uploader + "-" + mobj.group(2).decode('utf-8')

# construct API request

file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'

# retrieve .json file with links to files

request = compat_urllib_request.Request(file_url)

try:

self.report_download_json(file_url)

jsonData = compat_urllib_request.urlopen(request).read()

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))

return

# parse JSON

json_data = json.loads(jsonData)

player_url = json_data['player_swf_url']

formats = dict(json_data['audio_formats'])

req_format = self._downloader.params.get('format', None)

bitrate = None

if self._downloader.params.get('listformats', None):

self._print_formats(formats)

return

if req_format is None or req_format == 'best':

for format_param in formats.keys():

url_list = self.get_urls(formats, format_param)

# check urls

file_url = self.check_urls(url_list)

if file_url is not None:

break # got it!

else:

if req_format not in formats.keys():

self._downloader.trouble(u'ERROR: format is not available')

return

url_list = self.get_urls(formats, req_format)

file_url = self.check_urls(url_list)

format_param = req_format

return [{

'id': file_id.decode('utf-8'),

'url': file_url.decode('utf-8'),

'uploader': uploader.decode('utf-8'),

'upload_date': None,

'title': json_data['name'],

'ext': file_url.split('.')[-1].decode('utf-8'),

'format': (format_param is None and u'NA' or format_param.decode('utf-8')),

'thumbnail': json_data['thumbnail_url'],

'description': json_data['description'],

'player_url': player_url.decode('utf-8'),

}]

class StanfordOpenClassroomIE(InfoExtractor):

"""Information extractor for Stanford's Open ClassRoom"""

_VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'

IE_NAME = u'stanfordoc'

def report_download_webpage(self, objid):

"""Report information extraction."""

self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))

def report_extraction(self, video_id):

"""Report information extraction."""

self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

def _real_extract(self, url):

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

return

if mobj.group('course') and mobj.group('video'): # A specific video

course = mobj.group('course')

video = mobj.group('video')

info = {

'id': course + '_' + video,

'uploader': None,

'upload_date': None,

}

self.report_extraction(info['id'])

baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'

xmlUrl = baseUrl + video + '.xml'

try:

metaXml = compat_urllib_request.urlopen(xmlUrl).read()

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))

return

mdoc = xml.etree.ElementTree.fromstring(metaXml)

try:

info['title'] = mdoc.findall('./title')[0].text

info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text

except IndexError:

self._downloader.trouble(u'\nERROR: Invalid metadata XML file')

return

info['ext'] = info['url'].rpartition('.')[2]

return [info]

elif mobj.group('course'): # A course page

course = mobj.group('course')

info = {

'id': course,

'type': 'playlist',

'uploader': None,

'upload_date': None,

}

self.report_download_webpage(info['id'])

try:

coursepage = compat_urllib_request.urlopen(url).read()

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))

return

m = re.search('<h1>([^<]+)</h1>', coursepage)

if m:

info['title'] = unescapeHTML(m.group(1))

else:

info['title'] = info['id']

m = re.search('<description>([^<]+)</description>', coursepage)

if m:

info['description'] = unescapeHTML(m.group(1))

links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))

info['list'] = [

{

'type': 'reference',

'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),

}

for vpage in links]

results = []

for entry in info['list']:

assert entry['type'] == 'reference'

results += self.extract(entry['url'])

return results

else: # Root page

info = {

'id': 'Stanford OpenClassroom',

'type': 'playlist',

'uploader': None,

'upload_date': None,

}

self.report_download_webpage(info['id'])

rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'

try:

rootpage = compat_urllib_request.urlopen(rootURL).read()

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))

return

info['title'] = info['id']

links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))

info['list'] = [

{

'type': 'reference',

'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),

}

for cpage in links]

results = []

for entry in info['list']:

assert entry['type'] == 'reference'

results += self.extract(entry['url'])

return results

class MTVIE(InfoExtractor):

"""Information extractor for MTV.com"""

_VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'

IE_NAME = u'mtv'

def report_webpage(self, video_id):

"""Report information extraction."""

self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))

def report_extraction(self, video_id):

"""Report information extraction."""

self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

def _real_extract(self, url):

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

return

if not mobj.group('proto'):

url = 'http://' + url

video_id = mobj.group('videoid')

self.report_webpage(video_id)

request = compat_urllib_request.Request(url)

try:

webpage = compat_urllib_request.urlopen(request).read()

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))

return

mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)

if mobj is None:

self._downloader.trouble(u'ERROR: unable to extract song name')

return

song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))

mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)

if mobj is None:

self._downloader.trouble(u'ERROR: unable to extract performer')

return

performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))

video_title = performer + ' - ' + song_name

mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)

if mobj is None:

self._downloader.trouble(u'ERROR: unable to mtvn_uri')

return

mtvn_uri = mobj.group(1)

mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)

if mobj is None:

self._downloader.trouble(u'ERROR: unable to extract content id')

return

content_id = mobj.group(1)

videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri

self.report_extraction(video_id)

request = compat_urllib_request.Request(videogen_url)

try:

metadataXml = compat_urllib_request.urlopen(request).read()

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))

return

mdoc = xml.etree.ElementTree.fromstring(metadataXml)

renditions = mdoc.findall('.//rendition')

# For now, always pick the highest quality.

rendition = renditions[-1]

try:

_,_,ext = rendition.attrib['type'].partition('/')

format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']

video_url = rendition.find('./src').text

except KeyError:

self._downloader.trouble('Invalid rendition field.')

return

info = {

'id': video_id,

'url': video_url,

'uploader': performer,

'upload_date': None,

'title': video_title,

'ext': ext,

'format': format,

}

return [info]

class YoukuIE(InfoExtractor):

_VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'

IE_NAME = u'Youku'

def __init__(self, downloader=None):

InfoExtractor.__init__(self, downloader)

def report_download_webpage(self, file_id):

"""Report webpage download."""

self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)

def report_extraction(self, file_id):

"""Report information extraction."""

self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)

def _gen_sid(self):

nowTime = int(time.time() * 1000)

random1 = random.randint(1000,1998)

random2 = random.randint(1000,9999)

return "%d%d%d" %(nowTime,random1,random2)

def _get_file_ID_mix_string(self, seed):

mixed = []

source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")

seed = float(seed)

for i in range(len(source)):

seed = (seed * 211 + 30031 ) % 65536

index = math.floor(seed / 65536 * len(source) )

mixed.append(source[int(index)])

source.remove(source[int(index)])

#return ''.join(mixed)

return mixed

def _get_file_id(self, fileId, seed):

mixed = self._get_file_ID_mix_string(seed)

ids = fileId.split('*')

realId = []

for ch in ids:

if ch:

realId.append(mixed[int(ch)])

return ''.join(realId)

def _real_extract(self, url):

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

return

video_id = mobj.group('ID')

info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id

request = compat_urllib_request.Request(info_url, None, std_headers)

try:

self.report_download_webpage(video_id)

jsondata = compat_urllib_request.urlopen(request).read()

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))

return

self.report_extraction(video_id)

try:

jsonstr = jsondata.decode('utf-8')

config = json.loads(jsonstr)

video_title = config['data'][0]['title']

seed = config['data'][0]['seed']

format = self._downloader.params.get('format', None)

supported_format = config['data'][0]['streamfileids'].keys()

if format is None or format == 'best':

if 'hd2' in supported_format:

format = 'hd2'

else:

format = 'flv'

ext = u'flv'

elif format == 'worst':

format = 'mp4'

ext = u'mp4'

else:

format = 'flv'

ext = u'flv'

fileid = config['data'][0]['streamfileids'][format]

keys = [s['k'] for s in config['data'][0]['segs'][format]]

except (UnicodeDecodeError, ValueError, KeyError):

self._downloader.trouble(u'ERROR: unable to extract info section')

return

files_info=[]

sid = self._gen_sid()

fileid = self._get_file_id(fileid, seed)

#column 8,9 of fileid represent the segment number

#fileid[7:9] should be changed

for index, key in enumerate(keys):

temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])

download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)

info = {

'id': '%s_part%02d' % (video_id, index),

'url': download_url,

'uploader': None,

'upload_date': None,

'title': video_title,

'ext': ext,

}

files_info.append(info)

return files_info

class XNXXIE(InfoExtractor):

"""Information extractor for xnxx.com"""

_VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'

IE_NAME = u'xnxx'

VIDEO_URL_RE = r'flv_url=(.*?)&'

VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'

VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'

def report_webpage(self, video_id):

"""Report information extraction"""

self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))

def report_extraction(self, video_id):

"""Report information extraction"""

self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

def _real_extract(self, url):

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

return

video_id = mobj.group(1)

self.report_webpage(video_id)

# Get webpage content

try:

webpage_bytes = compat_urllib_request.urlopen(url).read()

webpage = webpage_bytes.decode('utf-8')

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)

return

result = re.search(self.VIDEO_URL_RE, webpage)

if result is None:

self._downloader.trouble(u'ERROR: unable to extract video url')

return

video_url = compat_urllib_parse.unquote(result.group(1))

result = re.search(self.VIDEO_TITLE_RE, webpage)

if result is None:

self._downloader.trouble(u'ERROR: unable to extract video title')

return

video_title = result.group(1)

result = re.search(self.VIDEO_THUMB_RE, webpage)

if result is None:

self._downloader.trouble(u'ERROR: unable to extract video thumbnail')

return

video_thumbnail = result.group(1)

return [{

'id': video_id,

'url': video_url,

'uploader': None,

'upload_date': None,

'title': video_title,

'ext': 'flv',

'thumbnail': video_thumbnail,

'description': None,

}]

class GooglePlusIE(InfoExtractor):

"""Information extractor for plus.google.com."""

_VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'

IE_NAME = u'plus.google'

def __init__(self, downloader=None):

InfoExtractor.__init__(self, downloader)

def report_extract_entry(self, url):

"""Report downloading extry"""

self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)

def report_date(self, upload_date):

"""Report downloading extry"""

self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)

def report_uploader(self, uploader):

"""Report downloading extry"""

self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)

def report_title(self, video_title):

"""Report downloading extry"""

self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)

def report_extract_vid_page(self, video_page):

"""Report information extraction."""

self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)

def _real_extract(self, url):

# Extract id from URL

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)

return

post_url = mobj.group(0)

video_id = mobj.group(1)

video_extension = 'flv'

# Step 1, Retrieve post webpage to extract further information

self.report_extract_entry(post_url)

request = compat_urllib_request.Request(post_url)

try:

webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))

return

# Extract update date

upload_date = None

pattern = 'title="Timestamp">(.*?)</a>'

mobj = re.search(pattern, webpage)

if mobj:

upload_date = mobj.group(1)

# Convert timestring to a format suitable for filename

upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")

upload_date = upload_date.strftime('%Y%m%d')

self.report_date(upload_date)

# Extract uploader

uploader = None

pattern = r'rel\="author".*?>(.*?)</a>'

mobj = re.search(pattern, webpage)

if mobj:

uploader = mobj.group(1)

self.report_uploader(uploader)

# Extract title

# Get the first line for title

video_title = u'NA'

pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'

mobj = re.search(pattern, webpage)

if mobj:

video_title = mobj.group(1)

self.report_title(video_title)

# Step 2, Stimulate clicking the image box to launch video

pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'

mobj = re.search(pattern, webpage)

if mobj is None:

self._downloader.trouble(u'ERROR: unable to extract video page URL')

video_page = mobj.group(1)

request = compat_urllib_request.Request(video_page)

try:

webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))

return

self.report_extract_vid_page(video_page)

# Extract video links on video page

"""Extract video links of all sizes"""

pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'

mobj = re.findall(pattern, webpage)

if len(mobj) == 0:

self._downloader.trouble(u'ERROR: unable to extract video links')

# Sort in resolution

links = sorted(mobj)

# Choose the lowest of the sort, i.e. highest resolution

video_url = links[-1]

# Only get the url. The resolution part in the tuple has no use anymore

video_url = video_url[-1]

# Treat escaped \u0026 style hex

try:

video_url = video_url.decode("unicode_escape")

except AttributeError: # Python 3

video_url = bytes(video_url, 'ascii').decode('unicode-escape')

return [{

'id': video_id,

'url': video_url,

'uploader': uploader,

'upload_date': upload_date,

'title': video_title,

'ext': video_extension,

}]

class NBAIE(InfoExtractor):

_VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'

IE_NAME = u'nba'

def report_extraction(self, video_id):

self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

def _real_extract(self, url):

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

return

video_id = mobj.group(1)

if video_id.endswith('/index.html'):

video_id = video_id[:-len('/index.html')]

self.report_extraction(video_id)

try:

urlh = compat_urllib_request.urlopen(url)

webpage_bytes = urlh.read()

webpage = webpage_bytes.decode('utf-8', 'ignore')

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))

return

video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'

def _findProp(rexp, default=None):

m = re.search(rexp, webpage)

if m:

return unescapeHTML(m.group(1))

else:

return default

shortened_video_id = video_id.rpartition('/')[2]

title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')

info = {

'id': shortened_video_id,

'url': video_url,

'ext': 'mp4',

'title': title,

'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),

'description': _findProp(r'<div class="description">(.*?)</h1>'),

}

return [info]

class JustinTVIE(InfoExtractor):

"""Information extractor for justin.tv and twitch.tv"""

# TODO: One broadcast may be split into multiple videos. The key

# 'broadcast_id' is the same for all parts, and 'broadcast_part'

# starts at 1 and increases. Can we treat all parts as one video?

_VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/

([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""

_JUSTIN_PAGE_LIMIT = 100

IE_NAME = u'justin.tv'

def report_extraction(self, file_id):

"""Report information extraction."""

self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))

def report_download_page(self, channel, offset):

"""Report attempt to download a single page of videos."""

self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %

(self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))

# Return count of items, list of *valid* items

def _parse_page(self, url):

try:

urlh = compat_urllib_request.urlopen(url)

webpage_bytes = urlh.read()

webpage = webpage_bytes.decode('utf-8', 'ignore')

except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:

self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))

return

response = json.loads(webpage)

info = []

for clip in response:

video_url = clip['video_file_url']

if video_url:

video_extension = os.path.splitext(video_url)[1][1:]

video_date = re.sub('-', '', clip['created_on'][:10])

info.append({

'id': clip['id'],

'url': video_url,

'title': clip['title'],

'uploader': clip.get('user_id', clip.get('channel_id')),

'upload_date': video_date,

'ext': video_extension,

})

return (len(response), info)

def _real_extract(self, url):

mobj = re.match(self._VALID_URL, url)

if mobj is None:

self._downloader.trouble(u'ERROR: invalid URL: %s' % url)

return

api = 'http://api.justin.tv'

video_id = mobj.group(mobj.lastindex)

paged = False

if mobj.lastindex == 1:

paged = True

api += '/channel/archives/%s.json'

else:

api += '/clip/show/%s.json'

api = api % (video_id,)

self.report_extraction(video_id)

info = []

offset = 0

limit = self._JUSTIN_PAGE_LIMIT

while True:

if paged:

self.report_download_page(video_id, offset)

page_url = api + ('?offset=%d&limit=%d' % (offset, limit))

page_count, page_info = self._parse_page(page_url)

info.extend(page_info)

if not paged or page_count != limit:

break

offset += limit

return info

Coverage for youtube_dl.InfoExtractors : 58%

2213 statements 1290 run 923 missing 0 excluded