|
|
@ -15,6 +15,7 @@ __authors__ = ( |
|
|
|
'Kevin Ngo', |
|
|
|
'Ori Avtalion', |
|
|
|
'shizeeg', |
|
|
|
'Filippo Valsorda', |
|
|
|
) |
|
|
|
|
|
|
|
__license__ = 'Public Domain' |
|
|
@ -66,11 +67,6 @@ try: |
|
|
|
except ImportError: |
|
|
|
from cgi import parse_qs |
|
|
|
|
|
|
|
try: |
|
|
|
import lxml.etree |
|
|
|
except ImportError: |
|
|
|
pass # Handled below |
|
|
|
|
|
|
|
try: |
|
|
|
import xml.etree.ElementTree |
|
|
|
except ImportError: # Python<2.5: Not officially supported, but let it slip |
|
|
@ -197,6 +193,69 @@ except ImportError: # Python <2.6, use trivialjson (https://github.com/phihag/tr |
|
|
|
raise ValueError('Extra data at end of input (index ' + str(i) + ' of ' + repr(s) + ': ' + repr(s[i:]) + ')') |
|
|
|
return res |
|
|
|
|
|
|
|
|
|
|
|
class IDParser(HTMLParser.HTMLParser): |
|
|
|
"""Modified HTMLParser that isolates a tag with the specified id""" |
|
|
|
def __init__(self, id): |
|
|
|
self.id = id |
|
|
|
self.result = None |
|
|
|
self.started = False |
|
|
|
self.depth = {} |
|
|
|
self.html = None |
|
|
|
self.watch_startpos = False |
|
|
|
HTMLParser.HTMLParser.__init__(self) |
|
|
|
|
|
|
|
def loads(self, html): |
|
|
|
self.html = html |
|
|
|
self.feed(html) |
|
|
|
self.close() |
|
|
|
|
|
|
|
def handle_starttag(self, tag, attrs): |
|
|
|
attrs = dict(attrs) |
|
|
|
if self.started: |
|
|
|
self.find_startpos(None) |
|
|
|
if 'id' in attrs and attrs['id'] == self.id: |
|
|
|
self.result = [tag] |
|
|
|
self.started = True |
|
|
|
self.watch_startpos = True |
|
|
|
if self.started: |
|
|
|
if not tag in self.depth: self.depth[tag] = 0 |
|
|
|
self.depth[tag] += 1 |
|
|
|
|
|
|
|
def handle_endtag(self, tag): |
|
|
|
if self.started: |
|
|
|
if tag in self.depth: self.depth[tag] -= 1 |
|
|
|
if self.depth[self.result[0]] == 0: |
|
|
|
self.started = False |
|
|
|
self.result.append(self.getpos()) |
|
|
|
|
|
|
|
def find_startpos(self, x): |
|
|
|
"""Needed to put the start position of the result (self.result[1]) |
|
|
|
after the opening tag with the requested id""" |
|
|
|
if self.watch_startpos: |
|
|
|
self.watch_startpos = False |
|
|
|
self.result.append(self.getpos()) |
|
|
|
handle_entityref = handle_charref = handle_data = handle_comment = \ |
|
|
|
handle_decl = handle_pi = unknown_decl = find_startpos |
|
|
|
|
|
|
|
def get_result(self): |
|
|
|
if self.result == None: return None |
|
|
|
if len(self.result) != 3: return None |
|
|
|
lines = self.html.split('\n') |
|
|
|
lines = lines[self.result[1][0]-1:self.result[2][0]] |
|
|
|
lines[0] = lines[0][self.result[1][1]:] |
|
|
|
if len(lines) == 1: |
|
|
|
lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]] |
|
|
|
lines[-1] = lines[-1][:self.result[2][1]] |
|
|
|
return '\n'.join(lines).strip() |
|
|
|
|
|
|
|
def get_element_by_id(id, html): |
|
|
|
"""Return the content of the tag with the specified id in the passed HTML document""" |
|
|
|
parser = IDParser(id) |
|
|
|
parser.loads(html) |
|
|
|
return parser.get_result() |
|
|
|
|
|
|
|
|
|
|
|
def preferredencoding(): |
|
|
|
"""Get preferred encoding. |
|
|
|
|
|
|
@ -241,6 +300,18 @@ def htmlentity_transform(matchobj): |
|
|
|
return (u'&%s;' % entity) |
|
|
|
|
|
|
|
|
|
|
|
def clean_html(html): |
|
|
|
"""Clean an HTML snippet into a readable string""" |
|
|
|
# Newline vs <br /> |
|
|
|
html = html.replace('\n', ' ') |
|
|
|
html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html) |
|
|
|
# Strip html tags |
|
|
|
html = re.sub('<.*?>', '', html) |
|
|
|
# Replace html entities |
|
|
|
html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html) |
|
|
|
return html |
|
|
|
|
|
|
|
|
|
|
|
def sanitize_title(utitle): |
|
|
|
"""Sanitizes a video title so it could be used as part of a filename.""" |
|
|
|
utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle) |
|
|
@ -1419,18 +1490,9 @@ class YoutubeIE(InfoExtractor): |
|
|
|
pass |
|
|
|
|
|
|
|
# description |
|
|
|
try: |
|
|
|
lxml.etree |
|
|
|
except NameError: |
|
|
|
video_description = u'No description available.' |
|
|
|
mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage) |
|
|
|
if mobj is not None: |
|
|
|
video_description = mobj.group(1).decode('utf-8') |
|
|
|
else: |
|
|
|
html_parser = lxml.etree.HTMLParser(encoding='utf-8') |
|
|
|
vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser) |
|
|
|
video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()')) |
|
|
|
# TODO use another parser |
|
|
|
video_description = get_element_by_id("eow-description", video_webpage) |
|
|
|
if video_description: video_description = clean_html(video_description.decode('utf8')) |
|
|
|
else: video_description = '' |
|
|
|
|
|
|
|
# closed captions |
|
|
|
video_subtitles = None |
|
|
@ -2164,18 +2226,9 @@ class VimeoIE(InfoExtractor): |
|
|
|
video_thumbnail = config["video"]["thumbnail"] |
|
|
|
|
|
|
|
# Extract video description |
|
|
|
try: |
|
|
|
lxml.etree |
|
|
|
except NameError: |
|
|
|
video_description = u'No description available.' |
|
|
|
mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE) |
|
|
|
if mobj is not None: |
|
|
|
video_description = mobj.group(1) |
|
|
|
else: |
|
|
|
html_parser = lxml.etree.HTMLParser() |
|
|
|
vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser) |
|
|
|
video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip() |
|
|
|
# TODO use another parser |
|
|
|
video_description = get_element_by_id("description", webpage) |
|
|
|
if video_description: video_description = clean_html(video_description.decode('utf8')) |
|
|
|
else: video_description = '' |
|
|
|
|
|
|
|
# Extract upload date |
|
|
|
video_upload_date = u'NA' |
|
|
@ -3342,8 +3395,6 @@ class EscapistIE(InfoExtractor): |
|
|
|
self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName) |
|
|
|
|
|
|
|
def _real_extract(self, url): |
|
|
|
htmlParser = HTMLParser.HTMLParser() |
|
|
|
|
|
|
|
mobj = re.match(self._VALID_URL, url) |
|
|
|
if mobj is None: |
|
|
|
self._downloader.trouble(u'ERROR: invalid URL: %s' % url) |
|
|
@ -3359,11 +3410,11 @@ class EscapistIE(InfoExtractor): |
|
|
|
return |
|
|
|
|
|
|
|
descMatch = re.search('<meta name="description" content="([^"]*)"', webPage) |
|
|
|
description = htmlParser.unescape(descMatch.group(1)) |
|
|
|
description = unescapeHTML(descMatch.group(1)) |
|
|
|
imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage) |
|
|
|
imgUrl = htmlParser.unescape(imgMatch.group(1)) |
|
|
|
imgUrl = unescapeHTML(imgMatch.group(1)) |
|
|
|
playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage) |
|
|
|
playerUrl = htmlParser.unescape(playerUrlMatch.group(1)) |
|
|
|
playerUrl = unescapeHTML(playerUrlMatch.group(1)) |
|
|
|
configUrlMatch = re.search('config=(.*)$', playerUrl) |
|
|
|
configUrl = urllib2.unquote(configUrlMatch.group(1)) |
|
|
|
|
|
|
@ -3422,8 +3473,6 @@ class CollegeHumorIE(InfoExtractor): |
|
|
|
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) |
|
|
|
|
|
|
|
def _real_extract(self, url): |
|
|
|
htmlParser = HTMLParser.HTMLParser() |
|
|
|
|
|
|
|
mobj = re.match(self._VALID_URL, url) |
|
|
|
if mobj is None: |
|
|
|
self._downloader.trouble(u'ERROR: invalid URL: %s' % url) |
|
|
@ -3494,8 +3543,6 @@ class XVideosIE(InfoExtractor): |
|
|
|
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) |
|
|
|
|
|
|
|
def _real_extract(self, url): |
|
|
|
htmlParser = HTMLParser.HTMLParser() |
|
|
|
|
|
|
|
mobj = re.match(self._VALID_URL, url) |
|
|
|
if mobj is None: |
|
|
|
self._downloader.trouble(u'ERROR: invalid URL: %s' % url) |
|
|
@ -3584,8 +3631,6 @@ class SoundcloudIE(InfoExtractor): |
|
|
|
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) |
|
|
|
|
|
|
|
def _real_extract(self, url): |
|
|
|
htmlParser = HTMLParser.HTMLParser() |
|
|
|
|
|
|
|
mobj = re.match(self._VALID_URL, url) |
|
|
|
if mobj is None: |
|
|
|
self._downloader.trouble(u'ERROR: invalid URL: %s' % url) |
|
|
@ -3673,8 +3718,6 @@ class InfoQIE(InfoExtractor): |
|
|
|
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id)) |
|
|
|
|
|
|
|
def _real_extract(self, url): |
|
|
|
htmlParser = HTMLParser.HTMLParser() |
|
|
|
|
|
|
|
mobj = re.match(self._VALID_URL, url) |
|
|
|
if mobj is None: |
|
|
|
self._downloader.trouble(u'ERROR: invalid URL: %s' % url) |
|
|
@ -3908,8 +3951,6 @@ class StanfordOpenClassroomIE(InfoExtractor): |
|
|
|
except UnavailableVideoError, err: |
|
|
|
self._downloader.trouble(u'\nERROR: unable to download video') |
|
|
|
elif mobj.group('course'): # A course page |
|
|
|
unescapeHTML = HTMLParser.HTMLParser().unescape |
|
|
|
|
|
|
|
course = mobj.group('course') |
|
|
|
info = { |
|
|
|
'id': _simplify_title(course), |
|
|
@ -3946,8 +3987,6 @@ class StanfordOpenClassroomIE(InfoExtractor): |
|
|
|
assert entry['type'] == 'reference' |
|
|
|
self.extract(entry['url']) |
|
|
|
else: # Root page |
|
|
|
unescapeHTML = HTMLParser.HTMLParser().unescape |
|
|
|
|
|
|
|
info = { |
|
|
|
'id': 'Stanford OpenClassroom', |
|
|
|
'type': 'playlist', |
|
|
|