From f3098c4d8abed9644af5e1b461c2cf5042113653 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Thu, 15 Sep 2011 10:43:49 +0200 Subject: [PATCH] --list-extractors (Closes #161) --- youtube-dl | 106 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 71 insertions(+), 35 deletions(-) diff --git a/youtube-dl b/youtube-dl index cd8e57b06..dbcf1c9fb 100755 --- a/youtube-dl +++ b/youtube-dl @@ -1086,6 +1086,7 @@ class YoutubeIE(InfoExtractor): '43': 'webm', '45': 'webm', } + IE_NAME = u'youtube' def report_lang(self): """Report attempt to set language.""" @@ -1359,6 +1360,7 @@ class MetacafeIE(InfoExtractor): _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' _youtube_ie = None + IE_NAME = u'metacafe' def __init__(self, youtube_ie, downloader=None): InfoExtractor.__init__(self, downloader) @@ -1497,6 +1499,7 @@ class DailymotionIE(InfoExtractor): """Information Extractor for Dailymotion""" _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^_/]+)_([^/]+)' + IE_NAME = u'dailymotion' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) @@ -1587,6 +1590,7 @@ class GoogleIE(InfoExtractor): """Information extractor for video.google.com.""" _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*' + IE_NAME = u'video.google' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) @@ -1693,6 +1697,7 @@ class PhotobucketIE(InfoExtractor): """Information extractor for photobucket.com.""" _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' + IE_NAME = u'photobucket' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) @@ -1774,6 +1779,7 @@ class YahooIE(InfoExtractor): # _VPAGE_URL matches only the extractable '/watch/' URLs _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?' _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?' + IE_NAME = u'video.yahoo' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) @@ -1926,6 +1932,7 @@ class VimeoIE(InfoExtractor): # _VALID_URL matches Vimeo URLs _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:groups/[^/]+/)?(?:videos?/)?([0-9]+)' + IE_NAME = u'vimeo' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) @@ -2036,7 +2043,8 @@ class VimeoIE(InfoExtractor): class GenericIE(InfoExtractor): """Generic last-resort information extractor.""" - _VALID_URL = '.*' + _VALID_URL = r'.*' + IE_NAME = u'generic' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) @@ -2140,6 +2148,7 @@ class YoutubeSearchIE(InfoExtractor): _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*' _youtube_ie = None _max_youtube_results = 1000 + IE_NAME = u'youtube:search' def __init__(self, youtube_ie, downloader=None): InfoExtractor.__init__(self, downloader) @@ -2228,6 +2237,7 @@ class GoogleSearchIE(InfoExtractor): _MORE_PAGES_INDICATOR = r'Next' _google_ie = None _max_google_results = 1000 + IE_NAME = u'video.google:search' def __init__(self, google_ie, downloader=None): InfoExtractor.__init__(self, downloader) @@ -2316,6 +2326,7 @@ class YahooSearchIE(InfoExtractor): _MORE_PAGES_INDICATOR = r'\s*Next' _yahoo_ie = None _max_yahoo_results = 1000 + IE_NAME = u'video.yahoo:search' def __init__(self, yahoo_ie, downloader=None): InfoExtractor.__init__(self, downloader) @@ -2404,6 +2415,7 @@ class YoutubePlaylistIE(InfoExtractor): _VIDEO_INDICATOR = r'/watch\?v=(.+?)&' _MORE_PAGES_INDICATOR = r'(?m)>\s*Next\s*' _youtube_ie = None + IE_NAME = u'youtube:playlist' def __init__(self, youtube_ie, downloader=None): InfoExtractor.__init__(self, downloader) @@ -2478,6 +2490,7 @@ class YoutubeUserIE(InfoExtractor): _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d' _VIDEO_INDICATOR = r'/watch\?v=(.+?)&' _youtube_ie = None + IE_NAME = u'youtube:user' def __init__(self, youtube_ie, downloader=None): InfoExtractor.__init__(self, downloader) @@ -2560,6 +2573,7 @@ class DepositFilesIE(InfoExtractor): """Information extractor for depositfiles.com""" _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles.com/(?:../(?#locale))?files/(.+)' + IE_NAME = u'DepositFiles' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) @@ -2643,6 +2657,7 @@ class FacebookIE(InfoExtractor): 'highqual': 'mp4', 'lowqual': 'mp4', } + IE_NAME = u'facebook' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) @@ -2852,6 +2867,7 @@ class BlipTVIE(InfoExtractor): _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$' _URL_EXT = r'^.*\.([a-z0-9]+)$' + IE_NAME = u'blip.tv' def report_extraction(self, file_id): """Report information extraction.""" @@ -2923,6 +2939,7 @@ class MyVideoIE(InfoExtractor): """Information Extractor for myvideo.de.""" _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*' + IE_NAME = u'myvideo' def __init__(self, downloader=None): InfoExtractor.__init__(self, downloader) @@ -2994,7 +3011,8 @@ class MyVideoIE(InfoExtractor): class ComedyCentralIE(InfoExtractor): """Information extractor for The Daily Show and Colbert Report """ - _VALID_URL = r'^(:(?Ptds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)(?Pthedailyshow|colbertnation)\.com/full-episodes/(?P.*)$' + _VALID_URL = r'^(:(?Ptds|thedailyshow|cr|colbert|colbertnation|colbertreport))|(https?://)?(www\.)?(?Pthedailyshow|colbertnation)\.com/full-episodes/(?P.*)$' + IE_NAME = u'comedycentral' def report_extraction(self, episode_id): self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id) @@ -3135,6 +3153,7 @@ class EscapistIE(InfoExtractor): """Information extractor for The Escapist """ _VALID_URL = r'^(https?://)?(www\.)escapistmagazine.com/videos/view/(?P[^/]+)/(?P[^/?]+)[/?].*$' + IE_NAME = u'escapist' def report_extraction(self, showName): self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName) @@ -3446,6 +3465,9 @@ def parseOpts(): general.add_option('--dump-user-agent', action='store_true', dest='dump_user_agent', help='display the current browser identification', default=False) + general.add_option('--list-extractors', + action='store_true', dest='list_extractors', + help='List all supported extractors and the URLs they would handle', default=False) selection.add_option('--playlist-start', dest='playliststart', metavar='NUMBER', help='playlist video to start at (default is 1)', default=1) @@ -3542,6 +3564,36 @@ def parseOpts(): return parser, opts, args +def gen_extractors(): + """ Return a list of an instance of every supported extractor. + The order does matter; the first extractor matched is the one handling the URL. + """ + youtube_ie = YoutubeIE() + google_ie = GoogleIE() + yahoo_ie = YahooIE() + return [ + youtube_ie, + MetacafeIE(youtube_ie), + DailymotionIE(), + YoutubePlaylistIE(youtube_ie), + YoutubeUserIE(youtube_ie), + YoutubeSearchIE(youtube_ie), + google_ie, + GoogleSearchIE(google_ie), + PhotobucketIE(), + yahoo_ie, + YahooSearchIE(yahoo_ie), + DepositFilesIE(), + FacebookIE(), + BlipTVIE(), + VimeoIE(), + MyVideoIE(), + ComedyCentralIE(), + EscapistIE(), + + GenericIE() + ] + def main(): parser, opts, args = parseOpts() @@ -3561,12 +3613,6 @@ def main(): print std_headers['User-Agent'] sys.exit(0) - # General configuration - cookie_processor = urllib2.HTTPCookieProcessor(jar) - opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()) - urllib2.install_opener(opener) - socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words) - # Batch file verification batchurls = [] if opts.batchfile is not None: @@ -3582,6 +3628,23 @@ def main(): sys.exit(u'ERROR: batch file could not be read') all_urls = batchurls + args + # General configuration + cookie_processor = urllib2.HTTPCookieProcessor(jar) + opener = urllib2.build_opener(urllib2.ProxyHandler(), cookie_processor, YoutubeDLHandler()) + urllib2.install_opener(opener) + socket.setdefaulttimeout(300) # 5 minutes should be enough (famous last words) + + extractors = gen_extractors() + + if opts.list_extractors: + for ie in extractors: + print(ie.IE_NAME) + matchedUrls = filter(lambda url: ie.suitable(url), all_urls) + all_urls = filter(lambda url: url not in matchedUrls, all_urls) + for mu in matchedUrls: + print(u' ' + mu) + sys.exit(0) + # Conflicting, missing and erroneous options if opts.usenetrc and (opts.username is not None or opts.password is not None): parser.error(u'using .netrc conflicts with giving username/password') @@ -3619,33 +3682,6 @@ def main(): if opts.audioformat not in ['best', 'aac', 'mp3']: parser.error(u'invalid audio format specified') - # Information extractors - youtube_ie = YoutubeIE() - google_ie = GoogleIE() - yahoo_ie = YahooIE() - extractors = [ # Order does matter - youtube_ie, - MetacafeIE(youtube_ie), - DailymotionIE(), - YoutubePlaylistIE(youtube_ie), - YoutubeUserIE(youtube_ie), - YoutubeSearchIE(youtube_ie), - google_ie, - GoogleSearchIE(google_ie), - PhotobucketIE(), - yahoo_ie, - YahooSearchIE(yahoo_ie), - DepositFilesIE(), - FacebookIE(), - BlipTVIE(), - VimeoIE(), - MyVideoIE(), - ComedyCentralIE(), - EscapistIE(), - - GenericIE() - ] - # File downloader fd = FileDownloader({ 'usenetrc': opts.usenetrc,