Improve subtitles support

For each language the extractor builds a list with the available formats sorted (like for video formats), then YoutubeDL selects one of them using the '--sub-format' option which now allows giving the format preferences (for example 'ass/srt/best'). For each format the 'url' field can be set so that we only download the contents if needed, or if the contents needs to be processed (like in crunchyroll) the 'data' field can be used. The reasons for this change are: * We weren't checking that the format given with '--sub-format' was available, checking it in each extractor would be repetitive. * It allows to easily support giving a format preference. * The subtitles were automatically downloaded in the extractor, but I think that if you use for example the '--dump-json' option you want to finish as fast as possible. Currently only the ted extractor has been updated, but the old system still works.
10 years ago · a504ced097
--- a/test/test_subtitles.py
+++ b/test/test_subtitles.py
@ -27,15 +27,23 @@ class BaseTestSubtitles(unittest.TestCase):
    def setUp(self):
        self.DL = FakeYDL()
        self.ie = self.IE(self.DL)
        self.ie = self.IE()
        self.DL.add_info_extractor(self.ie)
    def getInfoDict(self):
        info_dict = self.ie.extract(self.url)
        info_dict = self.DL.extract_info(self.url, download=False)
        return info_dict
    def getSubtitles(self):
        info_dict = self.getInfoDict()
        return info_dict['subtitles']
        subtitles = info_dict['subtitles']
        if not subtitles:
            return subtitles
        for sub_info in subtitles.values():
            if sub_info.get('data') is None:
                uf = self.DL.urlopen(sub_info['url'])
                sub_info['data'] = uf.read().decode('utf-8')
        return dict((l, sub_info['data']) for l, sub_info in subtitles.items())
 class TestYoutubeSubtitles(BaseTestSubtitles):
@ -176,7 +184,7 @@ class TestTedSubtitles(BaseTestSubtitles):
    def test_no_writesubtitles(self):
        subtitles = self.getSubtitles()
        self.assertEqual(subtitles, None)
        self.assertFalse(subtitles)
    def test_subtitles(self):
        self.DL.params['writesubtitles'] = True
@ -196,18 +204,10 @@ class TestTedSubtitles(BaseTestSubtitles):
        self.assertTrue(len(subtitles.keys()) >= 28)
    def test_list_subtitles(self):
        self.DL.expect_warning('Automatic Captions not supported by this server')
        self.DL.params['listsubtitles'] = True
        info_dict = self.getInfoDict()
        self.assertEqual(info_dict, None)
    def test_automatic_captions(self):
        self.DL.expect_warning('Automatic Captions not supported by this server')
        self.DL.params['writeautomaticsub'] = True
        self.DL.params['subtitleslang'] = ['en']
        subtitles = self.getSubtitles()
        self.assertTrue(len(subtitles.keys()) == 0)
    def test_multiple_langs(self):
        self.DL.params['writesubtitles'] = True
        langs = ['es', 'fr', 'de']
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@ -154,7 +154,7 @@ class YoutubeDL(object):
    allsubtitles:      Downloads all the subtitles of the video
                       (requires writesubtitles or writeautomaticsub)
    listsubtitles:     Lists all available subtitles for the video
    subtitlesformat:   Subtitle format [srt/sbv/vtt] (default=srt)
    subtitlesformat:   The format code for subtitles
    subtitleslangs:    List of languages of the subtitles to download
    keepvideo:         Keep the video file after post-processing
    daterange:         A DateRange object, download only if the upload_date is in the range.
@ -1019,6 +1019,11 @@ class YoutubeDL(object):
                info_dict['timestamp'])
            info_dict['upload_date'] = upload_date.strftime('%Y%m%d')
        if self.params.get('listsubtitles', False):
            self.list_subtitles(info_dict['id'], info_dict.get('subtitles'))
            return
        info_dict['subtitles'] = self.process_subtitles(info_dict['id'], info_dict.get('subtitles'))
        # This extractors handle format selection themselves
        if info_dict['extractor'] in ['Youku']:
            if download:
@ -1147,6 +1152,53 @@ class YoutubeDL(object):
        info_dict.update(formats_to_download[-1])
        return info_dict
    def process_subtitles(self, video_id, available_subs):
        """Select the requested subtitles and their format"""
        if not available_subs:
            return available_subs
        if self.params.get('allsubtitles', False):
            requested_langs = available_subs.keys()
        else:
            if self.params.get('subtitleslangs', False):
                requested_langs = self.params.get('subtitleslangs')
            elif 'en' in available_subs:
                requested_langs = ['en']
            else:
                requested_langs = [list(available_subs.keys())[0]]
        formats_query = self.params.get('subtitlesformat', 'best')
        formats_preference = formats_query.split('/') if formats_query else []
        subs = {}
        for lang in requested_langs:
            formats = available_subs.get(lang)
            if formats is None:
                self.report_warning('%s subtitles not available for %s' % (lang, video_id))
                continue
            if isinstance(formats, compat_str):
                # TODO: convert all IE with subtitles support to the new format
                # and remove this
                subs[lang] = {
                    'ext': formats_preference[0],
                    'data': formats,
                }
                continue
            for ext in formats_preference:
                if ext == 'best':
                    f = formats[-1]
                    break
                matches = list(filter(lambda f: f['ext'] == ext, formats))
                if matches:
                    f = matches[-1]
                    break
            else:
                f = formats[-1]
                self.report_warning(
                    'No subtitle format found matching "%s" for language %s, '
                    'using %s' % (formats_query, lang, f['ext']))
            subs[lang] = f
        return subs
    def process_info(self, info_dict):
        """Process a single resolved IE result."""
@ -1253,11 +1305,18 @@ class YoutubeDL(object):
            # subtitles download errors are already managed as troubles in relevant IE
            # that way it will silently go on when used with unsupporting IE
            subtitles = info_dict['subtitles']
            sub_format = self.params.get('subtitlesformat', 'srt')
            for sub_lang in subtitles.keys():
                sub = subtitles[sub_lang]
                if sub is None:
                    continue
            for sub_lang, sub_info in subtitles.items():
                sub_format = sub_info['ext']
                if sub_info.get('data') is not None:
                    sub_data = sub_info['data']
                else:
                    try:
                        uf = self.urlopen(sub_info['url'])
                        sub_data = uf.read().decode('utf-8')
                    except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
                        self.report_warning('Unable to download subtitle for "%s": %s' %
                                            (sub_lang, compat_str(err)))
                        continue
                try:
                    sub_filename = subtitles_filename(filename, sub_lang, sub_format)
                    if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(sub_filename)):
@ -1265,7 +1324,7 @@ class YoutubeDL(object):
                    else:
                        self.to_screen('[info] Writing video subtitles to: ' + sub_filename)
                        with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile:
                            subfile.write(sub)
                            subfile.write(sub_data)
                except (OSError, IOError):
                    self.report_error('Cannot write subtitles file ' + sub_filename)
                    return
@ -1586,6 +1645,18 @@ class YoutubeDL(object):
            ['ID', 'width', 'height', 'URL'],
            [[t['id'], t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]))
    def list_subtitles(self, video_id, subtitles):
        if not subtitles:
            self.to_screen('%s has no subtitles' % video_id)
            return
        header_line = 'Language    formats'
        sub_lines = [
            '%-12s%s' % (lang, ', '.join(f['ext'] for f in reversed(formats)))
            for lang, formats in subtitles.items()]
        self.to_screen(
            'Available subtitles for %s:\n%s\n%s' %
            (video_id, header_line, '\n'.join(sub_lines)))
    def urlopen(self, req):
        """ Start an HTTP download """
--- a/youtube_dl/init.py
+++ b/youtube_dl/init.py
@ -226,7 +226,6 @@ def _real_main(argv=None):
    if opts.embedsubtitles:
        postprocessors.append({
            'key': 'FFmpegEmbedSubtitle',
            'subtitlesformat': opts.subtitlesformat,
        })
    if opts.xattrs:
        postprocessors.append({'key': 'XAttrMetadata'})
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -151,8 +151,14 @@ class InfoExtractor(object):
                    If not explicitly set, calculated from timestamp.
    uploader_id:    Nickname or id of the video uploader.
    location:       Physical location where the video was filmed.
    subtitles:      The subtitle file contents as a dictionary in the format
                    {language: subtitles}.
    subtitles:      The available subtitles as a dictionary in the format
                    {language: subformats}. "subformats" is a list sorted from
                    lower to higher preference, each element is a dictionary
                    with the "ext" entry and one of:
                        * "data": The subtitles file contents
                        * "url": A url pointing to the subtitles file
                    Note: YoutubeDL.extract_info will get the requested
                    format and replace the "subformats" list with it.
    duration:       Length of the video in seconds, as an integer.
    view_count:     How many users have watched the video on the platform.
    like_count:     Number of positive ratings of the video
@ -993,6 +999,16 @@ class InfoExtractor(object):
            any_restricted = any_restricted or is_restricted
        return not any_restricted
    def extract_subtitles(self, *args, **kwargs):
        subtitles = {}
        list_subtitles = self._downloader.params.get('listsubtitles')
        if self._downloader.params.get('writesubtitles', False) or list_subtitles:
            subtitles.update(self._get_subtitles(*args, **kwargs))
        return subtitles
    def _get_subtitles(self, *args, **kwargs):
        raise NotImplementedError("This method must be implemented by subclasses")
 class SearchInfoExtractor(InfoExtractor):
    """
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@ -3,14 +3,14 @@ from __future__ import unicode_literals
 import json
 import re
 from .subtitles import SubtitlesInfoExtractor
 from .common import InfoExtractor
 from ..compat import (
    compat_str,
 )
 class TEDIE(SubtitlesInfoExtractor):
 class TEDIE(InfoExtractor):
    _VALID_URL = r'''(?x)
        (?P<proto>https?://)
        (?P<type>www|embed(?:-ssl)?)(?P<urlmain>\.ted\.com/
@ -165,9 +165,6 @@ class TEDIE(SubtitlesInfoExtractor):
        video_id = compat_str(talk_info['id'])
        # subtitles
        video_subtitles = self.extract_subtitles(video_id, talk_info)
        if self._downloader.params.get('listsubtitles', False):
            self._list_available_subtitles(video_id, talk_info)
            return
        thumbnail = talk_info['thumb']
        if not thumbnail.startswith('http'):
@ -183,13 +180,18 @@ class TEDIE(SubtitlesInfoExtractor):
            'duration': talk_info.get('duration'),
        }
    def _get_available_subtitles(self, video_id, talk_info):
    def _get_subtitles(self, video_id, talk_info):
        languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
        if languages:
            sub_lang_list = {}
            for l in languages:
                url = 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/srt' % (video_id, l)
                sub_lang_list[l] = url
                sub_lang_list[l] = [
                    {
                        'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
                        'ext': ext,
                    }
                    for ext in ['ted', 'srt']
                ]
            return sub_lang_list
        else:
            self._downloader.report_warning('video doesn\'t have subtitles')
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@ -387,8 +387,8 @@ def parseOpts(overrideArguments=None):
        help='lists all available subtitles for the video')
    subtitles.add_option(
        '--sub-format',
        action='store', dest='subtitlesformat', metavar='FORMAT', default='srt',
        help='subtitle format (default=srt) ([sbv/vtt] youtube only)')
        action='store', dest='subtitlesformat', metavar='FORMAT', default='best',
        help='subtitle format, accepts formats preference, for example: "ass/srt/best"')
    subtitles.add_option(
        '--sub-lang', '--sub-langs', '--srt-lang',
        action='callback', dest='subtitleslangs', metavar='LANGS', type='str',
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@ -453,10 +453,6 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
        'zu': 'zul',
    }
    def __init__(self, downloader=None, subtitlesformat='srt'):
        super(FFmpegEmbedSubtitlePP, self).__init__(downloader)
        self._subformat = subtitlesformat
    @classmethod
    def _conver_lang_code(cls, code):
        """Convert language code from ISO 639-1 to ISO 639-2/T"""
@ -472,7 +468,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
        sub_langs = [key for key in information['subtitles']]
        filename = information['filepath']
        input_files = [filename] + [subtitles_filename(filename, lang, self._subformat) for lang in sub_langs]
        input_files = [filename] + [subtitles_filename(filename, lang, sub_info['ext']) for lang, sub_info in information['subtitles'].items()]
        opts = [
            '-map', '0',