Merge pull request #6428 from dstftw/improve-generic-smil-support

Improve generic SMIL support
10 years ago · d5d7bdaeb5
--- a/test/helper.py
+++ b/test/helper.py
@ -133,8 +133,8 @@ def expect_info_dict(self, got_dict, expected_dict):
            elif isinstance(expected, compat_str) and expected.startswith('mincount:'):
                got = got_dict.get(info_field)
                self.assertTrue(
                    isinstance(got, list),
                    'Expected field %s to be a list, but it is of type %s' % (
                    isinstance(got, (list, dict)),
                    'Expected field %s to be a list or a dict, but it is of type %s' % (
                        info_field, type(got).__name__))
                expected_num = int(expected.partition(':')[2])
                assertGreaterEqual(
--- a/test/test_download.py
+++ b/test/test_download.py
@ -136,7 +136,9 @@ def generator(test_case):
                    # We're not using .download here sine that is just a shim
                    # for outside error handling, and returns the exit code
                    # instead of the result dict.
                    res_dict = ydl.extract_info(test_case['url'])
                    res_dict = ydl.extract_info(
                        test_case['url'],
                        force_generic_extractor=params.get('force_generic_extractor', False))
                except (DownloadError, ExtractorError) as err:
                    # Check if the exception is not a network related one
                    if not err.exc_info[0] in (compat_urllib_error.URLError, socket.timeout, UnavailableVideoError, compat_http_client.BadStatusLine) or (err.exc_info[0] == compat_HTTPError and err.exc_info[1].code == 503):
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@ -18,6 +18,7 @@ from ..compat import (
    compat_HTTPError,
    compat_http_client,
    compat_urllib_error,
    compat_urllib_parse,
    compat_urllib_parse_urlparse,
    compat_urllib_request,
    compat_urlparse,
@ -37,6 +38,7 @@ from ..utils import (
    RegexNotFoundError,
    sanitize_filename,
    unescapeHTML,
    url_basename,
 )


@ -978,69 +980,167 @@ class InfoExtractor(object):
        self._sort_formats(formats)
        return formats

    # TODO: improve extraction
    def _extract_smil_formats(self, smil_url, video_id, fatal=True):
        smil = self._download_xml(
            smil_url, video_id, 'Downloading SMIL file',
            'Unable to download SMIL file', fatal=fatal)
    @staticmethod
    def _xpath_ns(path, namespace=None):
        if not namespace:
            return path
        out = []
        for c in path.split('/'):
            if not c or c == '.':
                out.append(c)
            else:
                out.append('{%s}%s' % (namespace, c))
        return '/'.join(out)

    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
        smil = self._download_smil(smil_url, video_id, fatal=fatal)

        if smil is False:
            assert not fatal
            return []

        base = smil.find('./head/meta').get('base')
        namespace = self._parse_smil_namespace(smil)

        return self._parse_smil_formats(
            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)

    def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
        smil = self._download_smil(smil_url, video_id, fatal=fatal)
        if smil is False:
            return {}
        return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)

    def _download_smil(self, smil_url, video_id, fatal=True):
        return self._download_xml(
            smil_url, video_id, 'Downloading SMIL file',
            'Unable to download SMIL file', fatal=fatal)

    def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
        namespace = self._parse_smil_namespace(smil)

        formats = self._parse_smil_formats(
            smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
        subtitles = self._parse_smil_subtitles(smil, namespace=namespace)

        video_id = os.path.splitext(url_basename(smil_url))[0]
        title = None
        description = None
        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
            name = meta.attrib.get('name')
            content = meta.attrib.get('content')
            if not name or not content:
                continue
            if not title and name == 'title':
                title = content
            elif not description and name in ('description', 'abstract'):
                description = content

        return {
            'id': video_id,
            'title': title or video_id,
            'description': description,
            'formats': formats,
            'subtitles': subtitles,
        }

    def _parse_smil_namespace(self, smil):
        return self._search_regex(
            r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)

    def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None):
        base = smil_url
        for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
            b = meta.get('base') or meta.get('httpBase')
            if b:
                base = b
                break

        formats = []
        rtmp_count = 0
        if smil.findall('./body/seq/video'):
            video = smil.findall('./body/seq/video')[0]
            fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
            formats.extend(fmts)
        else:
            for video in smil.findall('./body/switch/video'):
                fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
                formats.extend(fmts)
        http_count = 0

        videos = smil.findall(self._xpath_ns('.//video', namespace))
        for video in videos:
            src = video.get('src')
            if not src:
                continue

            bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
            filesize = int_or_none(video.get('size') or video.get('fileSize'))
            width = int_or_none(video.get('width'))
            height = int_or_none(video.get('height'))
            proto = video.get('proto')
            ext = video.get('ext')
            src_ext = determine_ext(src)
            streamer = video.get('streamer') or base

            if proto == 'rtmp' or streamer.startswith('rtmp'):
                rtmp_count += 1
                formats.append({
                    'url': streamer,
                    'play_path': src,
                    'ext': 'flv',
                    'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
                    'tbr': bitrate,
                    'filesize': filesize,
                    'width': width,
                    'height': height,
                })
                continue

            src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)

            if proto == 'm3u8' or src_ext == 'm3u8':
                formats.extend(self._extract_m3u8_formats(
                    src_url, video_id, ext or 'mp4', m3u8_id='hls'))
                continue

            if src_ext == 'f4m':
                f4m_url = src_url
                if not f4m_params:
                    f4m_params = {
                        'hdcore': '3.2.0',
                        'plugin': 'flowplayer-3.2.0.1',
                    }
                f4m_url += '&' if '?' in f4m_url else '?'
                f4m_url += compat_urllib_parse.urlencode(f4m_params)
                formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
                continue

            if src_url.startswith('http'):
                http_count += 1
                formats.append({
                    'url': src_url,
                    'ext': ext or src_ext or 'flv',
                    'format_id': 'http-%d' % (bitrate or http_count),
                    'tbr': bitrate,
                    'filesize': filesize,
                    'width': width,
                    'height': height,
                })
                continue

        self._sort_formats(formats)

        return formats

    def _parse_smil_video(self, video, video_id, base, rtmp_count):
        src = video.get('src')
        if not src:
            return [], rtmp_count
        bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
        width = int_or_none(video.get('width'))
        height = int_or_none(video.get('height'))
        proto = video.get('proto')
        if not proto:
            if base:
                if base.startswith('rtmp'):
                    proto = 'rtmp'
                elif base.startswith('http'):
                    proto = 'http'
        ext = video.get('ext')
        if proto == 'm3u8':
            return self._extract_m3u8_formats(src, video_id, ext), rtmp_count
        elif proto == 'rtmp':
            rtmp_count += 1
            streamer = video.get('streamer') or base
            return ([{
                'url': streamer,
                'play_path': src,
                'ext': 'flv',
                'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
                'tbr': bitrate,
                'width': width,
                'height': height,
            }], rtmp_count)
        elif proto.startswith('http'):
            return ([{
                'url': base + src,
                'ext': ext or 'flv',
                'tbr': bitrate,
                'width': width,
                'height': height,
            }], rtmp_count)
    def _parse_smil_subtitles(self, smil, namespace=None):
        subtitles = {}
        for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
            src = textstream.get('src')
            if not src:
                continue
            ext = textstream.get('ext') or determine_ext(src)
            if not ext:
                type_ = textstream.get('type')
                if type_ == 'text/srt':
                    ext = 'srt'
            lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName')
            subtitles.setdefault(lang, []).append({
                'url': src,
                'ext': ext,
            })
        return subtitles

    def _live_title(self, name):
        """ Generate the title for a live video """
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@ -130,6 +130,74 @@ class GenericIE(InfoExtractor):
                'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
            }
        },
        # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng
        {
            'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml',
            'info_dict': {
                'id': 'smil',
                'ext': 'mp4',
                'title': 'Automatics, robotics and biocybernetics',
                'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
                'formats': 'mincount:16',
                'subtitles': 'mincount:1',
            },
            'params': {
                'force_generic_extractor': True,
                'skip_download': True,
            },
        },
        # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html
        {
            'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil',
            'info_dict': {
                'id': 'hds',
                'ext': 'flv',
                'title': 'hds',
                'formats': 'mincount:1',
            },
            'params': {
                'skip_download': True,
            },
        },
        # SMIL from https://www.restudy.dk/video/play/id/1637
        {
            'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml',
            'info_dict': {
                'id': 'video_1637',
                'ext': 'flv',
                'title': 'video_1637',
                'formats': 'mincount:3',
            },
            'params': {
                'skip_download': True,
            },
        },
        # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm
        {
            'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil',
            'info_dict': {
                'id': 'smil-service',
                'ext': 'flv',
                'title': 'smil-service',
                'formats': 'mincount:1',
            },
            'params': {
                'skip_download': True,
            },
        },
        # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370
        {
            'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil',
            'info_dict': {
                'id': '4719370',
                'ext': 'mp4',
                'title': '571de1fd-47bc-48db-abf9-238872a58d1f',
                'formats': 'mincount:3',
            },
            'params': {
                'skip_download': True,
            },
        },
        # google redirect
        {
            'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
@ -1123,11 +1191,13 @@ class GenericIE(InfoExtractor):

        self.report_extraction(video_id)

        # Is it an RSS feed?
        # Is it an RSS feed or a SMIL file?
        try:
            doc = parse_xml(webpage)
            if doc.tag == 'rss':
                return self._extract_rss(url, video_id, doc)
            elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
                return self._parse_smil(doc, url, video_id)
        except compat_xml_parse_error:
            pass

--- a/youtube_dl/extractor/videolecturesnet.py
+++ b/youtube_dl/extractor/videolecturesnet.py
@ -12,7 +12,7 @@ from ..utils import (


 class VideoLecturesNetIE(InfoExtractor):
    _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)/'
    _VALID_URL = r'http://(?:www\.)?videolectures\.net/(?P<id>[^/#?]+)(?:/?[#?].*)?$'
    IE_NAME = 'videolectures.net'

    _TEST = {