[mdr] Modernize and include kika.de

9 years ago · 2b1b2d83ca
--- a/youtube_dl/extractor/init.py
+++ b/youtube_dl/extractor/init.py
@ -274,7 +274,6 @@ from .karrierevideos import KarriereVideosIE
 from .keezmovies import KeezMoviesIE
 from .khanacademy import KhanAcademyIE
 from .kickstarter import KickStarterIE
 from .kika import KikaIE
 from .keek import KeekIE
 from .kontrtube import KontrTubeIE
 from .krasview import KrasViewIE
--- a/youtube_dl/extractor/kika.py
+++ b/youtube_dl/extractor/kika.py
@ -1,101 +0,0 @@
 # coding: utf-8
 from __future__ import unicode_literals

 from .common import InfoExtractor
 from ..utils import ExtractorError


 class KikaIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?kika\.de/(?:[a-z-]+/)*(?:video|(?:einzel)?sendung)(?P<id>\d+).*'

    _TESTS = [
        {
            'url': 'http://www.kika.de/baumhaus/videos/video19636.html',
            'md5': '4930515e36b06c111213e80d1e4aad0e',
            'info_dict': {
                'id': '19636',
                'ext': 'mp4',
                'title': 'Baumhaus vom 30. Oktober 2015',
                'description': None,
            },
        },
        {
            'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html',
            'md5': '5fe9c4dd7d71e3b238f04b8fdd588357',
            'info_dict': {
                'id': '8182',
                'ext': 'mp4',
                'title': 'Beutolomäus und der geheime Weihnachtswunsch',
                'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd',
            },
        },
        {
            'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html',
            'md5': '4930515e36b06c111213e80d1e4aad0e',
            'info_dict': {
                'id': '19636',
                'ext': 'mp4',
                'title': 'Baumhaus vom 30. Oktober 2015',
                'description': None,
            },
        },
        {
            'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html',
            'md5': '5fe9c4dd7d71e3b238f04b8fdd588357',
            'info_dict': {
                'id': '8182',
                'ext': 'mp4',
                'title': 'Beutolomäus und der geheime Weihnachtswunsch',
                'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd',
            },
        },
    ]

    def _real_extract(self, url):
        # broadcast_id may be the same as the video_id
        broadcast_id = self._match_id(url)
        webpage = self._download_webpage(url, broadcast_id)

        xml_re = r'sectionArticle[ "](?:(?!sectionA[ "])(?:.|\n))*?dataURL:\'(?:/[a-z-]+?)*?/video(\d+)-avCustom\.xml'
        video_id = self._search_regex(xml_re, webpage, "xml_url", default=None)
        if not video_id:
            err_msg = 'Video %s is not available online' % broadcast_id
            raise ExtractorError(err_msg, expected=True)

        xml_url = 'http://www.kika.de/video%s-avCustom.xml' % (video_id)
        xml_tree = self._download_xml(xml_url, video_id)

        title = xml_tree.find('title').text
        webpage_url = xml_tree.find('htmlUrl').text

        # Try to get the description, not available for all videos
        try:
            broadcast_elem = xml_tree.find('broadcast')
            description = broadcast_elem.find('broadcastDescription').text
        except AttributeError:
            description = None

        # duration string format is mm:ss (even if it is >= 1 hour, e.g. 78:42)
        tmp = xml_tree.find('duration').text.split(':')
        duration = int(tmp[0]) * 60 + int(tmp[1])

        formats = [{
            'url': elem.find('progressiveDownloadUrl').text,
            'ext': elem.find('mediaType').text.lower(),
            'format': elem.find('profileName').text,
            'width': int(elem.find('frameWidth').text),
            'height': int(elem.find('frameHeight').text),
            'abr': int(elem.find('bitrateAudio').text),
            'vbr': int(elem.find('bitrateVideo').text),
            'filesize': int(elem.find('fileSize').text),
        } for elem in xml_tree.find('assets')]
        self._sort_formats(formats)

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'formats': formats,
            'duration': duration,
            'webpage_url': webpage_url,
        }
--- a/youtube_dl/extractor/mdr.py
+++ b/youtube_dl/extractor/mdr.py
@ -1,64 +1,154 @@
 # coding: utf-8
 from __future__ import unicode_literals

 import re

 from .common import InfoExtractor
 from ..compat import compat_urlparse
 from ..utils import (
    determine_ext,
    int_or_none,
    parse_duration,
    parse_iso8601,
    xpath_text,
 )


 class MDRIE(InfoExtractor):
    _VALID_URL = r'^(?P<domain>https?://(?:www\.)?mdr\.de)/(?:.*)/(?P<type>video|audio)(?P<video_id>[^/_]+)(?:_|\.html)'
    IE_DESC = 'MDR.DE and KiKA'
    _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+(?P<id>\d+)(?:_.+?)?\.html'

    # No tests, MDR regularily deletes its videos
    _TEST = {
    _TESTS = [{
        # MDR regularily deletes its videos
        'url': 'http://www.mdr.de/fakt/video189002.html',
        'only_matching': True,
    }
    }, {
        'url': 'http://www.kika.de/baumhaus/videos/video19636.html',
        'md5': '4930515e36b06c111213e80d1e4aad0e',
        'info_dict': {
            'id': '19636',
            'ext': 'mp4',
            'title': 'Baumhaus vom 30. Oktober 2015',
            'duration': 134,
            'uploader': 'KIKA',
        },
    }, {
        'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html',
        'md5': '5fe9c4dd7d71e3b238f04b8fdd588357',
        'info_dict': {
            'id': '8182',
            'ext': 'mp4',
            'title': 'Beutolomäus und der geheime Weihnachtswunsch',
            'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd',
            'timestamp': 1419047100,
            'upload_date': '20141220',
            'duration': 4628,
            'uploader': 'KIKA',
        },
    }, {
        'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html',
        'only_matching': True,
    }, {
        'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/einzelsendung2534.html',
        'only_matching': True,
    }]

    def _real_extract(self, url):
        m = re.match(self._VALID_URL, url)
        video_id = m.group('video_id')
        domain = m.group('domain')
        video_id = self._match_id(url)

        webpage = self._download_webpage(url, video_id)

        # determine title and media streams from webpage
        html = self._download_webpage(url, video_id)
        data_url = self._search_regex(
            r'dataURL\s*:\s*(["\'])(?P<url>/.+/(?:video|audio)[0-9]+-avCustom\.xml)\1',
            webpage, 'data url', group='url')

        title = self._html_search_regex(r'<h[12]>(.*?)</h[12]>', html, 'title')
        xmlurl = self._search_regex(
            r'dataURL:\'(/(?:.+)/(?:video|audio)[0-9]+-avCustom.xml)', html, 'XML URL')
        doc = self._download_xml(
            compat_urlparse.urljoin(url, data_url), video_id)

        title = (xpath_text(doc, './title', 'title', default=None) or
                 xpath_text(doc, './broadcast/broadcastName', 'title'))

        doc = self._download_xml(domain + xmlurl, video_id)
        formats = []
        for a in doc.findall('./assets/asset'):
            url_el = a.find('./progressiveDownloadUrl')
            if url_el is None:
                continue
            abr = int(a.find('bitrateAudio').text) // 1000
            media_type = a.find('mediaType').text
            format = {
                'abr': abr,
                'filesize': int(a.find('fileSize').text),
                'url': url_el.text,
            }

            vbr_el = a.find('bitrateVideo')
            if vbr_el is None:
                format.update({
                    'vcodec': 'none',
                    'format_id': '%s-%d' % (media_type, abr),
                })
            else:
                vbr = int(vbr_el.text) // 1000
                format.update({
                    'vbr': vbr,
                    'width': int(a.find('frameWidth').text),
                    'height': int(a.find('frameHeight').text),
                    'format_id': '%s-%d' % (media_type, vbr),
                })
            formats.append(format)
        processed_urls = []
        for asset in doc.findall('./assets/asset'):
            for source in (
                    'progressiveDownload',
                    'dynamicHttpStreamingRedirector',
                    'adaptiveHttpStreamingRedirector'):
                url_el = asset.find('./%sUrl' % source)
                if url_el is None:
                    continue

                video_url = url_el.text
                if video_url in processed_urls:
                    continue

                processed_urls.append(video_url)

                vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000)
                abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000)

                url_formats = []

                ext = determine_ext(url_el.text)
                if ext == 'm3u8':
                    url_formats = self._extract_m3u8_formats(
                        video_url, video_id, 'mp4', entry_protocol='m3u8_native',
                        preference=0, m3u8_id='HLS', fatal=False)
                elif ext == 'f4m':
                    url_formats = self._extract_f4m_formats(
                        video_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', video_id,
                        preference=0, f4m_id='HDS', fatal=False)
                else:
                    media_type = xpath_text(asset, './mediaType', 'media type', default='MP4')
                    vbr = int_or_none(xpath_text(asset, './bitrateVideo', 'vbr'), 1000)
                    abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000)
                    filesize = int_or_none(xpath_text(asset, './fileSize', 'file size'))

                    f = {
                        'url': video_url,
                        'format_id': '%s-%d' % (media_type, vbr or abr),
                        'filesize': filesize,
                        'abr': abr,
                        'preference': 1,
                    }

                    if vbr:
                        width = int_or_none(xpath_text(asset, './frameWidth', 'width'))
                        height = int_or_none(xpath_text(asset, './frameHeight', 'height'))
                        f.update({
                            'vbr': vbr,
                            'width': width,
                            'height': height,
                        })

                    url_formats.append(f)

                if not vbr:
                    for f in url_formats:
                        abr = f.get('tbr') or abr
                        if 'tbr' in f:
                            del f['tbr']
                        f.update({
                            'abr': abr,
                            'vcodec': 'none',
                        })

                if url_formats:
                    formats.extend(url_formats)
        self._sort_formats(formats)

        description = xpath_text(doc, './broadcast/broadcastDescription', 'description')
        timestamp = parse_iso8601(
            xpath_text(doc, './broadcast/broadcastDate', 'timestamp', default=None) or
            xpath_text(doc, './broadcast/broadcastStartDate', 'timestamp', default=None))
        duration = parse_duration(xpath_text(doc, './duration', 'duration'))
        uploader = xpath_text(doc, './rights', 'uploader')

        return {
            'id': video_id,
            'title': title,
            'description': description,
            'timestamp': timestamp,
            'duration': duration,
            'uploader': uploader,
            'formats': formats,
        }