[closertotruth] Update and improve (Closes #8680)

9 years ago · cb23192bc4
--- a/youtube_dl/extractor/closertotruth.py
+++ b/youtube_dl/extractor/closertotruth.py
@ -1,69 +1,92 @@
 # coding: utf-8
 from __future__ import unicode_literals

 import re

 from .common import InfoExtractor


 class CloserToTruthIE(InfoExtractor):
    _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(episodes/|(series|interviews)/(?:[^#]+#video-)?(?P<id>\d+))'
    _TESTS = [
        {
            'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688',
            'md5': '5c548bde260a9247ddfdc07c7458ed29',
            'info_dict': {
                'id': '0_zof1ktre',
                'ext': 'mov',
                'title': 'Solutions to the Mind-Body Problem?',
                'upload_date': '20140221',
                'timestamp': 1392956007,
                'uploader_id': 'CTTXML'
            }
    _VALID_URL = r'https?://(?:www\.)?closertotruth\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)'
    _TESTS = [{
        'url': 'http://closertotruth.com/series/solutions-the-mind-body-problem#video-3688',
        'info_dict': {
            'id': '0_zof1ktre',
            'display_id': 'solutions-the-mind-body-problem',
            'ext': 'mov',
            'title': 'Solutions to the Mind-Body Problem?',
            'upload_date': '20140221',
            'timestamp': 1392956007,
            'uploader_id': 'CTTXML'
        },
        'params': {
            'skip_download': True,
        },
    }, {
        'url': 'http://closertotruth.com/episodes/how-do-brains-work',
        'info_dict': {
            'id': '0_iuxai6g6',
            'display_id': 'how-do-brains-work',
            'ext': 'mov',
            'title': 'How do Brains Work?',
            'upload_date': '20140221',
            'timestamp': 1392956024,
            'uploader_id': 'CTTXML'
        },
        {
            'url': 'http://closertotruth.com/interviews/1725',
            'md5': 'b00598fd6a38372edb976408f72c5792',
            'info_dict': {
                'id': '0_19qv5rn1',
                'ext': 'mov',
                'title': 'AyaFr-002 - Francisco J. Ayala',
                'upload_date': '20140307',
                'timestamp': 1394236431,
                'uploader_id': 'CTTXML'
            }
        'params': {
            'skip_download': True,
        },
        {
            'url': 'http://closertotruth.com/episodes/how-do-brains-work',
            'md5': '4dd96aa0a5c296afa5c0bd24895c2f16',
            'info_dict': {
                'id': '0_iuxai6g6',
                'ext': 'mov',
                'title': 'How do Brains Work?',
                'upload_date': '20140221',
                'timestamp': 1392956024,
                'uploader_id': 'CTTXML'
            }
    }, {
        'url': 'http://closertotruth.com/interviews/1725',
        'info_dict': {
            'id': '1725',
            'title': 'AyaFr-002',
        },
    ]
        'playlist_mincount': 2,
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        display_id = self._match_id(url)

        video_title = self._search_regex(r'<title>(.+) \|.+</title>', webpage, 'video title')
        webpage = self._download_webpage(url, display_id)

        entry_id = self._search_regex(r'<a[^>]+id="(?:video-%s|embed-kaltura)"[^>]+data-kaltura="([^"]+)' % video_id, webpage, "video entry_id")
        partner_id = self._search_regex(
            r'<script[^>]+src=["\'].*?\b(?:partner_id|p)/(\d+)',
            webpage, 'kaltura partner_id')

        interviewee_name = self._search_regex(r'<div id="(?:node_interview_full_group_white_wrapper|node_interview_series_full_group_ajax_content)"(?:.|\n)*<h3>(.*)</h3>.+', webpage, "video interviewee_name", False)
        title = self._search_regex(
            r'<title>(.+?)\s*\|\s*.+?</title>', webpage, 'video title')

        if interviewee_name:
            video_title = video_title + ' - ' + interviewee_name
        select = self._search_regex(
            r'(?s)<select[^>]+id="select-version"[^>]*>(.+?)</select>',
            webpage, 'select version', default=None)
        if select:
            entry_ids = set()
            entries = []
            for mobj in re.finditer(
                    r'<option[^>]+value=(["\'])(?P<id>[0-9a-z_]+)(?:#.+?)?\1[^>]*>(?P<title>[^<]+)',
                    webpage):
                entry_id = mobj.group('id')
                if entry_id in entry_ids:
                    continue
                entry_ids.add(entry_id)
                entries.append({
                    '_type': 'url_transparent',
                    'url': 'kaltura:%s:%s' % (partner_id, entry_id),
                    'ie_key': 'Kaltura',
                    'title': mobj.group('title'),
                })
            if entries:
                return self.playlist_result(entries, display_id, title)

        p_id = self._search_regex(r'<script[^>]+src=["\'].+?partner_id/(\d+)', webpage, "kaltura partner_id")
        entry_id = self._search_regex(
            r'<a[^>]+id=(["\'])embed-kaltura\1[^>]+data-kaltura=(["\'])(?P<id>[0-9a-z_]+)\2',
            webpage, 'kaltura entry_id', group='id')

        return {
            '_type': 'url_transparent',
            'id': entry_id,
            'url': 'kaltura:%s:%s' % (p_id, entry_id),
            'display_id': display_id,
            'url': 'kaltura:%s:%s' % (partner_id, entry_id),
            'ie_key': 'Kaltura',
            'title': video_title
            'title': title
        }
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -140,6 +140,7 @@ from .cliprs import ClipRsIE
 from .clipfish import ClipfishIE
 from .cliphunter import CliphunterIE
 from .clipsyndicate import ClipsyndicateIE
 from .closertotruth import CloserToTruthIE
 from .cloudy import CloudyIE
 from .clubic import ClubicIE
 from .clyp import ClypIE