[itv] Make SOAP request non fatal and extract metadata from a webpage (closes #16780)

7 years ago · 30374f4d40
--- a/youtube_dl/extractor/itv.py
+++ b/youtube_dl/extractor/itv.py
@ -18,6 +18,7 @@ from ..utils import (
    xpath_element,
    xpath_text,
    int_or_none,
    merge_dicts,
    parse_duration,
    smuggle_url,
    ExtractorError,
@ -129,64 +130,65 @@ class ITVIE(InfoExtractor):
        resp_env = self._download_xml(
            params['data-playlist-url'], video_id,
            headers=headers, data=etree.tostring(req_env))
        playlist = xpath_element(resp_env, './/Playlist')
        if playlist is None:
            fault_code = xpath_text(resp_env, './/faultcode')
            fault_string = xpath_text(resp_env, './/faultstring')
            if fault_code == 'InvalidGeoRegion':
                self.raise_geo_restricted(
                    msg=fault_string, countries=self._GEO_COUNTRIES)
            elif fault_code not in (
                    'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'):
                raise ExtractorError(
                    '%s said: %s' % (self.IE_NAME, fault_string), expected=True)
            info.update({
                'title': self._og_search_title(webpage),
                'episode_title': params.get('data-video-episode'),
                'series': params.get('data-video-title'),
            })
        else:
            title = xpath_text(playlist, 'EpisodeTitle', default=None)
            info.update({
                'title': title,
                'episode_title': title,
                'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')),
                'series': xpath_text(playlist, 'ProgrammeTitle'),
                'duration': parse_duration(xpath_text(playlist, 'Duration')),
            })
            video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True)
            media_files = xpath_element(video_element, 'MediaFiles', fatal=True)
            rtmp_url = media_files.attrib['base']
            headers=headers, data=etree.tostring(req_env), fatal=False)
        if resp_env:
            playlist = xpath_element(resp_env, './/Playlist')
            if playlist is None:
                fault_code = xpath_text(resp_env, './/faultcode')
                fault_string = xpath_text(resp_env, './/faultstring')
                if fault_code == 'InvalidGeoRegion':
                    self.raise_geo_restricted(
                        msg=fault_string, countries=self._GEO_COUNTRIES)
                elif fault_code not in (
                        'InvalidEntity', 'InvalidVodcrid', 'ContentUnavailable'):
                    raise ExtractorError(
                        '%s said: %s' % (self.IE_NAME, fault_string), expected=True)
                info.update({
                    'title': self._og_search_title(webpage),
                    'episode_title': params.get('data-video-episode'),
                    'series': params.get('data-video-title'),
                })
            else:
                title = xpath_text(playlist, 'EpisodeTitle', default=None)
                info.update({
                    'title': title,
                    'episode_title': title,
                    'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')),
                    'series': xpath_text(playlist, 'ProgrammeTitle'),
                    'duration': parse_duration(xpath_text(playlist, 'Duration')),
                })
                video_element = xpath_element(playlist, 'VideoEntries/Video', fatal=True)
                media_files = xpath_element(video_element, 'MediaFiles', fatal=True)
                rtmp_url = media_files.attrib['base']
            for media_file in media_files.findall('MediaFile'):
                play_path = xpath_text(media_file, 'URL')
                if not play_path:
                    continue
                tbr = int_or_none(media_file.get('bitrate'), 1000)
                f = {
                    'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''),
                    'play_path': play_path,
                    # Providing this swfVfy allows to avoid truncated downloads
                    'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf',
                    'page_url': url,
                    'tbr': tbr,
                    'ext': 'flv',
                }
                app = self._search_regex(
                    'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None)
                if app:
                    f.update({
                        'url': rtmp_url.split('?', 1)[0],
                        'app': app,
                    })
                else:
                    f['url'] = rtmp_url
                formats.append(f)
                for media_file in media_files.findall('MediaFile'):
                    play_path = xpath_text(media_file, 'URL')
                    if not play_path:
                        continue
                    tbr = int_or_none(media_file.get('bitrate'), 1000)
                    f = {
                        'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''),
                        'play_path': play_path,
                        # Providing this swfVfy allows to avoid truncated downloads
                        'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf',
                        'page_url': url,
                        'tbr': tbr,
                        'ext': 'flv',
                    }
                    app = self._search_regex(
                        'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None)
                    if app:
                        f.update({
                            'url': rtmp_url.split('?', 1)[0],
                            'app': app,
                        })
                    else:
                        f['url'] = rtmp_url
                    formats.append(f)
            for caption_url in video_element.findall('ClosedCaptioningURIs/URL'):
                if caption_url.text:
                    extract_subtitle(caption_url.text)
                for caption_url in video_element.findall('ClosedCaptioningURIs/URL'):
                    if caption_url.text:
                        extract_subtitle(caption_url.text)
        ios_playlist_url = params.get('data-video-playlist') or params.get('data-video-id')
        hmac = params.get('data-video-hmac')
@ -261,7 +263,17 @@ class ITVIE(InfoExtractor):
            'formats': formats,
            'subtitles': subtitles,
        })
        return info
        webpage_info = self._search_json_ld(webpage, video_id, default={})
        if not webpage_info.get('title'):
            webpage_info['title'] = self._html_search_regex(
                r'(?s)<h\d+[^>]+\bclass=["\'][^>]*episode-title["\'][^>]*>([^<]+)<',
                webpage, 'title', default=None) or self._og_search_title(
                webpage, default=None) or self._html_search_meta(
                'twitter:title', webpage, 'title',
                default=None) or webpage_info['episode']
        return merge_dicts(info, webpage_info)
 class ITVBTCCIE(InfoExtractor):