Browse Source

[sportbox] Improve extraction, add support for matchtv.ru and fix video id (closes #17978)

master-ytdl-org
Sergey M․ 6 years ago
parent
commit
476cf548e1
No known key found for this signature in database GPG Key ID: 2C393E0F18A9236D
3 changed files with 37 additions and 26 deletions
  1. +1
    -1
      youtube_dl/extractor/extractors.py
  2. +3
    -3
      youtube_dl/extractor/generic.py
  3. +33
    -22
      youtube_dl/extractor/sportbox.py

+ 1
- 1
youtube_dl/extractor/extractors.py View File

@ -1043,7 +1043,7 @@ from .spike import (
) )
from .stitcher import StitcherIE from .stitcher import StitcherIE
from .sport5 import Sport5IE from .sport5 import Sport5IE
from .sportbox import SportBoxEmbedIE
from .sportbox import SportBoxIE
from .sportdeutschland import SportDeutschlandIE from .sportdeutschland import SportDeutschlandIE
from .springboardplatform import SpringboardPlatformIE from .springboardplatform import SpringboardPlatformIE
from .sprout import SproutIE from .sprout import SproutIE


+ 3
- 3
youtube_dl/extractor/generic.py View File

@ -47,7 +47,7 @@ from .nbc import NBCSportsVPlayerIE
from .ooyala import OoyalaIE from .ooyala import OoyalaIE
from .rutv import RUTVIE from .rutv import RUTVIE
from .tvc import TVCIE from .tvc import TVCIE
from .sportbox import SportBoxEmbedIE
from .sportbox import SportBoxIE
from .smotri import SmotriIE from .smotri import SmotriIE
from .myvi import MyviIE from .myvi import MyviIE
from .condenast import CondeNastIE from .condenast import CondeNastIE
@ -2636,9 +2636,9 @@ class GenericIE(InfoExtractor):
return self.url_result(tvc_url, 'TVC') return self.url_result(tvc_url, 'TVC')
# Look for embedded SportBox player # Look for embedded SportBox player
sportbox_urls = SportBoxEmbedIE._extract_urls(webpage)
sportbox_urls = SportBoxIE._extract_urls(webpage)
if sportbox_urls: if sportbox_urls:
return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie='SportBoxEmbed')
return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key())
# Look for embedded XHamster player # Look for embedded XHamster player
xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) xhamster_urls = XHamsterEmbedIE._extract_urls(webpage)


+ 33
- 22
youtube_dl/extractor/sportbox.py View File

@ -8,20 +8,24 @@ from ..utils import (
determine_ext, determine_ext,
int_or_none, int_or_none,
js_to_json, js_to_json,
merge_dicts,
) )
class SportBoxEmbedIE(InfoExtractor):
_VALID_URL = r'https?://news\.sportbox\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)'
class SportBoxIE(InfoExtractor):
_VALID_URL = r'https?://(?:news\.sportbox|matchtv)\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)'
_TESTS = [{ _TESTS = [{
'url': 'http://news.sportbox.ru/vdl/player/ci/211355', 'url': 'http://news.sportbox.ru/vdl/player/ci/211355',
'info_dict': { 'info_dict': {
'id': '211355',
'id': '109158',
'ext': 'mp4', 'ext': 'mp4',
'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»', 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»',
'description': 'В Новороссийске прошел детский турнир «Поле славы боевой»',
'thumbnail': r're:^https?://.*\.jpg$', 'thumbnail': r're:^https?://.*\.jpg$',
'duration': 292, 'duration': 292,
'view_count': int, 'view_count': int,
'timestamp': 1426237001,
'upload_date': '20150313',
}, },
'params': { 'params': {
# m3u8 download # m3u8 download
@ -33,12 +37,18 @@ class SportBoxEmbedIE(InfoExtractor):
}, { }, {
'url': 'https://news.sportbox.ru/vdl/player/media/193095', 'url': 'https://news.sportbox.ru/vdl/player/media/193095',
'only_matching': True, 'only_matching': True,
}, {
'url': 'https://news.sportbox.ru/vdl/player/media/109158',
'only_matching': True,
}, {
'url': 'https://matchtv.ru/vdl/player/media/109158',
'only_matching': True,
}] }]
@staticmethod @staticmethod
def _extract_urls(webpage): def _extract_urls(webpage):
return re.findall( return re.findall(
r'<iframe[^>]+src="(https?://news\.sportbox\.ru/vdl/player[^"]+)"',
r'<iframe[^>]+src="(https?://(?:news\.sportbox|matchtv)\.ru/vdl/player[^"]+)"',
webpage) webpage)
def _real_extract(self, url): def _real_extract(self, url):
@ -46,22 +56,14 @@ class SportBoxEmbedIE(InfoExtractor):
webpage = self._download_webpage(url, video_id) webpage = self._download_webpage(url, video_id)
wjplayer_data = self._parse_json(
self._search_regex(
r'(?s)var\s+playerOptions\s*=\s*({.+?});', webpage, 'wjplayer settings'),
video_id, transform_source=js_to_json)
wjplayer_data['sources'] = self._parse_json(
sources = self._parse_json(
self._search_regex( self._search_regex(
r'(?s)playerOptions\.sources\s*=\s*(\[.+?\]);', webpage, 'wjplayer sources'),
r'(?s)playerOptions\.sources(?:WithRes)?\s*=\s*(\[.+?\])\s*;\s*\n',
webpage, 'sources'),
video_id, transform_source=js_to_json) video_id, transform_source=js_to_json)
title = self._html_search_meta(
['og:title', 'twitter:title'], webpage) or self._html_search_regex(
r'<title>(.+?)</title>', webpage, 'title', fatal=False) or video_id
formats = [] formats = []
for source in wjplayer_data['sources']:
for source in sources:
src = source.get('src') src = source.get('src')
if not src: if not src:
continue continue
@ -75,14 +77,23 @@ class SportBoxEmbedIE(InfoExtractor):
}) })
self._sort_formats(formats) self._sort_formats(formats)
player = self._parse_json(
self._search_regex(
r'(?s)playerOptions\s*=\s*({.+?})\s*;\s*\n', webpage,
'player options', default='{}'),
video_id, transform_source=js_to_json)
media_id = player['mediaId']
info = self._search_json_ld(webpage, media_id, default={})
view_count = int_or_none(self._search_regex( view_count = int_or_none(self._search_regex(
r'Просмотров\s*:\s*(\d+)', webpage, 'view count', default=None)) r'Просмотров\s*:\s*(\d+)', webpage, 'view count', default=None))
return {
'id': video_id,
'title': title,
'thumbnail': wjplayer_data.get('poster'),
'duration': int_or_none(wjplayer_data.get('duration')),
return merge_dicts(info, {
'id': media_id,
'title': self._og_search_title(webpage, default=None) or media_id,
'thumbnail': player.get('poster'),
'duration': int_or_none(player.get('duration')),
'view_count': view_count, 'view_count': view_count,
'formats': formats, 'formats': formats,
}
})

Loading…
Cancel
Save