Browse Source

Merge branch 'brightcove_in_page_embed' of https://github.com/remitamine/youtube-dl into remitamine-brightcove_in_page_embed

totalwebcasting
Sergey M․ 9 years ago
parent
commit
a2973eb597
4 changed files with 128 additions and 2 deletions
  1. +4
    -1
      youtube_dl/extractor/__init__.py
  2. +95
    -0
      youtube_dl/extractor/brightcove.py
  3. +20
    -1
      youtube_dl/extractor/generic.py
  4. +9
    -0
      youtube_dl/utils.py

+ 4
- 1
youtube_dl/extractor/__init__.py View File

@ -60,7 +60,10 @@ from .bloomberg import BloombergIE
from .bpb import BpbIE from .bpb import BpbIE
from .br import BRIE from .br import BRIE
from .breakcom import BreakIE from .breakcom import BreakIE
from .brightcove import BrightcoveIE
from .brightcove import (
BrightcoveIE,
BrightcoveInPageEmbedIE,
)
from .buzzfeed import BuzzFeedIE from .buzzfeed import BuzzFeedIE
from .byutv import BYUtvIE from .byutv import BYUtvIE
from .c56 import C56IE from .c56 import C56IE


+ 95
- 0
youtube_dl/extractor/brightcove.py View File

@ -22,6 +22,10 @@ from ..utils import (
fix_xml_ampersands, fix_xml_ampersands,
unescapeHTML, unescapeHTML,
unsmuggle_url, unsmuggle_url,
js_to_json,
int_or_none,
parse_iso8601,
extract_attributes,
) )
@ -346,3 +350,94 @@ class BrightcoveIE(InfoExtractor):
if 'url' not in info and not info.get('formats'): if 'url' not in info and not info.get('formats'):
raise ExtractorError('Unable to extract video url for %s' % info['id']) raise ExtractorError('Unable to extract video url for %s' % info['id'])
return info return info
class BrightcoveInPageEmbedIE(InfoExtractor):
_VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/([a-z0-9-]+)_([a-z]+)/index.html?.*videoId=(?P<video_id>\d+)'
_TEST = {
'url': 'http://players.brightcove.net/929656772001/e41d32dc-ec74-459e-a845-6c69f7b724ea_default/index.html?videoId=4463358922001',
'md5': 'c8100925723840d4b0d243f7025703be',
'info_dict': {
'id': '4463358922001',
'ext': 'mp4',
'title': 'Meet the man behind Popcorn Time',
'description': 'md5:eac376a4fe366edc70279bfb681aea16',
'timestamp': 1441391203,
'upload_date': '20150904',
'duration': 165768,
'uploader_id': '929656772001',
}
}
@staticmethod
def _extract_url(webpage):
video_attributes = re.search(r'(?s)<video([^>]*)>.*?</(?:video|audio)>', webpage)
if video_attributes:
video_attributes = extract_attributes(video_attributes.group(), r'(?s)\s*data-(account|video-id|playlist-id|policy-key|player|embed)\s*=\s*["\']([^"\']+)["\']')
account_id = video_attributes.get('account')
player_id = video_attributes.get('player')
embed = video_attributes.get('embed')
video_id = video_attributes.get('video-id')
if account_id and player_id and embed and video_id:
return 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (account_id, player_id, embed, video_id)
return None
def _real_extract(self, url):
account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage('http://players.brightcove.net/%s/%s_%s/index.min.js' % (account_id, player_id, embed), video_id)
catalog = self._parse_json(
js_to_json(
self._search_regex(
r'catalog\(({[^}]+})\);',
webpage,
'catalog'
)
),
video_id
)
policy_key = catalog['policyKey']
req = compat_urllib_request.Request(
'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id),
headers={'Accept': 'application/json;pk=%s' % policy_key})
json_data = self._download_json(req, video_id)
title = json_data['name']
description = json_data.get('description')
thumbnail = json_data.get('thumbnail')
timestamp = parse_iso8601(json_data.get('published_at'))
duration = int_or_none(json_data.get('duration'))
formats = []
for source in json_data.get('sources'):
source_type = source.get('type')
if source_type == 'application/x-mpegURL':
formats.extend(self._extract_m3u8_formats(source.get('src'), video_id))
else:
src = source.get('src') or source.get('streaming_src')
if src:
formats.append({
'url': src,
'tbr': source.get('avg_bitrate'),
'width': int_or_none(source.get('width')),
'height': int_or_none(source.get('height')),
'filesize': source.get('size'),
'container': source.get('container'),
'vcodec': source.get('codec'),
'ext': source.get('container').lower(),
})
self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'timestamp': timestamp,
'duration': duration,
'formats': formats,
'uploader_id': account_id,
}

+ 20
- 1
youtube_dl/extractor/generic.py View File

@ -30,7 +30,10 @@ from ..utils import (
url_basename, url_basename,
xpath_text, xpath_text,
) )
from .brightcove import BrightcoveIE
from .brightcove import (
BrightcoveIE,
BrightcoveInPageEmbedIE,
)
from .nbc import NBCSportsVPlayerIE from .nbc import NBCSportsVPlayerIE
from .ooyala import OoyalaIE from .ooyala import OoyalaIE
from .rutv import RUTVIE from .rutv import RUTVIE
@ -1031,6 +1034,17 @@ class GenericIE(InfoExtractor):
'ext': 'mp4', 'ext': 'mp4',
'title': 'cinemasnob', 'title': 'cinemasnob',
}, },
},
# BrightcoveInPageEmbed embed
{
'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/',
'info_dict': {
'id': '4238694884001',
'ext': 'flv',
'title': 'Tabletop: Dread, Last Thoughts',
'description': 'Tabletop: Dread, Last Thoughts',
'duration': 51690,
},
} }
] ]
@ -1307,6 +1321,11 @@ class GenericIE(InfoExtractor):
'entries': entries, 'entries': entries,
} }
# Look for Brightcove In Page Embed:
brightcove_in_page_embed_url = BrightcoveInPageEmbedIE._extract_url(webpage)
if brightcove_in_page_embed_url:
return self.url_result(brightcove_in_page_embed_url, 'BrightcoveInPageEmbed')
# Look for embedded rtl.nl player # Look for embedded rtl.nl player
matches = re.findall( matches = re.findall(
r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',


+ 9
- 0
youtube_dl/utils.py View File

@ -259,6 +259,15 @@ def get_element_by_attribute(attribute, value, html):
return unescapeHTML(res) return unescapeHTML(res)
def extract_attributes(attributes_str, attributes_regex=r'(?s)\s*([^\s=]+)\s*=\s*["\']([^"\']+)["\']'):
attributes = re.findall(attributes_regex, attributes_str)
attributes_dict = {}
if attributes:
for (attribute_name, attribute_value) in attributes:
attributes_dict[attribute_name] = attribute_value
return attributes_dict
def clean_html(html): def clean_html(html):
"""Clean an HTML snippet into a readable string""" """Clean an HTML snippet into a readable string"""


Loading…
Cancel
Save