@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
@ -10,7 +11,8 @@ import hashlib
from .common import InfoExtractor
from ..compat import (
compat_str ,
compat_parse_qs ,
compat_urllib_parse_urlparse ,
)
from ..utils import (
determine_ext ,
@ -18,12 +20,68 @@ from ..utils import (
xpath_with_ns ,
unsmuggle_url ,
int_or_none ,
url_basename ,
float_or_none ,
)
_x = lambda p : xpath_with_ns ( p , { ' smil ' : ' http://www.w3.org/2005/SMIL21/Language ' } )
default_ns = ' http://www.w3.org/2005/SMIL21/Language '
_x = lambda p : xpath_with_ns ( p , { ' smil ' : default_ns } )
class ThePlatformIE ( InfoExtractor ) :
class ThePlatformBaseIE ( InfoExtractor ) :
def _extract_theplatform_smil_formats ( self , smil_url , video_id , note = ' Downloading SMIL data ' ) :
meta = self . _download_xml ( smil_url , video_id , note = note )
try :
error_msg = next (
n . attrib [ ' abstract ' ]
for n in meta . findall ( _x ( ' .//smil:ref ' ) )
if n . attrib . get ( ' title ' ) == ' Geographic Restriction ' or n . attrib . get ( ' title ' ) == ' Expired ' )
except StopIteration :
pass
else :
raise ExtractorError ( error_msg , expected = True )
formats = self . _parse_smil_formats (
meta , smil_url , video_id , namespace = default_ns ,
# the parameters are from syfy.com, other sites may use others,
# they also work for nbc.com
f4m_params = { ' g ' : ' UXWGVKRWHFSP ' , ' hdcore ' : ' 3.0.3 ' } ,
transform_rtmp_url = lambda streamer , src : ( streamer , ' mp4: ' + src ) )
for _format in formats :
ext = determine_ext ( _format [ ' url ' ] )
if ext == ' once ' :
_format [ ' ext ' ] = ' mp4 '
self . _sort_formats ( formats )
return formats
def get_metadata ( self , path , video_id ) :
info_url = ' http://link.theplatform.com/s/ %s ?format=preview ' % path
info_json = self . _download_webpage ( info_url , video_id )
info = json . loads ( info_json )
subtitles = { }
captions = info . get ( ' captions ' )
if isinstance ( captions , list ) :
for caption in captions :
lang , src , mime = caption . get ( ' lang ' , ' en ' ) , caption . get ( ' src ' ) , caption . get ( ' type ' )
subtitles [ lang ] = [ {
' ext ' : ' srt ' if mime == ' text/srt ' else ' ttml ' ,
' url ' : src ,
} ]
return {
' title ' : info [ ' title ' ] ,
' subtitles ' : subtitles ,
' description ' : info [ ' description ' ] ,
' thumbnail ' : info [ ' defaultThumbnailUrl ' ] ,
' duration ' : int_or_none ( info . get ( ' duration ' ) , 1000 ) ,
}
class ThePlatformIE ( ThePlatformBaseIE ) :
_VALID_URL = r ''' (?x)
( ? : https ? : / / ( ? : link | player ) \. theplatform \. com / [ sp ] / ( ? P < provider_id > [ ^ / ] + ) /
( ? : ( ? P < media > ( ? : [ ^ / ] + / ) + select / media / ) | ( ? P < config > ( ? : [ ^ / \? ] + / ( ? : swf | config ) | onsite ) / select / ) ) ?
@ -67,6 +125,20 @@ class ThePlatformIE(InfoExtractor):
} , {
' url ' : ' http://player.theplatform.com/p/NnzsPC/widget/select/media/4Y0TlYUr_ZT7 ' ,
' only_matching ' : True ,
} , {
' url ' : ' http://player.theplatform.com/p/2E2eJC/nbcNewsOffsite?guid=tdy_or_siri_150701 ' ,
' md5 ' : ' 734f3790fb5fc4903da391beeebc4836 ' ,
' info_dict ' : {
' id ' : ' tdy_or_siri_150701 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' iPhone Siri’s sassy response to a math question has people talking ' ,
' description ' : ' md5:a565d1deadd5086f3331d57298ec6333 ' ,
' duration ' : 83.0 ,
' thumbnail ' : ' re:^https?://.* \ .jpg$ ' ,
' timestamp ' : 1435752600 ,
' upload_date ' : ' 20150701 ' ,
' categories ' : [ ' Today/Shows/Orange Room ' , ' Today/Sections/Money ' , ' Today/Topics/Tech ' , " Today/Topics/Editor ' s picks " ] ,
} ,
} ]
@staticmethod
@ -101,6 +173,24 @@ class ThePlatformIE(InfoExtractor):
path + = ' /media '
path + = ' / ' + video_id
qs_dict = compat_parse_qs ( compat_urllib_parse_urlparse ( url ) . query )
if ' guid ' in qs_dict :
webpage = self . _download_webpage ( url , video_id )
scripts = re . findall ( r ' <script[^>]+src= " ([^ " ]+) " ' , webpage )
feed_id = None
# feed id usually locates in the last script.
# Seems there's no pattern for the interested script filename, so
# I try one by one
for script in reversed ( scripts ) :
feed_script = self . _download_webpage ( script , video_id , ' Downloading feed script ' )
feed_id = self . _search_regex ( r ' defaultFeedId \ s*: \ s* " ([^ " ]+) " ' , feed_script , ' default feed id ' , default = None )
if feed_id is not None :
break
if feed_id is None :
raise ExtractorError ( ' Unable to find feed id ' )
return self . url_result ( ' http://feed.theplatform.com/f/ %s / %s ?byGuid= %s ' % (
provider_id , feed_id , qs_dict [ ' guid ' ] [ 0 ] ) )
if smuggled_data . get ( ' force_smil_url ' , False ) :
smil_url = url
elif mobj . group ( ' config ' ) :
@ -120,95 +210,78 @@ class ThePlatformIE(InfoExtractor):
if sig :
smil_url = self . _sign_url ( smil_url , sig [ ' key ' ] , sig [ ' secret ' ] )
meta = self . _download_xml ( smil_url , video_id )
try :
error_msg = next (
n . attrib [ ' abstract ' ]
for n in meta . findall ( _x ( ' .//smil:ref ' ) )
if n . attrib . get ( ' title ' ) == ' Geographic Restriction ' or n . attrib . get ( ' title ' ) == ' Expired ' )
except StopIteration :
pass
else :
raise ExtractorError ( error_msg , expected = True )
formats = self . _extract_theplatform_smil_formats ( smil_url , video_id )
info_url = ' http://link.theplatform.com/s/ %s ?format=preview ' % path
info_json = self . _download_webpage ( info_url , video_id )
info = json . loads ( info_json )
ret = self . get_metadata ( path , video_id )
ret . update ( {
' id ' : video_id ,
' formats ' : formats ,
} )
subtitles = { }
captions = info . get ( ' captions ' )
if isinstance ( captions , list ) :
for caption in captions :
lang , src , mime = caption . get ( ' lang ' , ' en ' ) , caption . get ( ' src ' ) , caption . get ( ' type ' )
subtitles [ lang ] = [ {
' ext ' : ' srt ' if mime == ' text/srt ' else ' ttml ' ,
' url ' : src ,
} ]
return ret
head = meta . find ( _x ( ' smil:head ' ) )
body = meta . find ( _x ( ' smil:body ' ) )
f4m_node = body . find ( _x ( ' smil:seq//smil:video ' ) )
if f4m_node is None :
f4m_node = body . find ( _x ( ' smil:seq/smil:video ' ) )
if f4m_node is not None and ' .f4m ' in f4m_node . attrib [ ' src ' ] :
f4m_url = f4m_node . attrib [ ' src ' ]
if ' manifest.f4m? ' not in f4m_url :
f4m_url + = ' ? '
# the parameters are from syfy.com, other sites may use others,
# they also work for nbc.com
f4m_url + = ' &g=UXWGVKRWHFSP&hdcore=3.0.3 '
formats = self . _extract_f4m_formats ( f4m_url , video_id )
else :
formats = [ ]
switch = body . find ( _x ( ' smil:switch ' ) )
if switch is None :
switch = body . find ( _x ( ' smil:par//smil:switch ' ) )
if switch is None :
switch = body . find ( _x ( ' smil:par/smil:switch ' ) )
if switch is None :
switch = body . find ( _x ( ' smil:par ' ) )
if switch is not None :
base_url = head . find ( _x ( ' smil:meta ' ) ) . attrib [ ' base ' ]
for f in switch . findall ( _x ( ' smil:video ' ) ) :
attr = f . attrib
width = int_or_none ( attr . get ( ' width ' ) )
height = int_or_none ( attr . get ( ' height ' ) )
vbr = int_or_none ( attr . get ( ' system-bitrate ' ) , 1000 )
format_id = ' %d x %d _ %d k ' % ( width , height , vbr )
formats . append ( {
' format_id ' : format_id ,
' url ' : base_url ,
' play_path ' : ' mp4: ' + attr [ ' src ' ] ,
' ext ' : ' flv ' ,
' width ' : width ,
' height ' : height ,
' vbr ' : vbr ,
} )
else :
switch = body . find ( _x ( ' smil:seq//smil:switch ' ) )
if switch is None :
switch = body . find ( _x ( ' smil:seq/smil:switch ' ) )
for f in switch . findall ( _x ( ' smil:video ' ) ) :
attr = f . attrib
vbr = int_or_none ( attr . get ( ' system-bitrate ' ) , 1000 )
ext = determine_ext ( attr [ ' src ' ] )
if ext == ' once ' :
ext = ' mp4 '
formats . append ( {
' format_id ' : compat_str ( vbr ) ,
' url ' : attr [ ' src ' ] ,
' vbr ' : vbr ,
' ext ' : ext ,
} )
self . _sort_formats ( formats )
class ThePlatformFeedIE ( ThePlatformBaseIE ) :
_URL_TEMPLATE = ' %s //feed.theplatform.com/f/ %s / %s ?form=json&byGuid= %s '
_VALID_URL = r ' https?://feed \ .theplatform \ .com/f/(?P<provider_id>[^/]+)/(?P<feed_id>[^?/]+) \ ?(?:[^&]+&)*byGuid=(?P<id>[a-zA-Z0-9_]+) '
_TEST = {
# From http://player.theplatform.com/p/7wvmTC/MSNBCEmbeddedOffSite?guid=n_hardball_5biden_140207
' url ' : ' http://feed.theplatform.com/f/7wvmTC/msnbc_video-p-test?form=json&pretty=true&range=-40&byGuid=n_hardball_5biden_140207 ' ,
' md5 ' : ' 22d2b84f058d3586efcd99e57d59d314 ' ,
' info_dict ' : {
' id ' : ' n_hardball_5biden_140207 ' ,
' ext ' : ' mp4 ' ,
' title ' : ' The Biden factor: will Joe run in 2016? ' ,
' description ' : ' Could Vice President Joe Biden be preparing a 2016 campaign? Mark Halperin and Sam Stein weigh in. ' ,
' thumbnail ' : ' re:^https?://.* \ .jpg$ ' ,
' upload_date ' : ' 20140208 ' ,
' timestamp ' : 1391824260 ,
' duration ' : 467.0 ,
' categories ' : [ ' MSNBC/Issues/Democrats ' , ' MSNBC/Issues/Elections/Election 2016 ' ] ,
} ,
}
return {
def _real_extract ( self , url ) :
mobj = re . match ( self . _VALID_URL , url )
video_id = mobj . group ( ' id ' )
provider_id = mobj . group ( ' provider_id ' )
feed_id = mobj . group ( ' feed_id ' )
real_url = self . _URL_TEMPLATE % ( self . http_scheme ( ) , provider_id , feed_id , video_id )
feed = self . _download_json ( real_url , video_id )
entry = feed [ ' entries ' ] [ 0 ]
formats = [ ]
first_video_id = None
duration = None
for item in entry [ ' media$content ' ] :
smil_url = item [ ' plfile$url ' ] + ' &format=SMIL&Tracking=true&Embedded=true&formats=MPEG4,F4M '
cur_video_id = url_basename ( smil_url )
if first_video_id is None :
first_video_id = cur_video_id
duration = float_or_none ( item . get ( ' plfile$duration ' ) )
formats . extend ( self . _extract_theplatform_smil_formats ( smil_url , video_id , ' Downloading SMIL data for %s ' % cur_video_id ) )
self . _sort_formats ( formats )
thumbnails = [ {
' url ' : thumbnail [ ' plfile$url ' ] ,
' width ' : int_or_none ( thumbnail . get ( ' plfile$width ' ) ) ,
' height ' : int_or_none ( thumbnail . get ( ' plfile$height ' ) ) ,
} for thumbnail in entry . get ( ' media$thumbnails ' , [ ] ) ]
timestamp = int_or_none ( entry . get ( ' media$availableDate ' ) , scale = 1000 )
categories = [ item [ ' media$name ' ] for item in entry . get ( ' media$categories ' , [ ] ) ]
ret = self . get_metadata ( ' %s / %s ' % ( provider_id , first_video_id ) , video_id )
ret . update ( {
' id ' : video_id ,
' title ' : info [ ' title ' ] ,
' subtitles ' : subtitles ,
' formats ' : formats ,
' description ' : info [ ' description ' ] ,
' thumbnail ' : info [ ' defaultThumbnailUrl ' ] ,
' duration ' : int_or_none ( info . get ( ' duration ' ) , 1000 ) ,
}
' thumbnails ' : thumbnails ,
' duration ' : duration ,
' timestamp ' : timestamp ,
' categories ' : categories ,
} )
return ret