@ -245,6 +245,10 @@ class InfoExtractor(object):
specified in the URL .
end_time : Time in seconds where the reproduction should end , as
specified in the URL .
chapters : A list of dictionaries , with the following entries :
* " start_time " - The start time of the chapter in seconds
* " end_time " - The end time of the chapter in seconds
* " title " ( optional , string )
The following fields should only be used when the video belongs to some logical
chapter or section :
@ -976,6 +980,23 @@ class InfoExtractor(object):
return info
if isinstance ( json_ld , dict ) :
json_ld = [ json_ld ]
def extract_video_object ( e ) :
assert e [ ' @type ' ] == ' VideoObject '
info . update ( {
' url ' : e . get ( ' contentUrl ' ) ,
' title ' : unescapeHTML ( e . get ( ' name ' ) ) ,
' description ' : unescapeHTML ( e . get ( ' description ' ) ) ,
' thumbnail ' : e . get ( ' thumbnailUrl ' ) or e . get ( ' thumbnailURL ' ) ,
' duration ' : parse_duration ( e . get ( ' duration ' ) ) ,
' timestamp ' : unified_timestamp ( e . get ( ' uploadDate ' ) ) ,
' filesize ' : float_or_none ( e . get ( ' contentSize ' ) ) ,
' tbr ' : int_or_none ( e . get ( ' bitrate ' ) ) ,
' width ' : int_or_none ( e . get ( ' width ' ) ) ,
' height ' : int_or_none ( e . get ( ' height ' ) ) ,
' view_count ' : int_or_none ( e . get ( ' interactionCount ' ) ) ,
} )
for e in json_ld :
if e . get ( ' @context ' ) == ' http://schema.org ' :
item_type = e . get ( ' @type ' )
@ -1000,18 +1021,11 @@ class InfoExtractor(object):
' description ' : unescapeHTML ( e . get ( ' articleBody ' ) ) ,
} )
elif item_type == ' VideoObject ' :
info . update ( {
' url ' : e . get ( ' contentUrl ' ) ,
' title ' : unescapeHTML ( e . get ( ' name ' ) ) ,
' description ' : unescapeHTML ( e . get ( ' description ' ) ) ,
' thumbnail ' : e . get ( ' thumbnailUrl ' ) or e . get ( ' thumbnailURL ' ) ,
' duration ' : parse_duration ( e . get ( ' duration ' ) ) ,
' timestamp ' : unified_timestamp ( e . get ( ' uploadDate ' ) ) ,
' filesize ' : float_or_none ( e . get ( ' contentSize ' ) ) ,
' tbr ' : int_or_none ( e . get ( ' bitrate ' ) ) ,
' width ' : int_or_none ( e . get ( ' width ' ) ) ,
' height ' : int_or_none ( e . get ( ' height ' ) ) ,
} )
extract_video_object ( e )
elif item_type == ' WebPage ' :
video = e . get ( ' video ' )
if isinstance ( video , dict ) and video . get ( ' @type ' ) == ' VideoObject ' :
extract_video_object ( video )
break
return dict ( ( k , v ) for k , v in info . items ( ) if v is not None )
@ -1303,40 +1317,50 @@ class InfoExtractor(object):
entry_protocol = ' m3u8 ' , preference = None ,
m3u8_id = None , note = None , errnote = None ,
fatal = True , live = False ) :
res = self . _download_webpage_handle (
m3u8_url , video_id ,
note = note or ' Downloading m3u8 information ' ,
errnote = errnote or ' Failed to download m3u8 information ' ,
fatal = fatal )
if res is False :
return [ ]
m3u8_doc , urlh = res
m3u8_url = urlh . geturl ( )
return self . _parse_m3u8_formats (
m3u8_doc , m3u8_url , ext = ext , entry_protocol = entry_protocol ,
preference = preference , m3u8_id = m3u8_id , live = live )
def _parse_m3u8_formats ( self , m3u8_doc , m3u8_url , ext = None ,
entry_protocol = ' m3u8 ' , preference = None ,
m3u8_id = None , live = False ) :
if ' #EXT-X-FAXS-CM: ' in m3u8_doc : # Adobe Flash Access
return [ ]
formats = [ self . _m3u8_meta_format ( m3u8_url , ext , preference , m3u8_id ) ]
formats = [ ]
format_url = lambda u : (
u
if re . match ( r ' ^https?:// ' , u )
else compat_urlparse . urljoin ( m3u8_url , u ) )
# We should try extracting formats only from master playlists [1], i.e.
# playlists that describe available qualities. On the other hand media
# playlists [2] should be returned as is since they contain just the media
# without qualities renditions.
# References:
# 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
# 2. https://github.com/rg3/youtube-dl/issues/12211
# We should try extracting formats only from master playlists [1, 4.3.4],
# i.e. playlists that describe available qualities. On the other hand
# media playlists [1, 4.3.3] should be returned as is since they contain
# just the media without qualities renditions.
# Fortunately, master playlist can be easily distinguished from media
# playlist based on particular tags availability. As of [1, 2] master
# playlist tags MUST NOT appear in a media playist and vice versa.
# As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
# and MUST NOT appear in master playlist thus we can clearly detect media
# playlist with this criterion.
# 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
# 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
# 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
# playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
# master playlist tags MUST NOT appear in a media playist and vice versa.
# As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
# media playlist and MUST NOT appear in master playlist thus we can
# clearly detect media playlist with this criterion.
if ' #EXT-X-TARGETDURATION ' in m3u8_doc : # media playlist, return as is
return [ {
' url ' : m3u8_url ,
@ -1345,52 +1369,72 @@ class InfoExtractor(object):
' protocol ' : entry_protocol ,
' preference ' : preference ,
} ]
audio_in_video_stream = { }
last_info = { }
last_media = { }
groups = { }
last_stream_inf = { }
def extract_media ( x_media_line ) :
media = parse_m3u8_attributes ( x_media_line )
# As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
media_type , group_id , name = media . get ( ' TYPE ' ) , media . get ( ' GROUP-ID ' ) , media . get ( ' NAME ' )
if not ( media_type and group_id and name ) :
return
groups . setdefault ( group_id , [ ] ) . append ( media )
if media_type not in ( ' VIDEO ' , ' AUDIO ' ) :
return
media_url = media . get ( ' URI ' )
if media_url :
format_id = [ ]
for v in ( group_id , name ) :
if v :
format_id . append ( v )
f = {
' format_id ' : ' - ' . join ( format_id ) ,
' url ' : format_url ( media_url ) ,
' manifest_url ' : m3u8_url ,
' language ' : media . get ( ' LANGUAGE ' ) ,
' ext ' : ext ,
' protocol ' : entry_protocol ,
' preference ' : preference ,
}
if media_type == ' AUDIO ' :
f [ ' vcodec ' ] = ' none '
formats . append ( f )
def build_stream_name ( ) :
# Despite specification does not mention NAME attribute for
# EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
# or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
# 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
stream_name = last_stream_inf . get ( ' NAME ' )
if stream_name :
return stream_name
# If there is no NAME in EXT-X-STREAM-INF it will be obtained
# from corresponding rendition group
stream_group_id = last_stream_inf . get ( ' VIDEO ' )
if not stream_group_id :
return
stream_group = groups . get ( stream_group_id )
if not stream_group :
return stream_group_id
rendition = stream_group [ 0 ]
return rendition . get ( ' NAME ' ) or stream_group_id
for line in m3u8_doc . splitlines ( ) :
if line . startswith ( ' #EXT-X-STREAM-INF: ' ) :
last_info = parse_m3u8_attributes ( line )
last_stream_ inf = parse_m3u8_attributes ( line )
elif line . startswith ( ' #EXT-X-MEDIA: ' ) :
media = parse_m3u8_attributes ( line )
media_type = media . get ( ' TYPE ' )
if media_type in ( ' VIDEO ' , ' AUDIO ' ) :
group_id = media . get ( ' GROUP-ID ' )
media_url = media . get ( ' URI ' )
if media_url :
format_id = [ ]
for v in ( group_id , media . get ( ' NAME ' ) ) :
if v :
format_id . append ( v )
f = {
' format_id ' : ' - ' . join ( format_id ) ,
' url ' : format_url ( media_url ) ,
' language ' : media . get ( ' LANGUAGE ' ) ,
' ext ' : ext ,
' protocol ' : entry_protocol ,
' preference ' : preference ,
}
if media_type == ' AUDIO ' :
f [ ' vcodec ' ] = ' none '
if group_id and not audio_in_video_stream . get ( group_id ) :
audio_in_video_stream [ group_id ] = False
formats . append ( f )
else :
# When there is no URI in EXT-X-MEDIA let this tag's
# data be used by regular URI lines below
last_media = media
if media_type == ' AUDIO ' and group_id :
audio_in_video_stream [ group_id ] = True
extract_media ( line )
elif line . startswith ( ' # ' ) or not line . strip ( ) :
continue
else :
tbr = int_or_none ( last_info . get ( ' AVERAGE-BANDWIDTH ' ) or last_info . get ( ' BANDWIDTH ' ) , scale = 1000 )
tbr = float_or_none (
last_stream_inf . get ( ' AVERAGE-BANDWIDTH ' ) or
last_stream_inf . get ( ' BANDWIDTH ' ) , scale = 1000 )
format_id = [ ]
if m3u8_id :
format_id . append ( m3u8_id )
# Despite specification does not mention NAME attribute for
# EXT-X-STREAM-INF it still sometimes may be present
stream_name = last_info . get ( ' NAME ' ) or last_media . get ( ' NAME ' )
stream_name = build_stream_name ( )
# Bandwidth of live streams may differ over time thus making
# format_id unpredictable. So it's better to keep provided
# format_id intact.
@ -1400,14 +1444,14 @@ class InfoExtractor(object):
f = {
' format_id ' : ' - ' . join ( format_id ) ,
' url ' : manifest_url ,
' manifest_url ' : manifest _url ,
' manifest_url ' : m3u8 _url ,
' tbr ' : tbr ,
' ext ' : ext ,
' fps ' : float_or_none ( last_info . get ( ' FRAME-RATE ' ) ) ,
' fps ' : float_or_none ( last_stream_ inf . get ( ' FRAME-RATE ' ) ) ,
' protocol ' : entry_protocol ,
' preference ' : preference ,
}
resolution = last_info . get ( ' RESOLUTION ' )
resolution = last_stream_ inf . get ( ' RESOLUTION ' )
if resolution :
mobj = re . search ( r ' (?P<width> \ d+)[xX](?P<height> \ d+) ' , resolution )
if mobj :
@ -1423,13 +1467,26 @@ class InfoExtractor(object):
' vbr ' : vbr ,
' abr ' : abr ,
} )
f . update ( parse_codecs ( last_info . get ( ' CODECS ' ) ) )
if audio_in_video_stream . get ( last_info . get ( ' AUDIO ' ) ) is False and f [ ' vcodec ' ] != ' none ' :
# TODO: update acodec for audio only formats with the same GROUP-ID
f [ ' acodec ' ] = ' none '
codecs = parse_codecs ( last_stream_inf . get ( ' CODECS ' ) )
f . update ( codecs )
audio_group_id = last_stream_inf . get ( ' AUDIO ' )
# As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
# references a rendition group MUST have a CODECS attribute.
# However, this is not always respected, for example, [2]
# contains EXT-X-STREAM-INF tag which references AUDIO
# rendition group but does not have CODECS and despite
# referencing audio group an audio group, it represents
# a complete (with audio and video) format. So, for such cases
# we will ignore references to rendition groups and treat them
# as complete formats.
if audio_group_id and codecs and f . get ( ' vcodec ' ) != ' none ' :
audio_group = groups . get ( audio_group_id )
if audio_group and audio_group [ 0 ] . get ( ' URI ' ) :
# TODO: update acodec for audio only formats with
# the same GROUP-ID
f [ ' acodec ' ] = ' none '
formats . append ( f )
last_info = { }
last_media = { }
last_stream_inf = { }
return formats
@staticmethod
@ -1803,7 +1860,7 @@ class InfoExtractor(object):
' ext ' : mimetype2ext ( mime_type ) ,
' width ' : int_or_none ( representation_attrib . get ( ' width ' ) ) ,
' height ' : int_or_none ( representation_attrib . get ( ' height ' ) ) ,
' tbr ' : in t_or_none( bandwidth , 1000 ) ,
' tbr ' : floa t_or_none( bandwidth , 1000 ) ,
' asr ' : int_or_none ( representation_attrib . get ( ' audioSamplingRate ' ) ) ,
' fps ' : int_or_none ( representation_attrib . get ( ' frameRate ' ) ) ,
' language ' : lang if lang not in ( ' mul ' , ' und ' , ' zxx ' , ' mis ' ) else None ,
@ -2182,7 +2239,7 @@ class InfoExtractor(object):
def _find_jwplayer_data ( self , webpage , video_id = None , transform_source = js_to_json ) :
mobj = re . search (
r ' jwplayer \ ((?P<quote>[ \' " ])[^ \' " ]+(?P=quote) \ ) \ .setup \ s* \ ((?P<options>[^)]+) \ ) ' ,
r ' (?s) jwplayer\ ((?P<quote>[ \' " ])[^ \' " ]+(?P=quote) \ )(?!</script>).*? \ .setup \ s* \ ((?P<options>[^)]+) \ ) ' ,
webpage )
if mobj :
try :
@ -2258,11 +2315,17 @@ class InfoExtractor(object):
def _parse_jwplayer_formats ( self , jwplayer_sources_data , video_id = None ,
m3u8_id = None , mpd_id = None , rtmp_params = None , base_url = None ) :
urls = [ ]
formats = [ ]
for source in jwplayer_sources_data :
source_url = self . _proto_relative_url ( source [ ' file ' ] )
source_url = self . _proto_relative_url ( source . get ( ' file ' ) )
if not source_url :
continue
if base_url :
source_url = compat_urlparse . urljoin ( base_url , source_url )
if source_url in urls :
continue
urls . append ( source_url )
source_type = source . get ( ' type ' ) or ' '
ext = mimetype2ext ( source_type ) or determine_ext ( source_url )
if source_type == ' hls ' or ext == ' m3u8 ' :