@ -245,6 +245,10 @@ class InfoExtractor(object):
specified in the URL .
specified in the URL .
end_time : Time in seconds where the reproduction should end , as
end_time : Time in seconds where the reproduction should end , as
specified in the URL .
specified in the URL .
chapters : A list of dictionaries , with the following entries :
* " start_time " - The start time of the chapter in seconds
* " end_time " - The end time of the chapter in seconds
* " title " ( optional , string )
The following fields should only be used when the video belongs to some logical
The following fields should only be used when the video belongs to some logical
chapter or section :
chapter or section :
@ -976,6 +980,23 @@ class InfoExtractor(object):
return info
return info
if isinstance ( json_ld , dict ) :
if isinstance ( json_ld , dict ) :
json_ld = [ json_ld ]
json_ld = [ json_ld ]
def extract_video_object ( e ) :
assert e [ ' @type ' ] == ' VideoObject '
info . update ( {
' url ' : e . get ( ' contentUrl ' ) ,
' title ' : unescapeHTML ( e . get ( ' name ' ) ) ,
' description ' : unescapeHTML ( e . get ( ' description ' ) ) ,
' thumbnail ' : e . get ( ' thumbnailUrl ' ) or e . get ( ' thumbnailURL ' ) ,
' duration ' : parse_duration ( e . get ( ' duration ' ) ) ,
' timestamp ' : unified_timestamp ( e . get ( ' uploadDate ' ) ) ,
' filesize ' : float_or_none ( e . get ( ' contentSize ' ) ) ,
' tbr ' : int_or_none ( e . get ( ' bitrate ' ) ) ,
' width ' : int_or_none ( e . get ( ' width ' ) ) ,
' height ' : int_or_none ( e . get ( ' height ' ) ) ,
' view_count ' : int_or_none ( e . get ( ' interactionCount ' ) ) ,
} )
for e in json_ld :
for e in json_ld :
if e . get ( ' @context ' ) == ' http://schema.org ' :
if e . get ( ' @context ' ) == ' http://schema.org ' :
item_type = e . get ( ' @type ' )
item_type = e . get ( ' @type ' )
@ -1000,18 +1021,11 @@ class InfoExtractor(object):
' description ' : unescapeHTML ( e . get ( ' articleBody ' ) ) ,
' description ' : unescapeHTML ( e . get ( ' articleBody ' ) ) ,
} )
} )
elif item_type == ' VideoObject ' :
elif item_type == ' VideoObject ' :
info . update ( {
' url ' : e . get ( ' contentUrl ' ) ,
' title ' : unescapeHTML ( e . get ( ' name ' ) ) ,
' description ' : unescapeHTML ( e . get ( ' description ' ) ) ,
' thumbnail ' : e . get ( ' thumbnailUrl ' ) or e . get ( ' thumbnailURL ' ) ,
' duration ' : parse_duration ( e . get ( ' duration ' ) ) ,
' timestamp ' : unified_timestamp ( e . get ( ' uploadDate ' ) ) ,
' filesize ' : float_or_none ( e . get ( ' contentSize ' ) ) ,
' tbr ' : int_or_none ( e . get ( ' bitrate ' ) ) ,
' width ' : int_or_none ( e . get ( ' width ' ) ) ,
' height ' : int_or_none ( e . get ( ' height ' ) ) ,
} )
extract_video_object ( e )
elif item_type == ' WebPage ' :
video = e . get ( ' video ' )
if isinstance ( video , dict ) and video . get ( ' @type ' ) == ' VideoObject ' :
extract_video_object ( video )
break
break
return dict ( ( k , v ) for k , v in info . items ( ) if v is not None )
return dict ( ( k , v ) for k , v in info . items ( ) if v is not None )
@ -1303,40 +1317,50 @@ class InfoExtractor(object):
entry_protocol = ' m3u8 ' , preference = None ,
entry_protocol = ' m3u8 ' , preference = None ,
m3u8_id = None , note = None , errnote = None ,
m3u8_id = None , note = None , errnote = None ,
fatal = True , live = False ) :
fatal = True , live = False ) :
res = self . _download_webpage_handle (
res = self . _download_webpage_handle (
m3u8_url , video_id ,
m3u8_url , video_id ,
note = note or ' Downloading m3u8 information ' ,
note = note or ' Downloading m3u8 information ' ,
errnote = errnote or ' Failed to download m3u8 information ' ,
errnote = errnote or ' Failed to download m3u8 information ' ,
fatal = fatal )
fatal = fatal )
if res is False :
if res is False :
return [ ]
return [ ]
m3u8_doc , urlh = res
m3u8_doc , urlh = res
m3u8_url = urlh . geturl ( )
m3u8_url = urlh . geturl ( )
return self . _parse_m3u8_formats (
m3u8_doc , m3u8_url , ext = ext , entry_protocol = entry_protocol ,
preference = preference , m3u8_id = m3u8_id , live = live )
def _parse_m3u8_formats ( self , m3u8_doc , m3u8_url , ext = None ,
entry_protocol = ' m3u8 ' , preference = None ,
m3u8_id = None , live = False ) :
if ' #EXT-X-FAXS-CM: ' in m3u8_doc : # Adobe Flash Access
if ' #EXT-X-FAXS-CM: ' in m3u8_doc : # Adobe Flash Access
return [ ]
return [ ]
formats = [ self . _m3u8_meta_format ( m3u8_url , ext , preference , m3u8_id ) ]
formats = [ ]
format_url = lambda u : (
format_url = lambda u : (
u
u
if re . match ( r ' ^https?:// ' , u )
if re . match ( r ' ^https?:// ' , u )
else compat_urlparse . urljoin ( m3u8_url , u ) )
else compat_urlparse . urljoin ( m3u8_url , u ) )
# We should try extracting formats only from master playlists [1], i.e.
# playlists that describe available qualities. On the other hand media
# playlists [2] should be returned as is since they contain just the media
# without qualities renditions.
# References:
# 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
# 2. https://github.com/rg3/youtube-dl/issues/12211
# We should try extracting formats only from master playlists [1, 4.3.4],
# i.e. playlists that describe available qualities. On the other hand
# media playlists [1, 4.3.3] should be returned as is since they contain
# just the media without qualities renditions.
# Fortunately, master playlist can be easily distinguished from media
# Fortunately, master playlist can be easily distinguished from media
# playlist based on particular tags availability. As of [1, 2] master
# playlist tags MUST NOT appear in a media playist and vice versa.
# As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
# and MUST NOT appear in master playlist thus we can clearly detect media
# playlist with this criterion.
# 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
# 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
# 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
# playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
# master playlist tags MUST NOT appear in a media playist and vice versa.
# As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
# media playlist and MUST NOT appear in master playlist thus we can
# clearly detect media playlist with this criterion.
if ' #EXT-X-TARGETDURATION ' in m3u8_doc : # media playlist, return as is
if ' #EXT-X-TARGETDURATION ' in m3u8_doc : # media playlist, return as is
return [ {
return [ {
' url ' : m3u8_url ,
' url ' : m3u8_url ,
@ -1345,52 +1369,72 @@ class InfoExtractor(object):
' protocol ' : entry_protocol ,
' protocol ' : entry_protocol ,
' preference ' : preference ,
' preference ' : preference ,
} ]
} ]
audio_in_video_stream = { }
last_info = { }
last_media = { }
groups = { }
last_stream_inf = { }
def extract_media ( x_media_line ) :
media = parse_m3u8_attributes ( x_media_line )
# As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
media_type , group_id , name = media . get ( ' TYPE ' ) , media . get ( ' GROUP-ID ' ) , media . get ( ' NAME ' )
if not ( media_type and group_id and name ) :
return
groups . setdefault ( group_id , [ ] ) . append ( media )
if media_type not in ( ' VIDEO ' , ' AUDIO ' ) :
return
media_url = media . get ( ' URI ' )
if media_url :
format_id = [ ]
for v in ( group_id , name ) :
if v :
format_id . append ( v )
f = {
' format_id ' : ' - ' . join ( format_id ) ,
' url ' : format_url ( media_url ) ,
' manifest_url ' : m3u8_url ,
' language ' : media . get ( ' LANGUAGE ' ) ,
' ext ' : ext ,
' protocol ' : entry_protocol ,
' preference ' : preference ,
}
if media_type == ' AUDIO ' :
f [ ' vcodec ' ] = ' none '
formats . append ( f )
def build_stream_name ( ) :
# Despite specification does not mention NAME attribute for
# EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
# or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
# 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
stream_name = last_stream_inf . get ( ' NAME ' )
if stream_name :
return stream_name
# If there is no NAME in EXT-X-STREAM-INF it will be obtained
# from corresponding rendition group
stream_group_id = last_stream_inf . get ( ' VIDEO ' )
if not stream_group_id :
return
stream_group = groups . get ( stream_group_id )
if not stream_group :
return stream_group_id
rendition = stream_group [ 0 ]
return rendition . get ( ' NAME ' ) or stream_group_id
for line in m3u8_doc . splitlines ( ) :
for line in m3u8_doc . splitlines ( ) :
if line . startswith ( ' #EXT-X-STREAM-INF: ' ) :
if line . startswith ( ' #EXT-X-STREAM-INF: ' ) :
last_info = parse_m3u8_attributes ( line )
last_stream_ inf = parse_m3u8_attributes ( line )
elif line . startswith ( ' #EXT-X-MEDIA: ' ) :
elif line . startswith ( ' #EXT-X-MEDIA: ' ) :
media = parse_m3u8_attributes ( line )
media_type = media . get ( ' TYPE ' )
if media_type in ( ' VIDEO ' , ' AUDIO ' ) :
group_id = media . get ( ' GROUP-ID ' )
media_url = media . get ( ' URI ' )
if media_url :
format_id = [ ]
for v in ( group_id , media . get ( ' NAME ' ) ) :
if v :
format_id . append ( v )
f = {
' format_id ' : ' - ' . join ( format_id ) ,
' url ' : format_url ( media_url ) ,
' language ' : media . get ( ' LANGUAGE ' ) ,
' ext ' : ext ,
' protocol ' : entry_protocol ,
' preference ' : preference ,
}
if media_type == ' AUDIO ' :
f [ ' vcodec ' ] = ' none '
if group_id and not audio_in_video_stream . get ( group_id ) :
audio_in_video_stream [ group_id ] = False
formats . append ( f )
else :
# When there is no URI in EXT-X-MEDIA let this tag's
# data be used by regular URI lines below
last_media = media
if media_type == ' AUDIO ' and group_id :
audio_in_video_stream [ group_id ] = True
extract_media ( line )
elif line . startswith ( ' # ' ) or not line . strip ( ) :
elif line . startswith ( ' # ' ) or not line . strip ( ) :
continue
continue
else :
else :
tbr = int_or_none ( last_info . get ( ' AVERAGE-BANDWIDTH ' ) or last_info . get ( ' BANDWIDTH ' ) , scale = 1000 )
tbr = float_or_none (
last_stream_inf . get ( ' AVERAGE-BANDWIDTH ' ) or
last_stream_inf . get ( ' BANDWIDTH ' ) , scale = 1000 )
format_id = [ ]
format_id = [ ]
if m3u8_id :
if m3u8_id :
format_id . append ( m3u8_id )
format_id . append ( m3u8_id )
# Despite specification does not mention NAME attribute for
# EXT-X-STREAM-INF it still sometimes may be present
stream_name = last_info . get ( ' NAME ' ) or last_media . get ( ' NAME ' )
stream_name = build_stream_name ( )
# Bandwidth of live streams may differ over time thus making
# Bandwidth of live streams may differ over time thus making
# format_id unpredictable. So it's better to keep provided
# format_id unpredictable. So it's better to keep provided
# format_id intact.
# format_id intact.
@ -1400,14 +1444,14 @@ class InfoExtractor(object):
f = {
f = {
' format_id ' : ' - ' . join ( format_id ) ,
' format_id ' : ' - ' . join ( format_id ) ,
' url ' : manifest_url ,
' url ' : manifest_url ,
' manifest_url ' : manifest _url ,
' manifest_url ' : m3u8 _url ,
' tbr ' : tbr ,
' tbr ' : tbr ,
' ext ' : ext ,
' ext ' : ext ,
' fps ' : float_or_none ( last_info . get ( ' FRAME-RATE ' ) ) ,
' fps ' : float_or_none ( last_stream_ inf . get ( ' FRAME-RATE ' ) ) ,
' protocol ' : entry_protocol ,
' protocol ' : entry_protocol ,
' preference ' : preference ,
' preference ' : preference ,
}
}
resolution = last_info . get ( ' RESOLUTION ' )
resolution = last_stream_ inf . get ( ' RESOLUTION ' )
if resolution :
if resolution :
mobj = re . search ( r ' (?P<width> \ d+)[xX](?P<height> \ d+) ' , resolution )
mobj = re . search ( r ' (?P<width> \ d+)[xX](?P<height> \ d+) ' , resolution )
if mobj :
if mobj :
@ -1423,13 +1467,26 @@ class InfoExtractor(object):
' vbr ' : vbr ,
' vbr ' : vbr ,
' abr ' : abr ,
' abr ' : abr ,
} )
} )
f . update ( parse_codecs ( last_info . get ( ' CODECS ' ) ) )
if audio_in_video_stream . get ( last_info . get ( ' AUDIO ' ) ) is False and f [ ' vcodec ' ] != ' none ' :
# TODO: update acodec for audio only formats with the same GROUP-ID
f [ ' acodec ' ] = ' none '
codecs = parse_codecs ( last_stream_inf . get ( ' CODECS ' ) )
f . update ( codecs )
audio_group_id = last_stream_inf . get ( ' AUDIO ' )
# As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
# references a rendition group MUST have a CODECS attribute.
# However, this is not always respected, for example, [2]
# contains EXT-X-STREAM-INF tag which references AUDIO
# rendition group but does not have CODECS and despite
# referencing audio group an audio group, it represents
# a complete (with audio and video) format. So, for such cases
# we will ignore references to rendition groups and treat them
# as complete formats.
if audio_group_id and codecs and f . get ( ' vcodec ' ) != ' none ' :
audio_group = groups . get ( audio_group_id )
if audio_group and audio_group [ 0 ] . get ( ' URI ' ) :
# TODO: update acodec for audio only formats with
# the same GROUP-ID
f [ ' acodec ' ] = ' none '
formats . append ( f )
formats . append ( f )
last_info = { }
last_media = { }
last_stream_inf = { }
return formats
return formats
@staticmethod
@staticmethod
@ -1803,7 +1860,7 @@ class InfoExtractor(object):
' ext ' : mimetype2ext ( mime_type ) ,
' ext ' : mimetype2ext ( mime_type ) ,
' width ' : int_or_none ( representation_attrib . get ( ' width ' ) ) ,
' width ' : int_or_none ( representation_attrib . get ( ' width ' ) ) ,
' height ' : int_or_none ( representation_attrib . get ( ' height ' ) ) ,
' height ' : int_or_none ( representation_attrib . get ( ' height ' ) ) ,
' tbr ' : in t_or_none( bandwidth , 1000 ) ,
' tbr ' : floa t_or_none( bandwidth , 1000 ) ,
' asr ' : int_or_none ( representation_attrib . get ( ' audioSamplingRate ' ) ) ,
' asr ' : int_or_none ( representation_attrib . get ( ' audioSamplingRate ' ) ) ,
' fps ' : int_or_none ( representation_attrib . get ( ' frameRate ' ) ) ,
' fps ' : int_or_none ( representation_attrib . get ( ' frameRate ' ) ) ,
' language ' : lang if lang not in ( ' mul ' , ' und ' , ' zxx ' , ' mis ' ) else None ,
' language ' : lang if lang not in ( ' mul ' , ' und ' , ' zxx ' , ' mis ' ) else None ,
@ -2182,7 +2239,7 @@ class InfoExtractor(object):
def _find_jwplayer_data ( self , webpage , video_id = None , transform_source = js_to_json ) :
def _find_jwplayer_data ( self , webpage , video_id = None , transform_source = js_to_json ) :
mobj = re . search (
mobj = re . search (
r ' jwplayer \ ((?P<quote>[ \' " ])[^ \' " ]+(?P=quote) \ ) \ .setup \ s* \ ((?P<options>[^)]+) \ ) ' ,
r ' (?s) jwplayer\ ((?P<quote>[ \' " ])[^ \' " ]+(?P=quote) \ )(?!</script>).*? \ .setup \ s* \ ((?P<options>[^)]+) \ ) ' ,
webpage )
webpage )
if mobj :
if mobj :
try :
try :
@ -2258,11 +2315,17 @@ class InfoExtractor(object):
def _parse_jwplayer_formats ( self , jwplayer_sources_data , video_id = None ,
def _parse_jwplayer_formats ( self , jwplayer_sources_data , video_id = None ,
m3u8_id = None , mpd_id = None , rtmp_params = None , base_url = None ) :
m3u8_id = None , mpd_id = None , rtmp_params = None , base_url = None ) :
urls = [ ]
formats = [ ]
formats = [ ]
for source in jwplayer_sources_data :
for source in jwplayer_sources_data :
source_url = self . _proto_relative_url ( source [ ' file ' ] )
source_url = self . _proto_relative_url ( source . get ( ' file ' ) )
if not source_url :
continue
if base_url :
if base_url :
source_url = compat_urlparse . urljoin ( base_url , source_url )
source_url = compat_urlparse . urljoin ( base_url , source_url )
if source_url in urls :
continue
urls . append ( source_url )
source_type = source . get ( ' type ' ) or ' '
source_type = source . get ( ' type ' ) or ' '
ext = mimetype2ext ( source_type ) or determine_ext ( source_url )
ext = mimetype2ext ( source_type ) or determine_ext ( source_url )
if source_type == ' hls ' or ext == ' m3u8 ' :
if source_type == ' hls ' or ext == ' m3u8 ' :