Browse Source

[utils] Support TTML without default namespace

In a strict sense such TTML is invalid, but Yahoo uses it.
totalwebcasting
Yen Chi Hsuan 10 years ago
parent
commit
1b0427e6c4
2 changed files with 21 additions and 3 deletions
  1. +15
    -0
      test/test_utils.py
  2. +6
    -3
      youtube_dl/utils.py

+ 15
- 0
test/test_utils.py View File

@ -621,6 +621,21 @@ Line
'''
self.assertEqual(dfxp2srt(dfxp_data), srt_data)
dfxp_data_no_default_namespace = '''<?xml version="1.0" encoding="UTF-8"?>
<tt xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter">
<body>
<div xml:lang="en">
<p begin="0" end="1">The first line</p>
</div>
</body>
</tt>'''
srt_data = '''1
00:00:00,000 --> 00:00:01,000
The first line
'''
self.assertEqual(dfxp2srt(dfxp_data_no_default_namespace), srt_data)
if __name__ == '__main__':
unittest.main()

+ 6
- 3
youtube_dl/utils.py View File

@ -1848,9 +1848,9 @@ def dfxp2srt(dfxp_data):
out = str_or_empty(node.text)
for child in node:
if child.tag == _x('ttml:br'):
if child.tag in (_x('ttml:br'), 'br'):
out += '\n' + str_or_empty(child.tail)
elif child.tag == _x('ttml:span'):
elif child.tag in (_x('ttml:span'), 'span'):
out += str_or_empty(parse_node(child))
else:
out += str_or_empty(xml.etree.ElementTree.tostring(child))
@ -1859,7 +1859,10 @@ def dfxp2srt(dfxp_data):
dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
out = []
paras = dfxp.findall(_x('.//ttml:p'))
paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
if not paras:
raise ValueError('Invalid dfxp/TTML subtitle')
for para, index in zip(paras, itertools.count(1)):
begin_time = parse_dfxp_time_expr(para.attrib['begin'])


Loading…
Cancel
Save