Browse Source

[utils] Use bytes-like objects in dfxp2srt

This fixes handling of non-UTF8 TTML subtitles

Closes #14191
master-ytdl-org
Yen Chi Hsuan 7 years ago
parent
commit
3869028ffb
4 changed files with 41 additions and 11 deletions
  1. +6
    -0
      ChangeLog
  2. +23
    -3
      test/test_utils.py
  3. +1
    -1
      youtube_dl/postprocessor/ffmpeg.py
  4. +11
    -7
      youtube_dl/utils.py

+ 6
- 0
ChangeLog View File

@ -1,3 +1,9 @@
version <unreleased>
Core
* [utils] Fix handling raw TTML subtitles (#14191)
version 2017.09.15 version 2017.09.15
Core Core


+ 23
- 3
test/test_utils.py View File

@ -1064,7 +1064,7 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')
<p begin="3" dur="-1">Ignored, three</p> <p begin="3" dur="-1">Ignored, three</p>
</div> </div>
</body> </body>
</tt>'''
</tt>'''.encode('utf-8')
srt_data = '''1 srt_data = '''1
00:00:00,000 --> 00:00:01,000 00:00:00,000 --> 00:00:01,000
The following line contains Chinese characters and special symbols The following line contains Chinese characters and special symbols
@ -1089,7 +1089,7 @@ Line
<p begin="0" end="1">The first line</p> <p begin="0" end="1">The first line</p>
</div> </div>
</body> </body>
</tt>'''
</tt>'''.encode('utf-8')
srt_data = '''1 srt_data = '''1
00:00:00,000 --> 00:00:01,000 00:00:00,000 --> 00:00:01,000
The first line The first line
@ -1115,7 +1115,7 @@ The first line
<p style="s1" tts:textDecoration="underline" begin="00:00:09.56" id="p2" end="00:00:12.36"><span style="s2" tts:color="lime">inner<br /> </span>style</p> <p style="s1" tts:textDecoration="underline" begin="00:00:09.56" id="p2" end="00:00:12.36"><span style="s2" tts:color="lime">inner<br /> </span>style</p>
</div> </div>
</body> </body>
</tt>'''
</tt>'''.encode('utf-8')
srt_data = '''1 srt_data = '''1
00:00:02,080 --> 00:00:05,839 00:00:02,080 --> 00:00:05,839
<font color="white" face="sansSerif" size="16">default style<font color="red">custom style</font></font> <font color="white" face="sansSerif" size="16">default style<font color="red">custom style</font></font>
@ -1138,6 +1138,26 @@ part 3</font></u>
''' '''
self.assertEqual(dfxp2srt(dfxp_data_with_style), srt_data) self.assertEqual(dfxp2srt(dfxp_data_with_style), srt_data)
dfxp_data_non_utf8 = '''<?xml version="1.0" encoding="UTF-16"?>
<tt xmlns="http://www.w3.org/ns/ttml" xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter">
<body>
<div xml:lang="en">
<p begin="0" end="1">Line 1</p>
<p begin="1" end="2"></p>
</div>
</body>
</tt>'''.encode('utf-16')
srt_data = '''1
00:00:00,000 --> 00:00:01,000
Line 1
2
00:00:01,000 --> 00:00:02,000
'''
self.assertEqual(dfxp2srt(dfxp_data_non_utf8), srt_data)
def test_cli_option(self): def test_cli_option(self):
self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128']) self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128'])
self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), []) self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), [])


+ 1
- 1
youtube_dl/postprocessor/ffmpeg.py View File

@ -585,7 +585,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
dfxp_file = old_file dfxp_file = old_file
srt_file = subtitles_filename(filename, lang, 'srt') srt_file = subtitles_filename(filename, lang, 'srt')
with io.open(dfxp_file, 'rt', encoding='utf-8') as f:
with open(dfxp_file, 'rb') as f:
srt_data = dfxp2srt(f.read()) srt_data = dfxp2srt(f.read())
with io.open(srt_file, 'wt', encoding='utf-8') as f: with io.open(srt_file, 'wt', encoding='utf-8') as f:


+ 11
- 7
youtube_dl/utils.py View File

@ -2572,14 +2572,18 @@ def srt_subtitles_timecode(seconds):
def dfxp2srt(dfxp_data): def dfxp2srt(dfxp_data):
'''
@param dfxp_data A bytes-like object containing DFXP data
@returns A unicode object containing converted SRT data
'''
LEGACY_NAMESPACES = ( LEGACY_NAMESPACES = (
('http://www.w3.org/ns/ttml', [
'http://www.w3.org/2004/11/ttaf1',
'http://www.w3.org/2006/04/ttaf1',
'http://www.w3.org/2006/10/ttaf1',
(b'http://www.w3.org/ns/ttml', [
b'http://www.w3.org/2004/11/ttaf1',
b'http://www.w3.org/2006/04/ttaf1',
b'http://www.w3.org/2006/10/ttaf1',
]), ]),
('http://www.w3.org/ns/ttml#styling', [
'http://www.w3.org/ns/ttml#style',
(b'http://www.w3.org/ns/ttml#styling', [
b'http://www.w3.org/ns/ttml#style',
]), ]),
) )
@ -2674,7 +2678,7 @@ def dfxp2srt(dfxp_data):
for ns in v: for ns in v:
dfxp_data = dfxp_data.replace(ns, k) dfxp_data = dfxp_data.replace(ns, k)
dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
dfxp = compat_etree_fromstring(dfxp_data)
out = [] out = []
paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p') paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')


Loading…
Cancel
Save