summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorYen Chi Hsuan <[email protected]>2015-05-19 00:45:01 +0800
committerYen Chi Hsuan <[email protected]>2015-05-19 00:45:01 +0800
commit1b0427e6c433c0b6db5e210db6e3173e19e702ed (patch)
treef40cf7435bbae35bdb1419742d5149c03cf94b9d
parent2aa64b89b3ac8f387d4c0c27ce7de64bc0ff68de (diff)
downloadyoutube-dl-1b0427e6c433c0b6db5e210db6e3173e19e702ed.tar.gz
youtube-dl-1b0427e6c433c0b6db5e210db6e3173e19e702ed.zip
[utils] Support TTML without default namespace
In a strict sense such TTML is invalid, but Yahoo uses it.
-rw-r--r--test/test_utils.py15
-rw-r--r--youtube_dl/utils.py9
2 files changed, 21 insertions, 3 deletions
diff --git a/test/test_utils.py b/test/test_utils.py
index b40107037..e13e11b59 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -621,6 +621,21 @@ Line
'''
self.assertEqual(dfxp2srt(dfxp_data), srt_data)
+ dfxp_data_no_default_namespace = '''<?xml version="1.0" encoding="UTF-8"?>
+ <tt xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter">
+ <body>
+ <div xml:lang="en">
+ <p begin="0" end="1">The first line</p>
+ </div>
+ </body>
+ </tt>'''
+ srt_data = '''1
+00:00:00,000 --> 00:00:01,000
+The first line
+
+'''
+ self.assertEqual(dfxp2srt(dfxp_data_no_default_namespace), srt_data)
+
if __name__ == '__main__':
unittest.main()
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index ed9ed9ed6..507f07383 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -1848,9 +1848,9 @@ def dfxp2srt(dfxp_data):
out = str_or_empty(node.text)
for child in node:
- if child.tag == _x('ttml:br'):
+ if child.tag in (_x('ttml:br'), 'br'):
out += '\n' + str_or_empty(child.tail)
- elif child.tag == _x('ttml:span'):
+ elif child.tag in (_x('ttml:span'), 'span'):
out += str_or_empty(parse_node(child))
else:
out += str_or_empty(xml.etree.ElementTree.tostring(child))
@@ -1859,7 +1859,10 @@ def dfxp2srt(dfxp_data):
dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
out = []
- paras = dfxp.findall(_x('.//ttml:p'))
+ paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
+
+ if not paras:
+ raise ValueError('Invalid dfxp/TTML subtitle')
for para, index in zip(paras, itertools.count(1)):
begin_time = parse_dfxp_time_expr(para.attrib['begin'])