summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorremitamine <[email protected]>2016-02-02 18:36:26 +0100
committerremitamine <[email protected]>2016-02-02 18:36:26 +0100
commit4e0cff2a50f4c297fc25dae01c460596d8f5badb (patch)
tree7a3ed19f8182e6454c8d10ea63a88bd70eb6fd89
parent0436157b95ddd6d7c415549ba48b7c0305b2c0df (diff)
parent2b14cb566fde3e5482ce9a63b2be7103cec939e0 (diff)
downloadyoutube-dl-4e0cff2a50f4c297fc25dae01c460596d8f5badb.tar.gz
youtube-dl-4e0cff2a50f4c297fc25dae01c460596d8f5badb.zip
Merge pull request #8348 from remitamine/dfxp2srt-text
[utils] fix dfxp2srt text extraction(fixes #8055)
-rw-r--r--youtube_dl/utils.py29
1 files changed, 18 insertions, 11 deletions
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index c63b61598..18dbe28bb 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -2017,20 +2017,27 @@ def dfxp2srt(dfxp_data):
'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
})
- def parse_node(node):
- str_or_empty = functools.partial(str_or_none, default='')
+ class TTMLPElementParser:
+ out = ''
- out = str_or_empty(node.text)
+ def start(self, tag, attrib):
+ if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
+ self.out += '\n'
- for child in node:
- if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
- out += '\n' + str_or_empty(child.tail)
- elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
- out += str_or_empty(parse_node(child))
- else:
- out += str_or_empty(xml.etree.ElementTree.tostring(child))
+ def end(self, tag):
+ pass
- return out
+ def data(self, data):
+ self.out += data
+
+ def close(self):
+ return self.out.strip()
+
+ def parse_node(node):
+ target = TTMLPElementParser()
+ parser = xml.etree.ElementTree.XMLParser(target=target)
+ parser.feed(xml.etree.ElementTree.tostring(node))
+ return parser.close()
dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
out = []