diff options
author | Sergey M․ <[email protected]> | 2015-09-16 22:27:03 +0600 |
---|---|---|
committer | Sergey M․ <[email protected]> | 2015-09-16 22:27:03 +0600 |
commit | f84ce1ebaf930f5fc422e4fab77fa9c7c86a999a (patch) | |
tree | 8567106d0198f510b4d1ee0a59175af4dd86da64 | |
parent | 12bc2429444b9f89d1b7f4f631c25cc3af2a1d7c (diff) | |
download | youtube-dl-f84ce1ebaf930f5fc422e4fab77fa9c7c86a999a.tar.gz youtube-dl-f84ce1ebaf930f5fc422e4fab77fa9c7c86a999a.zip |
[mitele] Fix extraction (Closes #6414)
-rw-r--r-- | youtube_dl/extractor/mitele.py | 105 |
1 files changed, 58 insertions, 47 deletions
diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 852d72266..9e8a8fc72 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -1,17 +1,11 @@ from __future__ import unicode_literals -import json - from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urllib_parse_unquote, - compat_urlparse, -) +from ..compat import compat_urllib_parse from ..utils import ( + encode_dict, get_element_by_attribute, - parse_duration, - strip_jsonp, + int_or_none, ) @@ -21,54 +15,71 @@ class MiTeleIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', + 'md5': 'ace7635b2a0b286aaa37d3ff192d2a8a', 'info_dict': { - 'id': '0fce117d', - 'ext': 'mp4', - 'title': 'Programa 144 - Tor, la web invisible', - 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', + 'id': '0NF1jJnxS1Wu3pHrmvFyw2', 'display_id': 'programa-144', + 'ext': 'flv', + 'title': 'Tor, la web invisible', + 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', + 'thumbnail': 're:(?i)^https?://.*\.jpg$', 'duration': 2913, }, - 'params': { - # m3u8 download - 'skip_download': True, - }, }] def _real_extract(self, url): - episode = self._match_id(url) - webpage = self._download_webpage(url, episode) - embed_data_json = self._search_regex( - r'(?s)MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data', - ).replace('\'', '"') - embed_data = json.loads(embed_data_json) + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) - domain = embed_data['mediaUrl'] - if not domain.startswith('http'): - # only happens in telecinco.es videos - domain = 'http://' + domain - info_url = compat_urlparse.urljoin( - domain, - compat_urllib_parse_unquote(embed_data['flashvars']['host']) - ) - info_el = self._download_xml(info_url, episode).find('./video/info') + config_url = self._search_regex( + r'data-config\s*=\s*"([^"]+)"', webpage, 'data config url') - video_link = info_el.find('videoUrl/link').text - token_query = compat_urllib_parse.urlencode({'id': video_link}) - token_info = self._download_json( - embed_data['flashvars']['ov_tk'] + '?' + token_query, - episode, - transform_source=strip_jsonp - ) - formats = self._extract_m3u8_formats( - token_info['tokenizedUrl'], episode, ext='mp4') + config = self._download_json( + config_url, display_id, 'Downloading config JSON') + + mmc = self._download_json( + config['services']['mmc'], display_id, 'Downloading mmc JSON') + + formats = [] + for location in mmc['locations']: + gat = self._proto_relative_url(location.get('gat'), 'http:') + bas = location.get('bas') + loc = location.get('loc') + ogn = location.get('ogn') + if None in (gat, bas, loc, ogn): + continue + token_data = { + 'bas': bas, + 'icd': loc, + 'ogn': ogn, + 'sta': '0', + } + media = self._download_json( + '%s/?%s' % (gat, compat_urllib_parse.urlencode(encode_dict(token_data)).encode('utf-8')), + display_id, 'Downloading %s JSON' % location['loc']) + file_ = media.get('file') + if not file_: + continue + formats.extend(self._extract_f4m_formats( + file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', + display_id, f4m_id=loc)) + + title = self._search_regex( + r'class="Destacado-text"[^>]*>\s*<strong>([^<]+)</strong>', webpage, 'title') + + video_id = self._search_regex( + r'data-media-id\s*=\s*"([^"]+)"', webpage, + 'data media id', default=None) or display_id + thumbnail = config.get('poster', {}).get('imageUrl') + duration = int_or_none(mmc.get('duration')) return { - 'id': embed_data['videoId'], - 'display_id': episode, - 'title': info_el.find('title').text, - 'formats': formats, + 'id': video_id, + 'display_id': display_id, + 'title': title, 'description': get_element_by_attribute('class', 'text', webpage), - 'thumbnail': info_el.find('thumb').text, - 'duration': parse_duration(info_el.find('duration').text), + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, } |