diff options
author | Filippo Valsorda <[email protected]> | 2018-01-07 15:03:28 +0100 |
---|---|---|
committer | GitHub <[email protected]> | 2018-01-07 15:03:28 +0100 |
commit | 97bc05116e15df3cf1b7b7216c60fad7ee0acc9f (patch) | |
tree | 453059413184d22a44241d251a82196c5d2461eb /youtube_dl/extractor/common.py | |
parent | 7608a91ee7b40c74a35c30b44731606382702592 (diff) | |
parent | 0a5b1295b7c1aa6395b65ee137087c540b37b32b (diff) | |
download | youtube-dl-totalwebcasting.tar.gz youtube-dl-totalwebcasting.zip |
Merge branch 'master' into totalwebcastingtotalwebcasting
Diffstat (limited to 'youtube_dl/extractor/common.py')
-rw-r--r-- | youtube_dl/extractor/common.py | 827 |
1 files changed, 626 insertions, 201 deletions
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6fa7c334e..5b6a09c0b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import base64 @@ -6,6 +7,7 @@ import hashlib import json import netrc import os +import random import re import socket import sys @@ -25,8 +27,12 @@ from ..compat import ( compat_urllib_parse_urlencode, compat_urllib_request, compat_urlparse, + compat_xml_parse_error, +) +from ..downloader.f4m import ( + get_base_url, + remove_encrypted_media, ) -from ..downloader.f4m import remove_encrypted_media from ..utils import ( NO_DEFAULT, age_restricted, @@ -35,31 +41,35 @@ from ..utils import ( clean_html, compiled_regex_type, determine_ext, + determine_protocol, error_to_compat_str, ExtractorError, + extract_attributes, fix_xml_ampersands, float_or_none, + GeoRestrictedError, + GeoUtils, int_or_none, + js_to_json, + mimetype2ext, + orderedSet, + parse_codecs, + parse_duration, parse_iso8601, + parse_m3u8_attributes, RegexNotFoundError, - sanitize_filename, sanitized_Request, + sanitize_filename, unescapeHTML, unified_strdate, unified_timestamp, + update_Request, + update_url_query, + urljoin, url_basename, xpath_element, xpath_text, xpath_with_ns, - determine_protocol, - parse_duration, - mimetype2ext, - update_Request, - update_url_query, - parse_m3u8_attributes, - extract_attributes, - parse_codecs, - urljoin, ) @@ -121,9 +131,19 @@ class InfoExtractor(object): download, lower-case. "http", "https", "rtsp", "rtmp", "rtmpe", "m3u8", "m3u8_native" or "http_dash_segments". - * fragments A list of fragments of the fragmented media, - with the following entries: - * "url" (mandatory) - fragment's URL + * fragment_base_url + Base URL for fragments. Each fragment's path + value (if present) will be relative to + this URL. + * fragments A list of fragments of a fragmented media. + Each fragment entry must contain either an url + or a path. If an url is present it should be + considered by a client. Otherwise both path and + fragment_base_url must be present. Here is + the list of all potential fields: + * "url" - fragment's URL + * "path" - fragment's path relative to + fragment_base_url * "duration" (optional, int or float) * "filesize" (optional, int) * preference Order number of this format. If this field is @@ -229,6 +249,10 @@ class InfoExtractor(object): specified in the URL. end_time: Time in seconds where the reproduction should end, as specified in the URL. + chapters: A list of dictionaries, with the following entries: + * "start_time" - The start time of the chapter in seconds + * "end_time" - The end time of the chapter in seconds + * "title" (optional, string) The following fields should only be used when the video belongs to some logical chapter or section: @@ -277,8 +301,9 @@ class InfoExtractor(object): There must be a key "entries", which is a list, an iterable, or a PagedList object, each element of which is a valid dictionary by this specification. - Additionally, playlists can have "title", "description" and "id" attributes - with the same semantics as videos (see above). + Additionally, playlists can have "id", "title", "description", "uploader", + "uploader_id", "uploader_url" attributes with the same semantics as videos + (see above). _type "multi_video" indicates that there are multiple videos that @@ -309,17 +334,34 @@ class InfoExtractor(object): _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. + _GEO_BYPASS attribute may be set to False in order to disable + geo restriction bypass mechanisms for a particular extractor. + Though it won't disable explicit geo restriction bypass based on + country code provided with geo_bypass_country. (experimental) + + _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted + countries for this extractor. One of these countries will be used by + geo restriction bypass mechanism right away in order to bypass + geo restriction, of course, if the mechanism is not disabled. (experimental) + + NB: both these geo attributes are experimental and may change in future + or be completely removed. + Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. """ _ready = False _downloader = None + _x_forwarded_for_ip = None + _GEO_BYPASS = True + _GEO_COUNTRIES = None _WORKING = True def __init__(self, downloader=None): """Constructor. Receives an optional downloader.""" self._ready = False + self._x_forwarded_for_ip = None self.set_downloader(downloader) @classmethod @@ -339,7 +381,7 @@ class InfoExtractor(object): cls._VALID_URL_RE = re.compile(cls._VALID_URL) m = cls._VALID_URL_RE.match(url) assert m - return m.group('id') + return compat_str(m.group('id')) @classmethod def working(cls): @@ -348,15 +390,59 @@ class InfoExtractor(object): def initialize(self): """Initializes an instance (authentication, etc).""" + self._initialize_geo_bypass(self._GEO_COUNTRIES) if not self._ready: self._real_initialize() self._ready = True + def _initialize_geo_bypass(self, countries): + """ + Initialize geo restriction bypass mechanism. + + This method is used to initialize geo bypass mechanism based on faking + X-Forwarded-For HTTP header. A random country from provided country list + is selected and a random IP belonging to this country is generated. This + IP will be passed as X-Forwarded-For HTTP header in all subsequent + HTTP requests. + + This method will be used for initial geo bypass mechanism initialization + during the instance initialization with _GEO_COUNTRIES. + + You may also manually call it from extractor's code if geo countries + information is not available beforehand (e.g. obtained during + extraction) or due to some another reason. + """ + if not self._x_forwarded_for_ip: + country_code = self._downloader.params.get('geo_bypass_country', None) + # If there is no explicit country for geo bypass specified and + # the extractor is known to be geo restricted let's fake IP + # as X-Forwarded-For right away. + if (not country_code and + self._GEO_BYPASS and + self._downloader.params.get('geo_bypass', True) and + countries): + country_code = random.choice(countries) + if country_code: + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) + if self._downloader.params.get('verbose', False): + self._downloader.to_screen( + '[debug] Using fake IP %s (%s) as X-Forwarded-For.' + % (self._x_forwarded_for_ip, country_code.upper())) + def extract(self, url): """Extracts URL information and returns it in list of dicts.""" try: - self.initialize() - return self._real_extract(url) + for _ in range(2): + try: + self.initialize() + ie_result = self._real_extract(url) + if self._x_forwarded_for_ip: + ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip + return ie_result + except GeoRestrictedError as e: + if self.__maybe_fake_ip_and_retry(e.countries): + continue + raise except ExtractorError: raise except compat_http_client.IncompleteRead as e: @@ -364,6 +450,21 @@ class InfoExtractor(object): except (KeyError, StopIteration) as e: raise ExtractorError('An extractor error has occurred.', cause=e) + def __maybe_fake_ip_and_retry(self, countries): + if (not self._downloader.params.get('geo_bypass_country', None) and + self._GEO_BYPASS and + self._downloader.params.get('geo_bypass', True) and + not self._x_forwarded_for_ip and + countries): + country_code = random.choice(countries) + self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code) + if self._x_forwarded_for_ip: + self.report_warning( + 'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.' + % (self._x_forwarded_for_ip, country_code.upper())) + return True + return False + def set_downloader(self, downloader): """Sets the downloader for this IE.""" self._downloader = downloader @@ -394,6 +495,16 @@ class InfoExtractor(object): self.to_screen('%s' % (note,)) else: self.to_screen('%s: %s' % (video_id, note)) + + # Some sites check X-Forwarded-For HTTP header in order to figure out + # the origin of the client behind proxy. This allows bypassing geo + # restriction by faking this header's value to IP that belongs to some + # geo unrestricted country. We will do so once we encounter any + # geo restriction error. + if self._x_forwarded_for_ip: + if 'X-Forwarded-For' not in headers: + headers['X-Forwarded-For'] = self._x_forwarded_for_ip + if isinstance(url_or_request, compat_urllib_request.Request): url_or_request = update_Request( url_or_request, data=data, headers=headers, query=query) @@ -447,6 +558,34 @@ class InfoExtractor(object): return encoding + def __check_blocked(self, content): + first_block = content[:512] + if ('<title>Access to this site is blocked</title>' in content and + 'Websense' in first_block): + msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'<iframe src="([^"]+)"', content, + 'Websense information URL', default=None) + if blocked_iframe: + msg += ' Visit %s for more details' % blocked_iframe + raise ExtractorError(msg, expected=True) + if '<title>The URL you requested has been blocked</title>' in first_block: + msg = ( + 'Access to this webpage has been blocked by Indian censorship. ' + 'Use a VPN or proxy server (with --proxy) to route around it.') + block_msg = self._html_search_regex( + r'</h1><p>(.*?)</p>', + content, 'block message', default=None) + if block_msg: + msg += ' (Message: "%s")' % block_msg.replace('\n', ' ') + raise ExtractorError(msg, expected=True) + if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and + 'blocklist.rkn.gov.ru' in content): + raise ExtractorError( + 'Access to this webpage has been blocked by decision of the Russian government. ' + 'Visit http://blocklist.rkn.gov.ru/ for a block reason.', + expected=True) + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): content_type = urlh.headers.get('Content-Type', '') webpage_bytes = urlh.read() @@ -455,19 +594,11 @@ class InfoExtractor(object): if not encoding: encoding = self._guess_encoding_from_content(content_type, webpage_bytes) if self._downloader.params.get('dump_intermediate_pages', False): - try: - url = url_or_request.get_full_url() - except AttributeError: - url = url_or_request - self.to_screen('Dumping request to ' + url) + self.to_screen('Dumping request to ' + urlh.geturl()) dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) if self._downloader.params.get('write_pages', False): - try: - url = url_or_request.get_full_url() - except AttributeError: - url = url_or_request - basen = '%s_%s' % (video_id, url) + basen = '%s_%s' % (video_id, urlh.geturl()) if len(basen) > 240: h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() basen = basen[:240 - len(h)] + h @@ -488,25 +619,7 @@ class InfoExtractor(object): except LookupError: content = webpage_bytes.decode('utf-8', 'replace') - if ('<title>Access to this site is blocked</title>' in content and - 'Websense' in content[:512]): - msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' - blocked_iframe = self._html_search_regex( - r'<iframe src="([^"]+)"', content, - 'Websense information URL', default=None) - if blocked_iframe: - msg += ' Visit %s for more details' % blocked_iframe - raise ExtractorError(msg, expected=True) - if '<title>The URL you requested has been blocked</title>' in content[:512]: - msg = ( - 'Access to this webpage has been blocked by Indian censorship. ' - 'Use a VPN or proxy server (with --proxy) to route around it.') - block_msg = self._html_search_regex( - r'</h1><p>(.*?)</p>', - content, 'block message', default=None) - if block_msg: - msg += ' (Message: "%s")' % block_msg.replace('\n', ' ') - raise ExtractorError(msg, expected=True) + self.__check_blocked(content) return content @@ -531,15 +644,29 @@ class InfoExtractor(object): def _download_xml(self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}): + transform_source=None, fatal=True, encoding=None, + data=None, headers={}, query={}): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) + url_or_request, video_id, note, errnote, fatal=fatal, + encoding=encoding, data=data, headers=headers, query=query) if xml_string is False: return xml_string + return self._parse_xml( + xml_string, video_id, transform_source=transform_source, + fatal=fatal) + + def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): if transform_source: xml_string = transform_source(xml_string) - return compat_etree_fromstring(xml_string.encode('utf-8')) + try: + return compat_etree_fromstring(xml_string.encode('utf-8')) + except compat_xml_parse_error as ve: + errmsg = '%s: Failed to parse XML ' % video_id + if fatal: + raise ExtractorError(errmsg, cause=ve) + else: + self.report_warning(errmsg + str(ve)) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', @@ -598,10 +725,8 @@ class InfoExtractor(object): expected=True) @staticmethod - def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'): - raise ExtractorError( - '%s. You might want to use --proxy to workaround.' % msg, - expected=True) + def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None): + raise GeoRestrictedError(msg, countries=countries) # Methods for following #608 @staticmethod @@ -617,6 +742,13 @@ class InfoExtractor(object): video_info['title'] = video_title return video_info + def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None): + urls = orderedSet( + self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) + for m in matches) + return self.playlist_result( + urls, playlist_id=playlist_id, playlist_title=playlist_title) + @staticmethod def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): """Returns a playlist""" @@ -820,7 +952,8 @@ class InfoExtractor(object): def _family_friendly_search(self, html): # See http://schema.org/VideoObject - family_friendly = self._html_search_meta('isFamilyFriendly', html) + family_friendly = self._html_search_meta( + 'isFamilyFriendly', html, default=None) if not family_friendly: return None @@ -860,22 +993,39 @@ class InfoExtractor(object): return info if isinstance(json_ld, dict): json_ld = [json_ld] + + def extract_video_object(e): + assert e['@type'] == 'VideoObject' + info.update({ + 'url': e.get('contentUrl'), + 'title': unescapeHTML(e.get('name')), + 'description': unescapeHTML(e.get('description')), + 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), + 'duration': parse_duration(e.get('duration')), + 'timestamp': unified_timestamp(e.get('uploadDate')), + 'filesize': float_or_none(e.get('contentSize')), + 'tbr': int_or_none(e.get('bitrate')), + 'width': int_or_none(e.get('width')), + 'height': int_or_none(e.get('height')), + 'view_count': int_or_none(e.get('interactionCount')), + }) + for e in json_ld: if e.get('@context') == 'http://schema.org': item_type = e.get('@type') if expected_type is not None and expected_type != item_type: return info - if item_type == 'TVEpisode': + if item_type in ('TVEpisode', 'Episode'): info.update({ 'episode': unescapeHTML(e.get('name')), 'episode_number': int_or_none(e.get('episodeNumber')), 'description': unescapeHTML(e.get('description')), }) part_of_season = e.get('partOfSeason') - if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': + if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'): info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') - if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': + if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): info['series'] = unescapeHTML(part_of_series.get('name')) elif item_type == 'Article': info.update({ @@ -884,18 +1034,11 @@ class InfoExtractor(object): 'description': unescapeHTML(e.get('articleBody')), }) elif item_type == 'VideoObject': - info.update({ - 'url': e.get('contentUrl'), - 'title': unescapeHTML(e.get('name')), - 'description': unescapeHTML(e.get('description')), - 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), - 'duration': parse_duration(e.get('duration')), - 'timestamp': unified_timestamp(e.get('uploadDate')), - 'filesize': float_or_none(e.get('contentSize')), - 'tbr': int_or_none(e.get('bitrate')), - 'width': int_or_none(e.get('width')), - 'height': int_or_none(e.get('height')), - }) + extract_video_object(e) + continue + video = e.get('video') + if isinstance(video, dict) and video.get('@type') == 'VideoObject': + extract_video_object(video) break return dict((k, v) for k, v in info.items() if v is not None) @@ -1015,13 +1158,13 @@ class InfoExtractor(object): unique_formats.append(f) formats[:] = unique_formats - def _is_valid_url(self, url, video_id, item='video'): + def _is_valid_url(self, url, video_id, item='video', headers={}): url = self._proto_relative_url(url, scheme='http:') # For now assume non HTTP(S) URLs always valid if not (url.startswith('http://') or url.startswith('https://')): return True try: - self._request_webpage(url, video_id, 'Checking %s URL' % item) + self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) return True except ExtractorError as e: if isinstance(e.cause, compat_urllib_error.URLError): @@ -1093,11 +1236,8 @@ class InfoExtractor(object): media_nodes = remove_encrypted_media(media_nodes) if not media_nodes: return formats - base_url = xpath_text( - manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], - 'base URL', default=None) - if base_url: - base_url = base_url.strip() + + manifest_base_url = get_base_url(manifest) bootstrap_info = xpath_element( manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], @@ -1129,7 +1269,7 @@ class InfoExtractor(object): continue manifest_url = ( media_url if media_url.startswith('http://') or media_url.startswith('https://') - else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) + else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) # If media_url is itself a f4m manifest do the recursive extraction # since bitrates in parent manifest (this one) and media_url manifest # may differ leading to inability to resolve the format by requested @@ -1164,6 +1304,7 @@ class InfoExtractor(object): 'url': manifest_url, 'manifest_url': manifest_url, 'ext': 'flv' if bootstrap_info is not None else None, + 'protocol': 'f4m', 'tbr': tbr, 'width': width, 'height': height, @@ -1187,37 +1328,53 @@ class InfoExtractor(object): entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, fatal=True, live=False): - res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', errnote=errnote or 'Failed to download m3u8 information', fatal=fatal) + if res is False: return [] + m3u8_doc, urlh = res m3u8_url = urlh.geturl() - formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)] + return self._parse_m3u8_formats( + m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, + preference=preference, m3u8_id=m3u8_id, live=live) + + def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, + entry_protocol='m3u8', preference=None, + m3u8_id=None, live=False): + if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access + return [] + + if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc): # Apple FairPlay + return [] + + formats = [] format_url = lambda u: ( u if re.match(r'^https?://', u) else compat_urlparse.urljoin(m3u8_url, u)) - # We should try extracting formats only from master playlists [1], i.e. - # playlists that describe available qualities. On the other hand media - # playlists [2] should be returned as is since they contain just the media - # without qualities renditions. + # References: + # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21 + # 2. https://github.com/rg3/youtube-dl/issues/12211 + + # We should try extracting formats only from master playlists [1, 4.3.4], + # i.e. playlists that describe available qualities. On the other hand + # media playlists [1, 4.3.3] should be returned as is since they contain + # just the media without qualities renditions. # Fortunately, master playlist can be easily distinguished from media - # playlist based on particular tags availability. As of [1, 2] master - # playlist tags MUST NOT appear in a media playist and vice versa. - # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist - # and MUST NOT appear in master playlist thus we can clearly detect media - # playlist with this criterion. - # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4 - # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 - # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 + # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4] + # master playlist tags MUST NOT appear in a media playist and vice versa. + # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every + # media playlist and MUST NOT appear in master playlist thus we can + # clearly detect media playlist with this criterion. + if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is return [{ 'url': m3u8_url, @@ -1226,52 +1383,72 @@ class InfoExtractor(object): 'protocol': entry_protocol, 'preference': preference, }] - audio_in_video_stream = {} - last_info = {} - last_media = {} + + groups = {} + last_stream_inf = {} + + def extract_media(x_media_line): + media = parse_m3u8_attributes(x_media_line) + # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED + media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME') + if not (media_type and group_id and name): + return + groups.setdefault(group_id, []).append(media) + if media_type not in ('VIDEO', 'AUDIO'): + return + media_url = media.get('URI') + if media_url: + format_id = [] + for v in (m3u8_id, group_id, name): + if v: + format_id.append(v) + f = { + 'format_id': '-'.join(format_id), + 'url': format_url(media_url), + 'manifest_url': m3u8_url, + 'language': media.get('LANGUAGE'), + 'ext': ext, + 'protocol': entry_protocol, + 'preference': preference, + } + if media_type == 'AUDIO': + f['vcodec'] = 'none' + formats.append(f) + + def build_stream_name(): + # Despite specification does not mention NAME attribute for + # EXT-X-STREAM-INF tag it still sometimes may be present (see [1] + # or vidio test in TestInfoExtractor.test_parse_m3u8_formats) + # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015 + stream_name = last_stream_inf.get('NAME') + if stream_name: + return stream_name + # If there is no NAME in EXT-X-STREAM-INF it will be obtained + # from corresponding rendition group + stream_group_id = last_stream_inf.get('VIDEO') + if not stream_group_id: + return + stream_group = groups.get(stream_group_id) + if not stream_group: + return stream_group_id + rendition = stream_group[0] + return rendition.get('NAME') or stream_group_id + for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-STREAM-INF:'): - last_info = parse_m3u8_attributes(line) + last_stream_inf = parse_m3u8_attributes(line) elif line.startswith('#EXT-X-MEDIA:'): - media = parse_m3u8_attributes(line) - media_type = media.get('TYPE') - if media_type in ('VIDEO', 'AUDIO'): - group_id = media.get('GROUP-ID') - media_url = media.get('URI') - if media_url: - format_id = [] - for v in (group_id, media.get('NAME')): - if v: - format_id.append(v) - f = { - 'format_id': '-'.join(format_id), - 'url': format_url(media_url), - 'language': media.get('LANGUAGE'), - 'ext': ext, - 'protocol': entry_protocol, - 'preference': preference, - } - if media_type == 'AUDIO': - f['vcodec'] = 'none' - if group_id and not audio_in_video_stream.get(group_id): - audio_in_video_stream[group_id] = False - formats.append(f) - else: - # When there is no URI in EXT-X-MEDIA let this tag's - # data be used by regular URI lines below - last_media = media - if media_type == 'AUDIO' and group_id: - audio_in_video_stream[group_id] = True + extract_media(line) elif line.startswith('#') or not line.strip(): continue else: - tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000) + tbr = float_or_none( + last_stream_inf.get('AVERAGE-BANDWIDTH') or + last_stream_inf.get('BANDWIDTH'), scale=1000) format_id = [] if m3u8_id: format_id.append(m3u8_id) - # Despite specification does not mention NAME attribute for - # EXT-X-STREAM-INF it still sometimes may be present - stream_name = last_info.get('NAME') or last_media.get('NAME') + stream_name = build_stream_name() # Bandwidth of live streams may differ over time thus making # format_id unpredictable. So it's better to keep provided # format_id intact. @@ -1281,14 +1458,14 @@ class InfoExtractor(object): f = { 'format_id': '-'.join(format_id), 'url': manifest_url, - 'manifest_url': manifest_url, + 'manifest_url': m3u8_url, 'tbr': tbr, 'ext': ext, - 'fps': float_or_none(last_info.get('FRAME-RATE')), + 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')), 'protocol': entry_protocol, 'preference': preference, } - resolution = last_info.get('RESOLUTION') + resolution = last_stream_inf.get('RESOLUTION') if resolution: mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution) if mobj: @@ -1304,13 +1481,26 @@ class InfoExtractor(object): 'vbr': vbr, 'abr': abr, }) - f.update(parse_codecs(last_info.get('CODECS'))) - if audio_in_video_stream.get(last_info.get('AUDIO')) is False: - # TODO: update acodec for for audio only formats with the same GROUP-ID - f['acodec'] = 'none' + codecs = parse_codecs(last_stream_inf.get('CODECS')) + f.update(codecs) + audio_group_id = last_stream_inf.get('AUDIO') + # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which + # references a rendition group MUST have a CODECS attribute. + # However, this is not always respected, for example, [2] + # contains EXT-X-STREAM-INF tag which references AUDIO + # rendition group but does not have CODECS and despite + # referencing audio group an audio group, it represents + # a complete (with audio and video) format. So, for such cases + # we will ignore references to rendition groups and treat them + # as complete formats. + if audio_group_id and codecs and f.get('vcodec') != 'none': + audio_group = groups.get(audio_group_id) + if audio_group and audio_group[0].get('URI'): + # TODO: update acodec for audio only formats with + # the same GROUP-ID + f['acodec'] = 'none' formats.append(f) - last_info = {} - last_media = {} + last_stream_inf = {} return formats @staticmethod @@ -1609,7 +1799,7 @@ class InfoExtractor(object): ms_info['timescale'] = int(timescale) segment_duration = source.get('duration') if segment_duration: - ms_info['segment_duration'] = int(segment_duration) + ms_info['segment_duration'] = float(segment_duration) def extract_Initialization(source): initialization = source.find(_add_ns('Initialization')) @@ -1627,12 +1817,12 @@ class InfoExtractor(object): segment_template = element.find(_add_ns('SegmentTemplate')) if segment_template is not None: extract_common(segment_template) - media_template = segment_template.get('media') - if media_template: - ms_info['media_template'] = media_template + media = segment_template.get('media') + if media: + ms_info['media'] = media initialization = segment_template.get('initialization') if initialization: - ms_info['initialization_url'] = initialization + ms_info['initialization'] = initialization else: extract_Initialization(segment_template) return ms_info @@ -1660,7 +1850,7 @@ class InfoExtractor(object): if content_type == 'text': # TODO implement WebVTT downloading pass - elif content_type == 'video' or content_type == 'audio': + elif content_type in ('video', 'audio'): base_url = '' for element in (representation, adaptation_set, period, mpd_doc): base_url_e = element.find(_add_ns('BaseURL')) @@ -1676,6 +1866,7 @@ class InfoExtractor(object): lang = representation_attrib.get('lang') url_el = representation.find(_add_ns('BaseURL')) filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) + bandwidth = int_or_none(representation_attrib.get('bandwidth')) f = { 'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 'url': base_url, @@ -1683,34 +1874,58 @@ class InfoExtractor(object): 'ext': mimetype2ext(mime_type), 'width': int_or_none(representation_attrib.get('width')), 'height': int_or_none(representation_attrib.get('height')), - 'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000), + 'tbr': float_or_none(bandwidth, 1000), 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), 'fps': int_or_none(representation_attrib.get('frameRate')), 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, 'format_note': 'DASH %s' % content_type, 'filesize': filesize, + 'container': mimetype2ext(mime_type) + '_dash', } f.update(parse_codecs(representation_attrib.get('codecs'))) representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) - if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info: - media_template = representation_ms_info['media_template'] - media_template = media_template.replace('$RepresentationID$', representation_id) - media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template) - media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template) - media_template.replace('$$', '$') + def prepare_template(template_name, identifiers): + t = representation_ms_info[template_name] + t = t.replace('$RepresentationID$', representation_id) + t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) + t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) + t.replace('$$', '$') + return t + + # @initialization is a regular template like @media one + # so it should be handled just the same way (see + # https://github.com/rg3/youtube-dl/issues/11605) + if 'initialization' in representation_ms_info: + initialization_template = prepare_template( + 'initialization', + # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and + # $Time$ shall not be included for @initialization thus + # only $Bandwidth$ remains + ('Bandwidth', )) + representation_ms_info['initialization_url'] = initialization_template % { + 'Bandwidth': bandwidth, + } + + def location_key(location): + return 'url' if re.match(r'^https?://', location) else 'path' + + if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: + + media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) + media_location_key = location_key(media_template) # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ # can't be used at the same time if '%(Number' in media_template and 's' not in representation_ms_info: segment_duration = None - if 'total_number' not in representation_ms_info and 'segment_duration': + if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) representation_ms_info['fragments'] = [{ - 'url': media_template % { + media_location_key: media_template % { 'Number': segment_number, - 'Bandwidth': int_or_none(representation_attrib.get('bandwidth')), + 'Bandwidth': bandwidth, }, 'duration': segment_duration, } for segment_number in range( @@ -1728,11 +1943,11 @@ class InfoExtractor(object): def add_segment_url(): segment_url = media_template % { 'Time': segment_time, - 'Bandwidth': int_or_none(representation_attrib.get('bandwidth')), + 'Bandwidth': bandwidth, 'Number': segment_number, } representation_ms_info['fragments'].append({ - 'url': segment_url, + media_location_key: segment_url, 'duration': float_or_none(segment_d, representation_ms_info['timescale']), }) @@ -1751,40 +1966,56 @@ class InfoExtractor(object): # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI # or any YouTube dashsegments video fragments = [] - s_num = 0 - for segment_url in representation_ms_info['segment_urls']: - s = representation_ms_info['s'][s_num] + segment_index = 0 + timescale = representation_ms_info['timescale'] + for s in representation_ms_info['s']: + duration = float_or_none(s['d'], timescale) for r in range(s.get('r', 0) + 1): + segment_uri = representation_ms_info['segment_urls'][segment_index] fragments.append({ - 'url': segment_url, - 'duration': float_or_none(s['d'], representation_ms_info['timescale']), + location_key(segment_uri): segment_uri, + 'duration': duration, }) + segment_index += 1 + representation_ms_info['fragments'] = fragments + elif 'segment_urls' in representation_ms_info: + # Segment URLs with no SegmentTimeline + # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 + # https://github.com/rg3/youtube-dl/pull/14844 + fragments = [] + segment_duration = float_or_none( + representation_ms_info['segment_duration'], + representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None + for segment_url in representation_ms_info['segment_urls']: + fragment = { + location_key(segment_url): segment_url, + } + if segment_duration: + fragment['duration'] = segment_duration + fragments.append(fragment) representation_ms_info['fragments'] = fragments # NB: MPD manifest may contain direct URLs to unfragmented media. # No fragments key is present in this case. if 'fragments' in representation_ms_info: f.update({ + 'fragment_base_url': base_url, 'fragments': [], 'protocol': 'http_dash_segments', }) if 'initialization_url' in representation_ms_info: - initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id) + initialization_url = representation_ms_info['initialization_url'] if not f.get('url'): f['url'] = initialization_url - f['fragments'].append({'url': initialization_url}) + f['fragments'].append({location_key(initialization_url): initialization_url}) f['fragments'].extend(representation_ms_info['fragments']) - for fragment in f['fragments']: - fragment['url'] = urljoin(base_url, fragment['url']) - try: - existing_format = next( - fo for fo in formats - if fo['format_id'] == representation_id) - except StopIteration: - full_info = formats_dict.get(representation_id, {}).copy() - full_info.update(f) - formats.append(full_info) - else: - existing_format.update(f) + # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation + # is not necessarily unique within a Period thus formats with + # the same `format_id` are quite possible. There are numerous examples + # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111, + # https://github.com/rg3/youtube-dl/issues/13919) + full_info = formats_dict.get(representation_id, {}).copy() + full_info.update(f) + formats.append(full_info) else: self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats @@ -1803,6 +2034,12 @@ class InfoExtractor(object): compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id) def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): + """ + Parse formats from ISM manifest. + References: + 1. [MS-SSTR]: Smooth Streaming Protocol, + https://msdn.microsoft.com/en-us/library/ff469518.aspx + """ if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None: return [] @@ -1818,14 +2055,17 @@ class InfoExtractor(object): stream_timescale = int_or_none(stream.get('TimeScale')) or timescale stream_name = stream.get('Name') for track in stream.findall('QualityLevel'): - fourcc = track.get('FourCC') + fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None) # TODO: add support for WVC1 and WMAP if fourcc not in ('H264', 'AVC1', 'AACL'): self.report_warning('%s is not a supported codec' % fourcc) continue tbr = int(track.attrib['Bitrate']) // 1000 - width = int_or_none(track.get('MaxWidth')) - height = int_or_none(track.get('MaxHeight')) + # [1] does not mention Width and Height attributes. However, + # they're often present while MaxWidth and MaxHeight are + # missing, so should be used as fallbacks + width = int_or_none(track.get('MaxWidth') or track.get('Width')) + height = int_or_none(track.get('MaxHeight') or track.get('Height')) sampling_rate = int_or_none(track.get('SamplingRate')) track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) @@ -1888,7 +2128,7 @@ class InfoExtractor(object): }) return formats - def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None): + def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None): def absolute_url(video_url): return compat_urlparse.urljoin(base_url, video_url) @@ -1903,18 +2143,19 @@ class InfoExtractor(object): return f return {} - def _media_formats(src, cur_media_type): + def _media_formats(src, cur_media_type, type_info={}): full_url = absolute_url(src) - ext = determine_ext(full_url) + ext = type_info.get('ext') or determine_ext(full_url) if ext == 'm3u8': is_plain_url = False formats = self._extract_m3u8_formats( full_url, video_id, ext='mp4', - entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id) + entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id, + preference=preference, fatal=False) elif ext == 'mpd': is_plain_url = False formats = self._extract_mpd_formats( - full_url, video_id, mpd_id=mpd_id) + full_url, video_id, mpd_id=mpd_id, fatal=False) else: is_plain_url = True formats = [{ @@ -1924,10 +2165,18 @@ class InfoExtractor(object): return is_plain_url, formats entries = [] + # amp-video and amp-audio are very similar to their HTML5 counterparts + # so we wll include them right here (see + # https://www.ampproject.org/docs/reference/components/amp-video) media_tags = [(media_tag, media_type, '') for media_tag, media_type - in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)] - media_tags.extend(re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage)) + in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)] + media_tags.extend(re.findall( + # We only allow video|audio followed by a whitespace or '>'. + # Allowing more characters may end up in significant slow down (see + # https://github.com/rg3/youtube-dl/issues/11979, example URL: + # http://www.porntrex.com/maps/videositemap.xml). + r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage)) for media_tag, media_type, media_content in media_tags: media_info = { 'formats': [], @@ -1945,9 +2194,15 @@ class InfoExtractor(object): src = source_attributes.get('src') if not src: continue - is_plain_url, formats = _media_formats(src, media_type) + f = parse_content_type(source_attributes.get('type')) + is_plain_url, formats = _media_formats(src, media_type, f) if is_plain_url: - f = parse_content_type(source_attributes.get('type')) + # res attribute is not standard but seen several times + # in the wild + f.update({ + 'height': int_or_none(source_attributes.get('res')), + 'format_id': source_attributes.get('label'), + }) f.update(formats[0]) media_info['formats'].append(f) else: @@ -1967,10 +2222,13 @@ class InfoExtractor(object): entries.append(media_info) return entries - def _extract_akamai_formats(self, manifest_url, video_id): + def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): formats = [] hdcore_sign = 'hdcore=3.7.0' - f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') + f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') + hds_host = hosts.get('hds') + if hds_host: + f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) if 'hdcore=' not in f4m_url: f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign f4m_formats = self._extract_f4m_formats( @@ -1978,33 +2236,45 @@ class InfoExtractor(object): for entry in f4m_formats: entry.update({'extra_param_to_segment_url': hdcore_sign}) formats.extend(f4m_formats) - m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') + m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') + hls_host = hosts.get('hls') + if hls_host: + m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) return formats def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): + query = compat_urlparse.urlparse(url).query url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) - url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url') - http_base_url = 'http' + url_base + url_base = self._search_regex( + r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url') + http_base_url = '%s:%s' % ('http', url_base) formats = [] + + def manifest_url(manifest): + m_url = '%s/%s' % (http_base_url, manifest) + if query: + m_url += '?%s' % query + return m_url + if 'm3u8' not in skip_protocols: formats.extend(self._extract_m3u8_formats( - http_base_url + '/playlist.m3u8', video_id, 'mp4', + manifest_url('playlist.m3u8'), video_id, 'mp4', m3u8_entry_protocol, m3u8_id='hls', fatal=False)) if 'f4m' not in skip_protocols: formats.extend(self._extract_f4m_formats( - http_base_url + '/manifest.f4m', + manifest_url('manifest.f4m'), video_id, f4m_id='hds', fatal=False)) if 'dash' not in skip_protocols: formats.extend(self._extract_mpd_formats( - http_base_url + '/manifest.mpd', + manifest_url('manifest.mpd'), video_id, mpd_id='dash', fatal=False)) if re.search(r'(?:/smil:|\.smil)', url_base): if 'smil' not in skip_protocols: rtmp_formats = self._extract_smil_formats( - http_base_url + '/jwplayer.smil', + manifest_url('jwplayer.smil'), video_id, fatal=False) for rtmp_format in rtmp_formats: rtsp_format = rtmp_format.copy() @@ -2021,12 +2291,165 @@ class InfoExtractor(object): for protocol in ('rtmp', 'rtsp'): if protocol not in skip_protocols: formats.append({ - 'url': protocol + url_base, + 'url': '%s:%s' % (protocol, url_base), 'format_id': protocol, 'protocol': protocol, }) return formats + def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): + mobj = re.search( + r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)', + webpage) + if mobj: + try: + jwplayer_data = self._parse_json(mobj.group('options'), + video_id=video_id, + transform_source=transform_source) + except ExtractorError: + pass + else: + if isinstance(jwplayer_data, dict): + return jwplayer_data + + def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): + jwplayer_data = self._find_jwplayer_data( + webpage, video_id, transform_source=js_to_json) + return self._parse_jwplayer_data( + jwplayer_data, video_id, *args, **kwargs) + + def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, + m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): + # JWPlayer backward compatibility: flattened playlists + # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 + if 'playlist' not in jwplayer_data: + jwplayer_data = {'playlist': [jwplayer_data]} + + entries = [] + + # JWPlayer backward compatibility: single playlist item + # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 + if not isinstance(jwplayer_data['playlist'], list): + jwplayer_data['playlist'] = [jwplayer_data['playlist']] + + for video_data in jwplayer_data['playlist']: + # JWPlayer backward compatibility: flattened sources + # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 + if 'sources' not in video_data: + video_data['sources'] = [video_data] + + this_video_id = video_id or video_data['mediaid'] + + formats = self._parse_jwplayer_formats( + video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id, + mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url) + + subtitles = {} + tracks = video_data.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if not isinstance(track, dict): + continue + if track.get('kind') != 'captions': + continue + track_url = urljoin(base_url, track.get('file')) + if not track_url: + continue + subtitles.setdefault(track.get('label') or 'en', []).append({ + 'url': self._proto_relative_url(track_url) + }) + + entry = { + 'id': this_video_id, + 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')), + 'description': video_data.get('description'), + 'thumbnail': self._proto_relative_url(video_data.get('image')), + 'timestamp': int_or_none(video_data.get('pubdate')), + 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), + 'subtitles': subtitles, + } + # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32 + if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']): + entry.update({ + '_type': 'url_transparent', + 'url': formats[0]['url'], + }) + else: + self._sort_formats(formats) + entry['formats'] = formats + entries.append(entry) + if len(entries) == 1: + return entries[0] + else: + return self.playlist_result(entries) + + def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, + m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): + urls = [] + formats = [] + for source in jwplayer_sources_data: + if not isinstance(source, dict): + continue + source_url = self._proto_relative_url(source.get('file')) + if not source_url: + continue + if base_url: + source_url = compat_urlparse.urljoin(base_url, source_url) + if source_url in urls: + continue + urls.append(source_url) + source_type = source.get('type') or '' + ext = mimetype2ext(source_type) or determine_ext(source_url) + if source_type == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=m3u8_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + source_url, video_id, mpd_id=mpd_id, fatal=False)) + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + source_url, video_id, fatal=False)) + # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 + elif source_type.startswith('audio') or ext in ( + 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'): + formats.append({ + 'url': source_url, + 'vcodec': 'none', + 'ext': ext, + }) + else: + height = int_or_none(source.get('height')) + if height is None: + # Often no height is provided but there is a label in + # format like "1080p", "720p SD", or 1080. + height = int_or_none(self._search_regex( + r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), + 'height', default=None)) + a_format = { + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': height, + 'tbr': int_or_none(source.get('bitrate')), + 'ext': ext, + } + if source_url.startswith('rtmp'): + a_format['ext'] = 'flv' + # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as + # of jwplayer.flash.swf + rtmp_url_parts = re.split( + r'((?:mp4|mp3|flv):)', source_url, 1) + if len(rtmp_url_parts) == 3: + rtmp_url, prefix, play_path = rtmp_url_parts + a_format.update({ + 'url': rtmp_url, + 'play_path': prefix + play_path, + }) + if rtmp_params: + a_format.update(rtmp_params) + formats.append(a_format) + return formats + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() @@ -2055,10 +2478,12 @@ class InfoExtractor(object): self._downloader.report_warning(msg) return res - def _set_cookie(self, domain, name, value, expire_time=None): + def _set_cookie(self, domain, name, value, expire_time=None, port=None, + path='/', secure=False, discard=False, rest={}, **kwargs): cookie = compat_cookiejar.Cookie( - 0, name, value, None, None, domain, None, - None, '/', True, False, expire_time, '', None, None, None) + 0, name, value, port, port is not None, domain, True, + domain.startswith('.'), path, True, secure, expire_time, + discard, None, None, rest) self._downloader.cookiejar.set_cookie(cookie) def _get_cookies(self, url): |