aboutsummaryrefslogtreecommitdiffhomepage
path: root/youtube_dl/utils.py
diff options
context:
space:
mode:
authordirkf <[email protected]>2023-06-07 14:51:50 +0100
committerdirkf <[email protected]>2023-07-18 10:50:46 +0100
commit46fde7caeeab13a6277aab22a0e8a29e10c30cc3 (patch)
treef4744405e643df24ef573852d55e1635092ffa0c /youtube_dl/utils.py
parent648dc5304cb2476592ff142988b8c62675011fcc (diff)
downloadyoutube-dl-46fde7caeeab13a6277aab22a0e8a29e10c30cc3.tar.gz
youtube-dl-46fde7caeeab13a6277aab22a0e8a29e10c30cc3.zip
[core] Update redirect handling from yt-dlp
* Thx coletdjnz: https://github.com/yt-dlp/yt-dlp/pull/7094 * add test that redirected `POST` loses its `Content-Type`
Diffstat (limited to 'youtube_dl/utils.py')
-rw-r--r--youtube_dl/utils.py74
1 files changed, 44 insertions, 30 deletions
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index dbdbe5f59..58c710b08 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -41,7 +41,6 @@ import zlib
from .compat import (
compat_HTMLParseError,
compat_HTMLParser,
- compat_HTTPError,
compat_basestring,
compat_casefold,
compat_chr,
@@ -64,6 +63,7 @@ from .compat import (
compat_struct_pack,
compat_struct_unpack,
compat_urllib_error,
+ compat_urllib_HTTPError,
compat_urllib_parse,
compat_urllib_parse_parse_qs as compat_parse_qs,
compat_urllib_parse_urlencode,
@@ -2614,7 +2614,8 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
Part of this code was copied from:
- http://techknack.net/python-urllib2-handlers/
+ http://techknack.net/python-urllib2-handlers/, archived at
+ https://web.archive.org/web/20130527205558/http://techknack.net/python-urllib2-handlers/
Andrew Rowls, the author of that code, agreed to release it to the
public domain.
@@ -2672,7 +2673,9 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
req._Request__original = req._Request__original.partition('#')[0]
req._Request__r_type = req._Request__r_type.partition('#')[0]
- return req
+ # Use the totally undocumented AbstractHTTPHandler per
+ # https://github.com/yt-dlp/yt-dlp/pull/4158
+ return compat_urllib_request.AbstractHTTPHandler.do_request_(self, req)
def http_response(self, req, resp):
old_resp = resp
@@ -2683,7 +2686,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
try:
uncompressed = io.BytesIO(gz.read())
except IOError as original_ioerror:
- # There may be junk add the end of the file
+ # There may be junk at the end of the file
# See http://stackoverflow.com/q/4928560/35070 for details
for i in range(1, 1024):
try:
@@ -2710,9 +2713,8 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
if location:
# As of RFC 2616 default charset is iso-8859-1 that is respected by python 3
if sys.version_info >= (3, 0):
- location = location.encode('iso-8859-1').decode('utf-8')
- else:
- location = location.decode('utf-8')
+ location = location.encode('iso-8859-1')
+ location = location.decode('utf-8')
location_escaped = escape_url(location)
if location != location_escaped:
del resp.headers['Location']
@@ -2940,17 +2942,16 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
The code is based on HTTPRedirectHandler implementation from CPython [1].
- This redirect handler solves two issues:
- - ensures redirect URL is always unicode under python 2
- - introduces support for experimental HTTP response status code
- 308 Permanent Redirect [2] used by some sites [3]
+ This redirect handler fixes and improves the logic to better align with RFC7261
+ and what browsers tend to do [2][3]
1. https://github.com/python/cpython/blob/master/Lib/urllib/request.py
- 2. https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/308
- 3. https://github.com/ytdl-org/youtube-dl/issues/28768
+ 2. https://datatracker.ietf.org/doc/html/rfc7231
+ 3. https://github.com/python/cpython/issues/91306
"""
- http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
+ # Supply possibly missing alias
+ http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302
def redirect_request(self, req, fp, code, msg, headers, newurl):
"""Return a Request or None in response to a redirect.
@@ -2962,19 +2963,16 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
else should try to handle this url. Return None if you can't
but another Handler might.
"""
- m = req.get_method()
- if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD")
- or code in (301, 302, 303) and m == "POST")):
- raise compat_HTTPError(req.full_url, code, msg, headers, fp)
- # Strictly (according to RFC 2616), 301 or 302 in response to
- # a POST MUST NOT cause a redirection without confirmation
- # from the user (of urllib.request, in this case). In practice,
- # essentially all clients do redirect in this case, so we do
- # the same.
+ if code not in (301, 302, 303, 307, 308):
+ raise compat_urllib_HTTPError(req.full_url, code, msg, headers, fp)
+
+ new_method = req.get_method()
+ new_data = req.data
+ remove_headers = []
# On python 2 urlh.geturl() may sometimes return redirect URL
- # as byte string instead of unicode. This workaround allows
- # to force it always return unicode.
+ # as a byte string instead of unicode. This workaround forces
+ # it to return unicode.
if sys.version_info[0] < 3:
newurl = compat_str(newurl)
@@ -2983,13 +2981,29 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
# but it is kept for compatibility with other callers.
newurl = newurl.replace(' ', '%20')
- CONTENT_HEADERS = ("content-length", "content-type")
+ # A 303 must either use GET or HEAD for subsequent request
+ # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4
+ if code == 303 and req.get_method() != 'HEAD':
+ new_method = 'GET'
+ # 301 and 302 redirects are commonly turned into a GET from a POST
+ # for subsequent requests by browsers, so we'll do the same.
+ # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2
+ # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3
+ elif code in (301, 302) and req.get_method() == 'POST':
+ new_method = 'GET'
+
+ # only remove payload if method changed (e.g. POST to GET)
+ if new_method != req.get_method():
+ new_data = None
+ remove_headers.extend(['Content-Length', 'Content-Type'])
+
# NB: don't use dict comprehension for python 2.6 compatibility
- newheaders = dict((k, v) for k, v in req.headers.items()
- if k.lower() not in CONTENT_HEADERS)
+ new_headers = dict((k, v) for k, v in req.header_items()
+ if k.lower() not in remove_headers)
+
return compat_urllib_request.Request(
- newurl, headers=newheaders, origin_req_host=req.origin_req_host,
- unverifiable=True)
+ newurl, headers=new_headers, origin_req_host=req.origin_req_host,
+ unverifiable=True, method=new_method, data=new_data)
def extract_timezone(date_str):