diff options
author | ewang <[email protected]> | 2024-03-28 15:19:14 -0700 |
---|---|---|
committer | Edward Wang <[email protected]> | 2024-04-12 15:30:28 -0700 |
commit | 3f682ea37f2d373dca5e6218c9a621e2c7da67b6 (patch) | |
tree | 865880a48c49fee7a12ff2770948fc96fc03a0be /pingora-core | |
parent | acee67f87020ef41267fe475bcc5cbed44782a06 (diff) | |
download | pingora-3f682ea37f2d373dca5e6218c9a621e2c7da67b6.tar.gz pingora-3f682ea37f2d373dca5e6218c9a621e2c7da67b6.zip |
Add cache_not_modified_filter, handle etag/last-modified
This filter allows customizing the `ETag`/`Last-Modified` check to see
if the proxy cache can return 304 Not Modified instead of the full
response.
Also flesh out the `ETag`/`Last-Modified` handling in the base
`not_modified_filter` with `If-None-Match` and `If-Modified-Since`.
Diffstat (limited to 'pingora-core')
-rw-r--r-- | pingora-core/Cargo.toml | 1 | ||||
-rw-r--r-- | pingora-core/src/protocols/http/conditional_filter.rs | 322 | ||||
-rw-r--r-- | pingora-core/src/protocols/http/mod.rs | 1 |
3 files changed, 324 insertions, 0 deletions
diff --git a/pingora-core/Cargo.toml b/pingora-core/Cargo.toml index a885e6b..936a7b9 100644 --- a/pingora-core/Cargo.toml +++ b/pingora-core/Cargo.toml @@ -65,6 +65,7 @@ brotli = "3" openssl-probe = "0.1" tokio-test = "0.4" zstd = "0" +httpdate = "1" [dev-dependencies] matches = "0.1" diff --git a/pingora-core/src/protocols/http/conditional_filter.rs b/pingora-core/src/protocols/http/conditional_filter.rs new file mode 100644 index 0000000..271f175 --- /dev/null +++ b/pingora-core/src/protocols/http/conditional_filter.rs @@ -0,0 +1,322 @@ +// Copyright 2024 Cloudflare, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +//! Conditional filter (not modified) utilities + +use http::{header::*, StatusCode}; +use httpdate::{parse_http_date, HttpDate}; +use pingora_error::{ErrorType::InvalidHTTPHeader, OrErr, Result}; +use pingora_http::{RequestHeader, ResponseHeader}; + +/// Evaluates conditional headers according to the [RFC](https://datatracker.ietf.org/doc/html/rfc9111#name-handling-a-received-validat). +/// +/// Returns true if the request should receive 304 Not Modified. +pub fn not_modified_filter(req: &RequestHeader, resp: &ResponseHeader) -> bool { + // https://datatracker.ietf.org/doc/html/rfc9110#name-304-not-modified + // 304 can only validate 200 + if resp.status != StatusCode::OK { + return false; + } + + // Evulation of conditional headers, based on RFC: + // https://datatracker.ietf.org/doc/html/rfc9111#name-handling-a-received-validat + + // TODO: If-Match and If-Unmodified-Since, and returning 412 Precondition Failed + // Note that this function is currently used only for proxy cache, + // and the current RFCs have some conflicting opinions as to whether + // If-Match and If-Unmodified-Since can be used. https://github.com/httpwg/http-core/issues/1111 + + // Conditional request precedence: + // https://datatracker.ietf.org/doc/html/rfc9110#name-precedence-of-preconditions + // If-None-Match should be handled before If-Modified-Since. + // XXX: In nginx, IMS is actually checked first, which may cause compatibility issues + // for certain origins/clients. + + if req.headers.contains_key(IF_NONE_MATCH) { + if let Some(etag) = resp.headers.get(ETAG) { + for inm in req.headers.get_all(IF_NONE_MATCH) { + if weak_validate_etag(inm.as_bytes(), etag.as_bytes()) { + return true; + } + } + } + // https://datatracker.ietf.org/doc/html/rfc9110#field.if-modified-since + // "MUST ignore If-Modified-Since if the request contains an If-None-Match header" + return false; + } + + // GET/HEAD only https://datatracker.ietf.org/doc/html/rfc9110#field.if-modified-since + if matches!(req.method, http::Method::GET | http::Method::HEAD) { + if let Ok(Some(if_modified_since)) = req_header_as_http_date(req, &IF_MODIFIED_SINCE) { + if let Ok(Some(last_modified)) = resp_header_as_http_date(resp, &LAST_MODIFIED) { + if if_modified_since >= last_modified { + return true; + } + } + } + } + false +} + +// Trim ASCII whitespace bytes from the start of the slice. +// This is pretty much copied from the nightly API. +// TODO: use `trim_ascii_start` when it stabilizes https://doc.rust-lang.org/std/primitive.slice.html#method.trim_ascii_start +fn trim_ascii_start(mut bytes: &[u8]) -> &[u8] { + while let [first, rest @ ..] = bytes { + if first.is_ascii_whitespace() { + bytes = rest; + } else { + break; + } + } + bytes +} + +/// Search for an ETag matching `target_etag` from the input header, using +/// [weak comparison](https://datatracker.ietf.org/doc/html/rfc9110#section-8.8.3.2). +/// Multiple ETags can exist in the header as a comma-separated list. +/// +/// Returns true if a matching ETag exists. +pub fn weak_validate_etag(input_etag_header: &[u8], target_etag: &[u8]) -> bool { + // ETag comparison: https://datatracker.ietf.org/doc/html/rfc9110#section-8.8.3.2 + fn strip_weak_prefix(etag: &[u8]) -> &[u8] { + // Weak ETags are prefaced with `W/` + etag.strip_prefix(b"W/").unwrap_or(etag) + } + // https://datatracker.ietf.org/doc/html/rfc9110#section-13.1.2 unsafe method only + if input_etag_header == b"*" { + return true; + } + + // The RFC defines ETags here: https://datatracker.ietf.org/doc/html/rfc9110#section-8.8.3 + // The RFC requires ETags to be wrapped in double quotes, though some legacy origins or clients + // don't adhere to this. + // Unfortunately by allowing non-quoted etags, parsing becomes a little more complicated. + // + // This implementation uses nginx's algorithm for parsing ETags, which can handle both quoted + // and non-quoted ETags. It essentially does a substring comparison at each comma divider, + // searching for an exact match of the ETag (optional double quotes included) followed by + // either EOF or another comma. + // + // Clients and upstreams should still ideally adhere to quoted ETags to disambiguate + // situations where commas are contained within the ETag (allowed by the RFC). + // XXX: This nginx algorithm will handle matching against ETags with commas correctly, but only + // if the target ETag is a quoted RFC-compliant ETag. + // + // For example, consider an if-none-match header: `"xyzzy,xyz,x,y", "xyzzy"`. + // If the target ETag is double quoted as mandated by the RFC like `"xyz,x"`, this algorithm + // will correctly report no matching ETag. + // But if the target ETag is not double quoted like `xyz,x`, it will "incorrectly" match + // against the substring after the first comma inside the first quoted ETag. + + // Search for the target at each comma delimiter + let target_etag = strip_weak_prefix(target_etag); + let mut remaining = strip_weak_prefix(input_etag_header); + while let Some(search_slice) = remaining.get(0..target_etag.len()) { + if search_slice == target_etag { + remaining = &remaining[target_etag.len()..]; + // check if there's any content after the matched substring + // skip any whitespace + remaining = trim_ascii_start(remaining); + if matches!(remaining.first(), None | Some(b',')) { + // we are either at the end of the header, or at a comma delimiter + // which means this is a match + return true; + } + } + // find the next delimiter (ignore any remaining part of the non-matching etag) + let Some(next_delimiter_pos) = remaining.iter().position(|&b| b == b',') else { + break; + }; + remaining = &remaining[next_delimiter_pos..]; + // find the next etag slice to compare + // ignore extraneous delimiters and whitespace + let Some(next_etag_pos) = remaining + .iter() + .position(|&b| !b.is_ascii_whitespace() && b != b',') + else { + break; + }; + remaining = &remaining[next_etag_pos..]; + + remaining = strip_weak_prefix(remaining); + } + // remaining length < target etag length + false +} + +/// Utility function to parse an HTTP request header as an [HTTP-date](https://datatracker.ietf.org/doc/html/rfc9110#name-date-time-formats). +pub fn req_header_as_http_date<H>(req: &RequestHeader, header_name: H) -> Result<Option<HttpDate>> +where + H: AsHeaderName, +{ + let Some(header_value) = req.headers.get(header_name) else { + return Ok(None); + }; + Ok(Some(parse_bytes_as_http_date(header_value.as_bytes())?)) +} + +/// Utility function to parse an HTTP response header as an [HTTP-date](https://datatracker.ietf.org/doc/html/rfc9110#name-date-time-formats). +pub fn resp_header_as_http_date<H>( + resp: &ResponseHeader, + header_name: H, +) -> Result<Option<HttpDate>> +where + H: AsHeaderName, +{ + let Some(header_value) = resp.headers.get(header_name) else { + return Ok(None); + }; + Ok(Some(parse_bytes_as_http_date(header_value.as_bytes())?)) +} + +fn parse_bytes_as_http_date(bytes: &[u8]) -> Result<HttpDate> { + let input_time = std::str::from_utf8(bytes).explain_err(InvalidHTTPHeader, |_| { + "HTTP date has unsupported characters (bytes outside of UTF-8)" + })?; + Ok(parse_http_date(input_time) + .or_err(InvalidHTTPHeader, "Invalid HTTP date")? + .into()) +} + +/// Utility function to convert the input response header to a 304 Not Modified response. +pub fn to_304(resp: &mut ResponseHeader) { + // https://datatracker.ietf.org/doc/html/rfc9110#name-304-not-modified + // XXX: https://datatracker.ietf.org/doc/html/rfc9110#name-content-length + // "A server may send content-length in 304", but no common web server does it + // So we drop both content-length and content-type for consistency/less surprise + resp.set_status(StatusCode::NOT_MODIFIED).unwrap(); + resp.remove_header(&CONTENT_LENGTH); + resp.remove_header(&CONTENT_TYPE); + // https://datatracker.ietf.org/doc/html/rfc9110#section-15.4.5-4 + // "SHOULD NOT generate representation metadata other than the above listed fields + // unless said metadata exists for the purpose of guiding cache updates" + // Remove some more representation metadata headers + resp.remove_header(&TRANSFER_ENCODING); + // note that the following are also stripped by nginx + resp.remove_header(&CONTENT_ENCODING); + resp.remove_header(&ACCEPT_RANGES); +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_if_modified_since() { + fn build_req(if_modified_since: &[u8]) -> RequestHeader { + let mut req = RequestHeader::build("GET", b"/", None).unwrap(); + req.insert_header("If-Modified-Since", if_modified_since) + .unwrap(); + req + } + + fn build_resp(last_modified: &[u8]) -> ResponseHeader { + let mut resp = ResponseHeader::build(200, None).unwrap(); + resp.insert_header("Last-Modified", last_modified).unwrap(); + resp + } + + // same date + let last_modified = b"Fri, 26 Mar 2010 00:05:00 GMT"; + let req = build_req(b"Fri, 26 Mar 2010 00:05:00 GMT"); + let resp = build_resp(last_modified); + assert!(not_modified_filter(&req, &resp)); + + // before + let req = build_req(b"Fri, 26 Mar 2010 00:03:00 GMT"); + let resp = build_resp(last_modified); + assert!(!not_modified_filter(&req, &resp)); + + // after + let req = build_req(b"Sun, 28 Mar 2010 01:07:00 GMT"); + let resp = build_resp(last_modified); + assert!(not_modified_filter(&req, &resp)); + } + + #[test] + fn test_weak_validate_etag() { + let target_weak_etag = br#"W/"xyzzy""#; + let target_etag = br#""xyzzy""#; + assert!(weak_validate_etag(b"*", target_weak_etag)); + assert!(weak_validate_etag(b"*", target_etag)); + + assert!(weak_validate_etag(target_etag, target_etag)); + assert!(weak_validate_etag(target_etag, target_weak_etag)); + assert!(weak_validate_etag(target_weak_etag, target_etag)); + assert!(weak_validate_etag(target_weak_etag, target_weak_etag)); + + let mismatch_weak_etag = br#"W/"abc""#; + let mismatch_etag = br#""abc""#; + assert!(!weak_validate_etag(mismatch_etag, target_etag)); + assert!(!weak_validate_etag(mismatch_etag, target_weak_etag)); + assert!(!weak_validate_etag(mismatch_weak_etag, target_etag)); + assert!(!weak_validate_etag(mismatch_weak_etag, target_weak_etag)); + + let multiple_etags = br#"a, "xyzzy","r2d2xxxx", "c3piozzzz",zzzfoo"#; + assert!(weak_validate_etag(multiple_etags, target_etag)); + assert!(weak_validate_etag(multiple_etags, target_weak_etag)); + + let multiple_mismatch_etags = br#"foobar", "r2d2xxxx", "c3piozzzz",zzzfoo"#; + assert!(!weak_validate_etag(multiple_mismatch_etags, target_etag)); + assert!(!weak_validate_etag( + multiple_mismatch_etags, + target_weak_etag + )); + + let multiple_mismatch_etags = + br#"foobar", "r2d2xxxxyzzy", "c3piozzzz",zzzfoo, "xyzzy,xyzzy""#; + assert!(!weak_validate_etag(multiple_mismatch_etags, target_etag)); + assert!(!weak_validate_etag( + multiple_mismatch_etags, + target_weak_etag + )); + + let target_comma_etag = br#"",,,""#; + let multiple_mismatch_etags = br#",", ",,,,", ,,,,,,,,",,",",,,,,,""#; + assert!(!weak_validate_etag( + multiple_mismatch_etags, + target_comma_etag + )); + let multiple_etags = br#",", ",,,,", ,,,,,,,,",,,",",,,,,,""#; + assert!(weak_validate_etag(multiple_etags, target_comma_etag)); + } + + #[test] + fn test_weak_validate_etag_unquoted() { + // legacy unquoted etag + let target_unquoted = b"xyzzy"; + assert!(weak_validate_etag(b"*", target_unquoted)); + + let strong_etag = br#""xyzzy""#; + assert!(!weak_validate_etag(strong_etag, target_unquoted)); + assert!(!weak_validate_etag(target_unquoted, strong_etag)); + + let multiple_etags = br#"a, "r2d2xxxx", "c3piozzzz", xyzzy"#; + assert!(weak_validate_etag(multiple_etags, target_unquoted)); + + let multiple_mismatch_etags = + br#"foobar", "r2d2xxxxyzzy", "c3piozzzz",zzzfoo, "xyzzy,xyzzy""#; + assert!(!weak_validate_etag( + multiple_mismatch_etags, + target_unquoted + )); + + // in certain edge cases where commas are used alongside quoted ETags, + // the test can fail if target is unquoted (the last ETag is intended to be one ETag) + let multiple_mismatch_etags = + br#"foobar", "r2d2xxxxyzzy", "c3piozzzz",zzzfoo, "xyzzy,xyzzy,xy""#; + assert!(weak_validate_etag(multiple_mismatch_etags, target_unquoted)); + } +} diff --git a/pingora-core/src/protocols/http/mod.rs b/pingora-core/src/protocols/http/mod.rs index d5e8ee9..b3f0ddb 100644 --- a/pingora-core/src/protocols/http/mod.rs +++ b/pingora-core/src/protocols/http/mod.rs @@ -17,6 +17,7 @@ mod body_buffer; pub mod client; pub mod compression; +pub mod conditional_filter; pub(crate) mod date; pub mod error_resp; pub mod server; |