diff options
author | Andrew Hauck <[email protected]> | 2024-08-20 10:44:20 -0700 |
---|---|---|
committer | Edward Wang <[email protected]> | 2024-09-06 13:43:53 -0700 |
commit | d1d7a87b761eeb4f71fcaa3f7c4ae8e32f1d93c8 (patch) | |
tree | 1f71e461b7b922cc19e69eadff6059ae8d2d4394 | |
parent | acffb8aaf2a76e3ab8a4db698ed2f151cfb64566 (diff) | |
download | pingora-d1d7a87b761eeb4f71fcaa3f7c4ae8e32f1d93c8.tar.gz pingora-d1d7a87b761eeb4f71fcaa3f7c4ae8e32f1d93c8.zip |
Add support for binding to local port ranges and retrying on EADDRNOTAVAIL
-rw-r--r-- | .bleep | 2 | ||||
-rw-r--r-- | pingora-core/src/connectors/l4.rs | 206 | ||||
-rw-r--r-- | pingora-core/src/connectors/mod.rs | 6 | ||||
-rw-r--r-- | pingora-core/src/protocols/l4/ext.rs | 98 | ||||
-rw-r--r-- | pingora-core/src/upstreams/peer.rs | 12 |
5 files changed, 286 insertions, 38 deletions
@@ -1 +1 @@ -28f94f2a402bbf66341bdac8fa670caf5b7311e9
\ No newline at end of file +90c70086397a4708a4dadfed6e6915ce6dc33481
\ No newline at end of file diff --git a/pingora-core/src/connectors/l4.rs b/pingora-core/src/connectors/l4.rs index 0162034..58bd209 100644 --- a/pingora-core/src/connectors/l4.rs +++ b/pingora-core/src/connectors/l4.rs @@ -33,8 +33,59 @@ pub trait Connect: std::fmt::Debug { async fn connect(&self, addr: &SocketAddr) -> Result<Stream>; } +/// Settings for binding on connect +#[derive(Clone, Debug, Default)] +pub struct BindTo { + // local ip address + pub addr: Option<InetSocketAddr>, + // port range + port_range: Option<(u16, u16)>, + // whether we fallback and try again on bind errors when a port range is set + fallback: bool, +} + +impl BindTo { + /// Sets the port range we will bind to where the first item in the tuple is the lower bound + /// and the second item is the upper bound. + /// + /// Note this bind option is only supported on Linux since 6.3, this is a no-op on other systems. + /// To reset the range, pass a `None` or `Some((0,0))`, more information can be found [here](https://man7.org/linux/man-pages/man7/ip.7.html) + pub fn set_port_range(&mut self, range: Option<(u16, u16)>) -> Result<()> { + if range.is_none() && self.port_range.is_none() { + // nothing to do + return Ok(()); + } + + match range { + // 0,0 is valid for resets + None | Some((0, 0)) => self.port_range = Some((0, 0)), + // set the port range if valid + Some((low, high)) if low > 0 && low < high => { + self.port_range = Some((low, high)); + } + _ => return Error::e_explain(SocketError, "invalid port range: {range}"), + } + Ok(()) + } + + /// Set whether we fallback on no address available if a port range is set + pub fn set_fallback(&mut self, fallback: bool) { + self.fallback = fallback + } + + /// Configured bind port range + pub fn port_range(&self) -> Option<(u16, u16)> { + self.port_range + } + + /// Whether we attempt to fallback on no address available + pub fn will_fallback(&self) -> bool { + self.fallback && self.port_range.is_some() + } +} + /// Establish a connection (l4) to the given peer using its settings and an optional bind address. -pub(crate) async fn connect<P>(peer: &P, bind_to: Option<InetSocketAddr>) -> Result<Stream> +pub(crate) async fn connect<P>(peer: &P, bind_to: Option<BindTo>) -> Result<Stream> where P: Peer + Send + Sync, { @@ -142,12 +193,8 @@ pub(crate) fn bind_to_random<P: Peer>( peer: &P, v4_list: &[InetSocketAddr], v6_list: &[InetSocketAddr], -) -> Option<InetSocketAddr> { - let selected = peer.get_peer_options().and_then(|o| o.bind_to); - if selected.is_some() { - return selected; - } - +) -> Option<BindTo> { + // helper function for randomly picking address fn bind_to_ips(ips: &[InetSocketAddr]) -> Option<InetSocketAddr> { match ips.len() { 0 => None, @@ -159,13 +206,31 @@ pub(crate) fn bind_to_random<P: Peer>( } } - match peer.address() { + let mut bind_to = peer.get_peer_options().and_then(|o| o.bind_to.clone()); + if bind_to.as_ref().map(|b| b.addr).is_some() { + // already have a bind address selected + return bind_to; + } + + let addr = match peer.address() { SocketAddr::Inet(sockaddr) => match sockaddr { InetSocketAddr::V4(_) => bind_to_ips(v4_list), InetSocketAddr::V6(_) => bind_to_ips(v6_list), }, SocketAddr::Unix(_) => None, + }; + + if addr.is_some() { + if let Some(bind_to) = bind_to.as_mut() { + bind_to.addr = addr; + } else { + bind_to = Some(BindTo { + addr, + ..Default::default() + }); + } } + bind_to } use crate::protocols::raw_connect; @@ -238,16 +303,25 @@ mod tests { #[tokio::test] async fn test_conn_error_addr_not_avail() { let peer = HttpPeer::new("127.0.0.1:121".to_string(), false, "".to_string()); - let new_session = connect(&peer, Some("192.0.2.2:0".parse().unwrap())).await; + let addr = "192.0.2.2:0".parse().ok(); + let bind_to = BindTo { + addr, + ..Default::default() + }; + let new_session = connect(&peer, Some(bind_to)).await; assert_eq!(new_session.unwrap_err().etype(), &InternalError) } #[tokio::test] async fn test_conn_error_other() { let peer = HttpPeer::new("240.0.0.1:80".to_string(), false, "".to_string()); // non localhost - + let addr = "127.0.0.1:0".parse().ok(); // create an error: cannot send from src addr: localhost to dst addr: a public IP - let new_session = connect(&peer, Some("127.0.0.1:0".parse().unwrap())).await; + let bind_to = BindTo { + addr, + ..Default::default() + }; + let new_session = connect(&peer, Some(bind_to)).await; let error = new_session.unwrap_err(); // XXX: some system will allow the socket to bind and connect without error, only to timeout assert!(error.etype() == &ConnectError || error.etype() == &ConnectTimedout) @@ -371,4 +445,114 @@ mod tests { assert_eq!(err.etype(), &ConnectionClosed); assert!(!err.retry()); } + + #[cfg(target_os = "linux")] + #[tokio::test(flavor = "multi_thread")] + async fn test_bind_to_port_range_on_connect() { + fn get_ip_local_port_range() -> (u16, u16) { + let path = "/proc/sys/net/ipv4/ip_local_port_range"; + let file = std::fs::read_to_string(path).unwrap(); + let mut parts = file.split_whitespace(); + ( + parts.next().unwrap().parse().unwrap(), + parts.next().unwrap().parse().unwrap(), + ) + } + + // one-off mock server + async fn mock_inet_connect_server() { + use tokio::net::TcpListener; + let listener = TcpListener::bind("127.0.0.1:10020").await.unwrap(); + if let Ok((mut stream, _addr)) = listener.accept().await { + stream.write_all(b"HTTP/1.1 200 OK\r\n\r\n").await.unwrap(); + // wait a bit so that the client can read + tokio::time::sleep(std::time::Duration::from_millis(100)).await; + } + } + + fn in_port_range(session: Stream, lower: u16, upper: u16) -> bool { + let digest = session.get_socket_digest(); + let local_addr = digest + .as_ref() + .and_then(|s| s.local_addr()) + .unwrap() + .as_inet() + .unwrap(); + + // assert range + local_addr.port() >= lower && local_addr.port() <= upper + } + + tokio::spawn(async { + mock_inet_connect_server().await; + }); + // wait for the server to start + tokio::time::sleep(std::time::Duration::from_millis(100)).await; + + // need to read /proc/sys/net/ipv4/ip_local_port_range for this test to work + // IP_LOCAL_PORT_RANGE clamp only works on ports in /proc/sys/net/ipv4/ip_local_port_range + let (low, _) = get_ip_local_port_range(); + let high = low + 1; + + let peer = HttpPeer::new("127.0.0.1:10020".to_string(), false, "".to_string()); + let mut bind_to = BindTo { + addr: "127.0.0.1:0".parse().ok(), + ..Default::default() + }; + bind_to.set_port_range(Some((low, high))).unwrap(); + + let session1 = connect(&peer, Some(bind_to.clone())).await.unwrap(); + assert!(in_port_range(session1, low, high)); + + // execute more connect() + let session2 = connect(&peer, Some(bind_to.clone())).await.unwrap(); + assert!(in_port_range(session2, low, high)); + let session3 = connect(&peer, Some(bind_to.clone())).await.unwrap(); + assert!(in_port_range(session3, low, high)); + + // disabled fallback, should be AddrNotAvailable error + let err = connect(&peer, Some(bind_to.clone())).await.unwrap_err(); + assert_eq!(err.etype(), &InternalError); + + // enable fallback, assert not in port range but successful + bind_to.set_fallback(true); + let session4 = connect(&peer, Some(bind_to.clone())).await.unwrap(); + assert!(!in_port_range(session4, low, high)); + + // works without bind IP, shift up to use new ports + let low = low + 2; + let high = low + 1; + let mut bind_to = BindTo::default(); + bind_to.set_port_range(Some((low, high))).unwrap(); + let session5 = connect(&peer, Some(bind_to.clone())).await.unwrap(); + assert!(in_port_range(session5, low, high)); + } + + #[test] + fn test_bind_to_port_ranges() { + let addr = "127.0.0.1:0".parse().ok(); + let mut bind_to = BindTo { + addr, + ..Default::default() + }; + + // None because the previous value was None + bind_to.set_port_range(None).unwrap(); + assert!(bind_to.port_range.is_none()); + + // zeroes are handled + bind_to.set_port_range(Some((0, 0))).unwrap(); + assert_eq!(bind_to.port_range, Some((0, 0))); + + // zeroes because the previous value was Some + bind_to.set_port_range(None).unwrap(); + assert_eq!(bind_to.port_range, Some((0, 0))); + + // low > high is error + assert!(bind_to.set_port_range(Some((2000, 1000))).is_err()); + + // low < high success + bind_to.set_port_range(Some((1000, 2000))).unwrap(); + assert_eq!(bind_to.port_range, Some((1000, 2000))); + } } diff --git a/pingora-core/src/connectors/mod.rs b/pingora-core/src/connectors/mod.rs index 2d4584c..cbe299f 100644 --- a/pingora-core/src/connectors/mod.rs +++ b/pingora-core/src/connectors/mod.rs @@ -24,8 +24,8 @@ use crate::server::configuration::ServerConf; use crate::tls::ssl::SslConnector; use crate::upstreams::peer::{Peer, ALPN}; -use l4::connect as l4_connect; pub use l4::Connect as L4Connect; +use l4::{connect as l4_connect, BindTo}; use log::{debug, error, warn}; use offload::OffloadRuntime; use parking_lot::RwLock; @@ -273,7 +273,7 @@ impl TransportConnector { // connection timeout if there is one async fn do_connect<P: Peer + Send + Sync>( peer: &P, - bind_to: Option<SocketAddr>, + bind_to: Option<BindTo>, alpn_override: Option<ALPN>, tls_ctx: &SslConnector, ) -> Result<Stream> { @@ -296,7 +296,7 @@ async fn do_connect<P: Peer + Send + Sync>( // Perform the actual L4 and tls connection steps with no timeout async fn do_connect_inner<P: Peer + Send + Sync>( peer: &P, - bind_to: Option<SocketAddr>, + bind_to: Option<BindTo>, alpn_override: Option<ALPN>, tls_ctx: &SslConnector, ) -> Result<Stream> { diff --git a/pingora-core/src/protocols/l4/ext.rs b/pingora-core/src/protocols/l4/ext.rs index 5123bdf..4b65f84 100644 --- a/pingora-core/src/protocols/l4/ext.rs +++ b/pingora-core/src/protocols/l4/ext.rs @@ -27,6 +27,8 @@ use std::os::unix::io::{AsRawFd, RawFd}; use std::time::Duration; use tokio::net::{TcpSocket, TcpStream, UnixStream}; +use crate::connectors::l4::BindTo; + /// The (copy of) the kernel struct tcp_info returns #[repr(C)] #[derive(Copy, Clone, Debug)] @@ -160,9 +162,12 @@ fn cvt_linux_error(t: i32) -> io::Result<i32> { #[cfg(target_os = "linux")] fn ip_bind_addr_no_port(fd: RawFd, val: bool) -> io::Result<()> { - const IP_BIND_ADDRESS_NO_PORT: i32 = 24; - - set_opt(fd, libc::IPPROTO_IP, IP_BIND_ADDRESS_NO_PORT, val as c_int) + set_opt( + fd, + libc::IPPROTO_IP, + libc::IP_BIND_ADDRESS_NO_PORT, + val as c_int, + ) } #[cfg(not(target_os = "linux"))] @@ -170,6 +175,26 @@ fn ip_bind_addr_no_port(_fd: RawFd, _val: bool) -> io::Result<()> { Ok(()) } +/// IP_LOCAL_PORT_RANGE is only supported on Linux 6.3 and higher, +/// ip_local_port_range() is a no-op on unsupported versions. +/// See the [man page](https://man7.org/linux/man-pages/man7/ip.7.html) for more details. +#[cfg(target_os = "linux")] +fn ip_local_port_range(fd: RawFd, low: u16, high: u16) -> io::Result<()> { + const IP_LOCAL_PORT_RANGE: i32 = 51; + let range: u32 = (low as u32) | ((high as u32) << 16); + + let result = set_opt(fd, libc::IPPROTO_IP, IP_LOCAL_PORT_RANGE, range as c_int); + match result { + Err(e) if e.raw_os_error() != Some(libc::ENOPROTOOPT) => Err(e), + _ => Ok(()), // no error or ENOPROTOOPT + } +} + +#[cfg(not(target_os = "linux"))] +fn ip_local_port_range(_fd: RawFd, _low: u16, _high: u16) -> io::Result<()> { + Ok(()) +} + #[cfg(target_os = "linux")] fn set_so_keepalive(fd: RawFd, val: bool) -> io::Result<()> { set_opt(fd, libc::SOL_SOCKET, libc::SO_KEEPALIVE, val as c_int) @@ -310,14 +335,42 @@ pub fn get_socket_cookie(_fd: RawFd) -> io::Result<u64> { Ok(0) // SO_COOKIE is a Linux concept } -/// connect() to the given address while optionally binding to the specific source address. +/// connect() to the given address while optionally binding to the specific source address and port range. /// /// The `set_socket` callback can be used to tune the socket before `connect()` is called. /// +/// If a [`BindTo`] is set with a port range and fallback setting enabled this function will retry +/// on EADDRNOTAVAIL ignoring the port range. +/// /// `IP_BIND_ADDRESS_NO_PORT` is used. -pub(crate) async fn connect_with<F: FnOnce(&TcpSocket) -> Result<()>>( +/// `IP_LOCAL_PORT_RANGE` is used if a port range is set on [`BindTo`]. +pub(crate) async fn connect_with<F: FnOnce(&TcpSocket) -> Result<()> + Clone>( addr: &SocketAddr, - bind_to: Option<&SocketAddr>, + bind_to: Option<&BindTo>, + set_socket: F, +) -> Result<TcpStream> { + if bind_to.as_ref().map_or(false, |b| b.will_fallback()) { + // if we see an EADDRNOTAVAIL error clear the port range and try again + let connect_result = inner_connect_with(addr, bind_to, set_socket.clone()).await; + if let Err(e) = connect_result.as_ref() { + if matches!(e.etype(), BindError) { + let mut new_bind_to = BindTo::default(); + new_bind_to.addr = bind_to.as_ref().and_then(|b| b.addr); + // reset the port range + new_bind_to.set_port_range(None).unwrap(); + return inner_connect_with(addr, Some(&new_bind_to), set_socket).await; + } + } + connect_result + } else { + // not retryable + inner_connect_with(addr, bind_to, set_socket).await + } +} + +async fn inner_connect_with<F: FnOnce(&TcpSocket) -> Result<()>>( + addr: &SocketAddr, + bind_to: Option<&BindTo>, set_socket: F, ) -> Result<TcpStream> { let socket = if addr.is_ipv4() { @@ -328,14 +381,23 @@ pub(crate) async fn connect_with<F: FnOnce(&TcpSocket) -> Result<()>>( .or_err(SocketError, "failed to create socket")?; if cfg!(target_os = "linux") { - ip_bind_addr_no_port(socket.as_raw_fd(), true) - .or_err(SocketError, "failed to set socket opts")?; - - if let Some(baddr) = bind_to { - socket - .bind(*baddr) - .or_err_with(BindError, || format!("failed to bind to socket {}", *baddr))?; - }; + ip_bind_addr_no_port(socket.as_raw_fd(), true).or_err( + SocketError, + "failed to set socket opts IP_BIND_ADDRESS_NO_PORT", + )?; + + if let Some(bind_to) = bind_to { + if let Some((low, high)) = bind_to.port_range() { + ip_local_port_range(socket.as_raw_fd(), low, high) + .or_err(SocketError, "failed to set socket opts IP_LOCAL_PORT_RANGE")?; + } + + if let Some(baddr) = bind_to.addr { + socket + .bind(baddr) + .or_err_with(BindError, || format!("failed to bind to socket {}", baddr))?; + } + } } // TODO: add support for bind on other platforms @@ -349,8 +411,9 @@ pub(crate) async fn connect_with<F: FnOnce(&TcpSocket) -> Result<()>>( /// connect() to the given address while optionally binding to the specific source address. /// -/// `IP_BIND_ADDRESS_NO_PORT` is used. -pub async fn connect(addr: &SocketAddr, bind_to: Option<&SocketAddr>) -> Result<TcpStream> { +/// `IP_BIND_ADDRESS_NO_PORT` is used +/// `IP_LOCAL_PORT_RANGE` is used if a port range is set on [`BindTo`]. +pub async fn connect(addr: &SocketAddr, bind_to: Option<&BindTo>) -> Result<TcpStream> { connect_with(addr, bind_to, |_| Ok(())).await } @@ -365,7 +428,8 @@ fn wrap_os_connect_error(e: std::io::Error, context: String) -> Box<Error> { match e.kind() { ErrorKind::ConnectionRefused => Error::because(ConnectRefused, context, e), ErrorKind::TimedOut => Error::because(ConnectTimedout, context, e), - ErrorKind::PermissionDenied | ErrorKind::AddrInUse | ErrorKind::AddrNotAvailable => { + ErrorKind::AddrNotAvailable => Error::because(BindError, context, e), + ErrorKind::PermissionDenied | ErrorKind::AddrInUse => { Error::because(InternalError, context, e) } _ => match e.raw_os_error() { diff --git a/pingora-core/src/upstreams/peer.rs b/pingora-core/src/upstreams/peer.rs index fad8471..832791d 100644 --- a/pingora-core/src/upstreams/peer.rs +++ b/pingora-core/src/upstreams/peer.rs @@ -29,7 +29,7 @@ use std::path::{Path, PathBuf}; use std::sync::Arc; use std::time::Duration; -use crate::connectors::L4Connect; +use crate::connectors::{l4::BindTo, L4Connect}; use crate::protocols::l4::socket::SocketAddr; use crate::protocols::ConnFdReusable; use crate::protocols::TcpKeepalive; @@ -110,8 +110,8 @@ pub trait Peer: Display + Clone { None => None, } } - /// Which local source address this connection should be bind to. - fn bind_to(&self) -> Option<&InetSocketAddr> { + /// Information about the local source address this connection should be bound to. + fn bind_to(&self) -> Option<&BindTo> { match self.get_peer_options() { Some(opt) => opt.bind_to.as_ref(), None => None, @@ -243,7 +243,7 @@ impl Peer for BasicPeer { !self.sni.is_empty() } - fn bind_to(&self) -> Option<&InetSocketAddr> { + fn bind_to(&self) -> Option<&BindTo> { None } @@ -294,7 +294,7 @@ impl Scheme { /// See [`Peer`] for the meaning of the fields #[derive(Clone, Debug)] pub struct PeerOptions { - pub bind_to: Option<InetSocketAddr>, + pub bind_to: Option<BindTo>, pub connection_timeout: Option<Duration>, pub total_connection_timeout: Option<Duration>, pub read_timeout: Option<Duration>, @@ -365,7 +365,7 @@ impl PeerOptions { impl Display for PeerOptions { fn fmt(&self, f: &mut Formatter<'_>) -> FmtResult { - if let Some(b) = self.bind_to { + if let Some(b) = self.bind_to.as_ref() { write!(f, "bind_to: {:?},", b)?; } if let Some(t) = self.connection_timeout { |