diff --git a/aya-obj/include/linux_wrapper.h b/aya-obj/include/linux_wrapper.h index 9123cf028..a57434ed5 100644 --- a/aya-obj/include/linux_wrapper.h +++ b/aya-obj/include/linux_wrapper.h @@ -1,10 +1,13 @@ #include #include #include +#include #include +#include #include #include #include #include #include +#include #include diff --git a/test/integration-test/src/lib.rs b/test/integration-test/src/lib.rs index 54f07e6cc..8b3a8c3c7 100644 --- a/test/integration-test/src/lib.rs +++ b/test/integration-test/src/lib.rs @@ -74,6 +74,8 @@ bpf_file!( STACK_TRACE_LSM => "stack_trace_lsm", ); +#[cfg(test)] +mod netlink; #[cfg(test)] mod tests; #[cfg(test)] diff --git a/test/integration-test/src/netlink.rs b/test/integration-test/src/netlink.rs new file mode 100644 index 000000000..99364cec8 --- /dev/null +++ b/test/integration-test/src/netlink.rs @@ -0,0 +1,542 @@ +//! Minimal netlink helpers for integration-test network setup. +//! +//! These live here (rather than in the `aya` crate) to avoid adding test-only +//! public API surface to the production library. + +use std::{ + collections::HashMap, + ffi::CStr, + io, mem, + net::Ipv4Addr, + os::fd::{AsRawFd as _, FromRawFd as _, OwnedFd, RawFd}, + ptr, slice, +}; + +use aya_obj::generated::{NLMSG_ALIGNTO, ifinfomsg}; +use libc::{ + AF_INET, AF_NETLINK, AF_UNSPEC, IFA_ADDRESS, IFA_LOCAL, IFF_UP, IFLA_ADDRESS, IFLA_IFNAME, + IFLA_INFO_DATA, IFLA_INFO_KIND, IFLA_LINKINFO, IFLA_NET_NS_FD, NDA_DST, NDA_LLADDR, + NETLINK_ROUTE, NLA_ALIGNTO, NLA_F_NESTED, NLA_TYPE_MASK, NLM_F_ACK, NLM_F_CREATE, NLM_F_EXCL, + NLM_F_REQUEST, NLMSG_DONE, NLMSG_ERROR, NTF_SELF, NUD_PERMANENT, RTM_GETLINK, RTM_NEWADDR, + RTM_NEWLINK, RTM_NEWNEIGH, RTM_SETLINK, SOCK_RAW, SOL_NETLINK, nlattr, nlmsgerr, nlmsghdr, +}; + +const NLA_HDR_LEN: usize = align_to(size_of::(), NLA_ALIGNTO as usize); +/// `VETH_INFO_PEER` from `linux/veth.h`; not exported by libc. +/// TODO: import from `aya_obj::generated` once codegen is re-run with `linux/veth.h`. +const VETH_INFO_PEER: u16 = 1; + +/// `struct ifaddrmsg` from `linux/if_addr.h`. +/// TODO: import from `aya_obj::generated` once codegen is re-run. +#[derive(Copy, Clone)] +#[repr(C)] +struct Ifaddrmsg { + family: u8, + prefixlen: u8, + flags: u8, + scope: u8, + index: u32, +} + +/// `struct ndmsg` from `linux/neighbour.h`. +/// TODO: import from `aya_obj::generated` once codegen is re-run. +#[derive(Copy, Clone)] +#[repr(C)] +struct Ndmsg { + family: u8, + pad1: u8, + pad2: u16, + ifindex: i32, + state: u16, + flags: u8, + r#type: u8, +} + +/// Netlink request for link operations (set up, set namespace, etc.). +/// +/// Largest payload is `set_link_ns` carrying `IFLA_NET_NS_FD`: NLA header (4) + +/// i32 (4) = 8 bytes. 64 leaves room for future attributes. +#[derive(Copy, Clone)] +#[repr(C)] +struct LinkRequest { + header: nlmsghdr, + if_info: ifinfomsg, + attrs: [u8; 64], +} + +/// Netlink request for veth pair creation. +/// +/// Nested attrs: `IFLA_IFNAME` + `IFLA_LINKINFO` { `IFLA_INFO_KIND`("veth") + +/// `IFLA_INFO_DATA` { `VETH_INFO_PEER` { `ifinfomsg` + `IFLA_IFNAME` } } } totals ~64 +/// bytes with short names, up to ~88 with `IFNAMSIZ`-length names. 128 is plenty. +#[derive(Copy, Clone)] +#[repr(C)] +struct VethRequest { + header: nlmsghdr, + if_info: ifinfomsg, + attrs: [u8; 128], +} + +/// Netlink request for address assignment. +/// +/// `IFA_LOCAL` (8) + `IFA_ADDRESS` (8) = 16 bytes. 24 provides headroom. +#[derive(Copy, Clone)] +#[repr(C)] +struct AddrRequest { + header: nlmsghdr, + addr_info: Ifaddrmsg, + attrs: [u8; 24], +} + +/// Netlink request for neighbor (ARP) entries. +/// +/// `NDA_DST` (8) + `NDA_LLADDR` (4 hdr + 6 MAC padded to 8 = 12) = 20 bytes. +#[derive(Copy, Clone)] +#[repr(C)] +struct NeighRequest { + header: nlmsghdr, + neigh_info: Ndmsg, + attrs: [u8; 24], +} + +const fn align_to(v: usize, align: usize) -> usize { + v.next_multiple_of(align) +} + +/// Reinterpret a `#[repr(C)]` struct as a byte slice. +/// +/// # Safety +/// +/// `T` must be `#[repr(C)]` with no padding that could leak uninitialised bytes +/// into the returned slice. All request structs in this module satisfy this +/// because they are either fully initialised or their padding bytes are +/// explicitly zeroed via `mem::zeroed()`. +unsafe fn bytes_of(val: &T) -> &[u8] { + unsafe { slice::from_raw_parts(ptr::from_ref(val).cast(), size_of::()) } +} + +/// Return a mutable slice over the attribute area of a request struct. +/// +/// # Safety +/// +/// `msg_len` must be the offset of the attribute area within `T`. +unsafe fn request_attributes(req: &mut T, msg_len: usize) -> &mut [u8] { + let req: *mut u8 = ptr::from_mut(req).cast(); + let attrs_addr = unsafe { req.add(msg_len) }; + let align_offset = attrs_addr.align_offset(NLMSG_ALIGNTO as usize); + let attrs_addr = unsafe { attrs_addr.add(align_offset) }; + let len = size_of::() - msg_len - align_offset; + unsafe { slice::from_raw_parts_mut(attrs_addr, len) } +} + +fn write_attr_bytes( + buf: &mut [u8], + offset: usize, + attr_type: u16, + value: &[u8], +) -> Result { + let attr = nlattr { + nla_type: attr_type, + nla_len: (NLA_HDR_LEN + value.len()) as u16, + }; + write_attr_header(buf, offset, attr)?; + let value_len = write_bytes(buf, offset + NLA_HDR_LEN, value)?; + Ok(NLA_HDR_LEN + value_len) +} + +fn write_attr_header(buf: &mut [u8], offset: usize, attr: nlattr) -> Result { + let attr = unsafe { bytes_of(&attr) }; + write_bytes(buf, offset, attr)?; + Ok(NLA_HDR_LEN) +} + +fn write_bytes(buf: &mut [u8], offset: usize, value: &[u8]) -> Result { + let align_len = align_to(value.len(), NLA_ALIGNTO as usize); + if offset + align_len > buf.len() { + return Err(io::Error::other( + "no space left in netlink attribute buffer", + )); + } + buf[offset..offset + value.len()].copy_from_slice(value); + Ok(align_len) +} + +struct NetlinkSocket { + sock: OwnedFd, +} + +impl NetlinkSocket { + fn open() -> io::Result { + let sock = unsafe { libc::socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE) }; + if sock < 0 { + return Err(io::Error::last_os_error()); + } + let sock = unsafe { OwnedFd::from_raw_fd(sock) }; + + let enable = 1i32; + unsafe { + if libc::setsockopt( + sock.as_raw_fd(), + SOL_NETLINK, + libc::NETLINK_EXT_ACK, + ptr::from_ref(&enable).cast(), + size_of_val(&enable) as u32, + ) < 0 + { + return Err(io::Error::last_os_error()); + } + if libc::setsockopt( + sock.as_raw_fd(), + SOL_NETLINK, + libc::NETLINK_CAP_ACK, + ptr::from_ref(&enable).cast(), + size_of_val(&enable) as u32, + ) < 0 + { + return Err(io::Error::last_os_error()); + } + } + Ok(Self { sock }) + } + + /// Send a request and collect the response messages. + fn execute(&self, msg: &[u8]) -> io::Result> { + if unsafe { libc::send(self.sock.as_raw_fd(), msg.as_ptr().cast(), msg.len(), 0) } < 0 { + return Err(io::Error::last_os_error()); + } + self.recv() + } + + fn recv(&self) -> io::Result> { + let mut buf = [0u8; 4096]; + let mut messages = Vec::new(); + let mut multipart = true; + 'out: while multipart { + multipart = false; + let len = + unsafe { libc::recv(self.sock.as_raw_fd(), buf.as_mut_ptr().cast(), buf.len(), 0) }; + if len < 0 { + return Err(io::Error::last_os_error()); + } + if len == 0 { + break; + } + let len = len as usize; + let mut offset = 0; + while offset < len { + let message = NetlinkMessage::read(&buf[offset..])?; + offset += align_to(message.header.nlmsg_len as usize, NLMSG_ALIGNTO as usize); + multipart = (message.header.nlmsg_flags & (libc::NLM_F_MULTI as u16)) != 0; + match i32::from(message.header.nlmsg_type) { + NLMSG_ERROR => { + let err = message.error.unwrap(); + if err.error == 0 { + // ACK + continue; + } + return Err(io::Error::from_raw_os_error(-err.error)); + } + NLMSG_DONE => break 'out, + _ => messages.push(message), + } + } + } + Ok(messages) + } +} + +struct NetlinkMessage { + header: nlmsghdr, + data: Vec, + error: Option, +} + +impl NetlinkMessage { + fn read(buf: &[u8]) -> io::Result { + if size_of::() > buf.len() { + return Err(io::Error::other("buffer smaller than nlmsghdr")); + } + let header: nlmsghdr = unsafe { ptr::read_unaligned(buf.as_ptr().cast()) }; + let msg_len = header.nlmsg_len as usize; + if msg_len < size_of::() || msg_len > buf.len() { + return Err(io::Error::other("invalid nlmsg_len")); + } + let data_offset = align_to(size_of::(), NLMSG_ALIGNTO as usize); + if data_offset >= buf.len() { + return Err(io::Error::other("need more data")); + } + let (rest, error) = if header.nlmsg_type == NLMSG_ERROR as u16 { + if data_offset + size_of::() > buf.len() { + return Err(io::Error::other( + "NLMSG_ERROR but not enough space for nlmsgerr", + )); + } + ( + &buf[data_offset + size_of::()..msg_len], + Some(unsafe { ptr::read_unaligned(buf[data_offset..].as_ptr().cast()) }), + ) + } else { + (&buf[data_offset..msg_len], None) + }; + Ok(Self { + header, + data: rest.to_vec(), + error, + }) + } +} + +fn parse_attrs(buf: &[u8]) -> HashMap { + let mut attrs = HashMap::new(); + let mut offset = 0; + while offset < buf.len() { + let remaining = &buf[offset..]; + if NLA_HDR_LEN > remaining.len() { + break; + } + let attr: nlattr = unsafe { ptr::read_unaligned(remaining.as_ptr().cast()) }; + let len = attr.nla_len as usize; + if len < NLA_HDR_LEN { + break; + } + let align_len = align_to(len, NLA_ALIGNTO as usize); + if align_len > remaining.len() { + break; + } + attrs.insert( + attr.nla_type & (NLA_TYPE_MASK as u16), + &remaining[NLA_HDR_LEN..len], + ); + offset += align_len; + } + attrs +} + +/// Set a network interface up (`IFF_UP`). +pub(crate) fn set_link_up(if_index: i32) -> io::Result<()> { + let sock = NetlinkSocket::open()?; + let mut req = unsafe { mem::zeroed::() }; + + let nlmsg_len = size_of::() + size_of::(); + req.header = nlmsghdr { + nlmsg_len: nlmsg_len as u32, + nlmsg_flags: (NLM_F_REQUEST | NLM_F_ACK) as u16, + nlmsg_type: RTM_SETLINK, + nlmsg_pid: 0, + nlmsg_seq: 1, + }; + req.if_info.ifi_family = AF_UNSPEC as u8; + req.if_info.ifi_index = if_index; + req.if_info.ifi_flags = IFF_UP as u32; + req.if_info.ifi_change = IFF_UP as u32; + + sock.execute(unsafe { &bytes_of(&req)[..req.header.nlmsg_len as usize] })?; + Ok(()) +} + +/// Create a veth pair. +pub(crate) fn create_veth_pair(if_name: &CStr, peer_name: &CStr) -> io::Result<()> { + let sock = NetlinkSocket::open()?; + let mut req = unsafe { mem::zeroed::() }; + + let nlmsg_len = size_of::() + size_of::(); + req.header = nlmsghdr { + nlmsg_len: nlmsg_len as u32, + nlmsg_flags: (NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE | NLM_F_EXCL) as u16, + nlmsg_type: RTM_NEWLINK, + nlmsg_pid: 0, + nlmsg_seq: 1, + }; + req.if_info.ifi_family = AF_UNSPEC as u8; + + let attrs_buf = unsafe { request_attributes(&mut req, nlmsg_len) }; + let attr_len = write_veth_attrs(attrs_buf, if_name, peer_name)?; + req.header.nlmsg_len += align_to(attr_len, NLA_ALIGNTO as usize) as u32; + + sock.execute(unsafe { &bytes_of(&req)[..req.header.nlmsg_len as usize] })?; + Ok(()) +} + +fn write_veth_attrs(buf: &mut [u8], if_name: &CStr, peer_name: &CStr) -> Result { + let mut offset = 0usize; + + // IFLA_IFNAME for the first interface. + offset += write_attr_bytes(buf, offset, IFLA_IFNAME, if_name.to_bytes_with_nul())?; + + // IFLA_LINKINFO (nested): contains IFLA_INFO_KIND + IFLA_INFO_DATA. + let linkinfo_start = offset; + offset += NLA_HDR_LEN; // reserve; back-filled below + + // IFLA_INFO_KIND = "veth" + offset += write_attr_bytes(buf, offset, IFLA_INFO_KIND, c"veth".to_bytes_with_nul())?; + + // IFLA_INFO_DATA (nested): contains VETH_INFO_PEER. + let info_data_start = offset; + offset += NLA_HDR_LEN; + + // VETH_INFO_PEER (nested): contains an Ifinfomsg + IFLA_IFNAME. + let peer_start = offset; + offset += NLA_HDR_LEN; + + // Embedded Ifinfomsg for the peer (all zeros from mem::zeroed). + let ifinfo_size = align_to(size_of::(), NLA_ALIGNTO as usize); + if offset + ifinfo_size > buf.len() { + return Err(io::Error::other( + "no space left in netlink attribute buffer", + )); + } + offset += ifinfo_size; + + // IFLA_IFNAME for the peer. + offset += write_attr_bytes(buf, offset, IFLA_IFNAME, peer_name.to_bytes_with_nul())?; + + // Back-fill headers. + write_attr_header( + buf, + peer_start, + nlattr { + nla_type: (NLA_F_NESTED as u16) | VETH_INFO_PEER, + nla_len: (offset - peer_start) as u16, + }, + )?; + write_attr_header( + buf, + info_data_start, + nlattr { + nla_type: (NLA_F_NESTED as u16) | IFLA_INFO_DATA, + nla_len: (offset - info_data_start) as u16, + }, + )?; + write_attr_header( + buf, + linkinfo_start, + nlattr { + nla_type: (NLA_F_NESTED as u16) | IFLA_LINKINFO, + nla_len: (offset - linkinfo_start) as u16, + }, + )?; + + Ok(offset) +} + +/// Move a network interface to another network namespace. +pub(crate) fn set_link_ns(if_index: i32, netns_fd: RawFd) -> io::Result<()> { + let sock = NetlinkSocket::open()?; + let mut req = unsafe { mem::zeroed::() }; + + let nlmsg_len = size_of::() + size_of::(); + req.header = nlmsghdr { + nlmsg_len: nlmsg_len as u32, + nlmsg_flags: (NLM_F_REQUEST | NLM_F_ACK) as u16, + nlmsg_type: RTM_SETLINK, + nlmsg_pid: 0, + nlmsg_seq: 1, + }; + req.if_info.ifi_family = AF_UNSPEC as u8; + req.if_info.ifi_index = if_index; + + let attrs_buf = unsafe { request_attributes(&mut req, nlmsg_len) }; + let attr_len = write_attr_bytes(attrs_buf, 0, IFLA_NET_NS_FD, unsafe { bytes_of(&netns_fd) })?; + req.header.nlmsg_len += align_to(attr_len, NLA_ALIGNTO as usize) as u32; + + sock.execute(unsafe { &bytes_of(&req)[..req.header.nlmsg_len as usize] })?; + Ok(()) +} + +/// Add an IPv4 address to a network interface. +pub(crate) fn add_addr_v4(if_index: i32, addr: Ipv4Addr, prefix_len: u8) -> io::Result<()> { + let sock = NetlinkSocket::open()?; + let mut req = unsafe { mem::zeroed::() }; + + let nlmsg_len = size_of::() + size_of::(); + req.header = nlmsghdr { + nlmsg_len: nlmsg_len as u32, + nlmsg_flags: (NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE | NLM_F_EXCL) as u16, + nlmsg_type: RTM_NEWADDR, + nlmsg_pid: 0, + nlmsg_seq: 1, + }; + req.addr_info = Ifaddrmsg { + family: AF_INET as u8, + prefixlen: prefix_len, + flags: 0, + scope: 0, + index: if_index as u32, + }; + + let attrs_buf = unsafe { request_attributes(&mut req, nlmsg_len) }; + let addr_bytes = addr.octets(); + let mut offset = 0; + offset += write_attr_bytes(attrs_buf, offset, IFA_LOCAL, &addr_bytes)?; + offset += write_attr_bytes(attrs_buf, offset, IFA_ADDRESS, &addr_bytes)?; + req.header.nlmsg_len += align_to(offset, NLA_ALIGNTO as usize) as u32; + + sock.execute(unsafe { &bytes_of(&req)[..req.header.nlmsg_len as usize] })?; + Ok(()) +} + +/// Read the MAC address (`IFLA_ADDRESS`) of a network interface. +pub(crate) fn get_link_mac(if_index: i32) -> io::Result<[u8; 6]> { + let sock = NetlinkSocket::open()?; + let mut req = unsafe { mem::zeroed::() }; + + let nlmsg_len = size_of::() + size_of::(); + req.header = nlmsghdr { + nlmsg_len: nlmsg_len as u32, + nlmsg_flags: NLM_F_REQUEST as u16, + nlmsg_type: RTM_GETLINK, + nlmsg_pid: 0, + nlmsg_seq: 1, + }; + req.if_info.ifi_family = AF_UNSPEC as u8; + req.if_info.ifi_index = if_index; + + for msg in sock.execute(unsafe { &bytes_of(&req)[..req.header.nlmsg_len as usize] })? { + if msg.header.nlmsg_type != RTM_NEWLINK { + continue; + } + let attrs = parse_attrs(&msg.data[size_of::()..]); + if let Some(data) = attrs.get(&IFLA_ADDRESS) { + if let Ok(mac) = <[u8; 6]>::try_from(*data) { + return Ok(mac); + } + } + } + + Err(io::Error::other( + "IFLA_ADDRESS not found in RTM_GETLINK response", + )) +} + +/// Add a static neighbor (ARP) entry. +pub(crate) fn add_neigh_v4(if_index: i32, dst_addr: Ipv4Addr, lladdr: [u8; 6]) -> io::Result<()> { + let sock = NetlinkSocket::open()?; + let mut req = unsafe { mem::zeroed::() }; + + let nlmsg_len = size_of::() + size_of::(); + req.header = nlmsghdr { + nlmsg_len: nlmsg_len as u32, + nlmsg_flags: (NLM_F_REQUEST | NLM_F_ACK | NLM_F_CREATE | NLM_F_EXCL) as u16, + nlmsg_type: RTM_NEWNEIGH, + nlmsg_pid: 0, + nlmsg_seq: 1, + }; + req.neigh_info = Ndmsg { + family: AF_INET as u8, + pad1: 0, + pad2: 0, + ifindex: if_index, + state: NUD_PERMANENT, + flags: NTF_SELF, + r#type: 0, + }; + + let attrs_buf = unsafe { request_attributes(&mut req, nlmsg_len) }; + let mut offset = 0; + offset += write_attr_bytes(attrs_buf, offset, NDA_DST, &dst_addr.octets())?; + offset += write_attr_bytes(attrs_buf, offset, NDA_LLADDR, &lladdr)?; + req.header.nlmsg_len += align_to(offset, NLA_ALIGNTO as usize) as u32; + + sock.execute(unsafe { &bytes_of(&req)[..req.header.nlmsg_len as usize] })?; + Ok(()) +} diff --git a/test/integration-test/src/tests/load.rs b/test/integration-test/src/tests/load.rs index d42eee9b7..e82d0f23d 100644 --- a/test/integration-test/src/tests/load.rs +++ b/test/integration-test/src/tests/load.rs @@ -24,11 +24,15 @@ use aya::{ use aya_obj::programs::XdpAttachType; use test_case::test_case; +use crate::utils::NetNsGuard; + const MAX_RETRIES: usize = 100; pub(crate) const RETRY_DURATION: Duration = Duration::from_millis(10); #[test_log::test] fn long_name() { + let _netns = NetNsGuard::new(); + let mut bpf = Ebpf::load(crate::NAME_TEST).unwrap(); let name_prog: &mut Xdp = bpf .program_mut("ihaveaverylongname") @@ -278,6 +282,8 @@ impl_unload_program_ops!(FlowDissector, FlowDissectorLinkId, FlowDissectorLink); #[test_log::test] fn unload_xdp() { + let _netns = NetNsGuard::new(); + type P = Xdp; let program_name = "pass"; @@ -410,6 +416,8 @@ fn basic_flow_dissector() { #[test_log::test] fn pin_link() { + let _netns = NetNsGuard::new(); + type P = Xdp; let program_name = "pass"; @@ -423,7 +431,7 @@ fn pin_link() { assert_loaded(program_name); let fd_link: FdLink = link.try_into().unwrap(); - let pinned = fd_link.pin("/sys/fs/bpf/aya-xdp-test-lo").unwrap(); + let pinned = fd_link.pin("/sys/fs/bpf/aya-xdp-test-veth0").unwrap(); // because of the pin, the program is still attached prog.unload().unwrap(); @@ -447,11 +455,10 @@ fn pin_tcx_link() { return; } - use crate::utils::NetNsGuard; let _netns = NetNsGuard::new(); let program_name = "tcx_next"; - let pin_path = "/sys/fs/bpf/aya-tcx-test-lo"; + let pin_path = "/sys/fs/bpf/aya-tcx-test-veth0"; let mut bpf = Ebpf::load(crate::TCX).unwrap(); let prog: &mut SchedClassifier = bpf.program_mut(program_name).unwrap().try_into().unwrap(); prog.load().unwrap(); @@ -516,12 +523,14 @@ impl_pin_program_ops!(UProbe); #[test_log::test] fn pin_lifecycle() { + let _netns = NetNsGuard::new(); + type P = Xdp; let program_name = "pass"; let attach = |prog: &mut P| prog.attach("lo", XdpFlags::default()).unwrap(); let program_pin = "/sys/fs/bpf/aya-xdp-test-prog"; - let link_pin = "/sys/fs/bpf/aya-xdp-test-lo"; + let link_pin = "/sys/fs/bpf/aya-xdp-test-veth0"; let from_pin = |program_pin: &str| P::from_pin(program_pin, XdpAttachType::Interface).unwrap(); let kernel_version = KernelVersion::current().unwrap(); diff --git a/test/integration-test/src/tests/xdp.rs b/test/integration-test/src/tests/xdp.rs index 192538b91..bbdceb64b 100644 --- a/test/integration-test/src/tests/xdp.rs +++ b/test/integration-test/src/tests/xdp.rs @@ -4,13 +4,35 @@ use assert_matches::assert_matches; use aya::{ Ebpf, maps::{Array, CpuMap, XskMap}, - programs::{ProgramError, Xdp, XdpError, XdpFlags, xdp::XdpLinkId}, + programs::{ProgramError, Xdp, XdpError, XdpFlags}, util::KernelVersion, }; use object::{Object as _, ObjectSection as _, ObjectSymbol as _, SymbolSection}; use xdpilone::{BufIdx, IfInfo, Socket, SocketConfig, Umem, UmemConfig}; -use crate::utils::NetNsGuard; +use crate::utils::{NetNsGuard, PeerNsGuard}; + +// Sanity-check the veth + PeerNsGuard plumbing without any BPF programs. +#[test_log::test] +fn veth_connectivity() { + let (_netns, peer) = NetNsGuard::with_peer(); + + const PAYLOAD: &[u8] = b"veth ok"; + + let sock = UdpSocket::bind((PeerNsGuard::IFACE_IP, 0)).unwrap(); + let addr = sock.local_addr().unwrap(); + sock.set_read_timeout(Some(Duration::from_secs(10))) + .unwrap(); + + peer.run(|| { + let sock = UdpSocket::bind((PeerNsGuard::PEER_IP, 0)).unwrap(); + sock.send_to(PAYLOAD, addr).unwrap(); + }); + + let mut buf = [0u8; PAYLOAD.len() + 1]; + let n = sock.recv(&mut buf).unwrap(); + assert_eq!(&buf[..n], PAYLOAD); +} #[test_log::test] #[expect( @@ -18,7 +40,7 @@ use crate::utils::NetNsGuard; reason = "packet headers are encoded in network byte order" )] fn af_xdp() { - let _netns = NetNsGuard::new(); + let (_netns, peer) = NetNsGuard::with_peer(); let mut bpf = Ebpf::load(crate::REDIRECT).unwrap(); let mut socks: XskMap<_> = bpf.take_map("SOCKS").unwrap().try_into().unwrap(); @@ -29,7 +51,8 @@ fn af_xdp() { .try_into() .unwrap(); xdp.load().unwrap(); - xdp.attach("lo", XdpFlags::default()).unwrap(); + xdp.attach(PeerNsGuard::IFACE_NAME, XdpFlags::default()) + .unwrap(); const SIZE: usize = 2 * 4096; @@ -47,7 +70,7 @@ fn af_xdp() { }; let mut iface = IfInfo::invalid(); - iface.from_name(c"lo").unwrap(); + iface.from_name(c"veth0").unwrap(); let sock = match Socket::with_shared(&iface, &umem) { Ok(sock) => sock, Err(err) => { @@ -81,9 +104,13 @@ fn af_xdp() { writer.insert_once(frame1.offset); writer.commit(); - let sock = UdpSocket::bind("127.0.0.1:0").unwrap(); - let port = sock.local_addr().unwrap().port(); - sock.send_to(b"hello AF_XDP", "127.0.0.1:1777").unwrap(); + let dst = (PeerNsGuard::IFACE_IP, 1777u16); + let port = peer.run(|| { + let sock = UdpSocket::bind((PeerNsGuard::PEER_IP, 0)).unwrap(); + let port = sock.local_addr().unwrap().port(); + sock.send_to(b"hello AF_XDP", dst).unwrap(); + port + }); assert_eq!(rx.available(), 1); let desc = rx.receive(1).read().unwrap(); @@ -97,20 +124,26 @@ fn af_xdp() { assert_eq!(ip[9], 17); // UDP let (udp, payload) = buf.split_at(8); let ports = &udp[..4]; - let (src, dst) = ports.split_at(2); + let (src, dst_port) = ports.split_at(2); assert_eq!(src, port.to_be_bytes().as_slice()); // Source - assert_eq!(dst, 1777u16.to_be_bytes().as_slice()); // Dest + assert_eq!(dst_port, 1777u16.to_be_bytes().as_slice()); // Dest assert_eq!(payload, b"hello AF_XDP"); assert_eq!(rx.available(), 1); // Removes socket from map, no more packets will be redirected. socks.unset(0).unwrap(); assert_eq!(rx.available(), 1); - sock.send_to(b"hello AF_XDP", "127.0.0.1:1777").unwrap(); + peer.run(|| { + let sock = UdpSocket::bind((PeerNsGuard::PEER_IP, 0)).unwrap(); + sock.send_to(b"hello AF_XDP", dst).unwrap(); + }); assert_eq!(rx.available(), 1); // Adds socket to map again, packets will be redirected again. socks.set(0, rx.as_raw_fd(), 0).unwrap(); - sock.send_to(b"hello AF_XDP", "127.0.0.1:1777").unwrap(); + peer.run(|| { + let sock = UdpSocket::bind((PeerNsGuard::PEER_IP, 0)).unwrap(); + sock.send_to(b"hello AF_XDP", dst).unwrap(); + }); assert_eq!(rx.available(), 2); } @@ -182,34 +215,35 @@ fn cpumap_chain() { }; cpus.set(0, 2048, Some(xdp_chain_fd), 0).unwrap(); - // Load the main program + // Load the main program and attach to loopback (generic/SKB-mode XDP). + // We use loopback instead of veth because cpumap chaining does not + // reliably deliver packets through veth on arm64. let xdp: &mut Xdp = bpf.program_mut("redirect_cpu").unwrap().try_into().unwrap(); xdp.load().unwrap(); - let result = xdp.attach("lo", XdpFlags::default()); + // Generic devices did not support cpumap XDP programs until 5.15. - // // See https://github.com/torvalds/linux/commit/11941f8a85362f612df61f4aaab0e41b64d2111d. + let result = xdp.attach("lo", XdpFlags::default()); if KernelVersion::current().unwrap() < KernelVersion::new(5, 15, 0) { assert_matches!(result, Err(ProgramError::XdpError(XdpError::NetlinkError(err))) => { assert_eq!(err.raw_os_error(), Some(libc::EINVAL)) }); - eprintln!( - "skipping test - cpumap attachment not supported on generic (loopback) interface" - ); + eprintln!("skipping test - cpumap on generic XDP requires kernel >= 5.15"); return; } - let _unused: XdpLinkId = result.unwrap(); + result.unwrap(); const PAYLOAD: &str = "hello cpumap"; + // Send a UDP packet to ourselves over loopback so it hits our XDP program. let sock = UdpSocket::bind("127.0.0.1:0").unwrap(); let addr = sock.local_addr().unwrap(); sock.set_read_timeout(Some(Duration::from_secs(60))) .unwrap(); sock.send_to(PAYLOAD.as_bytes(), addr).unwrap(); - // Read back the packet to ensure it went through the entire network stack, including our two - // probes. + // Read back the packet to ensure it traversed the full XDP pipeline: + // redirect_cpu -> cpumap -> redirect_cpu_chain -> network stack. let mut buf = [0u8; PAYLOAD.len() + 1]; let n = sock.recv(&mut buf).unwrap(); diff --git a/test/integration-test/src/utils.rs b/test/integration-test/src/utils.rs index 04aa50413..d8a104ac0 100644 --- a/test/integration-test/src/utils.rs +++ b/test/integration-test/src/utils.rs @@ -3,18 +3,21 @@ use std::{ borrow::Cow, cell::OnceCell, - ffi::CString, + ffi::CStr, fs, io::{self, Write as _}, + net::Ipv4Addr, + os::fd::AsRawFd as _, path::Path, process, sync::atomic::{AtomicU64, Ordering}, }; use anyhow::{Context as _, Result}; -use aya::netlink_set_link_up; use libc::if_nametoindex; +use crate::netlink; + const CGROUP_ROOT: &str = "/sys/fs/cgroup"; const CGROUP_PROCS: &str = "cgroup.procs"; @@ -179,21 +182,27 @@ impl NetNsGuard { let ns = Self { name, old_ns }; // By default, the loopback in a new netns is down. Set it up. - let lo = CString::new("lo").unwrap(); unsafe { - let idx = if_nametoindex(lo.as_ptr()); - assert!( - idx != 0, + let idx = if_nametoindex(c"lo".as_ptr()); + assert_ne!( + idx, + 0, "interface `lo` not found in netns {}: {}", ns.name, io::Error::last_os_error() ); - netlink_set_link_up(idx as i32) + netlink::set_link_up(idx as i32) .unwrap_or_else(|e| panic!("failed to set `lo` up in netns {}: {e}", ns.name)); } ns } + + pub(crate) fn with_peer() -> (Self, PeerNsGuard) { + let netns = Self::new(); + let peer = PeerNsGuard::new(&netns); + (netns, peer) + } } impl Drop for NetNsGuard { @@ -231,6 +240,270 @@ impl Drop for NetNsGuard { } } +/// Run a closure inside a network namespace identified by a file handle. +fn run_in_netns(ns_file: &fs::File, f: F) -> R +where + F: FnOnce() -> R + Send, + R: Send, +{ + std::thread::scope(|s| { + s.spawn(|| { + nix::sched::setns(ns_file, nix::sched::CloneFlags::CLONE_NEWNET) + .expect("setns to target netns"); + f() + }) + .join() + .unwrap() + }) +} + +/// A second network namespace connected to the test namespace via the veth pair. +/// +/// Creates a topology where `veth0` (10.0.0.1) lives in the test namespace and +/// `veth1` (10.0.0.2) is moved into the peer namespace. Static ARP entries are +/// installed on both sides so that no ARP traffic interferes with XDP programs. +/// +/// Must be declared *after* `NetNsGuard` so that it is dropped first (Rust drops +/// locals in reverse declaration order). Dropping the peer namespace also destroys +/// its end of the veth pair. +pub(crate) struct PeerNsGuard { + name: String, +} + +impl PeerNsGuard { + const IFACE: &CStr = c"veth0"; + const PEER_IFACE: &CStr = c"veth1"; + pub(crate) const IFACE_NAME: &str = "veth0"; + pub(crate) const IFACE_IP: Ipv4Addr = Ipv4Addr::new(10, 0, 0, 1); + pub(crate) const PEER_IP: Ipv4Addr = Ipv4Addr::new(10, 0, 0, 2); + + fn new(netns: &NetNsGuard) -> Self { + let name = format!("{}-peer", netns.name); + + // Create a veth pair for tests that attach XDP/TC programs. Veth supports + // native XDP (unlike loopback which only supports SKB/generic mode), so tests + // exercise the same code path used in production. + netlink::create_veth_pair(Self::IFACE, Self::PEER_IFACE).unwrap_or_else(|e| { + panic!( + "failed to create veth pair {:?}/{:?} in netns {}: {e}", + Self::IFACE, + Self::PEER_IFACE, + netns.name + ) + }); + + // Bring up veth0. + unsafe { + let idx = if_nametoindex(Self::IFACE.as_ptr()); + assert_ne!( + idx, + 0, + "interface `{:?}` not found in netns {}: {}", + Self::IFACE, + netns.name, + io::Error::last_os_error() + ); + netlink::set_link_up(idx as i32).unwrap_or_else(|e| { + panic!( + "failed to set `{:?}` up in netns {}: {e}", + Self::IFACE, + netns.name + ) + }); + } + + // Create peer netns: create a persist file, then use a separate thread + // to unshare(CLONE_NEWNET) and bind-mount the new ns over the file. + let ns_path = Path::new(NetNsGuard::PERSIST_DIR).join(&name); + let _unused: fs::File = fs::File::create(&ns_path) + .unwrap_or_else(|err| panic!("fs::File::create(\"{}\"): {err:?}", ns_path.display())); + std::thread::scope(|s| { + s.spawn(|| { + nix::sched::unshare(nix::sched::CloneFlags::CLONE_NEWNET) + .expect("unshare(CLONE_NEWNET) for peer ns"); + let new_ns_path = format!("/proc/self/task/{}/ns/net", nix::unistd::gettid()); + nix::mount::mount( + Some(new_ns_path.as_str()), + &ns_path, + Some("none"), + nix::mount::MsFlags::MS_BIND, + None::<&str>, + ) + .expect("mount-bind peer netns"); + }) + .join() + .unwrap(); + }); + + // Move veth1 into peer netns. + let peer_ns = fs::File::open(&ns_path) + .unwrap_or_else(|e| panic!("open(\"{}\"): {e}", ns_path.display())); + unsafe { + let idx = if_nametoindex(Self::PEER_IFACE.as_ptr()); + assert_ne!( + idx, + 0, + "interface `{:?}` not found: {}", + Self::PEER_IFACE, + io::Error::last_os_error() + ); + netlink::set_link_ns(idx as i32, peer_ns.as_raw_fd()).unwrap_or_else(|e| { + panic!( + "failed to move `{:?}` to netns {name}: {e}", + Self::PEER_IFACE + ) + }); + } + + // Assign IP to veth0 in test netns. + unsafe { + let idx = if_nametoindex(Self::IFACE.as_ptr()); + assert_ne!( + idx, + 0, + "interface `{:?}` not found: {}", + Self::IFACE, + io::Error::last_os_error() + ); + netlink::add_addr_v4(idx as i32, Self::IFACE_IP, 24) + .unwrap_or_else(|e| panic!("failed to add addr to `{:?}`: {e}", Self::IFACE)); + } + + // Configure veth1 in peer netns: add addr, set link up, set lo up, get MAC. + let veth1_mac = run_in_netns(&peer_ns, || unsafe { + let idx = if_nametoindex(Self::PEER_IFACE.as_ptr()); + assert_ne!( + idx, + 0, + "interface `{:?}` not found in peer netns: {}", + Self::PEER_IFACE, + io::Error::last_os_error() + ); + netlink::add_addr_v4(idx as i32, Self::PEER_IP, 24).unwrap_or_else(|e| { + panic!( + "failed to add addr to `{:?}` in peer netns: {e}", + Self::PEER_IFACE + ) + }); + netlink::set_link_up(idx as i32).unwrap_or_else(|e| { + panic!( + "failed to set `{:?}` up in peer netns: {e}", + Self::PEER_IFACE + ) + }); + + let lo_idx = if_nametoindex(c"lo".as_ptr()); + assert_ne!( + lo_idx, + 0, + "interface `lo` not found in peer netns: {}", + io::Error::last_os_error() + ); + netlink::set_link_up(lo_idx as i32) + .unwrap_or_else(|e| panic!("failed to set `lo` up in peer netns: {e}")); + + netlink::get_link_mac(idx as i32) + .unwrap_or_else(|e| panic!("failed to get MAC of `{:?}`: {e}", Self::PEER_IFACE)) + }); + + // Read veth0 MAC in test netns. + let veth0_mac = unsafe { + let idx = if_nametoindex(Self::IFACE.as_ptr()); + assert_ne!( + idx, + 0, + "interface `{:?}` not found: {}", + Self::IFACE, + io::Error::last_os_error() + ); + netlink::get_link_mac(idx as i32) + .unwrap_or_else(|e| panic!("failed to get MAC of `{:?}`: {e}", Self::IFACE)) + }; + + // Static ARP in test netns: peer IP -> veth1 MAC. + unsafe { + let idx = if_nametoindex(Self::IFACE.as_ptr()); + netlink::add_neigh_v4(idx as i32, Self::PEER_IP, veth1_mac).unwrap_or_else(|e| { + panic!( + "failed to add neigh entry for {} on `{:?}`: {e}", + Self::PEER_IP, + Self::IFACE + ) + }); + } + + // Static ARP in peer netns: test IP -> veth0 MAC. + run_in_netns(&peer_ns, || unsafe { + let idx = if_nametoindex(Self::PEER_IFACE.as_ptr()); + assert_ne!( + idx, + 0, + "interface `{:?}` not found in peer netns: {}", + Self::PEER_IFACE, + io::Error::last_os_error() + ); + netlink::add_neigh_v4(idx as i32, Self::IFACE_IP, veth0_mac).unwrap_or_else(|e| { + panic!( + "failed to add neigh entry for {} on `{:?}`: {e}", + Self::IFACE_IP, + Self::PEER_IFACE + ) + }); + }); + + Self { name } + } + + /// Run a closure inside the peer network namespace. + /// + /// `f` runs on a separate thread because `setns` affects the calling thread. + /// `F` is `Send` but not `'static`, so callers can capture references whose + /// lifetime is bound by this guard. + pub(crate) fn run(&self, f: F) -> R + where + F: FnOnce() -> R + Send, + R: Send, + { + let peer_ns_path = Path::new(NetNsGuard::PERSIST_DIR).join(&self.name); + let peer_ns = fs::File::open(&peer_ns_path) + .unwrap_or_else(|e| panic!("open(\"{}\"): {e}", peer_ns_path.display())); + run_in_netns(&peer_ns, f) + } +} + +impl Drop for PeerNsGuard { + #[expect( + clippy::print_stderr, + reason = "drop handlers avoid panic-in-panic by logging errors" + )] + #[expect( + clippy::use_debug, + reason = "debug formatting preserves error context in drop" + )] + fn drop(&mut self) { + let Self { name } = self; + match (|| -> Result<()> { + let ns_path = Path::new(NetNsGuard::PERSIST_DIR).join(&name); + nix::mount::umount2(&ns_path, nix::mount::MntFlags::MNT_DETACH).with_context(|| { + format!("nix::mount::umount2(\"{}\", MNT_DETACH)", ns_path.display()) + })?; + fs::remove_file(&ns_path) + .with_context(|| format!("fs::remove_file(\"{}\")", ns_path.display()))?; + Ok(()) + })() { + Ok(()) => (), + Err(err) => { + // Avoid panic in panic. + if std::thread::panicking() { + eprintln!("{err:?}"); + } else { + panic!("{err:?}"); + } + } + } + } +} + /// If the `KernelVersion::current >= $version`, `assert!($cond)`, else `assert!(!$cond)`. macro_rules! kernel_assert { ($cond:expr, $version:expr $(,)?) => { diff --git a/xtask/src/codegen/aya.rs b/xtask/src/codegen/aya.rs index 2945084f6..27465631a 100644 --- a/xtask/src/codegen/aya.rs +++ b/xtask/src/codegen/aya.rs @@ -118,7 +118,9 @@ fn codegen_bindings(opts: &SysrootOptions, libbpf_dir: &Path) -> Result<()> { "perf_type_id", "perf_event_type", // NETLINK + "ifaddrmsg", "ifinfomsg", + "ndmsg", "tcmsg", "nlmsgerr_attrs", // ITER @@ -163,6 +165,8 @@ fn codegen_bindings(opts: &SysrootOptions, libbpf_dir: &Path) -> Result<()> { "TC_H_MIN_EGRESS", // Ringbuf "BPF_RINGBUF_.*", + // VETH + "VETH_INFO_PEER", // NETFILTER "NFPROTO_.*", ];