Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions man/io_uring_enter.2
Original file line number Diff line number Diff line change
Expand Up @@ -2130,13 +2130,6 @@ The
field in a submission queue entry is invalid.
.TP
.B EINVAL
.B IORING_OP_NOP
was specified in the submission queue entry, but the io_uring context
was setup for polling
.RB ( IORING_SETUP_IOPOLL
was specified in the call to io_uring_setup).
.TP
.B EINVAL
.B IORING_OP_READV
or
.B IORING_OP_WRITEV
Expand Down
12 changes: 9 additions & 3 deletions man/io_uring_setup.2
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,17 @@ this to work. Busy-waiting provides lower latency, but may consume
more CPU resources than interrupt driven I/O. Currently, this feature
is usable only on a file descriptor opened using the
.B O_DIRECT
flag. When a read or write is submitted to a polled context, the
flag
(if using the
.B IORING_OP_{READ,WRITE}(V)(_FIXED)
opcodes). When a read or write is submitted to a polled context, the
application must poll for completions on the CQ ring by calling
.BR io_uring_enter (2).
It is illegal to mix and match polled and non-polled I/O on an io_uring
instance.
Most non-polled I/O requests are forbidden on
.B IORING_SETUP_IOPOLL
io_uring instances (see
.BR io_uring_setup_flags (7)
for the allowed opcodes).

This is only applicable for storage devices for now, and the storage device
must be configured for polling. How to do that depends on the device type
Expand Down
29 changes: 25 additions & 4 deletions man/io_uring_setup_flags.7
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ latency for high-performance storage devices (NVMe, etc.) but requires:
.IP \(bu 2
Files opened with
.B O_DIRECT
(if using the
.B IORING_OP_{READ,WRITE}(V)(_FIXED)
opcodes)
.IP \(bu
Hardware and drivers that support polling
.IP \(bu
Expand All @@ -37,10 +40,28 @@ to reap completions (busy-polling)
.IP \(bu
Storage device configuration for polling support
.PP
IOPOLL rings cannot use IRQ-driven completion; the application must poll.
Only request types that support polling may be issued on an IOPOLL ring.
This mode is commonly used for scenarios that purely do polled I/O on
storage devices like NVMe.
Only the following opcodes are allowed on IOPOLL rings:
.IP \(bu 2
.B IORING_OP_NOP(128)
.IP \(bu
.B IORING_OP_{READ,WRITE}(V)(_FIXED)
(if the file supports busy-polling)
.IP \(bu
.B IORING_OP_FILES_UPDATE
.IP \(bu
.B IORING_OP_{PROVIDE,REMOVE}_BUFFERS
.IP \(bu
.B IORING_OP_MSG_RING
.IP \(bu
.B IORING_OP_URING_CMD(128)
.PP
Since kernel 7.1, an
.B IORING_OP_URING_CMD(128)
request will use busy-polling if the file supports it (i.e., NVMe passthrough
I/O commands).
Previously,
.B IORING_OP_URING_CMD(128)
was only allowed on files that supported busy-polling.
.PP
Using IOPOLL generally requires storage device setup. For NVMe devices,
the kernel parameter
Expand Down
1 change: 1 addition & 0 deletions test/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ test_srcs := \
iopoll.c \
iopoll-leak.c \
iopoll-overflow.c \
iopoll-sync.c \
io_uring_enter.c \
io_uring_passthrough.c \
io_uring_register.c \
Expand Down
129 changes: 129 additions & 0 deletions test/iopoll-sync.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
/* SPDX-License-Identifier: MIT */
/*
* Description: test uring_cmds for files that don't support iopoll
* on IORING_SETUP_IOPOLL rings
*/

#include <liburing.h>
#include <stdio.h>
#include <sys/socket.h>

#include "helpers.h"

int main(void)
{
int sockfd;
int level = SOL_SOCKET;
int optname = SO_REUSEADDR;
int optval1, optval2, optval3;
struct io_uring ring;
int ret;
struct io_uring_sqe *sqe;
struct io_uring_cqe_iter cqe_iter;
struct io_uring_cqe *cqe;

sockfd = socket(AF_INET, SOCK_STREAM, 0);
if (sockfd < 0) {
fprintf(stderr, "socket() failed: %m\n");
return T_EXIT_SKIP;
}

optval1 = 0;
if (setsockopt(sockfd, level, optname, &optval1, sizeof(optval1)) < 0) {
fprintf(stderr, "setsockopt() failed: %m\n");
return T_EXIT_SKIP;
}

ret = t_create_ring(3, &ring, IORING_SETUP_IOPOLL);
if (ret == T_SETUP_SKIP) {
fprintf(stderr, "IORING_SETUP_IOPOLL not supported\n");
return T_EXIT_SKIP;
}
if (ret)
return T_EXIT_FAIL;

optval1 = 123;
sqe = io_uring_get_sqe(&ring);
io_uring_prep_cmd_sock(sqe, SOCKET_URING_OP_GETSOCKOPT, sockfd,
level, optname, &optval1, sizeof(optval1));
sqe->flags |= IOSQE_IO_LINK;
sqe->user_data = 1;

optval2 = 1;
sqe = io_uring_get_sqe(&ring);
io_uring_prep_cmd_sock(sqe, SOCKET_URING_OP_SETSOCKOPT, sockfd,
level, optname, &optval2, sizeof(optval2));
sqe->flags |= IOSQE_IO_LINK;
sqe->user_data = 2;

optval3 = 123;
sqe = io_uring_get_sqe(&ring);
io_uring_prep_cmd_sock(sqe, SOCKET_URING_OP_GETSOCKOPT, sockfd,
level, optname, &optval3, sizeof(optval3));
sqe->user_data = 3;

ret = io_uring_submit(&ring);
if (ret != 3) {
fprintf(stderr, "io_uring_submit() returned %d\n", ret);
return T_EXIT_FAIL;
}

cqe_iter = io_uring_cqe_iter_init(&ring);
if (!io_uring_cqe_iter_next(&cqe_iter, &cqe)) {
fprintf(stderr, "No CQE available\n");
return T_EXIT_FAIL;
}
if (cqe->user_data != 1) {
fprintf(stderr, "No CQE for user_data 1\n");
return T_EXIT_FAIL;
}
if (cqe->res == -EOPNOTSUPP) {
fprintf(stderr, "GETSOCKOPT not supported\n");
return T_EXIT_SKIP;
}
if (cqe->res != sizeof(optval1)) {
fprintf(stderr, "GETSOCKOPT returned %d\n", cqe->res);
return T_EXIT_FAIL;
}
if (optval1 != 0) {
fprintf(stderr, "optval %d != 0\n", optval1);
return T_EXIT_FAIL;
}

if (!io_uring_cqe_iter_next(&cqe_iter, &cqe)) {
fprintf(stderr, "Only 1 CQE available\n");
return T_EXIT_FAIL;
}
if (cqe->user_data != 2) {
fprintf(stderr, "No CQE for user_data 2\n");
return T_EXIT_FAIL;
}
if (cqe->res) {
fprintf(stderr, "SETSOCKOPT returned %d\n", cqe->res);
return T_EXIT_FAIL;
}

if (!io_uring_cqe_iter_next(&cqe_iter, &cqe)) {
fprintf(stderr, "Only 2 CQEs available\n");
return T_EXIT_FAIL;
}
if (cqe->user_data != 3) {
fprintf(stderr, "No CQE for user_data 3\n");
return T_EXIT_FAIL;
}
if (cqe->res != sizeof(optval3)) {
fprintf(stderr, "GETSOCKOPT returned %d\n", cqe->res);
return T_EXIT_FAIL;
}
if (optval3 != 1) {
fprintf(stderr, "optval %d != 1\n", optval3);
return T_EXIT_FAIL;
}

if (io_uring_cqe_iter_next(&cqe_iter, &cqe)) {
fprintf(stderr, "More than 3 CQEs available");
return T_EXIT_FAIL;
}

return T_EXIT_PASS;
}
Loading