597 lines
18 KiB
Groff
597 lines
18 KiB
Groff
.\" Copyright (C) 2019 Jens Axboe <axboe@kernel.dk>
|
|
.\" Copyright (C) 2019 Jon Corbet <corbet@lwn.net>
|
|
.\" Copyright (C) 2019 Red Hat, Inc.
|
|
.\"
|
|
.\" SPDX-License-Identifier: LGPL-2.0-or-later
|
|
.\"
|
|
.TH IO_URING_SETUP 2 2019-01-29 "Linux" "Linux Programmer's Manual"
|
|
.SH NAME
|
|
io_uring_setup \- setup a context for performing asynchronous I/O
|
|
.SH SYNOPSIS
|
|
.nf
|
|
.BR "#include <linux/io_uring.h>"
|
|
.PP
|
|
.BI "int io_uring_setup(u32 " entries ", struct io_uring_params *" p );
|
|
.fi
|
|
.PP
|
|
.SH DESCRIPTION
|
|
.PP
|
|
The io_uring_setup() system call sets up a submission queue (SQ) and
|
|
completion queue (CQ) with at least
|
|
.I entries
|
|
entries, and returns a file descriptor which can be used to perform
|
|
subsequent operations on the io_uring instance. The submission and
|
|
completion queues are shared between userspace and the kernel, which
|
|
eliminates the need to copy data when initiating and completing I/O.
|
|
|
|
.I params
|
|
is used by the application to pass options to the kernel, and by the
|
|
kernel to convey information about the ring buffers.
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
struct io_uring_params {
|
|
__u32 sq_entries;
|
|
__u32 cq_entries;
|
|
__u32 flags;
|
|
__u32 sq_thread_cpu;
|
|
__u32 sq_thread_idle;
|
|
__u32 features;
|
|
__u32 wq_fd;
|
|
__u32 resv[3];
|
|
struct io_sqring_offsets sq_off;
|
|
struct io_cqring_offsets cq_off;
|
|
};
|
|
.EE
|
|
.in
|
|
.PP
|
|
The
|
|
.IR flags ,
|
|
.IR sq_thread_cpu ,
|
|
and
|
|
.I sq_thread_idle
|
|
fields are used to configure the io_uring instance.
|
|
.I flags
|
|
is a bit mask of 0 or more of the following values ORed
|
|
together:
|
|
.TP
|
|
.B IORING_SETUP_IOPOLL
|
|
Perform busy-waiting for an I/O completion, as opposed to getting
|
|
notifications via an asynchronous IRQ (Interrupt Request). The file
|
|
system (if any) and block device must support polling in order for
|
|
this to work. Busy-waiting provides lower latency, but may consume
|
|
more CPU resources than interrupt driven I/O. Currently, this feature
|
|
is usable only on a file descriptor opened using the
|
|
.B O_DIRECT
|
|
flag. When a read or write is submitted to a polled context, the
|
|
application must poll for completions on the CQ ring by calling
|
|
.BR io_uring_enter (2).
|
|
It is illegal to mix and match polled and non-polled I/O on an io_uring
|
|
instance.
|
|
|
|
.TP
|
|
.B IORING_SETUP_SQPOLL
|
|
When this flag is specified, a kernel thread is created to perform
|
|
submission queue polling. An io_uring instance configured in this way
|
|
enables an application to issue I/O without ever context switching
|
|
into the kernel. By using the submission queue to fill in new
|
|
submission queue entries and watching for completions on the
|
|
completion queue, the application can submit and reap I/Os without
|
|
doing a single system call.
|
|
|
|
If the kernel thread is idle for more than
|
|
.I sq_thread_idle
|
|
milliseconds, it will set the
|
|
.B IORING_SQ_NEED_WAKEUP
|
|
bit in the
|
|
.I flags
|
|
field of the
|
|
.IR "struct io_sq_ring" .
|
|
When this happens, the application must call
|
|
.BR io_uring_enter (2)
|
|
to wake the kernel thread. If I/O is kept busy, the kernel thread
|
|
will never sleep. An application making use of this feature will need
|
|
to guard the
|
|
.BR io_uring_enter (2)
|
|
call with the following code sequence:
|
|
|
|
.in +4n
|
|
.EX
|
|
/*
|
|
* Ensure that the wakeup flag is read after the tail pointer
|
|
* has been written. It's important to use memory load acquire
|
|
* semantics for the flags read, as otherwise the application
|
|
* and the kernel might not agree on the consistency of the
|
|
* wakeup flag.
|
|
*/
|
|
unsigned flags = atomic_load_relaxed(sq_ring->flags);
|
|
if (flags & IORING_SQ_NEED_WAKEUP)
|
|
io_uring_enter(fd, 0, 0, IORING_ENTER_SQ_WAKEUP);
|
|
.EE
|
|
.in
|
|
|
|
where
|
|
.I sq_ring
|
|
is a submission queue ring setup using the
|
|
.I struct io_sqring_offsets
|
|
described below.
|
|
.TP
|
|
.BR
|
|
Before version 5.11 of the Linux kernel, to successfully use this feature, the
|
|
application must register a set of files to be used for IO through
|
|
.BR io_uring_register (2)
|
|
using the
|
|
.B IORING_REGISTER_FILES
|
|
opcode. Failure to do so will result in submitted IO being errored with
|
|
.B EBADF.
|
|
The presence of this feature can be detected by the
|
|
.B IORING_FEAT_SQPOLL_NONFIXED
|
|
feature flag.
|
|
In version 5.11 and later, it is no longer necessary to register files to use
|
|
this feature. 5.11 also allows using this as non-root, if the user has the
|
|
.B CAP_SYS_NICE
|
|
capability.
|
|
.TP
|
|
.B IORING_SETUP_SQ_AFF
|
|
If this flag is specified, then the poll thread will be bound to the
|
|
cpu set in the
|
|
.I sq_thread_cpu
|
|
field of the
|
|
.IR "struct io_uring_params" .
|
|
This flag is only meaningful when
|
|
.B IORING_SETUP_SQPOLL
|
|
is specified. When cgroup setting
|
|
.I cpuset.cpus
|
|
changes (typically in container environment), the bounded cpu set may be
|
|
changed as well.
|
|
.TP
|
|
.B IORING_SETUP_CQSIZE
|
|
Create the completion queue with
|
|
.IR "struct io_uring_params.cq_entries"
|
|
entries. The value must be greater than
|
|
.IR entries ,
|
|
and may be rounded up to the next power-of-two.
|
|
.TP
|
|
.B IORING_SETUP_CLAMP
|
|
If this flag is specified, and if
|
|
.IR entries
|
|
exceeds
|
|
.B IORING_MAX_ENTRIES ,
|
|
then
|
|
.IR entries
|
|
will be clamped at
|
|
.B IORING_MAX_ENTRIES .
|
|
If the flag
|
|
.BR IORING_SETUP_SQPOLL
|
|
is set, and if the value of
|
|
.IR "struct io_uring_params.cq_entries"
|
|
exceeds
|
|
.B IORING_MAX_CQ_ENTRIES ,
|
|
then it will be clamped at
|
|
.B IORING_MAX_CQ_ENTRIES .
|
|
.TP
|
|
.B IORING_SETUP_ATTACH_WQ
|
|
This flag should be set in conjunction with
|
|
.IR "struct io_uring_params.wq_fd"
|
|
being set to an existing io_uring ring file descriptor. When set, the
|
|
io_uring instance being created will share the asynchronous worker
|
|
thread backend of the specified io_uring ring, rather than create a new
|
|
separate thread pool.
|
|
.TP
|
|
.B IORING_SETUP_R_DISABLED
|
|
If this flag is specified, the io_uring ring starts in a disabled state.
|
|
In this state, restrictions can be registered, but submissions are not allowed.
|
|
See
|
|
.BR io_uring_register (2)
|
|
for details on how to enable the ring. Available since 5.10.
|
|
.TP
|
|
.B IORING_SETUP_SUBMIT_ALL
|
|
Normally io_uring stops submitting a batch of request, if one of these requests
|
|
results in an error. This can cause submission of less than what is expected,
|
|
if a request ends in error while being submitted. If the ring is created with
|
|
this flag,
|
|
.BR io_uring_enter (2)
|
|
will continue submitting requests even if it encounters an error submitting
|
|
a request. CQEs are still posted for errored request regardless of whether or
|
|
not this flag is set at ring creation time, the only difference is if the
|
|
submit sequence is halted or continued when an error is observed. Available
|
|
since 5.18.
|
|
.TP
|
|
.B IORING_SETUP_COOP_TASKRUN
|
|
By default, io_uring will interrupt a task running in userspace when a
|
|
completion event comes in. This is to ensure that completions run in a timely
|
|
manner. For a lot of use cases, this is overkill and can cause reduced
|
|
performance from both the inter-processor interrupt used to do this, the
|
|
kernel/user transition, the needless interruption of the tasks userspace
|
|
activities, and reduced batching if completions come in at a rapid rate. Most
|
|
applications don't need the forceful interruption, as the events are processed
|
|
at any kernel/user transition. The exception are setups where the application
|
|
uses multiple threads operating on the same ring, where the application
|
|
waiting on completions isn't the one that submitted them. For most other
|
|
use cases, setting this flag will improve performance. Available since 5.19.
|
|
.TP
|
|
.B IORING_SETUP_TASKRUN_FLAG
|
|
Used in conjunction with
|
|
.B IORING_SETUP_COOP_TASKRUN,
|
|
this provides a flag,
|
|
.B IORING_SQ_TASKRUN,
|
|
which is set in the SQ ring
|
|
.I flags
|
|
whenever completions are pending that should be processed. liburing will check
|
|
for this flag even when doing
|
|
.BR io_uring_peek_cqe (3)
|
|
and enter the kernel to process them, and applications can do the same. This
|
|
makes
|
|
.B IORING_SETUP_TASKRUN_FLAG
|
|
safe to use even when applications rely on a peek style operation on the CQ
|
|
ring to see if anything might be pending to reap. Available since 5.19.
|
|
.TP
|
|
.B IORING_SETUP_SQE128
|
|
If set, io_uring will use 128-byte SQEs rather than the normal 64-byte sized
|
|
variant. This is a requirement for using certain request types, as of 5.19
|
|
only the
|
|
.B IORING_OP_URING_CMD
|
|
passthrough command for NVMe passthrough needs this. Available since 5.19.
|
|
.TP
|
|
.B IORING_SETUP_CQE32
|
|
If set, io_uring will use 32-byte CQEs rather than the normal 32-byte sized
|
|
variant. This is a requirement for using certain request types, as of 5.19
|
|
only the
|
|
.B IORING_OP_URING_CMD
|
|
passthrough command for NVMe passthrough needs this. Available since 5.19.
|
|
.PP
|
|
If no flags are specified, the io_uring instance is setup for
|
|
interrupt driven I/O. I/O may be submitted using
|
|
.BR io_uring_enter (2)
|
|
and can be reaped by polling the completion queue.
|
|
|
|
The
|
|
.I resv
|
|
array must be initialized to zero.
|
|
|
|
.I features
|
|
is filled in by the kernel, which specifies various features supported
|
|
by current kernel version.
|
|
.TP
|
|
.B IORING_FEAT_SINGLE_MMAP
|
|
If this flag is set, the two SQ and CQ rings can be mapped with a single
|
|
.I mmap(2)
|
|
call. The SQEs must still be allocated separately. This brings the necessary
|
|
.I mmap(2)
|
|
calls down from three to two. Available since kernel 5.4.
|
|
.TP
|
|
.B IORING_FEAT_NODROP
|
|
If this flag is set, io_uring supports never dropping completion events.
|
|
If a completion event occurs and the CQ ring is full, the kernel stores
|
|
the event internally until such a time that the CQ ring has room for more
|
|
entries. If this overflow condition is entered, attempting to submit more
|
|
IO will fail with the
|
|
.B -EBUSY
|
|
error value, if it can't flush the overflown events to the CQ ring. If this
|
|
happens, the application must reap events from the CQ ring and attempt the
|
|
submit again. Available since kernel 5.5.
|
|
.TP
|
|
.B IORING_FEAT_SUBMIT_STABLE
|
|
If this flag is set, applications can be certain that any data for
|
|
async offload has been consumed when the kernel has consumed the SQE. Available
|
|
since kernel 5.5.
|
|
.TP
|
|
.B IORING_FEAT_RW_CUR_POS
|
|
If this flag is set, applications can specify
|
|
.I offset
|
|
==
|
|
.B -1
|
|
with
|
|
.B IORING_OP_{READV,WRITEV}
|
|
,
|
|
.B IORING_OP_{READ,WRITE}_FIXED
|
|
, and
|
|
.B IORING_OP_{READ,WRITE}
|
|
to mean current file position, which behaves like
|
|
.I preadv2(2)
|
|
and
|
|
.I pwritev2(2)
|
|
with
|
|
.I offset
|
|
==
|
|
.B -1.
|
|
It'll use (and update) the current file position. This obviously comes
|
|
with the caveat that if the application has multiple reads or writes in flight,
|
|
then the end result will not be as expected. This is similar to threads sharing
|
|
a file descriptor and doing IO using the current file position. Available since
|
|
kernel 5.6.
|
|
.TP
|
|
.B IORING_FEAT_CUR_PERSONALITY
|
|
If this flag is set, then io_uring guarantees that both sync and async
|
|
execution of a request assumes the credentials of the task that called
|
|
.I
|
|
io_uring_enter(2)
|
|
to queue the requests. If this flag isn't set, then requests are issued with
|
|
the credentials of the task that originally registered the io_uring. If only
|
|
one task is using a ring, then this flag doesn't matter as the credentials
|
|
will always be the same. Note that this is the default behavior, tasks can
|
|
still register different personalities through
|
|
.I
|
|
io_uring_register(2)
|
|
with
|
|
.B IORING_REGISTER_PERSONALITY
|
|
and specify the personality to use in the sqe. Available since kernel 5.6.
|
|
.TP
|
|
.B IORING_FEAT_FAST_POLL
|
|
If this flag is set, then io_uring supports using an internal poll mechanism
|
|
to drive data/space readiness. This means that requests that cannot read or
|
|
write data to a file no longer need to be punted to an async thread for
|
|
handling, instead they will begin operation when the file is ready. This is
|
|
similar to doing poll + read/write in userspace, but eliminates the need to do
|
|
so. If this flag is set, requests waiting on space/data consume a lot less
|
|
resources doing so as they are not blocking a thread. Available since kernel
|
|
5.7.
|
|
.TP
|
|
.B IORING_FEAT_POLL_32BITS
|
|
If this flag is set, the
|
|
.B IORING_OP_POLL_ADD
|
|
command accepts the full 32-bit range of epoll based flags. Most notably
|
|
.B EPOLLEXCLUSIVE
|
|
which allows exclusive (waking single waiters) behavior. Available since kernel
|
|
5.9.
|
|
.TP
|
|
.B IORING_FEAT_SQPOLL_NONFIXED
|
|
If this flag is set, the
|
|
.B IORING_SETUP_SQPOLL
|
|
feature no longer requires the use of fixed files. Any normal file descriptor
|
|
can be used for IO commands without needing registration. Available since
|
|
kernel 5.11.
|
|
.TP
|
|
.B IORING_FEAT_ENTER_EXT_ARG
|
|
If this flag is set, then the
|
|
.BR io_uring_enter (2)
|
|
system call supports passing in an extended argument instead of just the
|
|
.IR "sigset_t"
|
|
of earlier kernels. This.
|
|
extended argument is of type
|
|
.IR "struct io_uring_getevents_arg"
|
|
and allows the caller to pass in both a
|
|
.IR "sigset_t"
|
|
and a timeout argument for waiting on events. The struct layout is as follows:
|
|
.TP
|
|
.in +8n
|
|
.EX
|
|
struct io_uring_getevents_arg {
|
|
__u64 sigmask;
|
|
__u32 sigmask_sz;
|
|
__u32 pad;
|
|
__u64 ts;
|
|
};
|
|
.EE
|
|
|
|
and a pointer to this struct must be passed in if
|
|
.B IORING_ENTER_EXT_ARG
|
|
is set in the flags for the enter system call. Available since kernel 5.11.
|
|
.TP
|
|
.B IORING_FEAT_NATIVE_WORKERS
|
|
If this flag is set, io_uring is using native workers for its async helpers.
|
|
Previous kernels used kernel threads that assumed the identity of the
|
|
original io_uring owning task, but later kernels will actively create what
|
|
looks more like regular process threads instead. Available since kernel
|
|
5.12.
|
|
.TP
|
|
.B IORING_FEAT_RSRC_TAGS
|
|
If this flag is set, then io_uring supports a variety of features related
|
|
to fixed files and buffers. In particular, it indicates that registered
|
|
buffers can be updated in-place, whereas before the full set would have to
|
|
be unregistered first. Available since kernel 5.13.
|
|
.TP
|
|
.B IORING_FEAT_CQE_SKIP
|
|
If this flag is set, then io_uring supports setting
|
|
.B IOSQE_CQE_SKIP_SUCCESS
|
|
in the submitted SQE, indicating that no CQE should be generated for this
|
|
SQE if it executes normally. If an error happens processing the SQE, a
|
|
CQE with the appropriate error value will still be generated. Available since
|
|
kernel 5.17.
|
|
.TP
|
|
.B IORING_FEAT_LINKED_FILE
|
|
If this flag is set, then io_uring supports sane assignment of files for SQEs
|
|
that have dependencies. For example, if a chain of SQEs are submitted with
|
|
.B IOSQE_IO_LINK,
|
|
then kernels without this flag will prepare the file for each link upfront.
|
|
If a previous link opens a file with a known index, eg if direct descriptors
|
|
are used with open or accept, then file assignment needs to happen post
|
|
execution of that SQE. If this flag is set, then the kernel will defer
|
|
file assignment until execution of a given request is started. Available since
|
|
kernel 5.17.
|
|
|
|
.PP
|
|
The rest of the fields in the
|
|
.I struct io_uring_params
|
|
are filled in by the kernel, and provide the information necessary to
|
|
memory map the submission queue, completion queue, and the array of
|
|
submission queue entries.
|
|
.I sq_entries
|
|
specifies the number of submission queue entries allocated.
|
|
.I sq_off
|
|
describes the offsets of various ring buffer fields:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
struct io_sqring_offsets {
|
|
__u32 head;
|
|
__u32 tail;
|
|
__u32 ring_mask;
|
|
__u32 ring_entries;
|
|
__u32 flags;
|
|
__u32 dropped;
|
|
__u32 array;
|
|
__u32 resv[3];
|
|
};
|
|
.EE
|
|
.in
|
|
.PP
|
|
Taken together,
|
|
.I sq_entries
|
|
and
|
|
.I sq_off
|
|
provide all of the information necessary for accessing the submission
|
|
queue ring buffer and the submission queue entry array. The
|
|
submission queue can be mapped with a call like:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
ptr = mmap(0, sq_off.array + sq_entries * sizeof(__u32),
|
|
PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE,
|
|
ring_fd, IORING_OFF_SQ_RING);
|
|
.EE
|
|
.in
|
|
.PP
|
|
where
|
|
.I sq_off
|
|
is the
|
|
.I io_sqring_offsets
|
|
structure, and
|
|
.I ring_fd
|
|
is the file descriptor returned from
|
|
.BR io_uring_setup (2).
|
|
The addition of
|
|
.I sq_off.array
|
|
to the length of the region accounts for the fact that the ring
|
|
located at the end of the data structure. As an example, the ring
|
|
buffer head pointer can be accessed by adding
|
|
.I sq_off.head
|
|
to the address returned from
|
|
.BR mmap (2):
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
head = ptr + sq_off.head;
|
|
.EE
|
|
.in
|
|
|
|
The
|
|
.I flags
|
|
field is used by the kernel to communicate state information to the
|
|
application. Currently, it is used to inform the application when a
|
|
call to
|
|
.BR io_uring_enter (2)
|
|
is necessary. See the documentation for the
|
|
.B IORING_SETUP_SQPOLL
|
|
flag above.
|
|
The
|
|
.I dropped
|
|
member is incremented for each invalid submission queue entry
|
|
encountered in the ring buffer.
|
|
|
|
The head and tail track the ring buffer state. The tail is
|
|
incremented by the application when submitting new I/O, and the head
|
|
is incremented by the kernel when the I/O has been successfully
|
|
submitted. Determining the index of the head or tail into the ring is
|
|
accomplished by applying a mask:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
index = tail & ring_mask;
|
|
.EE
|
|
.in
|
|
.PP
|
|
The array of submission queue entries is mapped with:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
sqentries = mmap(0, sq_entries * sizeof(struct io_uring_sqe),
|
|
PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE,
|
|
ring_fd, IORING_OFF_SQES);
|
|
.EE
|
|
.in
|
|
.PP
|
|
The completion queue is described by
|
|
.I cq_entries
|
|
and
|
|
.I cq_off
|
|
shown here:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
struct io_cqring_offsets {
|
|
__u32 head;
|
|
__u32 tail;
|
|
__u32 ring_mask;
|
|
__u32 ring_entries;
|
|
__u32 overflow;
|
|
__u32 cqes;
|
|
__u32 flags;
|
|
__u32 resv[3];
|
|
};
|
|
.EE
|
|
.in
|
|
.PP
|
|
The completion queue is simpler, since the entries are not separated
|
|
from the queue itself, and can be mapped with:
|
|
.PP
|
|
.in +4n
|
|
.EX
|
|
ptr = mmap(0, cq_off.cqes + cq_entries * sizeof(struct io_uring_cqe),
|
|
PROT_READ|PROT_WRITE, MAP_SHARED|MAP_POPULATE, ring_fd,
|
|
IORING_OFF_CQ_RING);
|
|
.EE
|
|
.in
|
|
.PP
|
|
Closing the file descriptor returned by
|
|
.BR io_uring_setup (2)
|
|
will free all resources associated with the io_uring context.
|
|
.PP
|
|
.SH RETURN VALUE
|
|
.BR io_uring_setup (2)
|
|
returns a new file descriptor on success. The application may then
|
|
provide the file descriptor in a subsequent
|
|
.BR mmap (2)
|
|
call to map the submission and completion queues, or to the
|
|
.BR io_uring_register (2)
|
|
or
|
|
.BR io_uring_enter (2)
|
|
system calls.
|
|
|
|
On error,
|
|
.B -1
|
|
is returned and
|
|
.I errno
|
|
is set appropriately.
|
|
.PP
|
|
.SH ERRORS
|
|
.TP
|
|
.B EFAULT
|
|
params is outside your accessible address space.
|
|
.TP
|
|
.B EINVAL
|
|
The resv array contains non-zero data, p.flags contains an unsupported
|
|
flag,
|
|
.I entries
|
|
is out of bounds,
|
|
.B IORING_SETUP_SQ_AFF
|
|
was specified, but
|
|
.B IORING_SETUP_SQPOLL
|
|
was not, or
|
|
.B IORING_SETUP_CQSIZE
|
|
was specified, but
|
|
.I io_uring_params.cq_entries
|
|
was invalid.
|
|
.TP
|
|
.B EMFILE
|
|
The per-process limit on the number of open file descriptors has been
|
|
reached (see the description of
|
|
.B RLIMIT_NOFILE
|
|
in
|
|
.BR getrlimit (2)).
|
|
.TP
|
|
.B ENFILE
|
|
The system-wide limit on the total number of open files has been
|
|
reached.
|
|
.TP
|
|
.B ENOMEM
|
|
Insufficient kernel resources are available.
|
|
.TP
|
|
.B EPERM
|
|
.B IORING_SETUP_SQPOLL
|
|
was specified, but the effective user ID of the caller did not have sufficient
|
|
privileges.
|
|
.SH SEE ALSO
|
|
.BR io_uring_register (2),
|
|
.BR io_uring_enter (2)
|