io_uring: make io_uring_types.h public
authorPavel Begunkov <asml.silence@gmail.com>
Thu, 16 Jun 2022 12:57:19 +0000 (13:57 +0100)
committerJens Axboe <axboe@kernel.dk>
Mon, 25 Jul 2022 00:39:14 +0000 (18:39 -0600)
Move io_uring types to linux/include, need them public so tracing can
see the definitions and we can clean trace/events/io_uring.h

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Link: https://lore.kernel.org/r/a15f12e8cb7289b2de0deaddcc7518d98a132d17.1655384063.git.asml.silence@gmail.com
Signed-off-by: Jens Axboe <axboe@kernel.dk>
include/linux/io_uring_types.h [new file with mode: 0644]
io_uring/filetable.h
io_uring/io-wq.h
io_uring/io_uring.h
io_uring/io_uring_types.h [deleted file]
io_uring/refs.h

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
new file mode 100644 (file)
index 0000000..779c72d
--- /dev/null
@@ -0,0 +1,554 @@
+#ifndef IO_URING_TYPES_H
+#define IO_URING_TYPES_H
+
+#include <linux/blkdev.h>
+#include <linux/task_work.h>
+#include <linux/bitmap.h>
+#include <uapi/linux/io_uring.h>
+
+struct io_wq_work_node {
+       struct io_wq_work_node *next;
+};
+
+struct io_wq_work_list {
+       struct io_wq_work_node *first;
+       struct io_wq_work_node *last;
+};
+
+struct io_wq_work {
+       struct io_wq_work_node list;
+       unsigned flags;
+       /* place it here instead of io_kiocb as it fills padding and saves 4B */
+       int cancel_seq;
+};
+
+struct io_fixed_file {
+       /* file * with additional FFS_* flags */
+       unsigned long file_ptr;
+};
+
+struct io_file_table {
+       struct io_fixed_file *files;
+       unsigned long *bitmap;
+       unsigned int alloc_hint;
+};
+
+struct io_hash_bucket {
+       spinlock_t              lock;
+       struct hlist_head       list;
+} ____cacheline_aligned_in_smp;
+
+struct io_hash_table {
+       struct io_hash_bucket   *hbs;
+       unsigned                hash_bits;
+};
+
+struct io_uring {
+       u32 head ____cacheline_aligned_in_smp;
+       u32 tail ____cacheline_aligned_in_smp;
+};
+
+/*
+ * This data is shared with the application through the mmap at offsets
+ * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
+ *
+ * The offsets to the member fields are published through struct
+ * io_sqring_offsets when calling io_uring_setup.
+ */
+struct io_rings {
+       /*
+        * Head and tail offsets into the ring; the offsets need to be
+        * masked to get valid indices.
+        *
+        * The kernel controls head of the sq ring and the tail of the cq ring,
+        * and the application controls tail of the sq ring and the head of the
+        * cq ring.
+        */
+       struct io_uring         sq, cq;
+       /*
+        * Bitmasks to apply to head and tail offsets (constant, equals
+        * ring_entries - 1)
+        */
+       u32                     sq_ring_mask, cq_ring_mask;
+       /* Ring sizes (constant, power of 2) */
+       u32                     sq_ring_entries, cq_ring_entries;
+       /*
+        * Number of invalid entries dropped by the kernel due to
+        * invalid index stored in array
+        *
+        * Written by the kernel, shouldn't be modified by the
+        * application (i.e. get number of "new events" by comparing to
+        * cached value).
+        *
+        * After a new SQ head value was read by the application this
+        * counter includes all submissions that were dropped reaching
+        * the new SQ head (and possibly more).
+        */
+       u32                     sq_dropped;
+       /*
+        * Runtime SQ flags
+        *
+        * Written by the kernel, shouldn't be modified by the
+        * application.
+        *
+        * The application needs a full memory barrier before checking
+        * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
+        */
+       atomic_t                sq_flags;
+       /*
+        * Runtime CQ flags
+        *
+        * Written by the application, shouldn't be modified by the
+        * kernel.
+        */
+       u32                     cq_flags;
+       /*
+        * Number of completion events lost because the queue was full;
+        * this should be avoided by the application by making sure
+        * there are not more requests pending than there is space in
+        * the completion queue.
+        *
+        * Written by the kernel, shouldn't be modified by the
+        * application (i.e. get number of "new events" by comparing to
+        * cached value).
+        *
+        * As completion events come in out of order this counter is not
+        * ordered with any other data.
+        */
+       u32                     cq_overflow;
+       /*
+        * Ring buffer of completion events.
+        *
+        * The kernel writes completion events fresh every time they are
+        * produced, so the application is allowed to modify pending
+        * entries.
+        */
+       struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
+};
+
+struct io_restriction {
+       DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
+       DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
+       u8 sqe_flags_allowed;
+       u8 sqe_flags_required;
+       bool registered;
+};
+
+struct io_submit_link {
+       struct io_kiocb         *head;
+       struct io_kiocb         *last;
+};
+
+struct io_submit_state {
+       /* inline/task_work completion list, under ->uring_lock */
+       struct io_wq_work_node  free_list;
+       /* batch completion logic */
+       struct io_wq_work_list  compl_reqs;
+       struct io_submit_link   link;
+
+       bool                    plug_started;
+       bool                    need_plug;
+       bool                    flush_cqes;
+       unsigned short          submit_nr;
+       struct blk_plug         plug;
+};
+
+struct io_ev_fd {
+       struct eventfd_ctx      *cq_ev_fd;
+       unsigned int            eventfd_async: 1;
+       struct rcu_head         rcu;
+};
+
+struct io_ring_ctx {
+       /* const or read-mostly hot data */
+       struct {
+               struct percpu_ref       refs;
+
+               struct io_rings         *rings;
+               unsigned int            flags;
+               enum task_work_notify_mode      notify_method;
+               unsigned int            compat: 1;
+               unsigned int            drain_next: 1;
+               unsigned int            restricted: 1;
+               unsigned int            off_timeout_used: 1;
+               unsigned int            drain_active: 1;
+               unsigned int            drain_disabled: 1;
+               unsigned int            has_evfd: 1;
+               unsigned int            syscall_iopoll: 1;
+       } ____cacheline_aligned_in_smp;
+
+       /* submission data */
+       struct {
+               struct mutex            uring_lock;
+
+               /*
+                * Ring buffer of indices into array of io_uring_sqe, which is
+                * mmapped by the application using the IORING_OFF_SQES offset.
+                *
+                * This indirection could e.g. be used to assign fixed
+                * io_uring_sqe entries to operations and only submit them to
+                * the queue when needed.
+                *
+                * The kernel modifies neither the indices array nor the entries
+                * array.
+                */
+               u32                     *sq_array;
+               struct io_uring_sqe     *sq_sqes;
+               unsigned                cached_sq_head;
+               unsigned                sq_entries;
+
+               /*
+                * Fixed resources fast path, should be accessed only under
+                * uring_lock, and updated through io_uring_register(2)
+                */
+               struct io_rsrc_node     *rsrc_node;
+               int                     rsrc_cached_refs;
+               atomic_t                cancel_seq;
+               struct io_file_table    file_table;
+               unsigned                nr_user_files;
+               unsigned                nr_user_bufs;
+               struct io_mapped_ubuf   **user_bufs;
+
+               struct io_submit_state  submit_state;
+
+               struct io_buffer_list   *io_bl;
+               struct xarray           io_bl_xa;
+               struct list_head        io_buffers_cache;
+
+               struct io_hash_table    cancel_table_locked;
+               struct list_head        cq_overflow_list;
+               struct list_head        apoll_cache;
+               struct xarray           personalities;
+               u32                     pers_next;
+       } ____cacheline_aligned_in_smp;
+
+       /* IRQ completion list, under ->completion_lock */
+       struct io_wq_work_list  locked_free_list;
+       unsigned int            locked_free_nr;
+
+       const struct cred       *sq_creds;      /* cred used for __io_sq_thread() */
+       struct io_sq_data       *sq_data;       /* if using sq thread polling */
+
+       struct wait_queue_head  sqo_sq_wait;
+       struct list_head        sqd_list;
+
+       unsigned long           check_cq;
+
+       struct {
+               /*
+                * We cache a range of free CQEs we can use, once exhausted it
+                * should go through a slower range setup, see __io_get_cqe()
+                */
+               struct io_uring_cqe     *cqe_cached;
+               struct io_uring_cqe     *cqe_sentinel;
+
+               unsigned                cached_cq_tail;
+               unsigned                cq_entries;
+               struct io_ev_fd __rcu   *io_ev_fd;
+               struct wait_queue_head  cq_wait;
+               unsigned                cq_extra;
+       } ____cacheline_aligned_in_smp;
+
+       struct {
+               spinlock_t              completion_lock;
+
+               /*
+                * ->iopoll_list is protected by the ctx->uring_lock for
+                * io_uring instances that don't use IORING_SETUP_SQPOLL.
+                * For SQPOLL, only the single threaded io_sq_thread() will
+                * manipulate the list, hence no extra locking is needed there.
+                */
+               struct io_wq_work_list  iopoll_list;
+               struct io_hash_table    cancel_table;
+               bool                    poll_multi_queue;
+
+               struct list_head        io_buffers_comp;
+       } ____cacheline_aligned_in_smp;
+
+       /* timeouts */
+       struct {
+               spinlock_t              timeout_lock;
+               atomic_t                cq_timeouts;
+               struct list_head        timeout_list;
+               struct list_head        ltimeout_list;
+               unsigned                cq_last_tm_flush;
+       } ____cacheline_aligned_in_smp;
+
+       /* Keep this last, we don't need it for the fast path */
+
+       struct io_restriction           restrictions;
+       struct task_struct              *submitter_task;
+
+       /* slow path rsrc auxilary data, used by update/register */
+       struct io_rsrc_node             *rsrc_backup_node;
+       struct io_mapped_ubuf           *dummy_ubuf;
+       struct io_rsrc_data             *file_data;
+       struct io_rsrc_data             *buf_data;
+
+       struct delayed_work             rsrc_put_work;
+       struct llist_head               rsrc_put_llist;
+       struct list_head                rsrc_ref_list;
+       spinlock_t                      rsrc_ref_lock;
+
+       struct list_head                io_buffers_pages;
+
+       #if defined(CONFIG_UNIX)
+               struct socket           *ring_sock;
+       #endif
+       /* hashed buffered write serialization */
+       struct io_wq_hash               *hash_map;
+
+       /* Only used for accounting purposes */
+       struct user_struct              *user;
+       struct mm_struct                *mm_account;
+
+       /* ctx exit and cancelation */
+       struct llist_head               fallback_llist;
+       struct delayed_work             fallback_work;
+       struct work_struct              exit_work;
+       struct list_head                tctx_list;
+       struct completion               ref_comp;
+
+       /* io-wq management, e.g. thread count */
+       u32                             iowq_limits[2];
+       bool                            iowq_limits_set;
+
+       struct list_head                defer_list;
+       unsigned                        sq_thread_idle;
+};
+
+enum {
+       REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
+       REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
+       REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
+       REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
+       REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
+       REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
+       REQ_F_CQE_SKIP_BIT      = IOSQE_CQE_SKIP_SUCCESS_BIT,
+
+       /* first byte is taken by user flags, shift it to not overlap */
+       REQ_F_FAIL_BIT          = 8,
+       REQ_F_INFLIGHT_BIT,
+       REQ_F_CUR_POS_BIT,
+       REQ_F_NOWAIT_BIT,
+       REQ_F_LINK_TIMEOUT_BIT,
+       REQ_F_NEED_CLEANUP_BIT,
+       REQ_F_POLLED_BIT,
+       REQ_F_BUFFER_SELECTED_BIT,
+       REQ_F_BUFFER_RING_BIT,
+       REQ_F_REISSUE_BIT,
+       REQ_F_CREDS_BIT,
+       REQ_F_REFCOUNT_BIT,
+       REQ_F_ARM_LTIMEOUT_BIT,
+       REQ_F_ASYNC_DATA_BIT,
+       REQ_F_SKIP_LINK_CQES_BIT,
+       REQ_F_SINGLE_POLL_BIT,
+       REQ_F_DOUBLE_POLL_BIT,
+       REQ_F_PARTIAL_IO_BIT,
+       REQ_F_CQE32_INIT_BIT,
+       REQ_F_APOLL_MULTISHOT_BIT,
+       REQ_F_CLEAR_POLLIN_BIT,
+       REQ_F_HASH_LOCKED_BIT,
+       /* keep async read/write and isreg together and in order */
+       REQ_F_SUPPORT_NOWAIT_BIT,
+       REQ_F_ISREG_BIT,
+
+       /* not a real bit, just to check we're not overflowing the space */
+       __REQ_F_LAST_BIT,
+};
+
+enum {
+       /* ctx owns file */
+       REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
+       /* drain existing IO first */
+       REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
+       /* linked sqes */
+       REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
+       /* doesn't sever on completion < 0 */
+       REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
+       /* IOSQE_ASYNC */
+       REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
+       /* IOSQE_BUFFER_SELECT */
+       REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
+       /* IOSQE_CQE_SKIP_SUCCESS */
+       REQ_F_CQE_SKIP          = BIT(REQ_F_CQE_SKIP_BIT),
+
+       /* fail rest of links */
+       REQ_F_FAIL              = BIT(REQ_F_FAIL_BIT),
+       /* on inflight list, should be cancelled and waited on exit reliably */
+       REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
+       /* read/write uses file position */
+       REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
+       /* must not punt to workers */
+       REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
+       /* has or had linked timeout */
+       REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
+       /* needs cleanup */
+       REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
+       /* already went through poll handler */
+       REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
+       /* buffer already selected */
+       REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
+       /* buffer selected from ring, needs commit */
+       REQ_F_BUFFER_RING       = BIT(REQ_F_BUFFER_RING_BIT),
+       /* caller should reissue async */
+       REQ_F_REISSUE           = BIT(REQ_F_REISSUE_BIT),
+       /* supports async reads/writes */
+       REQ_F_SUPPORT_NOWAIT    = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
+       /* regular file */
+       REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
+       /* has creds assigned */
+       REQ_F_CREDS             = BIT(REQ_F_CREDS_BIT),
+       /* skip refcounting if not set */
+       REQ_F_REFCOUNT          = BIT(REQ_F_REFCOUNT_BIT),
+       /* there is a linked timeout that has to be armed */
+       REQ_F_ARM_LTIMEOUT      = BIT(REQ_F_ARM_LTIMEOUT_BIT),
+       /* ->async_data allocated */
+       REQ_F_ASYNC_DATA        = BIT(REQ_F_ASYNC_DATA_BIT),
+       /* don't post CQEs while failing linked requests */
+       REQ_F_SKIP_LINK_CQES    = BIT(REQ_F_SKIP_LINK_CQES_BIT),
+       /* single poll may be active */
+       REQ_F_SINGLE_POLL       = BIT(REQ_F_SINGLE_POLL_BIT),
+       /* double poll may active */
+       REQ_F_DOUBLE_POLL       = BIT(REQ_F_DOUBLE_POLL_BIT),
+       /* request has already done partial IO */
+       REQ_F_PARTIAL_IO        = BIT(REQ_F_PARTIAL_IO_BIT),
+       /* fast poll multishot mode */
+       REQ_F_APOLL_MULTISHOT   = BIT(REQ_F_APOLL_MULTISHOT_BIT),
+       /* ->extra1 and ->extra2 are initialised */
+       REQ_F_CQE32_INIT        = BIT(REQ_F_CQE32_INIT_BIT),
+       /* recvmsg special flag, clear EPOLLIN */
+       REQ_F_CLEAR_POLLIN      = BIT(REQ_F_CLEAR_POLLIN_BIT),
+       /* hashed into ->cancel_hash_locked, protected by ->uring_lock */
+       REQ_F_HASH_LOCKED       = BIT(REQ_F_HASH_LOCKED_BIT),
+};
+
+typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
+
+struct io_task_work {
+       union {
+               struct io_wq_work_node  node;
+               struct llist_node       fallback_node;
+       };
+       io_req_tw_func_t                func;
+};
+
+struct io_cqe {
+       __u64   user_data;
+       __s32   res;
+       /* fd initially, then cflags for completion */
+       union {
+               __u32   flags;
+               int     fd;
+       };
+};
+
+/*
+ * Each request type overlays its private data structure on top of this one.
+ * They must not exceed this one in size.
+ */
+struct io_cmd_data {
+       struct file             *file;
+       /* each command gets 56 bytes of data */
+       __u8                    data[56];
+};
+
+#define io_kiocb_to_cmd(req)   ((void *) &(req)->cmd)
+#define cmd_to_io_kiocb(ptr)   ((struct io_kiocb *) ptr)
+
+struct io_kiocb {
+       union {
+               /*
+                * NOTE! Each of the io_kiocb union members has the file pointer
+                * as the first entry in their struct definition. So you can
+                * access the file pointer through any of the sub-structs,
+                * or directly as just 'file' in this struct.
+                */
+               struct file             *file;
+               struct io_cmd_data      cmd;
+       };
+
+       u8                              opcode;
+       /* polled IO has completed */
+       u8                              iopoll_completed;
+       /*
+        * Can be either a fixed buffer index, or used with provided buffers.
+        * For the latter, before issue it points to the buffer group ID,
+        * and after selection it points to the buffer ID itself.
+        */
+       u16                             buf_index;
+       unsigned int                    flags;
+
+       struct io_cqe                   cqe;
+
+       struct io_ring_ctx              *ctx;
+       struct task_struct              *task;
+
+       struct io_rsrc_node             *rsrc_node;
+
+       union {
+               /* store used ubuf, so we can prevent reloading */
+               struct io_mapped_ubuf   *imu;
+
+               /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
+               struct io_buffer        *kbuf;
+
+               /*
+                * stores buffer ID for ring provided buffers, valid IFF
+                * REQ_F_BUFFER_RING is set.
+                */
+               struct io_buffer_list   *buf_list;
+       };
+
+       union {
+               /* used by request caches, completion batching and iopoll */
+               struct io_wq_work_node  comp_list;
+               /* cache ->apoll->events */
+               __poll_t apoll_events;
+       };
+       atomic_t                        refs;
+       atomic_t                        poll_refs;
+       struct io_task_work             io_task_work;
+       /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
+       union {
+               struct hlist_node       hash_node;
+               struct {
+                       u64             extra1;
+                       u64             extra2;
+               };
+       };
+       /* internal polling, see IORING_FEAT_FAST_POLL */
+       struct async_poll               *apoll;
+       /* opcode allocated if it needs to store data for async defer */
+       void                            *async_data;
+       /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
+       struct io_kiocb                 *link;
+       /* custom credentials, valid IFF REQ_F_CREDS is set */
+       const struct cred               *creds;
+       struct io_wq_work               work;
+};
+
+struct io_cancel_data {
+       struct io_ring_ctx *ctx;
+       union {
+               u64 data;
+               struct file *file;
+       };
+       u32 flags;
+       int seq;
+};
+
+struct io_overflow_cqe {
+       struct list_head list;
+       struct io_uring_cqe cqe;
+};
+
+struct io_mapped_ubuf {
+       u64             ubuf;
+       u64             ubuf_end;
+       unsigned int    nr_bvecs;
+       unsigned long   acct_pages;
+       struct bio_vec  bvec[];
+};
+
+#endif
index c404360f709053e75c8ff377b361a8a96aab489b..6b58aa48bc45d39667317b74a6443d912892a0fa 100644 (file)
@@ -22,17 +22,6 @@ struct io_kiocb;
 #endif
 #define FFS_MASK               ~(FFS_NOWAIT|FFS_ISREG|FFS_SCM)
 
-struct io_fixed_file {
-       /* file * with additional FFS_* flags */
-       unsigned long file_ptr;
-};
-
-struct io_file_table {
-       struct io_fixed_file *files;
-       unsigned long *bitmap;
-       unsigned int alloc_hint;
-};
-
 bool io_alloc_file_tables(struct io_file_table *table, unsigned nr_files);
 void io_free_file_tables(struct io_file_table *table);
 
index 3f54ee2a8eebd32a3f70636fc42720fea33a91be..10b80ef78bb8175033e9967efb5db1a2079600c5 100644 (file)
@@ -2,6 +2,7 @@
 #define INTERNAL_IO_WQ_H
 
 #include <linux/refcount.h>
+#include <linux/io_uring_types.h>
 
 struct io_wq;
 
@@ -20,15 +21,6 @@ enum io_wq_cancel {
        IO_WQ_CANCEL_NOTFOUND,  /* work not found */
 };
 
-struct io_wq_work_node {
-       struct io_wq_work_node *next;
-};
-
-struct io_wq_work_list {
-       struct io_wq_work_node *first;
-       struct io_wq_work_node *last;
-};
-
 #define wq_list_for_each(pos, prv, head)                       \
        for (pos = (head)->first, prv = NULL; pos; prv = pos, pos = (pos)->next)
 
@@ -152,13 +144,6 @@ struct io_wq_work_node *wq_stack_extract(struct io_wq_work_node *stack)
        return node;
 }
 
-struct io_wq_work {
-       struct io_wq_work_node list;
-       unsigned flags;
-       /* place it here instead of io_kiocb as it fills padding and saves 4B */
-       int cancel_seq;
-};
-
 static inline struct io_wq_work *wq_next_work(struct io_wq_work *work)
 {
        if (!work->list.next)
index 558a860a93fcd618816239808f109a8d02a22420..5eaa01c4697c37537eca29a4ece83bacd81d4302 100644 (file)
@@ -3,7 +3,9 @@
 
 #include <linux/errno.h>
 #include <linux/lockdep.h>
-#include "io_uring_types.h"
+#include <linux/io_uring_types.h>
+#include "io-wq.h"
+#include "filetable.h"
 
 #ifndef CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
diff --git a/io_uring/io_uring_types.h b/io_uring/io_uring_types.h
deleted file mode 100644 (file)
index 65ac7cd..0000000
+++ /dev/null
@@ -1,530 +0,0 @@
-#ifndef IO_URING_TYPES_H
-#define IO_URING_TYPES_H
-
-#include <linux/blkdev.h>
-#include <linux/task_work.h>
-#include <linux/bitmap.h>
-#include <uapi/linux/io_uring.h>
-
-#include "io-wq.h"
-#include "filetable.h"
-
-struct io_hash_bucket {
-       spinlock_t              lock;
-       struct hlist_head       list;
-} ____cacheline_aligned_in_smp;
-
-struct io_hash_table {
-       struct io_hash_bucket   *hbs;
-       unsigned                hash_bits;
-};
-
-struct io_uring {
-       u32 head ____cacheline_aligned_in_smp;
-       u32 tail ____cacheline_aligned_in_smp;
-};
-
-/*
- * This data is shared with the application through the mmap at offsets
- * IORING_OFF_SQ_RING and IORING_OFF_CQ_RING.
- *
- * The offsets to the member fields are published through struct
- * io_sqring_offsets when calling io_uring_setup.
- */
-struct io_rings {
-       /*
-        * Head and tail offsets into the ring; the offsets need to be
-        * masked to get valid indices.
-        *
-        * The kernel controls head of the sq ring and the tail of the cq ring,
-        * and the application controls tail of the sq ring and the head of the
-        * cq ring.
-        */
-       struct io_uring         sq, cq;
-       /*
-        * Bitmasks to apply to head and tail offsets (constant, equals
-        * ring_entries - 1)
-        */
-       u32                     sq_ring_mask, cq_ring_mask;
-       /* Ring sizes (constant, power of 2) */
-       u32                     sq_ring_entries, cq_ring_entries;
-       /*
-        * Number of invalid entries dropped by the kernel due to
-        * invalid index stored in array
-        *
-        * Written by the kernel, shouldn't be modified by the
-        * application (i.e. get number of "new events" by comparing to
-        * cached value).
-        *
-        * After a new SQ head value was read by the application this
-        * counter includes all submissions that were dropped reaching
-        * the new SQ head (and possibly more).
-        */
-       u32                     sq_dropped;
-       /*
-        * Runtime SQ flags
-        *
-        * Written by the kernel, shouldn't be modified by the
-        * application.
-        *
-        * The application needs a full memory barrier before checking
-        * for IORING_SQ_NEED_WAKEUP after updating the sq tail.
-        */
-       atomic_t                sq_flags;
-       /*
-        * Runtime CQ flags
-        *
-        * Written by the application, shouldn't be modified by the
-        * kernel.
-        */
-       u32                     cq_flags;
-       /*
-        * Number of completion events lost because the queue was full;
-        * this should be avoided by the application by making sure
-        * there are not more requests pending than there is space in
-        * the completion queue.
-        *
-        * Written by the kernel, shouldn't be modified by the
-        * application (i.e. get number of "new events" by comparing to
-        * cached value).
-        *
-        * As completion events come in out of order this counter is not
-        * ordered with any other data.
-        */
-       u32                     cq_overflow;
-       /*
-        * Ring buffer of completion events.
-        *
-        * The kernel writes completion events fresh every time they are
-        * produced, so the application is allowed to modify pending
-        * entries.
-        */
-       struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
-};
-
-struct io_restriction {
-       DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
-       DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
-       u8 sqe_flags_allowed;
-       u8 sqe_flags_required;
-       bool registered;
-};
-
-struct io_submit_link {
-       struct io_kiocb         *head;
-       struct io_kiocb         *last;
-};
-
-struct io_submit_state {
-       /* inline/task_work completion list, under ->uring_lock */
-       struct io_wq_work_node  free_list;
-       /* batch completion logic */
-       struct io_wq_work_list  compl_reqs;
-       struct io_submit_link   link;
-
-       bool                    plug_started;
-       bool                    need_plug;
-       bool                    flush_cqes;
-       unsigned short          submit_nr;
-       struct blk_plug         plug;
-};
-
-struct io_ev_fd {
-       struct eventfd_ctx      *cq_ev_fd;
-       unsigned int            eventfd_async: 1;
-       struct rcu_head         rcu;
-};
-
-struct io_ring_ctx {
-       /* const or read-mostly hot data */
-       struct {
-               struct percpu_ref       refs;
-
-               struct io_rings         *rings;
-               unsigned int            flags;
-               enum task_work_notify_mode      notify_method;
-               unsigned int            compat: 1;
-               unsigned int            drain_next: 1;
-               unsigned int            restricted: 1;
-               unsigned int            off_timeout_used: 1;
-               unsigned int            drain_active: 1;
-               unsigned int            drain_disabled: 1;
-               unsigned int            has_evfd: 1;
-               unsigned int            syscall_iopoll: 1;
-       } ____cacheline_aligned_in_smp;
-
-       /* submission data */
-       struct {
-               struct mutex            uring_lock;
-
-               /*
-                * Ring buffer of indices into array of io_uring_sqe, which is
-                * mmapped by the application using the IORING_OFF_SQES offset.
-                *
-                * This indirection could e.g. be used to assign fixed
-                * io_uring_sqe entries to operations and only submit them to
-                * the queue when needed.
-                *
-                * The kernel modifies neither the indices array nor the entries
-                * array.
-                */
-               u32                     *sq_array;
-               struct io_uring_sqe     *sq_sqes;
-               unsigned                cached_sq_head;
-               unsigned                sq_entries;
-
-               /*
-                * Fixed resources fast path, should be accessed only under
-                * uring_lock, and updated through io_uring_register(2)
-                */
-               struct io_rsrc_node     *rsrc_node;
-               int                     rsrc_cached_refs;
-               atomic_t                cancel_seq;
-               struct io_file_table    file_table;
-               unsigned                nr_user_files;
-               unsigned                nr_user_bufs;
-               struct io_mapped_ubuf   **user_bufs;
-
-               struct io_submit_state  submit_state;
-
-               struct io_buffer_list   *io_bl;
-               struct xarray           io_bl_xa;
-               struct list_head        io_buffers_cache;
-
-               struct io_hash_table    cancel_table_locked;
-               struct list_head        cq_overflow_list;
-               struct list_head        apoll_cache;
-               struct xarray           personalities;
-               u32                     pers_next;
-       } ____cacheline_aligned_in_smp;
-
-       /* IRQ completion list, under ->completion_lock */
-       struct io_wq_work_list  locked_free_list;
-       unsigned int            locked_free_nr;
-
-       const struct cred       *sq_creds;      /* cred used for __io_sq_thread() */
-       struct io_sq_data       *sq_data;       /* if using sq thread polling */
-
-       struct wait_queue_head  sqo_sq_wait;
-       struct list_head        sqd_list;
-
-       unsigned long           check_cq;
-
-       struct {
-               /*
-                * We cache a range of free CQEs we can use, once exhausted it
-                * should go through a slower range setup, see __io_get_cqe()
-                */
-               struct io_uring_cqe     *cqe_cached;
-               struct io_uring_cqe     *cqe_sentinel;
-
-               unsigned                cached_cq_tail;
-               unsigned                cq_entries;
-               struct io_ev_fd __rcu   *io_ev_fd;
-               struct wait_queue_head  cq_wait;
-               unsigned                cq_extra;
-       } ____cacheline_aligned_in_smp;
-
-       struct {
-               spinlock_t              completion_lock;
-
-               /*
-                * ->iopoll_list is protected by the ctx->uring_lock for
-                * io_uring instances that don't use IORING_SETUP_SQPOLL.
-                * For SQPOLL, only the single threaded io_sq_thread() will
-                * manipulate the list, hence no extra locking is needed there.
-                */
-               struct io_wq_work_list  iopoll_list;
-               struct io_hash_table    cancel_table;
-               bool                    poll_multi_queue;
-
-               struct list_head        io_buffers_comp;
-       } ____cacheline_aligned_in_smp;
-
-       /* timeouts */
-       struct {
-               spinlock_t              timeout_lock;
-               atomic_t                cq_timeouts;
-               struct list_head        timeout_list;
-               struct list_head        ltimeout_list;
-               unsigned                cq_last_tm_flush;
-       } ____cacheline_aligned_in_smp;
-
-       /* Keep this last, we don't need it for the fast path */
-
-       struct io_restriction           restrictions;
-       struct task_struct              *submitter_task;
-
-       /* slow path rsrc auxilary data, used by update/register */
-       struct io_rsrc_node             *rsrc_backup_node;
-       struct io_mapped_ubuf           *dummy_ubuf;
-       struct io_rsrc_data             *file_data;
-       struct io_rsrc_data             *buf_data;
-
-       struct delayed_work             rsrc_put_work;
-       struct llist_head               rsrc_put_llist;
-       struct list_head                rsrc_ref_list;
-       spinlock_t                      rsrc_ref_lock;
-
-       struct list_head                io_buffers_pages;
-
-       #if defined(CONFIG_UNIX)
-               struct socket           *ring_sock;
-       #endif
-       /* hashed buffered write serialization */
-       struct io_wq_hash               *hash_map;
-
-       /* Only used for accounting purposes */
-       struct user_struct              *user;
-       struct mm_struct                *mm_account;
-
-       /* ctx exit and cancelation */
-       struct llist_head               fallback_llist;
-       struct delayed_work             fallback_work;
-       struct work_struct              exit_work;
-       struct list_head                tctx_list;
-       struct completion               ref_comp;
-
-       /* io-wq management, e.g. thread count */
-       u32                             iowq_limits[2];
-       bool                            iowq_limits_set;
-
-       struct list_head                defer_list;
-       unsigned                        sq_thread_idle;
-};
-
-enum {
-       REQ_F_FIXED_FILE_BIT    = IOSQE_FIXED_FILE_BIT,
-       REQ_F_IO_DRAIN_BIT      = IOSQE_IO_DRAIN_BIT,
-       REQ_F_LINK_BIT          = IOSQE_IO_LINK_BIT,
-       REQ_F_HARDLINK_BIT      = IOSQE_IO_HARDLINK_BIT,
-       REQ_F_FORCE_ASYNC_BIT   = IOSQE_ASYNC_BIT,
-       REQ_F_BUFFER_SELECT_BIT = IOSQE_BUFFER_SELECT_BIT,
-       REQ_F_CQE_SKIP_BIT      = IOSQE_CQE_SKIP_SUCCESS_BIT,
-
-       /* first byte is taken by user flags, shift it to not overlap */
-       REQ_F_FAIL_BIT          = 8,
-       REQ_F_INFLIGHT_BIT,
-       REQ_F_CUR_POS_BIT,
-       REQ_F_NOWAIT_BIT,
-       REQ_F_LINK_TIMEOUT_BIT,
-       REQ_F_NEED_CLEANUP_BIT,
-       REQ_F_POLLED_BIT,
-       REQ_F_BUFFER_SELECTED_BIT,
-       REQ_F_BUFFER_RING_BIT,
-       REQ_F_REISSUE_BIT,
-       REQ_F_CREDS_BIT,
-       REQ_F_REFCOUNT_BIT,
-       REQ_F_ARM_LTIMEOUT_BIT,
-       REQ_F_ASYNC_DATA_BIT,
-       REQ_F_SKIP_LINK_CQES_BIT,
-       REQ_F_SINGLE_POLL_BIT,
-       REQ_F_DOUBLE_POLL_BIT,
-       REQ_F_PARTIAL_IO_BIT,
-       REQ_F_CQE32_INIT_BIT,
-       REQ_F_APOLL_MULTISHOT_BIT,
-       REQ_F_CLEAR_POLLIN_BIT,
-       REQ_F_HASH_LOCKED_BIT,
-       /* keep async read/write and isreg together and in order */
-       REQ_F_SUPPORT_NOWAIT_BIT,
-       REQ_F_ISREG_BIT,
-
-       /* not a real bit, just to check we're not overflowing the space */
-       __REQ_F_LAST_BIT,
-};
-
-enum {
-       /* ctx owns file */
-       REQ_F_FIXED_FILE        = BIT(REQ_F_FIXED_FILE_BIT),
-       /* drain existing IO first */
-       REQ_F_IO_DRAIN          = BIT(REQ_F_IO_DRAIN_BIT),
-       /* linked sqes */
-       REQ_F_LINK              = BIT(REQ_F_LINK_BIT),
-       /* doesn't sever on completion < 0 */
-       REQ_F_HARDLINK          = BIT(REQ_F_HARDLINK_BIT),
-       /* IOSQE_ASYNC */
-       REQ_F_FORCE_ASYNC       = BIT(REQ_F_FORCE_ASYNC_BIT),
-       /* IOSQE_BUFFER_SELECT */
-       REQ_F_BUFFER_SELECT     = BIT(REQ_F_BUFFER_SELECT_BIT),
-       /* IOSQE_CQE_SKIP_SUCCESS */
-       REQ_F_CQE_SKIP          = BIT(REQ_F_CQE_SKIP_BIT),
-
-       /* fail rest of links */
-       REQ_F_FAIL              = BIT(REQ_F_FAIL_BIT),
-       /* on inflight list, should be cancelled and waited on exit reliably */
-       REQ_F_INFLIGHT          = BIT(REQ_F_INFLIGHT_BIT),
-       /* read/write uses file position */
-       REQ_F_CUR_POS           = BIT(REQ_F_CUR_POS_BIT),
-       /* must not punt to workers */
-       REQ_F_NOWAIT            = BIT(REQ_F_NOWAIT_BIT),
-       /* has or had linked timeout */
-       REQ_F_LINK_TIMEOUT      = BIT(REQ_F_LINK_TIMEOUT_BIT),
-       /* needs cleanup */
-       REQ_F_NEED_CLEANUP      = BIT(REQ_F_NEED_CLEANUP_BIT),
-       /* already went through poll handler */
-       REQ_F_POLLED            = BIT(REQ_F_POLLED_BIT),
-       /* buffer already selected */
-       REQ_F_BUFFER_SELECTED   = BIT(REQ_F_BUFFER_SELECTED_BIT),
-       /* buffer selected from ring, needs commit */
-       REQ_F_BUFFER_RING       = BIT(REQ_F_BUFFER_RING_BIT),
-       /* caller should reissue async */
-       REQ_F_REISSUE           = BIT(REQ_F_REISSUE_BIT),
-       /* supports async reads/writes */
-       REQ_F_SUPPORT_NOWAIT    = BIT(REQ_F_SUPPORT_NOWAIT_BIT),
-       /* regular file */
-       REQ_F_ISREG             = BIT(REQ_F_ISREG_BIT),
-       /* has creds assigned */
-       REQ_F_CREDS             = BIT(REQ_F_CREDS_BIT),
-       /* skip refcounting if not set */
-       REQ_F_REFCOUNT          = BIT(REQ_F_REFCOUNT_BIT),
-       /* there is a linked timeout that has to be armed */
-       REQ_F_ARM_LTIMEOUT      = BIT(REQ_F_ARM_LTIMEOUT_BIT),
-       /* ->async_data allocated */
-       REQ_F_ASYNC_DATA        = BIT(REQ_F_ASYNC_DATA_BIT),
-       /* don't post CQEs while failing linked requests */
-       REQ_F_SKIP_LINK_CQES    = BIT(REQ_F_SKIP_LINK_CQES_BIT),
-       /* single poll may be active */
-       REQ_F_SINGLE_POLL       = BIT(REQ_F_SINGLE_POLL_BIT),
-       /* double poll may active */
-       REQ_F_DOUBLE_POLL       = BIT(REQ_F_DOUBLE_POLL_BIT),
-       /* request has already done partial IO */
-       REQ_F_PARTIAL_IO        = BIT(REQ_F_PARTIAL_IO_BIT),
-       /* fast poll multishot mode */
-       REQ_F_APOLL_MULTISHOT   = BIT(REQ_F_APOLL_MULTISHOT_BIT),
-       /* ->extra1 and ->extra2 are initialised */
-       REQ_F_CQE32_INIT        = BIT(REQ_F_CQE32_INIT_BIT),
-       /* recvmsg special flag, clear EPOLLIN */
-       REQ_F_CLEAR_POLLIN      = BIT(REQ_F_CLEAR_POLLIN_BIT),
-       /* hashed into ->cancel_hash_locked, protected by ->uring_lock */
-       REQ_F_HASH_LOCKED       = BIT(REQ_F_HASH_LOCKED_BIT),
-};
-
-typedef void (*io_req_tw_func_t)(struct io_kiocb *req, bool *locked);
-
-struct io_task_work {
-       union {
-               struct io_wq_work_node  node;
-               struct llist_node       fallback_node;
-       };
-       io_req_tw_func_t                func;
-};
-
-struct io_cqe {
-       __u64   user_data;
-       __s32   res;
-       /* fd initially, then cflags for completion */
-       union {
-               __u32   flags;
-               int     fd;
-       };
-};
-
-/*
- * Each request type overlays its private data structure on top of this one.
- * They must not exceed this one in size.
- */
-struct io_cmd_data {
-       struct file             *file;
-       /* each command gets 56 bytes of data */
-       __u8                    data[56];
-};
-
-#define io_kiocb_to_cmd(req)   ((void *) &(req)->cmd)
-#define cmd_to_io_kiocb(ptr)   ((struct io_kiocb *) ptr)
-
-struct io_kiocb {
-       union {
-               /*
-                * NOTE! Each of the io_kiocb union members has the file pointer
-                * as the first entry in their struct definition. So you can
-                * access the file pointer through any of the sub-structs,
-                * or directly as just 'file' in this struct.
-                */
-               struct file             *file;
-               struct io_cmd_data      cmd;
-       };
-
-       u8                              opcode;
-       /* polled IO has completed */
-       u8                              iopoll_completed;
-       /*
-        * Can be either a fixed buffer index, or used with provided buffers.
-        * For the latter, before issue it points to the buffer group ID,
-        * and after selection it points to the buffer ID itself.
-        */
-       u16                             buf_index;
-       unsigned int                    flags;
-
-       struct io_cqe                   cqe;
-
-       struct io_ring_ctx              *ctx;
-       struct task_struct              *task;
-
-       struct io_rsrc_node             *rsrc_node;
-
-       union {
-               /* store used ubuf, so we can prevent reloading */
-               struct io_mapped_ubuf   *imu;
-
-               /* stores selected buf, valid IFF REQ_F_BUFFER_SELECTED is set */
-               struct io_buffer        *kbuf;
-
-               /*
-                * stores buffer ID for ring provided buffers, valid IFF
-                * REQ_F_BUFFER_RING is set.
-                */
-               struct io_buffer_list   *buf_list;
-       };
-
-       union {
-               /* used by request caches, completion batching and iopoll */
-               struct io_wq_work_node  comp_list;
-               /* cache ->apoll->events */
-               __poll_t apoll_events;
-       };
-       atomic_t                        refs;
-       atomic_t                        poll_refs;
-       struct io_task_work             io_task_work;
-       /* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
-       union {
-               struct hlist_node       hash_node;
-               struct {
-                       u64             extra1;
-                       u64             extra2;
-               };
-       };
-       /* internal polling, see IORING_FEAT_FAST_POLL */
-       struct async_poll               *apoll;
-       /* opcode allocated if it needs to store data for async defer */
-       void                            *async_data;
-       /* linked requests, IFF REQ_F_HARDLINK or REQ_F_LINK are set */
-       struct io_kiocb                 *link;
-       /* custom credentials, valid IFF REQ_F_CREDS is set */
-       const struct cred               *creds;
-       struct io_wq_work               work;
-};
-
-struct io_cancel_data {
-       struct io_ring_ctx *ctx;
-       union {
-               u64 data;
-               struct file *file;
-       };
-       u32 flags;
-       int seq;
-};
-
-struct io_overflow_cqe {
-       struct list_head list;
-       struct io_uring_cqe cqe;
-};
-
-struct io_mapped_ubuf {
-       u64             ubuf;
-       u64             ubuf_end;
-       unsigned int    nr_bvecs;
-       unsigned long   acct_pages;
-       struct bio_vec  bvec[];
-};
-
-#endif
index 334c5ead4c43d9046f6b1240395a9293fa2f4379..1336de3f2a30aa26cc51a6bd2b8dc67a9d0e5fe9 100644 (file)
@@ -2,7 +2,7 @@
 #define IOU_REQ_REF_H
 
 #include <linux/atomic.h>
-#include "io_uring_types.h"
+#include <linux/io_uring_types.h>
 
 /*
  * Shamelessly stolen from the mm implementation of page reference checking,