使用 io_uring 打印文本

io_uring

syscalls

io_uring 目前通过 3 个 syscall 来实现功能:io_uring_setup, io_uring_enter, io_uring_register

PS: 我目前的 kernel 是 5.4 版本,更完整的内容需要参考最新文档

  • io_uring_setup

    1
    int io_uring_setup(u32 entries, struct io_uring_params *p);

    io_uring_setup 用来创建实例,返回一个 fd(支持 mmap 和 poll),entries 描述 SQ 和 CQ 的大小,SQ 会是根据 entries 算出来的 2 的幂次方,CQ 默认是 SQ 的两倍,params 用于和 kernel 通信交换一些配置,flags 字段用来传递一些配置,features 则是用于返回当前 io_uring 支持的特性,并且 io_uring 会返回 sq_offcq_off,每个字段都对应着一个偏移量,根据 mmap 返回的指针加上字段对应的偏移量,就能得到真正地址。

    flags 比较重要的是 IORING_SETUP_IOPOLLIORING_SETUP_SQPOLL,,IORING_SETUP_IOPOLL 表示 io_uring 使用忙等待的方式等待 IO 完成,这个标志位目前使用上还会有一些限制;IORING_SETUP_SQPOLL 表示 io_uring 创建一个内核线程去同步 SQ,提交 IO 和更新 CQ,这里要注意的是,当使用了这个标志位,需要在更新 SQ 的 tail 后检查 SQE 的 flags 是否包含 IORING_SQ_NEED_WAKEUP,因为内核线程会在空闲时休眠,如果 SQE 中标记了 IORING_SQ_NEED_WAKEUP,需要调用 io_uring_enter 带上 IORING_ENTER_SQ_WAKEUP 去唤醒内核线程。

  • io_uring_enter

    1
    int io_uring_enter(unsigned int fd, unsigned int to_submit, unsigned int min_complete, unsigned int flags, sigset_t *sig);

    io_uring_enter 可以在一个 syscall 里实现提交 IO 操作和等待 IO 完成,如果 flags 字段设置了 IORING_ENTER_GETEVENTS,调用会等待 min_complete 个 IO 完成后才返回。

    min_complete 为 0,调用会立即完成,并返回已经完成的 IO,如果 min_complete 不为 0,并且 io_uring 设置了 IORING_SETUP_IOPOLL,那么调用通过忙等待的方式等待 min_complete 个 IO 完成事件,否则调用会挂起等待 IO 中断(或信号)

  • io_uring_register

    1
    int io_uring_register(unsigned int fd, unsigned int opcode, void *arg, unsigned int nr_args);

    io_uring_register 用来操作关联的对象,常用的比如内存和文件,注册内存就是锁内存页,注册 fd 就是先把 fd 对应的对象都预存好,其他的参考具体实现。

Print to stdout

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
#include <stdio.h>
#include <string.h>
#include <stdatomic.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/uio.h>
#include <sys/syscall.h>
#include <linux/io_uring.h>

#define URING_ENTRIES 8

struct user_sq {
unsigned *head;
unsigned *tail;
unsigned *ring_mask;
unsigned *ring_entries;
unsigned *flags;
unsigned *dropped;
unsigned *array;
struct io_uring_sqe *sqes;
};

struct user_cq {
unsigned *head;
unsigned *tail;
unsigned *ring_mask;
unsigned *ring_entries;
unsigned *overflow;
struct io_uring_cqe *cqes;
};

struct user_ring {
unsigned int fd;
struct user_sq sq;
struct user_cq cq;
};

static struct io_uring_params uring_params = {0};
static struct user_ring user_ring = {0};

static inline int io_uring_setup(__u32 entries, struct io_uring_params *p) {
return syscall(__NR_io_uring_setup, entries, p);
}

static inline int io_uring_enter(unsigned int fd, __u32 to_submit, __u32 min_complete, __u32 flags, const sigset_t *sig) {
return syscall(__NR_io_uring_enter, fd, to_submit, min_complete, flags, sig);
}

static inline int io_uring_register(unsigned int fd, unsigned int opcode, void *arg, unsigned int nr_args) {
return syscall(__NR_io_uring_register, fd, opcode, arg, nr_args);
}

void munmap_user_ring() {
if (user_ring.sq.sqes != NULL) {
munmap(user_ring.sq.sqes, *user_ring.sq.ring_entries * sizeof(struct io_uring_sqe));
}
if (user_ring.sq.head != NULL) {
size_t sq_size = uring_params.sq_off.array + uring_params.sq_entries * sizeof(__u32);
void *sq_ring_ptr = (void *) user_ring.sq.head - uring_params.sq_off.head;
munmap(sq_ring_ptr, sq_size);

if (user_ring.cq.head != NULL) {
void *cq_ring_ptr = (void *) user_ring.cq.head - uring_params.cq_off.head;
if (cq_ring_ptr != sq_ring_ptr) {
size_t cq_size = uring_params.cq_off.cqes + uring_params.cq_entries * sizeof(struct io_uring_cqe);
munmap(cq_ring_ptr, cq_size);
}
}
}
}

int mmap_user_ring() {
struct io_sqring_offsets *sq_off = &uring_params.sq_off;
struct io_cqring_offsets *cq_off = &uring_params.cq_off;

size_t sq_size = sq_off->array + uring_params.sq_entries * sizeof(__u32);
size_t cq_size = cq_off->cqes + uring_params.cq_entries * sizeof(struct io_uring_cqe);

if (uring_params.features & IORING_FEAT_SINGLE_MMAP) {
if (cq_size > sq_size) {
sq_size = cq_size;
}
}

void *sq_ptr = mmap(NULL,
sq_size,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE,
user_ring.fd,
IORING_OFF_SQ_RING);
if (sq_ptr == MAP_FAILED) {
return 1;
}

void *cq_ptr = NULL;
if (uring_params.features & IORING_FEAT_SINGLE_MMAP) {
cq_ptr = sq_ptr;
} else {
cq_ptr = mmap(NULL,
cq_size,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE,
user_ring.fd,
IORING_OFF_CQ_RING);
if (cq_ptr == MAP_FAILED) {
return 1;
}
}

struct user_sq *sq = &user_ring.sq;
sq->head = sq_ptr + sq_off->head;
sq->tail = sq_ptr + sq_off->tail;
sq->ring_mask = sq_ptr + sq_off->ring_mask;
sq->ring_entries = sq_ptr + sq_off->ring_entries;
sq->flags = sq_ptr + sq_off->flags;
sq->dropped = sq_ptr + sq_off->dropped;
sq->array = sq_ptr + sq_off->array;

size_t sqes_size = uring_params.sq_entries * sizeof(struct io_uring_sqe);
sq->sqes = mmap(NULL,
sqes_size,
PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_POPULATE,
user_ring.fd,
IORING_OFF_SQES);
if (sq->sqes == MAP_FAILED) {
return 1;
}

struct user_cq *cq = &user_ring.cq;
cq->head = cq_ptr + cq_off->head;
cq->tail = cq_ptr + cq_off->tail;
cq->ring_mask = cq_ptr + cq_off->ring_mask;
cq->ring_entries = cq_ptr + cq_off->ring_entries;
cq->overflow = cq_ptr + cq_off->overflow;
cq->cqes = cq_ptr + cq_off->cqes;

return 0;
}

int setup_ring() {
int fd = io_uring_setup(URING_ENTRIES, &uring_params);
if (fd == -1) {
perror("io_uring_setup");
return 1;
}
user_ring.fd = fd;
if (mmap_user_ring()) {
perror("mmap");
return 1;
}
return 0;
}

int cleanup_ring() {
munmap_user_ring();
if (user_ring.fd != 0) {
close(user_ring.fd);
}
return 0;
}

int print(const char *str) {
struct iovec vec;
vec.iov_base = (void *) str;
vec.iov_len = strlen(str);

struct user_sq *sq = &user_ring.sq;

unsigned tail = *sq->tail;
unsigned next = tail + 1;

atomic_thread_fence(memory_order_acquire);
const size_t idx = tail & *sq->ring_mask;

struct io_uring_sqe *sqe = &sq->sqes[idx];
memset(sqe, 0, sizeof(*sqe));
sqe->opcode = IORING_OP_WRITEV;
sqe->fd = STDOUT_FILENO;
sqe->addr = (__u64) &vec;
sqe->len = 1;

sq->array[idx] = idx;
*sq->tail = next;
atomic_thread_fence(memory_order_release);

if (io_uring_enter(user_ring.fd, 1, 1, IORING_ENTER_GETEVENTS, NULL) < 0) {
perror("io_uring_enter");
return 1;
}

struct user_cq *cq = &user_ring.cq;

unsigned head = *cq->head;

atomic_thread_fence(memory_order_acquire);

struct io_uring_cqe *cqe = &cq->cqes[head & *cq->ring_mask];
int res = cqe->res;

*cq->head = head;
atomic_thread_fence(memory_order_release);

if (res < 0) {
fprintf(stderr, "write STDOUT: %s\n", strerror(-cqe->res));
return 1;
}
return 0;
}

int main() {
if (setup_ring()) {
cleanup_ring();
return 1;
}
print("Hello World!\n");

cleanup_ring();
return 0;
}

References