tpool/aio_linux.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

/* Copyright (C) 2019, 2020, MariaDB Corporation.

This program is free software; you can redistribute itand /or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; version 2 of the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111 - 1301 USA*/

#include "tpool_structs.h"
#include "tpool.h"

#ifdef LINUX_NATIVE_AIO
# include <thread>
# include <atomic>
# include <libaio.h>
# include <sys/syscall.h>

/**
  Invoke the io_getevents() system call, without timeout parameter.

  @param ctx     context from io_setup()
  @param min_nr  minimum number of completion events to wait for
  @param nr      maximum number of completion events to collect
  @param ev      the collected events

  In https://pagure.io/libaio/c/7cede5af5adf01ad26155061cc476aad0804d3fc
  the io_getevents() implementation in libaio was "optimized" so that it
  would elide the system call when there are no outstanding requests
  and a timeout was specified.

  The libaio code for dereferencing ctx would occasionally trigger
  SIGSEGV if io_destroy() was concurrently invoked from another thread.
  Hence, we have to use the raw system call.

  WHY are we doing this at all?
  Because we want io_destroy() from another thread to interrupt io_getevents().

  And, WHY do we want io_destroy() from another thread to interrupt
  io_getevents()?

  Because there is no documented, libaio-friendly and race-condition-free way to
  interrupt io_getevents(). io_destroy() coupled with raw syscall seemed to work
  for us so far.

  Historical note : in the past, we used io_getevents with timeouts. We'd wake
  up periodically, check for shutdown flag, return from the main routine.
  This was admittedly safer, yet it did cost periodic wakeups, which we are not
  willing to do anymore.

  @note we also rely on the undocumented property, that io_destroy(ctx)
  will make this version of io_getevents return EINVAL.
*/
static int my_getevents(io_context_t ctx, long min_nr, long nr, io_event *ev)
{
  int saved_errno= errno;
  int ret= syscall(__NR_io_getevents, reinterpret_cast<long>(ctx),
                   min_nr, nr, ev, 0);
  if (ret < 0)
  {
    ret= -errno;
    errno= saved_errno;
  }
  return ret;
}
#endif


/*
  Linux AIO implementation, based on native AIO.
  Needs libaio.h and -laio at the compile time.

  io_submit() is used to submit async IO.

  A single thread will collect the completion notification
  with io_getevents() and forward io completion callback to
  the worker threadpool.
*/
namespace tpool
{
#ifdef LINUX_NATIVE_AIO

class aio_linux final : public aio
{
  thread_pool *m_pool;
  io_context_t m_io_ctx;
  std::thread m_getevent_thread;
  static std::atomic<bool> shutdown_in_progress;

  static void getevent_thread_routine(aio_linux *aio)
  {
    /*
      We collect events in small batches to hopefully reduce the
      number of system calls.
    */
    constexpr unsigned MAX_EVENTS= 256;

    io_event events[MAX_EVENTS];
    for (;;)
    {
      switch (int ret= my_getevents(aio->m_io_ctx, 1, MAX_EVENTS, events)) {
      case -EINTR:
        continue;
      case -EINVAL:
        if (shutdown_in_progress)
          return;
        /* fall through */
      default:
        if (ret < 0)
        {
          fprintf(stderr, "io_getevents returned %d\n", ret);
          abort();
          return;
        }
        for (int i= 0; i < ret; i++)
        {
          const io_event &event= events[i];
          aiocb *iocb= static_cast<aiocb*>(event.obj);
          if (static_cast<int>(event.res) < 0)
          {
            iocb->m_err= -event.res;
            iocb->m_ret_len= 0;
          }
          else
          {
            iocb->m_ret_len= event.res;
            iocb->m_err= 0;
          }
          iocb->m_internal_task.m_func= iocb->m_callback;
          iocb->m_internal_task.m_arg= iocb;
          iocb->m_internal_task.m_group= iocb->m_group;
          aio->m_pool->submit_task(&iocb->m_internal_task);
        }
      }
    }
  }

public:
  aio_linux(io_context_t ctx, thread_pool *pool)
    : m_pool(pool), m_io_ctx(ctx),
    m_getevent_thread(getevent_thread_routine, this)
  {
  }

  ~aio_linux()
  {
    shutdown_in_progress= true;
    io_destroy(m_io_ctx);
    m_getevent_thread.join();
    shutdown_in_progress= false;
  }

  int submit_io(aiocb *cb) override
  {
    io_prep_pread(static_cast<iocb*>(cb), cb->m_fh, cb->m_buffer, cb->m_len,
                  cb->m_offset);
    if (cb->m_opcode != aio_opcode::AIO_PREAD)
      cb->aio_lio_opcode= IO_CMD_PWRITE;
    iocb *icb= static_cast<iocb*>(cb);
    int ret= io_submit(m_io_ctx, 1, &icb);
    if (ret == 1)
      return 0;
    errno= -ret;
    return -1;
  }

  int bind(native_file_handle&) override { return 0; }
  int unbind(const native_file_handle&) override { return 0; }
};

std::atomic<bool> aio_linux::shutdown_in_progress;

aio *create_linux_aio(thread_pool *pool, int max_io)
{
  io_context_t ctx;
  memset(&ctx, 0, sizeof ctx);
  if (int ret= io_setup(max_io, &ctx))
  {
    fprintf(stderr, "io_setup(%d) returned %d\n", max_io, ret);
    return nullptr;
  }
  return new aio_linux(ctx, pool);
}
#else
aio *create_linux_aio(thread_pool*, int) { return nullptr; }
#endif
}