summaryrefslogtreecommitdiffstats
path: root/src/blk/aio/aio.cc
blob: 00a12bfd16af81cff24a3edb145ad5acf3b73510 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab

#include <algorithm>
#include "aio.h"

std::ostream& operator<<(std::ostream& os, const aio_t& aio)
{
  unsigned i = 0;
  os << "aio: ";
  for (auto& iov : aio.iov) {
    os << "\n [" << i++ << "] 0x"
       << std::hex << iov.iov_base << "~" << iov.iov_len << std::dec;
  }
  return os;
}

int aio_queue_t::submit_batch(aio_iter begin, aio_iter end, 
			      uint16_t aios_size, void *priv, 
			      int *retries)
{
  // 2^16 * 125us = ~8 seconds, so max sleep is ~16 seconds
  int attempts = 16;
  int delay = 125;
  int r;

  aio_iter cur = begin;
  struct aio_t *piocb[aios_size];
  int left = 0;
  while (cur != end) {
    cur->priv = priv;
    *(piocb+left) = &(*cur);
    ++left;
    ++cur;
  }
  ceph_assert(aios_size >= left);
  int done = 0;
  while (left > 0) {
#if defined(HAVE_LIBAIO)
    r = io_submit(ctx, std::min(left, max_iodepth), (struct iocb**)(piocb + done));
#elif defined(HAVE_POSIXAIO)
    if (piocb[done]->n_aiocb == 1) {
      // TODO: consider batching multiple reads together with lio_listio
      piocb[done]->aio.aiocb.aio_sigevent.sigev_notify = SIGEV_KEVENT;
      piocb[done]->aio.aiocb.aio_sigevent.sigev_notify_kqueue = ctx;
      piocb[done]->aio.aiocb.aio_sigevent.sigev_value.sival_ptr = piocb[done];
      r = aio_read(&piocb[done]->aio.aiocb);
    } else {
      struct sigevent sev;
      sev.sigev_notify = SIGEV_KEVENT;
      sev.sigev_notify_kqueue = ctx;
      sev.sigev_value.sival_ptr = piocb[done];
      r = lio_listio(LIO_NOWAIT, &piocb[done]->aio.aiocbp, piocb[done]->n_aiocb, &sev);
    }
#endif
    if (r < 0) {
      if (r == -EAGAIN && attempts-- > 0) {
	usleep(delay);
	delay *= 2;
	(*retries)++;
	continue;
      }
      return r;
    }
    ceph_assert(r > 0);
    done += r;
    left -= r;
    attempts = 16;
    delay = 125;
  }
  return done;
}

int aio_queue_t::get_next_completed(int timeout_ms, aio_t **paio, int max)
{
#if defined(HAVE_LIBAIO)
  io_event events[max];
#elif defined(HAVE_POSIXAIO)
  struct kevent events[max];
#endif
  struct timespec t = {
    timeout_ms / 1000,
    (timeout_ms % 1000) * 1000 * 1000
  };

  int r = 0;
  do {
#if defined(HAVE_LIBAIO)
    r = io_getevents(ctx, 1, max, events, &t);
#elif defined(HAVE_POSIXAIO)
    r = kevent(ctx, NULL, 0, events, max, &t);
    if (r < 0)
      r = -errno;
#endif
  } while (r == -EINTR);

  for (int i=0; i<r; ++i) {
#if defined(HAVE_LIBAIO)
    paio[i] = (aio_t *)events[i].obj;
    paio[i]->rval = events[i].res;
#else
    paio[i] = (aio_t*)events[i].udata;
    if (paio[i]->n_aiocb == 1) {
      paio[i]->rval = aio_return(&paio[i]->aio.aiocb);
    } else {
      // Emulate the return value of pwritev.  I can't find any documentation
      // for what the value of io_event.res is supposed to be.  I'm going to
      // assume that it's just like pwritev/preadv/pwrite/pread.
      paio[i]->rval = 0;
      for (int j = 0; j < paio[i]->n_aiocb; j++) {
	int res = aio_return(&paio[i]->aio.aiocbp[j]);
	if (res < 0) {
	  paio[i]->rval = res;
	  break;
	} else {
	  paio[i]->rval += res;
	}
      }
      free(paio[i]->aio.aiocbp);
    }
#endif
  }
  return r;
}