summaryrefslogtreecommitdiffstats
path: root/debian/patches/2541-fix-stack-overflow-in-pinbox-allocator.patch
blob: bbc1af6b4ecf9575e8fa640ec30c4d8a00c0f0d4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
Forwarded: https://github.com/MariaDB/server/pull/2541
Origin: https://patch-diff.githubusercontent.com/raw/MariaDB/server/pull/2541.patch
Bug: https://jira.mariadb.org/browse/MDEV-31151
From: Hugo Wen <wenhug@amazon.com>
Date: Sat, 11 Mar 2023 00:27:42 +0000
Subject: [PATCH] Fix a stack overflow in pinbox allocator

MariaDB supports a "wait-free concurrent allocator based on pinning addresses".
In `lf_pinbox_real_free()` it tries to sort the pinned addresses for better
performance to use binary search during "real free". `alloca()` was used to
allocate stack memory and copy addresses.

To prevent a stack overflow when allocating the stack memory the function checks
if there's enough stack space. However, the available stack size was calculated
inaccurately which eventually caused database crash due to stack overflow.

The crash was seen on MariaDB 10.6.11 but the same code defect exists on all
MariaDB versions.

A similar issue happened previously and the fix in fc2c1e43 was to add a
`ALLOCA_SAFETY_MARGIN` which is 8192 bytes. However, that safety margin is not
enough during high connection workloads.

MySQL also had a similar issue and the fix
https://github.com/mysql/mysql-server/commit/b086fda was to remove the use of
`alloca` and replace qsort approach by a linear scan through all pointers (pins)
owned by each thread.

This commit is mostly the same as it is the only way to solve this issue as:
1. Frame sizes in different architecture can be different.
2. Number of active (non-null) pinned addresses varies, so the frame
   size for the recursive sorting function `msort_with_tmp` is also hard
   to predict.
3. Allocating big memory blocks in stack doesn't seem to be a very good
   practice.

For further details see the mentioned commit in MySQL and the inline comments.

All new code of the whole pull request, including one or several files
that are either new files or modified ones, are contributed under the
BSD-new license. I am contributing on behalf of my employer Amazon Web
Services, Inc.
---
 mysys/lf_alloc-pin.c | 180 ++++++++++++++-----------------------------
 1 file changed, 59 insertions(+), 121 deletions(-)

--- a/mysys/lf_alloc-pin.c
+++ b/mysys/lf_alloc-pin.c
@@ -103,12 +103,6 @@
 #include <lf.h>
 #include "my_cpu.h"
 
-/*
-  when using alloca() leave at least that many bytes of the stack -
-  for functions we might be calling from within this stack frame
-*/
-#define ALLOCA_SAFETY_MARGIN 8192
-
 #define LF_PINBOX_MAX_PINS 65536
 
 static void lf_pinbox_real_free(LF_PINS *pins);
@@ -239,24 +233,21 @@ void lf_pinbox_put_pins(LF_PINS *pins)
   } while (!my_atomic_cas32((int32 volatile*) &pinbox->pinstack_top_ver,
                             (int32*) &top_ver,
                             top_ver-pins->link+nr+LF_PINBOX_MAX_PINS));
-  return;
 }
 
-static int ptr_cmp(void **a, void **b)
+/*
+  Get the next pointer in the purgatory list.
+  Note that next_node is not used to avoid the extra volatile.
+*/
+#define pnext_node(P, X) (*((void **)(((char *)(X)) + (P)->free_ptr_offset)))
+
+static inline void add_to_purgatory(LF_PINS *pins, void *addr)
 {
-  return *a < *b ? -1 : *a == *b ? 0 : 1;
+  pnext_node(pins->pinbox, addr)= pins->purgatory;
+  pins->purgatory= addr;
+  pins->purgatory_count++;
 }
 
-#define add_to_purgatory(PINS, ADDR)                                    \
-  do                                                                    \
-  {                                                                     \
-    my_atomic_storeptr_explicit(                                        \
-      (void **)((char *)(ADDR)+(PINS)->pinbox->free_ptr_offset),        \
-      (PINS)->purgatory, MY_MEMORY_ORDER_RELEASE);                      \
-    (PINS)->purgatory= (ADDR);                                          \
-    (PINS)->purgatory_count++;                                          \
-  } while (0)
-
 /*
   Free an object allocated via pinbox allocator
 
@@ -274,138 +265,85 @@ void lf_pinbox_free(LF_PINS *pins, void
                       lf_pinbox_real_free(pins););
 }
 
-struct st_harvester {
-  void **granary;
-  int npins;
+struct st_match_and_save_arg {
+  LF_PINS *pins;
+  LF_PINBOX *pinbox;
+  void *old_purgatory;
 };
 
 /*
-  callback forlf_dynarray_iterate:
-  scan all pins of all threads and accumulate all pins
+  Callback for lf_dynarray_iterate:
+  Scan all pins of all threads, for each active (non-null) pin,
+  scan the current thread's purgatory. If present there, move it
+  to a new purgatory. At the end, the old purgatory will contain
+  pointers not pinned by any thread.
 */
-static int harvest_pins(LF_PINS *el, struct st_harvester *hv)
+static int match_and_save(LF_PINS *el, struct st_match_and_save_arg *arg)
 {
   int i;
-  LF_PINS *el_end= el+MY_MIN(hv->npins, LF_DYNARRAY_LEVEL_LENGTH);
+  LF_PINS *el_end= el + LF_DYNARRAY_LEVEL_LENGTH;
   for (; el < el_end; el++)
   {
     for (i= 0; i < LF_PINBOX_PINS; i++)
     {
       void *p= el->pin[i];
       if (p)
-        *hv->granary++= p;
+      {
+        void *cur= arg->old_purgatory;
+        void **list_prev= &arg->old_purgatory;
+        while (cur)
+        {
+          void *next= pnext_node(arg->pinbox, cur);
+
+          if (p == cur)
+          {
+            /* pinned - keeping */
+            add_to_purgatory(arg->pins, cur);
+            /* unlink from old purgatory */
+            *list_prev= next;
+          }
+          else
+            list_prev= (void **)((char *)cur+arg->pinbox->free_ptr_offset);
+          cur= next;
+        }
+        if (!arg->old_purgatory)
+          return 1;
+      }
     }
   }
-  /*
-    hv->npins may become negative below, but it means that
-    we're on the last dynarray page and harvest_pins() won't be
-    called again. We don't bother to make hv->npins() correct
-    (that is 0) in this case.
-  */
-  hv->npins-= LF_DYNARRAY_LEVEL_LENGTH;
   return 0;
 }
 
 /*
-  callback forlf_dynarray_iterate:
-  scan all pins of all threads and see if addr is present there
-*/
-static int match_pins(LF_PINS *el, void *addr)
-{
-  int i;
-  LF_PINS *el_end= el+LF_DYNARRAY_LEVEL_LENGTH;
-  for (; el < el_end; el++)
-    for (i= 0; i < LF_PINBOX_PINS; i++)
-      if (el->pin[i] == addr)
-        return 1;
-  return 0;
-}
-
-#define next_node(P, X) (*((uchar * volatile *)(((uchar *)(X)) + (P)->free_ptr_offset)))
-#define anext_node(X) next_node(&allocator->pinbox, (X))
-
-/*
   Scan the purgatory and free everything that can be freed
 */
 static void lf_pinbox_real_free(LF_PINS *pins)
 {
-  int npins;
-  void *list;
-  void **addr= NULL;
-  void *first= NULL, *last= NULL;
-  struct st_my_thread_var *var= my_thread_var;
-  void *stack_ends_here= var ? var->stack_ends_here : NULL;
   LF_PINBOX *pinbox= pins->pinbox;
 
-  npins= pinbox->pins_in_array+1;
+  /* Store info about current purgatory. */
+  struct st_match_and_save_arg arg = {pins, pinbox, pins->purgatory};
+  /* Reset purgatory. */
+  pins->purgatory= NULL;
+  pins->purgatory_count= 0;
 
-#ifdef HAVE_ALLOCA
-  if (stack_ends_here != NULL)
-  {
-    int alloca_size= sizeof(void *)*LF_PINBOX_PINS*npins;
-    /* create a sorted list of pinned addresses, to speed up searches */
-    if (available_stack_size(&pinbox, stack_ends_here) >
-        alloca_size + ALLOCA_SAFETY_MARGIN)
-    {
-      struct st_harvester hv;
-      addr= (void **) alloca(alloca_size);
-      hv.granary= addr;
-      hv.npins= npins;
-      /* scan the dynarray and accumulate all pinned addresses */
-     lf_dynarray_iterate(&pinbox->pinarray,
-                           (lf_dynarray_func)harvest_pins, &hv);
-
-      npins= (int)(hv.granary-addr);
-      /* and sort them */
-      if (npins)
-        qsort(addr, npins, sizeof(void *), (qsort_cmp)ptr_cmp);
-    }
-  }
-#endif
 
-  list= pins->purgatory;
-  pins->purgatory= 0;
-  pins->purgatory_count= 0;
-  while (list)
+  lf_dynarray_iterate(&pinbox->pinarray,
+                      (lf_dynarray_func)match_and_save, &arg);
+
+  if (arg.old_purgatory)
   {
-    void *cur= list;
-    list= *(void **)((char *)cur+pinbox->free_ptr_offset);
-    if (npins)
-    {
-      if (addr) /* use binary search */
-      {
-        void **a, **b, **c;
-        for (a= addr, b= addr+npins-1, c= a+(b-a)/2; (b-a) > 1; c= a+(b-a)/2)
-          if (cur == *c)
-            a= b= c;
-          else if (cur > *c)
-            a= c;
-          else
-            b= c;
-        if (cur == *a || cur == *b)
-          goto found;
-      }
-      else /* no alloca - no cookie. linear search here */
-      {
-        if (lf_dynarray_iterate(&pinbox->pinarray,
-                                 (lf_dynarray_func)match_pins, cur))
-          goto found;
-      }
-    }
-    /* not pinned - freeing */
-    if (last)
-      last= next_node(pinbox, last)= (uchar *)cur;
-    else
-      first= last= (uchar *)cur;
-    continue;
-found:
-    /* pinned - keeping */
-    add_to_purgatory(pins, cur);
+    /* Some objects in the old purgatory were not pinned, free them. */
+    void *last= arg.old_purgatory;
+    while (pnext_node(pinbox, last))
+      last= pnext_node(pinbox, last);
+    pinbox->free_func(arg.old_purgatory, last, pinbox->free_func_arg);
   }
-  if (last)
-    pinbox->free_func(first, last, pinbox->free_func_arg);
 }
 
+#define next_node(P, X) (*((uchar * volatile *)(((uchar *)(X)) + (P)->free_ptr_offset)))
+#define anext_node(X) next_node(&allocator->pinbox, (X))
+
 /* lock-free memory allocator for fixed-size objects */
 
 /*