1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
|
.\" Copyright (C) 2002 Robert Love
.\" and Copyright (C) 2006, 2015 Michael Kerrisk
.\"
.\" SPDX-License-Identifier: GPL-2.0-or-later
.\"
.\" 2002-11-19 Robert Love <rml@tech9.net> - initial version
.\" 2004-04-20 mtk - fixed description of return value
.\" 2004-04-22 aeb - added glibc prototype history
.\" 2005-05-03 mtk - noted that sched_setaffinity may cause thread
.\" migration and that CPU affinity is a per-thread attribute.
.\" 2006-02-03 mtk -- Major rewrite
.\" 2008-11-12, mtk, removed CPU_*() macro descriptions to a
.\" separate CPU_SET(3) page.
.\"
.TH sched_setaffinity 2 2023-03-30 "Linux man-pages 6.04"
.SH NAME
sched_setaffinity, sched_getaffinity \- \
set and get a thread's CPU affinity mask
.SH LIBRARY
Standard C library
.RI ( libc ", " \-lc )
.SH SYNOPSIS
.nf
.BR "#define _GNU_SOURCE" " /* See feature_test_macros(7) */"
.B #include <sched.h>
.PP
.BI "int sched_setaffinity(pid_t " pid ", size_t " cpusetsize ,
.BI " const cpu_set_t *" mask );
.BI "int sched_getaffinity(pid_t " pid ", size_t " cpusetsize ,
.BI " cpu_set_t *" mask );
.fi
.SH DESCRIPTION
A thread's CPU affinity mask determines the set of CPUs on which
it is eligible to run.
On a multiprocessor system, setting the CPU affinity mask
can be used to obtain performance benefits.
For example,
by dedicating one CPU to a particular thread
(i.e., setting the affinity mask of that thread to specify a single CPU,
and setting the affinity mask of all other threads to exclude that CPU),
it is possible to ensure maximum execution speed for that thread.
Restricting a thread to run on a single CPU also avoids
the performance cost caused by the cache invalidation that occurs
when a thread ceases to execute on one CPU and then
recommences execution on a different CPU.
.PP
A CPU affinity mask is represented by the
.I cpu_set_t
structure, a "CPU set", pointed to by
.IR mask .
A set of macros for manipulating CPU sets is described in
.BR CPU_SET (3).
.PP
.BR sched_setaffinity ()
sets the CPU affinity mask of the thread whose ID is
.I pid
to the value specified by
.IR mask .
If
.I pid
is zero, then the calling thread is used.
The argument
.I cpusetsize
is the length (in bytes) of the data pointed to by
.IR mask .
Normally this argument would be specified as
.IR "sizeof(cpu_set_t)" .
.PP
If the thread specified by
.I pid
is not currently running on one of the CPUs specified in
.IR mask ,
then that thread is migrated to one of the CPUs specified in
.IR mask .
.PP
.BR sched_getaffinity ()
writes the affinity mask of the thread whose ID is
.I pid
into the
.I cpu_set_t
structure pointed to by
.IR mask .
The
.I cpusetsize
argument specifies the size (in bytes) of
.IR mask .
If
.I pid
is zero, then the mask of the calling thread is returned.
.SH RETURN VALUE
On success,
.BR sched_setaffinity ()
and
.BR sched_getaffinity ()
return 0 (but see "C library/kernel differences" below,
which notes that the underlying
.BR sched_getaffinity ()
differs in its return value).
On failure, \-1 is returned, and
.I errno
is set to indicate the error.
.SH ERRORS
.TP
.B EFAULT
A supplied memory address was invalid.
.TP
.B EINVAL
The affinity bit mask
.I mask
contains no processors that are currently physically on the system
and permitted to the thread according to any restrictions that
may be imposed by
.I cpuset
cgroups or the "cpuset" mechanism described in
.BR cpuset (7).
.TP
.B EINVAL
.RB ( sched_getaffinity ()
and, before Linux 2.6.9,
.BR sched_setaffinity ())
.I cpusetsize
is smaller than the size of the affinity mask used by the kernel.
.TP
.B EPERM
.RB ( sched_setaffinity ())
The calling thread does not have appropriate privileges.
The caller needs an effective user ID equal to the real user ID
or effective user ID of the thread identified by
.IR pid ,
or it must possess the
.B CAP_SYS_NICE
capability in the user namespace of the thread
.IR pid .
.TP
.B ESRCH
The thread whose ID is \fIpid\fP could not be found.
.SH STANDARDS
Linux.
.SH HISTORY
Linux 2.5.8,
glibc 2.3.
.PP
Initially, the glibc interfaces included a
.I cpusetsize
argument, typed as
.IR "unsigned int" .
In glibc 2.3.3, the
.I cpusetsize
argument was removed, but was then restored in glibc 2.3.4, with type
.IR size_t .
.SH NOTES
After a call to
.BR sched_setaffinity (),
the set of CPUs on which the thread will actually run is
the intersection of the set specified in the
.I mask
argument and the set of CPUs actually present on the system.
The system may further restrict the set of CPUs on which the thread
runs if the "cpuset" mechanism described in
.BR cpuset (7)
is being used.
These restrictions on the actual set of CPUs on which the thread
will run are silently imposed by the kernel.
.PP
There are various ways of determining the number of CPUs
available on the system, including: inspecting the contents of
.IR /proc/cpuinfo ;
using
.BR sysconf (3)
to obtain the values of the
.B _SC_NPROCESSORS_CONF
and
.B _SC_NPROCESSORS_ONLN
parameters; and inspecting the list of CPU directories under
.IR /sys/devices/system/cpu/ .
.PP
.BR sched (7)
has a description of the Linux scheduling scheme.
.PP
The affinity mask is a per-thread attribute that can be
adjusted independently for each of the threads in a thread group.
The value returned from a call to
.BR gettid (2)
can be passed in the argument
.IR pid .
Specifying
.I pid
as 0 will set the attribute for the calling thread,
and passing the value returned from a call to
.BR getpid (2)
will set the attribute for the main thread of the thread group.
(If you are using the POSIX threads API, then use
.BR pthread_setaffinity_np (3)
instead of
.BR sched_setaffinity ().)
.PP
The
.I isolcpus
boot option can be used to isolate one or more CPUs at boot time,
so that no processes are scheduled onto those CPUs.
Following the use of this boot option,
the only way to schedule processes onto the isolated CPUs is via
.BR sched_setaffinity ()
or the
.BR cpuset (7)
mechanism.
For further information, see the kernel source file
.IR Documentation/admin\-guide/kernel\-parameters.txt .
As noted in that file,
.I isolcpus
is the preferred mechanism of isolating CPUs
(versus the alternative of manually setting the CPU affinity
of all processes on the system).
.PP
A child created via
.BR fork (2)
inherits its parent's CPU affinity mask.
The affinity mask is preserved across an
.BR execve (2).
.SS C library/kernel differences
This manual page describes the glibc interface for the CPU affinity calls.
The actual system call interface is slightly different, with the
.I mask
being typed as
.IR "unsigned long\ *" ,
reflecting the fact that the underlying implementation of CPU
sets is a simple bit mask.
.PP
On success, the raw
.BR sched_getaffinity ()
system call returns the number of bytes placed copied into the
.I mask
buffer;
this will be the minimum of
.I cpusetsize
and the size (in bytes) of the
.I cpumask_t
data type that is used internally by the kernel to
represent the CPU set bit mask.
.SS Handling systems with large CPU affinity masks
The underlying system calls (which represent CPU masks as bit masks of type
.IR "unsigned long\ *" )
impose no restriction on the size of the CPU mask.
However, the
.I cpu_set_t
data type used by glibc has a fixed size of 128 bytes,
meaning that the maximum CPU number that can be represented is 1023.
.\" FIXME . See https://sourceware.org/bugzilla/show_bug.cgi?id=15630
.\" and https://sourceware.org/ml/libc-alpha/2013-07/msg00288.html
If the kernel CPU affinity mask is larger than 1024,
then calls of the form:
.PP
.in +4n
.EX
sched_getaffinity(pid, sizeof(cpu_set_t), &mask);
.EE
.in
.PP
fail with the error
.BR EINVAL ,
the error produced by the underlying system call for the case where the
.I mask
size specified in
.I cpusetsize
is smaller than the size of the affinity mask used by the kernel.
(Depending on the system CPU topology, the kernel affinity mask can
be substantially larger than the number of active CPUs in the system.)
.PP
When working on systems with large kernel CPU affinity masks,
one must dynamically allocate the
.I mask
argument (see
.BR CPU_ALLOC (3)).
Currently, the only way to do this is by probing for the size
of the required mask using
.BR sched_getaffinity ()
calls with increasing mask sizes (until the call does not fail with the error
.BR EINVAL ).
.PP
Be aware that
.BR CPU_ALLOC (3)
may allocate a slightly larger CPU set than requested
(because CPU sets are implemented as bit masks allocated in units of
.IR sizeof(long) ).
Consequently,
.BR sched_getaffinity ()
can set bits beyond the requested allocation size, because the kernel
sees a few additional bits.
Therefore, the caller should iterate over the bits in the returned set,
counting those which are set, and stop upon reaching the value returned by
.BR CPU_COUNT (3)
(rather than iterating over the number of bits
requested to be allocated).
.SH EXAMPLES
The program below creates a child process.
The parent and child then each assign themselves to a specified CPU
and execute identical loops that consume some CPU time.
Before terminating, the parent waits for the child to complete.
The program takes three command-line arguments:
the CPU number for the parent,
the CPU number for the child,
and the number of loop iterations that both processes should perform.
.PP
As the sample runs below demonstrate, the amount of real and CPU time
consumed when running the program will depend on intra-core caching effects
and whether the processes are using the same CPU.
.PP
We first employ
.BR lscpu (1)
to determine that this (x86)
system has two cores, each with two CPUs:
.PP
.in +4n
.EX
$ \fBlscpu | egrep \-i \[aq]core.*:|socket\[aq]\fP
Thread(s) per core: 2
Core(s) per socket: 2
Socket(s): 1
.EE
.in
.PP
We then time the operation of the example program for three cases:
both processes running on the same CPU;
both processes running on different CPUs on the same core;
and both processes running on different CPUs on different cores.
.PP
.in +4n
.EX
$ \fBtime \-p ./a.out 0 0 100000000\fP
real 14.75
user 3.02
sys 11.73
$ \fBtime \-p ./a.out 0 1 100000000\fP
real 11.52
user 3.98
sys 19.06
$ \fBtime \-p ./a.out 0 3 100000000\fP
real 7.89
user 3.29
sys 12.07
.EE
.in
.SS Program source
\&
.\" SRC BEGIN (sched_setaffinity.c)
.EX
#define _GNU_SOURCE
#include <err.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/wait.h>
#include <unistd.h>
int
main(int argc, char *argv[])
{
int parentCPU, childCPU;
cpu_set_t set;
unsigned int nloops;
if (argc != 4) {
fprintf(stderr, "Usage: %s parent\-cpu child\-cpu num\-loops\en",
argv[0]);
exit(EXIT_FAILURE);
}
parentCPU = atoi(argv[1]);
childCPU = atoi(argv[2]);
nloops = atoi(argv[3]);
CPU_ZERO(&set);
switch (fork()) {
case \-1: /* Error */
err(EXIT_FAILURE, "fork");
case 0: /* Child */
CPU_SET(childCPU, &set);
if (sched_setaffinity(getpid(), sizeof(set), &set) == \-1)
err(EXIT_FAILURE, "sched_setaffinity");
for (unsigned int j = 0; j < nloops; j++)
getppid();
exit(EXIT_SUCCESS);
default: /* Parent */
CPU_SET(parentCPU, &set);
if (sched_setaffinity(getpid(), sizeof(set), &set) == \-1)
err(EXIT_FAILURE, "sched_setaffinity");
for (unsigned int j = 0; j < nloops; j++)
getppid();
wait(NULL); /* Wait for child to terminate */
exit(EXIT_SUCCESS);
}
}
.EE
.\" SRC END
.SH SEE ALSO
.ad l
.nh
.BR lscpu (1),
.BR nproc (1),
.BR taskset (1),
.BR clone (2),
.BR getcpu (2),
.BR getpriority (2),
.BR gettid (2),
.BR nice (2),
.BR sched_get_priority_max (2),
.BR sched_get_priority_min (2),
.BR sched_getscheduler (2),
.BR sched_setscheduler (2),
.BR setpriority (2),
.BR CPU_SET (3),
.BR get_nprocs (3),
.BR pthread_setaffinity_np (3),
.BR sched_getcpu (3),
.BR capabilities (7),
.BR cpuset (7),
.BR sched (7),
.BR numactl (8)
|