PostgreSQL Source Code git master
pg_numa.c
Go to the documentation of this file.
1/*-------------------------------------------------------------------------
2 *
3 * pg_numa.c
4 * Basic NUMA portability routines
5 *
6 *
7 * Copyright (c) 2025, PostgreSQL Global Development Group
8 *
9 *
10 * IDENTIFICATION
11 * src/port/pg_numa.c
12 *
13 *-------------------------------------------------------------------------
14 */
15
16#include "c.h"
17#include <unistd.h>
18
19#include "miscadmin.h"
20#include "port/pg_numa.h"
21
22/*
23 * At this point we provide support only for Linux thanks to libnuma, but in
24 * future support for other platforms e.g. Win32 or FreeBSD might be possible
25 * too. For Win32 NUMA APIs see
26 * https://learn.microsoft.com/en-us/windows/win32/procthread/numa-support
27 */
28#ifdef USE_LIBNUMA
29
30#include <numa.h>
31#include <numaif.h>
32
33/*
34 * numa_move_pages() chunk size, has to be <= 16 to work around a kernel bug
35 * in do_pages_stat() (chunked by DO_PAGES_STAT_CHUNK_NR). By using the same
36 * chunk size, we make it work even on unfixed kernels.
37 *
38 * 64-bit system are not affected by the bug, and so use much larger chunks.
39 */
40#if SIZEOF_SIZE_T == 4
41#define NUMA_QUERY_CHUNK_SIZE 16
42#else
43#define NUMA_QUERY_CHUNK_SIZE 1024
44#endif
45
46/* libnuma requires initialization as per numa(3) on Linux */
47int
48pg_numa_init(void)
49{
50 int r;
51
52 /*
53 * XXX libnuma versions before 2.0.19 don't handle EPERM by disabling
54 * NUMA, which then leads to unexpected failures later. This affects
55 * containers that disable get_mempolicy by a seccomp profile.
56 */
57 if (get_mempolicy(NULL, NULL, 0, 0, 0) < 0 && (errno == EPERM))
58 r = -1;
59 else
60 r = numa_available();
61
62 return r;
63}
64
65/*
66 * We use move_pages(2) syscall here - instead of get_mempolicy(2) - as the
67 * first one allows us to batch and query about many memory pages in one single
68 * giant system call that is way faster.
69 *
70 * We call numa_move_pages() for smaller chunks of the whole array. The first
71 * reason is to work around a kernel bug, but also to allow interrupting the
72 * query between the calls (for many pointers processing the whole array can
73 * take a lot of time).
74 */
75int
76pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status)
77{
78 unsigned long next = 0;
79 int ret = 0;
80
81 /*
82 * Chunk pointers passed to numa_move_pages to NUMA_QUERY_CHUNK_SIZE
83 * items, to work around a kernel bug in do_pages_stat().
84 */
85 while (next < count)
86 {
87 unsigned long count_chunk = Min(count - next,
88 NUMA_QUERY_CHUNK_SIZE);
89
91
92 /*
93 * Bail out if any of the chunks errors out (ret<0). We ignore (ret>0)
94 * which is used to return number of nonmigrated pages, but we're not
95 * migrating any pages here.
96 */
97 ret = numa_move_pages(pid, count_chunk, &pages[next], NULL, &status[next], 0);
98 if (ret < 0)
99 {
100 /* plain error, return as is */
101 return ret;
102 }
103
104 next += count_chunk;
105 }
106
107 /* should have consumed the input array exactly */
108 Assert(next == count);
109
110 return 0;
111}
112
113int
115{
116 return numa_max_node();
117}
118
119#else
120
121/* Empty wrappers */
122int
124{
125 /* We state that NUMA is not available */
126 return -1;
127}
128
129int
130pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status)
131{
132 return 0;
133}
134
135int
137{
138 return 0;
139}
140
141#endif
static int32 next
Definition: blutils.c:224
#define Min(x, y)
Definition: c.h:1008
Assert(PointerIsAligned(start, uint64))
#define CHECK_FOR_INTERRUPTS()
Definition: miscadmin.h:123
int pg_numa_query_pages(int pid, unsigned long count, void **pages, int *status)
Definition: pg_numa.c:130
int pg_numa_init(void)
Definition: pg_numa.c:123
int pg_numa_get_max_node(void)
Definition: pg_numa.c:136