Skip to content

Commit a6fbe36

Browse files
committed
nptl: Add support for setup guard pages with MADV_GUARD_INSTALL
Linux 6.13 (662df3e5c3766) added a lightweight way to define guard areas through madvise syscall. Instead of PROT_NONE the guard region through mprotect, userland can madvise the same area with a special flag, and the kernel ensures that accessing the area will trigger a SIGSEGV (as for PROT_NONE mapping). The madvise way has the advantage of less kernel memory consumption for the process page-table (one less VMA per guard area), and slightly less contention on kernel (also due to the fewer VMA areas being tracked). The pthread_create allocates a new thread stack in two ways: if a guard area is set (the default) it allocates the memory range required using PROT_NONE and then mprotect the usable stack area. Otherwise, if a guard page is not set it allocates the region with the required flags. For the MADV_GUARD_INSTALL support, the stack area region is allocated with required flags and then the guard region is installed. If the kernel does not support it, the usual way is used instead (and MADV_GUARD_INSTALL is disabled for future stack creations). The stack allocation strategy is recorded on the pthread struct, and it is used in case the guard region needs to be resized. To avoid needing an extra field, the 'user_stack' is repurposed and renamed to 'stack_mode'. This patch also adds a proper test for the pthread guard. I checked on x86_64, aarch64, powerpc64le, and hppa with kernel 6.13.0-rc7. Reviewed-by: DJ Delorie <[email protected]>
1 parent 8e86549 commit a6fbe36

File tree

10 files changed

+561
-95
lines changed

10 files changed

+561
-95
lines changed

nptl/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,7 @@ tests = \
289289
tst-dlsym1 \
290290
tst-exec4 \
291291
tst-exec5 \
292+
tst-guard1 \
292293
tst-initializers1 \
293294
tst-initializers1-c11 \
294295
tst-initializers1-c89 \

nptl/TODO-testing

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,3 @@
1-
pthread_attr_setguardsize
2-
3-
test effectiveness
4-
51
pthread_attr_[sg]etschedparam
62

73
what to test?

nptl/allocatestack.c

Lines changed: 177 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -146,10 +146,37 @@ get_cached_stack (size_t *sizep, void **memp)
146146
return result;
147147
}
148148

149+
/* Assume support for MADV_ADVISE_GUARD, setup_stack_prot will disable it
150+
and fallback to ALLOCATE_GUARD_PROT_NONE if the madvise call fails. */
151+
static int allocate_stack_mode = ALLOCATE_GUARD_MADV_GUARD;
152+
153+
static inline int stack_prot (void)
154+
{
155+
return (PROT_READ | PROT_WRITE
156+
| ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
157+
}
158+
159+
static void *
160+
allocate_thread_stack (size_t size, size_t guardsize)
161+
{
162+
/* MADV_ADVISE_GUARD does not require an additional PROT_NONE mapping. */
163+
int prot = stack_prot ();
164+
165+
if (atomic_load_relaxed (&allocate_stack_mode) == ALLOCATE_GUARD_PROT_NONE)
166+
/* If a guard page is required, avoid committing memory by first allocate
167+
with PROT_NONE and then reserve with required permission excluding the
168+
guard page. */
169+
prot = guardsize == 0 ? prot : PROT_NONE;
170+
171+
return __mmap (NULL, size, prot, MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1,
172+
0);
173+
}
174+
175+
149176
/* Return the guard page position on allocated stack. */
150177
static inline char *
151178
__attribute ((always_inline))
152-
guard_position (void *mem, size_t size, size_t guardsize, struct pthread *pd,
179+
guard_position (void *mem, size_t size, size_t guardsize, const struct pthread *pd,
153180
size_t pagesize_m1)
154181
{
155182
#if _STACK_GROWS_DOWN
@@ -159,27 +186,131 @@ guard_position (void *mem, size_t size, size_t guardsize, struct pthread *pd,
159186
#endif
160187
}
161188

162-
/* Based on stack allocated with PROT_NONE, setup the required portions with
163-
'prot' flags based on the guard page position. */
164-
static inline int
165-
setup_stack_prot (char *mem, size_t size, char *guard, size_t guardsize,
166-
const int prot)
189+
/* Setup the MEM thread stack of SIZE bytes with the required protection flags
190+
along with a guard area of GUARDSIZE size. It first tries with
191+
MADV_GUARD_INSTALL, and then fallback to setup the guard area using the
192+
extra PROT_NONE mapping. Update PD with the type of guard area setup. */
193+
static inline bool
194+
setup_stack_prot (char *mem, size_t size, struct pthread *pd,
195+
size_t guardsize, size_t pagesize_m1)
167196
{
168-
char *guardend = guard + guardsize;
197+
if (__glibc_unlikely (guardsize == 0))
198+
return true;
199+
200+
char *guard = guard_position (mem, size, guardsize, pd, pagesize_m1);
201+
if (atomic_load_relaxed (&allocate_stack_mode) == ALLOCATE_GUARD_MADV_GUARD)
202+
{
203+
if (__madvise (guard, guardsize, MADV_GUARD_INSTALL) == 0)
204+
{
205+
pd->stack_mode = ALLOCATE_GUARD_MADV_GUARD;
206+
return true;
207+
}
208+
209+
/* If madvise fails it means the kernel does not support the guard
210+
advise (we assume that the syscall is available, guard is page-aligned
211+
and length is non negative). The stack has already the expected
212+
protection flags, so it just need to PROT_NONE the guard area. */
213+
atomic_store_relaxed (&allocate_stack_mode, ALLOCATE_GUARD_PROT_NONE);
214+
if (__mprotect (guard, guardsize, PROT_NONE) != 0)
215+
return false;
216+
}
217+
else
218+
{
219+
const int prot = stack_prot ();
220+
char *guardend = guard + guardsize;
169221
#if _STACK_GROWS_DOWN
170-
/* As defined at guard_position, for architectures with downward stack
171-
the guard page is always at start of the allocated area. */
172-
if (__mprotect (guardend, size - guardsize, prot) != 0)
173-
return errno;
222+
/* As defined at guard_position, for architectures with downward stack
223+
the guard page is always at start of the allocated area. */
224+
if (__mprotect (guardend, size - guardsize, prot) != 0)
225+
return false;
174226
#else
175-
size_t mprots1 = (uintptr_t) guard - (uintptr_t) mem;
176-
if (__mprotect (mem, mprots1, prot) != 0)
177-
return errno;
178-
size_t mprots2 = ((uintptr_t) mem + size) - (uintptr_t) guardend;
179-
if (__mprotect (guardend, mprots2, prot) != 0)
180-
return errno;
227+
size_t mprots1 = (uintptr_t) guard - (uintptr_t) mem;
228+
if (__mprotect (mem, mprots1, prot) != 0)
229+
return false;
230+
size_t mprots2 = ((uintptr_t) mem + size) - (uintptr_t) guardend;
231+
if (__mprotect (guardend, mprots2, prot) != 0)
232+
return false;
181233
#endif
182-
return 0;
234+
}
235+
236+
pd->stack_mode = ALLOCATE_GUARD_PROT_NONE;
237+
return true;
238+
}
239+
240+
/* Update the guard area of the thread stack MEM of size SIZE with the new
241+
GUARDISZE. It uses the method defined by PD stack_mode. */
242+
static inline bool
243+
adjust_stack_prot (char *mem, size_t size, const struct pthread *pd,
244+
size_t guardsize, size_t pagesize_m1)
245+
{
246+
/* The required guard area is larger than the current one. For
247+
_STACK_GROWS_DOWN it means the guard should increase as:
248+
249+
|guard|---------------------------------stack|
250+
|new guard--|---------------------------stack|
251+
252+
while for _STACK_GROWS_UP:
253+
254+
|stack---------------------------|guard|-----|
255+
|stack--------------------|new guard---|-----|
256+
257+
Both madvise and mprotect allows overlap the required region,
258+
so use the new guard placement with the new size. */
259+
if (guardsize > pd->guardsize)
260+
{
261+
char *guard = guard_position (mem, size, guardsize, pd, pagesize_m1);
262+
if (pd->stack_mode == ALLOCATE_GUARD_MADV_GUARD)
263+
return __madvise (guard, guardsize, MADV_GUARD_INSTALL) == 0;
264+
else if (pd->stack_mode == ALLOCATE_GUARD_PROT_NONE)
265+
return __mprotect (guard, guardsize, PROT_NONE) == 0;
266+
}
267+
/* The current guard area is larger than the required one. For
268+
_STACK_GROWS_DOWN is means change the guard as:
269+
270+
|guard-------|-------------------------stack|
271+
|new guard|----------------------------stack|
272+
273+
And for _STACK_GROWS_UP:
274+
275+
|stack---------------------|guard-------|---|
276+
|stack------------------------|new guard|---|
277+
278+
For ALLOCATE_GUARD_MADV_GUARD it means remove the slack area
279+
(disjointed region of guard and new guard), while for
280+
ALLOCATE_GUARD_PROT_NONE it requires to mprotect it with the stack
281+
protection flags. */
282+
else if (pd->guardsize > guardsize)
283+
{
284+
size_t slacksize = pd->guardsize - guardsize;
285+
if (pd->stack_mode == ALLOCATE_GUARD_MADV_GUARD)
286+
{
287+
void *slack =
288+
#if _STACK_GROWS_DOWN
289+
mem + guardsize;
290+
#else
291+
guard_position (mem, size, pd->guardsize, pd, pagesize_m1);
292+
#endif
293+
return __madvise (slack, slacksize, MADV_GUARD_REMOVE) == 0;
294+
}
295+
else if (pd->stack_mode == ALLOCATE_GUARD_PROT_NONE)
296+
{
297+
const int prot = stack_prot ();
298+
#if _STACK_GROWS_DOWN
299+
return __mprotect (mem + guardsize, slacksize, prot) == 0;
300+
#else
301+
char *new_guard = (char *)(((uintptr_t) pd - guardsize)
302+
& ~pagesize_m1);
303+
char *old_guard = (char *)(((uintptr_t) pd - pd->guardsize)
304+
& ~pagesize_m1);
305+
/* The guard size difference might be > 0, but once rounded
306+
to the nearest page the size difference might be zero. */
307+
if (new_guard > old_guard
308+
&& __mprotect (old_guard, new_guard - old_guard, prot) != 0)
309+
return false;
310+
#endif
311+
}
312+
}
313+
return true;
183314
}
184315

185316
/* Mark the memory of the stack as usable to the kernel. It frees everything
@@ -291,7 +422,7 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
291422

292423
/* This is a user-provided stack. It will not be queued in the
293424
stack cache nor will the memory (except the TLS memory) be freed. */
294-
pd->user_stack = true;
425+
pd->stack_mode = ALLOCATE_GUARD_USER;
295426

296427
/* This is at least the second thread. */
297428
pd->header.multiple_threads = 1;
@@ -325,10 +456,7 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
325456
/* Allocate some anonymous memory. If possible use the cache. */
326457
size_t guardsize;
327458
size_t reported_guardsize;
328-
size_t reqsize;
329459
void *mem;
330-
const int prot = (PROT_READ | PROT_WRITE
331-
| ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
332460

333461
/* Adjust the stack size for alignment. */
334462
size &= ~tls_static_align_m1;
@@ -358,16 +486,10 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
358486
return EINVAL;
359487

360488
/* Try to get a stack from the cache. */
361-
reqsize = size;
362489
pd = get_cached_stack (&size, &mem);
363490
if (pd == NULL)
364491
{
365-
/* If a guard page is required, avoid committing memory by first
366-
allocate with PROT_NONE and then reserve with required permission
367-
excluding the guard page. */
368-
mem = __mmap (NULL, size, (guardsize == 0) ? prot : PROT_NONE,
369-
MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
370-
492+
mem = allocate_thread_stack (size, guardsize);
371493
if (__glibc_unlikely (mem == MAP_FAILED))
372494
return errno;
373495

@@ -394,15 +516,10 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
394516
#endif
395517

396518
/* Now mprotect the required region excluding the guard area. */
397-
if (__glibc_likely (guardsize > 0))
519+
if (!setup_stack_prot (mem, size, pd, guardsize, pagesize_m1))
398520
{
399-
char *guard = guard_position (mem, size, guardsize, pd,
400-
pagesize_m1);
401-
if (setup_stack_prot (mem, size, guard, guardsize, prot) != 0)
402-
{
403-
__munmap (mem, size);
404-
return errno;
405-
}
521+
__munmap (mem, size);
522+
return errno;
406523
}
407524

408525
/* Remember the stack-related values. */
@@ -456,59 +573,31 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
456573
which will be read next. */
457574
}
458575

459-
/* Create or resize the guard area if necessary. */
460-
if (__glibc_unlikely (guardsize > pd->guardsize))
576+
/* Create or resize the guard area if necessary on an already
577+
allocated stack. */
578+
if (!adjust_stack_prot (mem, size, pd, guardsize, pagesize_m1))
461579
{
462-
char *guard = guard_position (mem, size, guardsize, pd,
463-
pagesize_m1);
464-
if (__mprotect (guard, guardsize, PROT_NONE) != 0)
465-
{
466-
mprot_error:
467-
lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
468-
469-
/* Remove the thread from the list. */
470-
__nptl_stack_list_del (&pd->list);
580+
lll_lock (GL (dl_stack_cache_lock), LLL_PRIVATE);
471581

472-
lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
582+
/* Remove the thread from the list. */
583+
__nptl_stack_list_del (&pd->list);
473584

474-
/* Get rid of the TLS block we allocated. */
475-
_dl_deallocate_tls (TLS_TPADJ (pd), false);
585+
lll_unlock (GL (dl_stack_cache_lock), LLL_PRIVATE);
476586

477-
/* Free the stack memory regardless of whether the size
478-
of the cache is over the limit or not. If this piece
479-
of memory caused problems we better do not use it
480-
anymore. Uh, and we ignore possible errors. There
481-
is nothing we could do. */
482-
(void) __munmap (mem, size);
587+
/* Get rid of the TLS block we allocated. */
588+
_dl_deallocate_tls (TLS_TPADJ (pd), false);
483589

484-
return errno;
485-
}
590+
/* Free the stack memory regardless of whether the size
591+
of the cache is over the limit or not. If this piece
592+
of memory caused problems we better do not use it
593+
anymore. Uh, and we ignore possible errors. There
594+
is nothing we could do. */
595+
(void) __munmap (mem, size);
486596

487-
pd->guardsize = guardsize;
597+
return errno;
488598
}
489-
else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
490-
0))
491-
{
492-
/* The old guard area is too large. */
493-
494-
#if _STACK_GROWS_DOWN
495-
if (__mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
496-
prot) != 0)
497-
goto mprot_error;
498-
#elif _STACK_GROWS_UP
499-
char *new_guard = (char *)(((uintptr_t) pd - guardsize)
500-
& ~pagesize_m1);
501-
char *old_guard = (char *)(((uintptr_t) pd - pd->guardsize)
502-
& ~pagesize_m1);
503-
/* The guard size difference might be > 0, but once rounded
504-
to the nearest page the size difference might be zero. */
505-
if (new_guard > old_guard
506-
&& __mprotect (old_guard, new_guard - old_guard, prot) != 0)
507-
goto mprot_error;
508-
#endif
509599

510-
pd->guardsize = guardsize;
511-
}
600+
pd->guardsize = guardsize;
512601
/* The pthread_getattr_np() calls need to get passed the size
513602
requested in the attribute, regardless of how large the
514603
actually used guardsize is. */
@@ -568,19 +657,21 @@ allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
568657
static void
569658
name_stack_maps (struct pthread *pd, bool set)
570659
{
660+
size_t adjust = pd->stack_mode == ALLOCATE_GUARD_PROT_NONE ?
661+
pd->guardsize : 0;
571662
#if _STACK_GROWS_DOWN
572-
void *stack = pd->stackblock + pd->guardsize;
663+
void *stack = pd->stackblock + adjust;
573664
#else
574665
void *stack = pd->stackblock;
575666
#endif
576-
size_t stacksize = pd->stackblock_size - pd->guardsize;
667+
size_t stacksize = pd->stackblock_size - adjust;
577668

578669
if (!set)
579-
__set_vma_name (stack, stacksize, NULL);
670+
__set_vma_name (stack, stacksize, " glibc: unused stack");
580671
else
581672
{
582673
unsigned int tid = pd->tid;
583-
if (pd->user_stack)
674+
if (pd->stack_mode == ALLOCATE_GUARD_USER)
584675
SET_STACK_NAME (" glibc: pthread user stack: ", stack, stacksize, tid);
585676
else
586677
SET_STACK_NAME (" glibc: pthread stack: ", stack, stacksize, tid);

nptl/descr.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,12 @@ struct priority_protection_data
125125
unsigned int priomap[];
126126
};
127127

128+
enum allocate_stack_mode_t
129+
{
130+
ALLOCATE_GUARD_MADV_GUARD = 0,
131+
ALLOCATE_GUARD_PROT_NONE = 1,
132+
ALLOCATE_GUARD_USER = 2,
133+
};
128134

129135
/* Thread descriptor data structure. */
130136
struct pthread
@@ -324,7 +330,7 @@ struct pthread
324330
bool report_events;
325331

326332
/* True if the user provided the stack. */
327-
bool user_stack;
333+
enum allocate_stack_mode_t stack_mode;
328334

329335
/* True if thread must stop at startup time. */
330336
bool stopped_start;

nptl/nptl-stack.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ __nptl_deallocate_stack (struct pthread *pd)
120120
not reset the 'used' flag in the 'tid' field. This is done by
121121
the kernel. If no thread has been created yet this field is
122122
still zero. */
123-
if (__glibc_likely (! pd->user_stack))
123+
if (__glibc_likely (pd->stack_mode != ALLOCATE_GUARD_USER))
124124
(void) queue_stack (pd);
125125
else
126126
/* Free the memory associated with the ELF TLS. */

0 commit comments

Comments
 (0)