Beepboop
Beepboop

Reputation: 41

Problem after creating signal handler with 'sigaction' in a standalone multi-thread executable. SIGSEGV from KERNEL when receives the handled signal

I am trying to make a standalone (without libc) mutlithreading executable in linux, which can be handled with signals and unix-domain socket, but I have troubles setting up the signal handler.

This is the part of the code that I have troubles with.

#include <linux/futex.h>
#include <linux/signal.h>
#include <linux/time.h>
#include <kernel.h>
#include <mm.h>
#include <fs.h>
#include <bit.h>
#include <text-format.h>

#define STACKSIZE 16384
#define CORES_COUNT 1
#define STARTING_CORE 1

struct thread_props {
    pid_t tid;
    pid_t tret;
    int32_t coreID;
    int32_t stopped;
    struct timespec start;
    struct timespec end;
    struct clone_args clargs;
};

typedef uint32_t cpuset_part_t;
#define MAX_CPUSET_BITS 32
#define CPUSET_PART_BITS (sizeof(cpuset_part_t) * 8)
#define CPUSET_PARTS ((MAX_CPUSET_BITS/8)/sizeof(cpuset_part_t))
#define CPUSET_ZERO(cpuset) memset((uint8_t *) cpuset, 0, CPUSET_PARTS * sizeof(cpuset_part_t))
#define CPUSET_ADD(cpuset, cpu) (cpuset[cpu/CPUSET_PART_BITS] = BIT_SET(cpuset[cpu/CPUSET_PART_BITS], (cpu % CPUSET_PART_BITS) + 1))
#define CPUSET_RM(cpuset, cpu) (cpuset[cpu/CPUSET_PART_BITS] = BIT_CLR(cpuset[cpu/CPUSET_PART_BITS], (cpu % CPUSET_PART_BITS) + 1))
#define CPUSET_TOGGLE(cpuset, cpu) (cpuset[cpu/CPUSET_PART_BITS] = BIT_TOGGLE(cpuset[cpu/CPUSET_PART_BITS], (cpu % CPUSET_PART_BITS) + 1))
#define CPUSET_ALLON(cpuset) memset((uint8_t *) cpuset, (uint64_t) -1, CPUSET_PARTS * sizeof(cpuset_part_t))

static int32_t thread_function(struct thread_props *tp)
{
    cpuset_part_t cpuset[CPUSET_PARTS];
    CPUSET_ZERO(cpuset);
    CPUSET_ADD(cpuset, tp->coreID);
    sched_setaffinity(0, sizeof(cpuset), (uint64_t) cpuset);
    
    printf("before stop: tid %sl:d\n", tp->tid);

    futex((uint64_t) &tp->stopped, FUTEX_WAIT, 1, (uint64_t) NULL, (uint64_t) NULL, 0);

    clock_gettime(CLOCK_THREAD_CPUTIME_ID, (uint64_t) &tp->start);

    printf("doing some works here\n")

    clock_gettime(CLOCK_THREAD_CPUTIME_ID, (uint64_t) &tp->end);

    tp->tid = 0;
    futex((uint64_t) &tp->tid, FUTEX_WAKE, 1, (uint64_t) NULL, (uint64_t) NULL, 0);
    return 0;
}

void create_thread(struct thread_props *tp) {
    printf("Started\n");
    tp->stopped = 1;
    tp->clargs.flags = (CLONE_SYSVSEM | CLONE_IO | CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | CLONE_THREAD);
    tp->clargs.exit_signal = 0;
    tp->clargs.parent_tid = 0;
    tp->clargs.pidfd = 0;
    tp->clargs.cgroup = 0;
    tp->clargs.set_tid = 0;
    tp->clargs.set_tid_size = 0;
    tp->clargs.tls = 0;
    tp->clargs.child_tid = 0;
    tp->clargs.stack = (uint64_t) mmap((uint64_t) NULL, STACKSIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
    if(tp->clargs.stack == (uint64_t) -1) {
        printf("mmap failed");
        return;
    }
    printf("Initialized: coreID = %sl:d\n", tp->coreID);
    tp->clargs.stack_size = STACKSIZE;
    int64_t clret = 0;
    clret = clone3((uint64_t) &tp->clargs, (uint64_t) sizeof(struct clone_args));
    if(clret == 0) {
        tp->tid = gettid();
        tp->tret = tp->tid;
        printf("entered tid: %sl:d\n", tp->tret);
        thread_function(tp);
        printf("exiting tid: %sl:d\n", gettid());
        exit(0);
    }
}

void wait_thread(struct thread_props *tp) {
    futex((uint64_t) &tp->tid, FUTEX_WAIT, tp->tret, (uint64_t) NULL, (uint64_t) NULL, 0);
}

static struct thread_props tprops[CORES_COUNT];

void wake_threads(int32_t signo) {
    printf("signaled\n");
    for(int32_t core = 0; core < CORES_COUNT; ++core)
        futex((uint64_t) &tprops[core].stopped, FUTEX_WAKE, CORES_COUNT, (uint64_t) NULL, (uint64_t) NULL, 0);
}

int32_t threads_test() {
    cpuset_part_t cpuset[CPUSET_PARTS];
    CPUSET_ZERO(cpuset);
    CPUSET_ADD(cpuset, 0); // core 0 here
    sched_setaffinity(0, sizeof(cpuset), (uint64_t) cpuset);

    printf("entered tgid: %sl:d\n", gettid());

    struct sigaction siganew = {};
    struct sigaction sigaold = {};
    siganew.sa_handler = wake_threads;
    siganew.sa_restorer = NULL;
    siganew.sa_mask = 0;
    siganew.sa_flags = SA_RESTART; // for the futexes to restart after signal
    rt_sigaction(SIGUSR1, (uint64_t) &siganew, (uint64_t) &sigaold, sizeof(sigset_t));

    for(int32_t core = 0; core < CORES_COUNT; ++core) {
        tprops[core].coreID = STARTING_CORE + core;
        create_thread(&tprops[core]);
    }

    struct timespec waittime;
    waittime.tv_sec = 1;
    waittime.tv_nsec = 0;
    // this will effectively give enough time for the threads to hook up on their stop futex
    clock_nanosleep(CLOCK_MONOTONIC, 0, (uint64_t) &waittime, (uint64_t) NULL);

    for(int32_t core = 0; core < CORES_COUNT; ++core)
        wait_thread(&tprops[core]);

    printf("exiting tgid: %sl:d\n", gettid());
    return 0;
}

int32_t main() {
    printf("entering threads test\n");
    threads_test();
    printf("finished threads test\n");
    return 0;
}

I used gdb to find out what is happening but the SIGSEGV is sent from the KERNEL after a close look on the output of the strace.

This is the output of the strace before I send a signal. It just correctly waits for a signal

$ strace -f output64/test.threads64
execve("output64/test.threads64", ["output64/test.threads64"], 0x7ffd61325aa8 /* 44 vars */) = 0
write(1, "entering threads test\n", 22entering threads test
) = 22
sched_setaffinity(0, 4, [0])            = 0
gettid()                                = 23568
write(1, "entered tgid: 23568\n", 20entered tgid: 23568
)   = 20
rt_sigaction(SIGUSR1, {sa_handler=0x7fb47a12f720, sa_mask=[], sa_flags=SA_RESTART}, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0}, 8) = 0
write(1, "Started\n", 8Started
)                = 8
mmap(NULL, 16384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7fb47a121000
write(1, "Initialized: coreID = 1\n", 24Initialized: coreID = 1
) = 24
clone3({flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_IO, exit_signal=0, stack=0x7fb47a121000, stack_size=0x4000}, 88strace: Process 23569 attached
) = 23569
[pid 23569] gettid( <unfinished ...>
[pid 23568] clock_nanosleep(CLOCK_MONOTONIC, 0, {tv_sec=1, tv_nsec=0},  <unfinished ...>
[pid 23569] <... gettid resumed>)       = 23569
[pid 23569] write(1, "entered tid: 23569\n", 19entered tid: 23569
) = 19
[pid 23569] sched_setaffinity(0, 4, [1]) = 0
[pid 23569] write(1, "before stop: tid 23569\n", 23before stop: tid 23569
) = 23
[pid 23569] futex(0x7fb47a132014, FUTEX_WAIT, 1, NULL <unfinished ...>
[pid 23568] <... clock_nanosleep resumed>NULL) = 0
[pid 23568] futex(0x7fb47a132000, FUTEX_WAIT, 23569, NULL

Then from another console, sending a signal with kill:

$ kill -SIGUSR1 23568

And the strace exits with:

[pid 23568] --- SIGUSR1 {si_signo=SIGUSR1, si_code=SI_USER, si_pid=23649, si_uid=1000} ---
[pid 23568] --- SIGSEGV {si_signo=SIGSEGV, si_code=SI_KERNEL, si_addr=NULL} ---
[pid 23569] <... futex resumed>)        = ?
[pid 23569] +++ killed by SIGSEGV +++
+++ killed by SIGSEGV +++

I am compiling the code with

gcc -c -nostdinc -m64 -march=raptorlake -mtune=raptorlake -fno-stack-protector -ffreestanding -fno-builtin -pipe -O0 -g3 -ggdb -I /usr/lib/modules/6.13.1/build/headers/include -I include -Wall -Wextra -o output64/threads.c.o test/threads.c

And finally link with

ld output64/libc.c.o output64/arch/x86/start64.S.o output64/threads.c.o -x -pie -nostdlib --no-dynamic-linker -m elf_x86_64 -o output64/test.threads64

Exits normally after I took out the stop futex of the child thread. The SIGSEGV happens only when it receives the signal.

I searched on the internet for such problems and I found nothing relevant. I have also tried with (SIG_BLOCK and SIG_SETMASK):

sigset_t sigset = (sigset_t)(1 << (((SIGUSR1) - 1) % (sizeof(sigset_t) * 8)));
rt_sigprocmask(SIG_BLOCK, (uint64_t) &sigset, (uint64_t) NULL, sizeof(sigset_t));

exactly after clone3 when clret == 0 (in the child thread) and I had the same results

update: @CraigEstey mentioned the use of the sigreturn() function, which indeed might be the problem, but I have no idea how the signal traboline concept works (as it is mentioned in the manual), so I am going to look for it

update: After some reasearch in the musl libc I found out that the default restorer just only calls __restore_rt or __restore which is an assembly function that only calls the rt_sigreturn or sigreturn syscall respectively. So I just placed a restorer function on the sa_restorer member of struct sigaction, which only calls rt_sigreturn syscall, added SA_RESTORER flag on the sa_flags and BOOM!!! it worked as expected!

So why is this needed is not clear to me. With few words the manual says, that it is a way for the kernel to restore the process context state (sigmask, stack frame etc), during transition from kernel-space to user-space after a signal has been received from the process.

Thank you @CraigEstey!

Upvotes: 4

Views: 87

Answers (0)

Related Questions