Reputation: 4517
I wrote a basic kprobe linux kernel module, which will register a handler for fork and in the handler i am printing the value of register 'orig_ax'.
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/kprobes.h>
MODULE_LICENSE("GPL");
static struct kprobe kp;
static char *name = "_do_fork";
module_param(name, charp, 0);
static int pre_handler(struct kprobe *p, struct pt_regs *regs)
{
printk("orig_ax regs:%lu \t ax:%lu\n", regs->orig_ax, regs->ax);
return 0;
}
static void post_handler(struct kprobe *p, struct pt_regs *regs,
unsigned long flags)
{
}
static int __init hello_init(void)
{
/* set the handler functions */
kp.pre_handler = pre_handler;
kp.post_handler = post_handler;
kp.symbol_name = name;
register_kprobe(&kp);
return 0;
}
static void __exit hello_exit(void)
{
unregister_kprobe(&kp);
}
module_init(hello_init);
module_exit(hello_exit);
I get a different value instead of 57
[ 9251.954392] orig_ax regs:0 ax:18446661681273651032
Am i making any mistake
Upvotes: 1
Views: 346
Reputation: 937
What happened:
Kprobe is implemented by replacing the original instruction with int 3
instruction which will cause CPU to generate an software interrupt. In this scenario, the CPU context must be saved on the kernel stack and then your handler will be executed. So the regs
is the context of int 3
not the context of syscall
instruction which is used by glibc to trigger a kernel system call. The value orig_ax
you getting is an error code when CPU interrupt/exception occurs. Its value is zero because int 3
interrupt doesn't generate any error, so kernel pushes zero to stack as a placeholder which make the whole implementation more generic.
What should you do:
If you want to fetch the syscall number, you should plant a probe on do_syscall_64
which is the first C function executed when a system call is invoked. Or you can probe entry_SYSCALL_64
which is the interrupt handler for syscall
/int 0x80
assembly instruction.
DETAILS:
The syscall mechanism is implemented using CPU trap gate. When you call fork()
in C language, glibc will execute syscall
assembly instruction and the syscall number is stored in rax
as you have already known. CPU will generate a software-generated interrupt and will start to execute the syscall's interrupt handler whose address is stored in IDT.
The following code is the syscall's interrupt handler on x86_64.
ENTRY(entry_SYSCALL_64)
UNWIND_HINT_EMPTY
/*
* Interrupts are off on entry.
* We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
* it is too small to ever cause noticeable irq latency.
*/
swapgs
/* tss.sp2 is scratch space. */
movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
GLOBAL(entry_SYSCALL_64_after_hwframe)
pushq %rax /* pt_regs->orig_ax */
PUSH_AND_CLEAR_REGS rax=$-ENOSYS
TRACE_IRQS_OFF
/* IRQs are off. */
movq %rax, %rdi
movq %rsp, %rsi
call do_syscall_64 /* returns with IRQs disabled */
The pushq %rax
instruction saves rax
(aka. syscall number) on kernel stack and then it calls do_syscall_64
.
__visible void do_syscall_64(unsigned long nr, struct pt_regs *regs)
{
struct thread_info *ti;
enter_from_user_mode();
local_irq_enable();
ti = current_thread_info();
if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY)
nr = syscall_trace_enter(regs);
if (likely(nr < NR_syscalls)) {
nr = array_index_nospec(nr, NR_syscalls);
regs->ax = sys_call_table[nr](regs);
#ifdef CONFIG_X86_X32_ABI
} else if (likely((nr & __X32_SYSCALL_BIT) &&
(nr & ~__X32_SYSCALL_BIT) < X32_NR_syscalls)) {
nr = array_index_nospec(nr & ~__X32_SYSCALL_BIT,
X32_NR_syscalls);
regs->ax = x32_sys_call_table[nr](regs);
#endif
}
syscall_return_slowpath(regs);
}
The most important statement is regs->ax = sys_call_table[nr](regs);
which will invoke fork
-related functions. When the _do_fork
is called, the regs
info has lost already, so you can't get anything information related to system call.
Upvotes: 2