#include "monitor_trace.h" #include #include #include // for syscall_get_nr #include #include #include // for get_task_mm #include #include mm_tree mm_tree_struct; struct diag_variant_buffer load_monitor_variant_buffer; typedef struct { struct rcu_head rcu_head; pid_t pid; struct mm_struct *mm; char cgroup_buf[256]; char argv[256]; } mm_info; struct stack_trace { unsigned int nr_entries, max_entries; unsigned long *entries; int skip; /* input argument: How many entries to skip */ }; struct stack_frame_user { const void __user *next_fp; unsigned long ret_addr; }; static inline int diag_get_task_type(struct task_struct *tsk) { if (orig_get_task_type) return orig_get_task_type(&tsk->se); return 0; } static inline int orig_diag_cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) { if (orig_kernfs_name && cgrp && cgrp->kn) { return orig_kernfs_name(cgrp->kn, buf, buflen); } else { return 0; } } static inline mm_info *find_mm_info(mm_tree *mm_tree, struct mm_struct *mm) { mm_info *info; if (mm == NULL) return NULL; info = radix_tree_lookup(&mm_tree->mm_tree, (unsigned long)mm); return info; } static void __diag_cgroup_name(struct task_struct *tsk, char *buf, unsigned int count, int cgroup) { int cgroup_id = cpuacct_cgrp_id; memset(buf, 0, count); if (cgroup == 1) { cgroup_id = cpuset_cgrp_id; } if (tsk && tsk->cgroups && tsk->cgroups->subsys && tsk->cgroups->subsys[cgroup_id] && tsk->cgroups->subsys[cgroup_id]->cgroup) { orig_diag_cgroup_name(tsk->cgroups->subsys[cgroup_id]->cgroup, buf, count); } } static void diag_cgroup_name(struct task_struct *tsk, char *buf, unsigned int count, int cgroup) { __diag_cgroup_name(tsk, buf, count, cgroup); } static int copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) { int ret; ret = 1; pagefault_disable(); if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) ret = 0; pagefault_enable(); return ret; } static int copy_stack_frame_remote(struct task_struct *tsk, const void __user *fp, struct stack_frame_user *frame) { int ret; struct mm_struct *mm; mm = get_task_mm(tsk); if (!mm) { printk("copy_stack_frame_remote %d get_task_mm fail\n", tsk->pid); return 0; } ret = orig_access_remote_vm(mm, (unsigned long)fp, frame, sizeof(*frame), 0); printk("copy_stack_frame_remote %d ret:%d\n", tsk->pid, ret); mmput(mm); return ret; } static inline void save_stack_trace_user_remote(struct task_struct *tsk, struct stack_trace *trace) { const struct pt_regs *regs = task_pt_regs(tsk); const void __user *fp = (const void __user *)regs->bp; int count = 0; if (in_atomic()) { printk("save_stack_trace_user_remote %d in_atomic\n", tsk->pid); return; } if (irqs_disabled()) { printk("save_stack_trace_user_remote %d irqs_disabled\n", tsk->pid); return; } if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = regs->ip; while (trace->nr_entries < trace->max_entries) { struct stack_frame_user frame; frame.next_fp = NULL; frame.ret_addr = 0; if (!copy_stack_frame_remote(tsk, fp, &frame)) { printk("save_stack_trace_user_remote %d copy_stack_frame_remote fail\n", tsk->pid); break; } if ((unsigned long)fp < regs->sp) { printk("save_stack_trace_user_remote %d fp < sp count:%d\n", tsk->pid, count); break; // 如果fp小于sp,说明已经到了栈底,退出 } // 如果返回地址不为0,说明是一个有效的栈帧,保存返回地址 if (frame.ret_addr) { trace->entries[trace->nr_entries++] = frame.ret_addr; printk("save_stack_trace_user_remote %d ret_addr:%lx\n", tsk->pid, frame.ret_addr); } else { printk("save_stack_trace_user_remote %d no ret_addr", tsk->pid); break; // continue; } // 如果fp指向自己,说明已经到了栈底,退出 if (fp == frame.next_fp) { printk("save_stack_trace_user_remote %d fp == next_fp", tsk->pid); break; } fp = frame.next_fp; // 否则,继续向下遍历 count++; /** * 线上环境发现这里有hardlockup,这里强制退出 */ if (count >= trace->max_entries || count >= 100) break; } } static inline void __save_stack_trace_user(struct stack_trace *trace) { const struct pt_regs *regs = task_pt_regs(current); const void __user *fp = (const void __user *)regs->bp; int count = 0; if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = regs->ip; while (trace->nr_entries < trace->max_entries) { struct stack_frame_user frame; frame.next_fp = NULL; frame.ret_addr = 0; if (!copy_stack_frame(fp, &frame)) break; if ((unsigned long)fp < regs->sp) break; if (frame.ret_addr) { trace->entries[trace->nr_entries++] = frame.ret_addr; } if (fp == frame.next_fp) break; fp = frame.next_fp; count++; /** * 线上环境发现这里有hardlockup,这里强制退出 */ if (count >= trace->max_entries || count >= 100) break; } } static void perfect_save_stack_trace_user(struct stack_trace *trace) { /* * Trace user stack if we are not a kernel thread */ if (current->mm) { __save_stack_trace_user(trace); } if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } static void diagnose_save_stack_trace_user(unsigned long *backtrace) { struct stack_trace trace; memset(&trace, 0, sizeof(trace)); memset(backtrace, 0, BACKTRACE_DEPTH * sizeof(unsigned long)); trace.max_entries = BACKTRACE_DEPTH; trace.entries = backtrace; perfect_save_stack_trace_user(&trace); } static void diagnose_save_stack_trace_user_remote(struct task_struct *tsk, unsigned long *backtrace) { struct stack_trace trace; memset(&trace, 0, sizeof(trace)); memset(backtrace, 0, BACKTRACE_DEPTH * sizeof(unsigned long)); trace.max_entries = BACKTRACE_DEPTH; trace.entries = backtrace; /* * Trace user stack if we are not a kernel thread */ if (tsk->mm) { printk("save_stack_trace_user_remote %d mm\n", tsk->pid); save_stack_trace_user_remote(tsk, &trace); } if (trace.nr_entries < trace.max_entries) trace.entries[trace.nr_entries++] = ULONG_MAX; printk("save_stack_trace_user_remote %d, stack: [", tsk->pid); int i = 0; for (i = 0; i < BACKTRACE_DEPTH; i++) { printk("%lx, ", backtrace[i]); } printk("]\n"); } static int diagnose_task_raw_stack_remote(struct task_struct *tsk, void *to, void __user *from, unsigned long n) { int ret; struct mm_struct *mm; if (in_atomic()) { printk("task_raw_stack_remote %d in_atomic\n", tsk->pid); return 0; } if (irqs_disabled()) { printk("task_raw_stack_remote %d irqs_disabled\n", tsk->pid); return 0; } if (in_atomic() || irqs_disabled()) { return 0; } mm = get_task_mm(tsk); if (!mm) return 0; ret = orig_access_remote_vm(mm, (unsigned long)from, to, n, 0); mmput(mm); printk("task_raw_stack_remote %d access_remote_vm ret: %d\n", tsk->pid, ret); return ret < 0 ? ret : 0; } void diag_task_brief(struct task_struct *tsk, task_detail *detail) { struct pid_namespace *ns; struct pt_regs *task_regs; struct task_struct *leader; struct pt_regs *irq_regs; if (!detail) return; memset(detail, 0, sizeof(task_detail)); if (!tsk || tsk->exit_state == EXIT_ZOMBIE) // zombie return; leader = tsk->group_leader; if (!leader || leader->exit_state == EXIT_ZOMBIE) { return; } if (tsk != current) { // not current task detail->user_mode = -1; detail->syscallno = -1; } else if (!tsk->mm) { // current task but kernel thread detail->user_mode = 0; detail->syscallno = -1; } else { // current task and user thread irq_regs = get_irq_regs(); // get current irq regs task_regs = task_pt_regs(tsk); if ((irq_regs && user_mode(irq_regs)) || (task_regs && user_mode(task_regs))) { detail->user_mode = 1; // user mode } else { detail->user_mode = 0; // kernel mode } if (task_regs) { detail->syscallno = syscall_get_nr(tsk, task_regs); // get syscall no } } if (tsk->sched_class == orig_idle_sched_class) // idle task detail->sys_task = 2; else if (!tsk->mm) // kernel thread detail->sys_task = 1; else detail->sys_task = 0; detail->pid = tsk->pid; // pid detail->tgid = tsk->tgid; // tgid detail->state = tsk->__state; // state detail->task_type = diag_get_task_type(tsk); // task type ns = task_active_pid_ns(tsk); // container pid if (ns && ns != &init_pid_ns) { detail->container_pid = task_pid_nr_ns(tsk, ns); detail->container_tgid = task_tgid_nr_ns(tsk, ns); } else { detail->container_pid = tsk->pid; detail->container_tgid = tsk->tgid; } strncpy(detail->comm, tsk->comm, TASK_COMM_LEN); detail->comm[TASK_COMM_LEN - 1] = 0; // comm name diag_cgroup_name(tsk, detail->cgroup_buf, CGROUP_NAME_LEN, 0); diag_cgroup_name(tsk, detail->cgroup_cpuset, CGROUP_NAME_LEN, 1); detail->cgroup_buf[CGROUP_NAME_LEN - 1] = 0; // cgroup name detail->cgroup_cpuset[CGROUP_NAME_LEN - 1] = 0; // cgroup cpuset name } void diag_task_user_stack(struct task_struct *tsk, user_stack_detail *detail) { struct pt_regs *regs; unsigned long sp, ip, bp; struct task_struct *leader; if (!detail) { return; } detail->stack[0] = 0; if (!tsk || !tsk->mm) { return; } leader = tsk->group_leader; if (!leader || !leader->mm || leader->exit_state == EXIT_ZOMBIE) { return; } sp = 0; ip = 0; bp = 0; regs = task_pt_regs(tsk); if (regs) { sp = regs->sp; ip = regs->ip; bp = regs->bp; } detail->regs = *regs; detail->sp = sp; detail->ip = ip; detail->bp = bp; if (tsk == current) { printk("diag_task_user_stack %d current\n", tsk->pid); diagnose_save_stack_trace_user(detail->stack); } else { printk("diag_task_user_stack %d no current\n", tsk->pid); diagnose_save_stack_trace_user_remote(tsk, detail->stack); } } unsigned int diag_task_kern_stack(struct task_struct *tsk, kern_stack_detail *detail) { return orig_stack_trace_save_tsk(tsk, detail->stack, BACKTRACE_DEPTH, 0); } void dump_proc_chains_argv(int style, struct task_struct *tsk, mm_tree *mm_tree, proc_chains_detail *detail) { struct task_struct *walker; mm_info *mm_info; int cnt = 0; int i = 0; struct task_struct *leader; for (i = 0; i < PROCESS_CHAINS_COUNT; i++) { detail->chains[i][0] = 0; detail->tgid[i] = 0; } if (style == 0) return; if (!tsk || !tsk->mm) return; leader = tsk->group_leader; if (!leader || !leader->mm || leader->exit_state == EXIT_ZOMBIE) { // leader is zombie or no mm return; } rcu_read_lock(); walker = tsk; while (walker->pid > 0) { if (!thread_group_leader(walker)) walker = rcu_dereference(walker->group_leader); mm_info = find_mm_info(mm_tree, walker->mm); if (mm_info) { if (mm_info->cgroup_buf[0] == 0) diag_cgroup_name(walker, mm_info->cgroup_buf, 255, 0); strncpy(detail->chains[cnt], mm_info->argv, PROCESS_ARGV_LEN); detail->full_argv[cnt] = 1; } else { strncpy(detail->chains[cnt], walker->comm, TASK_COMM_LEN); detail->full_argv[cnt] = 0; } detail->tgid[cnt] = walker->pid; // if ((detail->tgid[cnt] != 0) | (detail->full_argv[cnt] != 0)) { // printk("pid: %d,full_argv: %d, chains: %s, cnt:%d\n", // detail->tgid[cnt], // detail->full_argv[cnt], detail->chains[cnt], cnt); // } walker = rcu_dereference(walker->real_parent); cnt++; if (cnt >= PROCESS_CHAINS_COUNT) break; } rcu_read_unlock(); } /** * @brief copy task raw stack * * @param tsk * @param detail */ void diag_task_raw_stack(struct task_struct *tsk, raw_stack_detail *detail) { struct pt_regs *regs; int i; int ret; unsigned long sp, ip, bp; char *stack; memset(detail->stack, 0, DIAG_USER_STACK_SIZE); detail->stack_size = 0; if (!tsk || !tsk->mm) return; regs = task_pt_regs(tsk); if (!regs) return; sp = regs->sp; ip = regs->ip; bp = regs->bp; detail->regs = *regs; detail->sp = sp; detail->ip = ip; detail->bp = bp; stack = (char *)&detail->stack[0]; for (i = 0; i < (DIAG_USER_STACK_SIZE / 1024); i++) { if (tsk == current) { pagefault_disable(); ret = __copy_from_user_inatomic( stack, (void __user *)sp + detail->stack_size, 1024); pagefault_enable(); } else { ret = diagnose_task_raw_stack_remote( tsk, stack, (void __user *)sp + detail->stack_size, 1024); } printk("diag_task_raw_stack %d i:%d ret:%d\n", tsk->pid, i, ret); if (ret) break; else detail->stack_size += 1024; stack += 1024; } } /// @brief print all task stack /// @param // static void print_task_stack(void) { // struct task_struct *g, *p; // g: task group; p: task // unsigned long backtrace[BACKTRACE_DEPTH]; // save stack // unsigned int nr_bt; // stack depth // unsigned long long current_time; // last time // current_time = ktime_get_real(); // printk("Timestamp (ns): %lld\n", current_time); // printk("Recent Load: %lu.%02lu, %lu.%02lu, %lu.%02lu\n", // recent load // LOAD_INT(avenrun[0]), LOAD_FRAC(avenrun[0]), LOAD_INT(avenrun[1]), // LOAD_FRAC(avenrun[1]), LOAD_INT(avenrun[2]), LOAD_FRAC(avenrun[2])); // rcu_read_lock(); // lock run queue // // printk("Running task\n"); // do_each_thread(g, p) { // if (p->__state == TASK_RUNNING || __task_contributes_to_load(p) || // p->__state == TASK_IDLE) { // printk("task: %s, pid %d, state %d\n", p->comm, p->pid, // p->__state); //! todo // nr_bt = orig_stack_trace_save_tsk(p, backtrace, BACKTRACE_DEPTH, 0); // stack_trace_print(backtrace, nr_bt, 0); // print // } // } // while_each_thread(g, p); // rcu_read_unlock(); // unlock run queue // } // void diag_printf_kern_stack(kern_stack_detail *kern_stack, int reverse) { // int i; // symbol sym; // printf(" 内核态堆栈:\n"); // if (reverse) { // for (i = BACKTRACE_DEPTH - 1; i >= 0; i--) { // if (kern_stack->stack[i] == (size_t)-1 || kern_stack->stack[i] == 0) { // continue; // } // sym.reset(kern_stack->stack[i]); // if (g_symbol_parser.find_kernel_symbol(sym)) { // printf("#@ 0x%lx %s ([kernel.kallsyms])\n", // kern_stack->stack[i], // sym.name.c_str()); // } else { // printf("#@ 0x%lx %s\n", kern_stack->stack[i], "UNKNOWN"); // } // } // } else { // for (i = 0; i < BACKTRACE_DEPTH; i++) { // if (kern_stack->stack[i] == (size_t)-1 || kern_stack->stack[i] == 0) { // break; // } // sym.reset(kern_stack->stack[i]); // if (g_symbol_parser.find_kernel_symbol(sym)) { // printf("#@ 0x%lx %s ([kernel.kallsyms])\n", // kern_stack->stack[i], // sym.name.c_str()); // } else { // printf("#@ 0x%lx %s\n", kern_stack->stack[i], "UNKNOWN"); // } // } // } // } // void diag_printf_kern_stack(struct diag_kern_stack_detail *kern_stack) { // diag_printf_kern_stack(kern_stack, 0); // }