#include "monitor_trace.h" #include #include #include // for syscall_get_nr #include #include #include // for get_task_mm #include #include mm_tree mm_tree_struct; struct diag_variant_buffer load_monitor_variant_buffer; struct diag_variant_buffer stand_alone_buffer; typedef struct { struct rcu_head rcu_head; pid_t pid; struct mm_struct *mm; char cgroup_buf[256]; char argv[256]; } mm_info; struct stack_trace { unsigned int nr_entries, max_entries; unsigned long *entries; int skip; /* input argument: How many entries to skip */ }; struct stack_frame_user { const void __user *next_fp; unsigned long ret_addr; }; static inline int diag_get_task_type(struct task_struct *tsk) { if (orig_get_task_type) return orig_get_task_type(&tsk->se); return 0; } static inline int orig_diag_cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) { if (orig_kernfs_name && cgrp && cgrp->kn) { return orig_kernfs_name(cgrp->kn, buf, buflen); } else { return 0; } } /** * @brief find mm_info by mm * * @param mm_tree * @param mm * @return mm_info* */ static inline mm_info *find_mm_info(mm_tree *mm_tree, struct mm_struct *mm) { mm_info *info; if (mm == NULL) return NULL; info = radix_tree_lookup(&mm_tree->mm_tree, (unsigned long)mm); return info; } static void __diag_cgroup_name(struct task_struct *tsk, char *buf, unsigned int count, int cgroup) { int cgroup_id = cpuacct_cgrp_id; memset(buf, 0, count); if (cgroup == 1) { cgroup_id = cpuset_cgrp_id; } if (tsk && tsk->cgroups && tsk->cgroups->subsys && tsk->cgroups->subsys[cgroup_id] && tsk->cgroups->subsys[cgroup_id]->cgroup) { orig_diag_cgroup_name(tsk->cgroups->subsys[cgroup_id]->cgroup, buf, count); } } static void diag_cgroup_name(struct task_struct *tsk, char *buf, unsigned int count, int cgroup) { __diag_cgroup_name(tsk, buf, count, cgroup); } /** * @brief copy stack frame by fp * * @param fp * @param frame * @return int */ static int copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) { int ret; ret = 1; pagefault_disable(); if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) ret = 0; pagefault_enable(); return ret; } static int copy_stack_frame_remote(struct task_struct *tsk, const void __user *fp, struct stack_frame_user *frame) { int ret; struct mm_struct *mm; mm = get_task_mm(tsk); if (!mm) { printk(KERN_INFO "copy_stack_frame_remote %d get_task_mm fail\n", tsk->pid); return 0; } ret = orig_access_remote_vm(mm, (unsigned long)fp, frame, sizeof(*frame), 0); // printk(KERN_INFO "copy_stack_frame_remote %d ret:%d\n", tsk->pid, ret); mmput(mm); return ret; } /** * @brief save stack trace | not current task * * @param tsk * @param trace */ static inline void save_stack_trace_user_remote(struct task_struct *tsk, struct stack_trace *trace) { const struct pt_regs *regs = task_pt_regs(tsk); const void __user *fp = (const void __user *)regs->bp; int count = 0; // if (in_atomic()) { // printk(KERN_INFO "save_stack_trace_user_remote %d: task in_atomic\n", // tsk->pid); // return; // } // if (irqs_disabled()) { // printk(KERN_INFO "save_stack_trace_user_remote %d: task in irqs_disabled\n", // tsk->pid); // return; // } if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = regs->ip; while (trace->nr_entries < trace->max_entries) { struct stack_frame_user frame; frame.next_fp = NULL; frame.ret_addr = 0; if (!copy_stack_frame_remote(tsk, fp, &frame)) { // printk(KERN_INFO "save_stack_trace_user_remote %d // copy_stack_frame_remote fail\n", // tsk->pid); break; } if ((unsigned long)fp < regs->sp) { // printk(KERN_INFO "save_stack_trace_user_remote %d fp < sp count:%d\n", // tsk->pid, // count); break; // 如果fp小于sp,说明已经到了栈底,退出 } // 如果返回地址不为0,说明是一个有效的栈帧,保存返回地址 if (frame.ret_addr) { trace->entries[trace->nr_entries++] = frame.ret_addr; // printk(KERN_INFO "save_stack_trace_user_remote %d ret_addr:%lx\n", // tsk->pid, // frame.ret_addr); } else { // printk(KERN_INFO "save_stack_trace_user_remote %d no ret_addr", // tsk->pid); break; } // 如果fp指向自己,说明已经到了栈底,退出 if (fp == frame.next_fp) { // printk(KERN_INFO "save_stack_trace_user_remote %d fp == next_fp", // tsk->pid); break; } fp = frame.next_fp; // 否则,继续向下遍历 count++; /** * 线上环境发现这里有hardlockup,这里强制退出 */ if (count >= trace->max_entries || count >= 100) break; } } static inline void __save_stack_trace_user(struct stack_trace *trace) { const struct pt_regs *regs = task_pt_regs(current); const void __user *fp = (const void __user *)regs->bp; int count = 0; if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = regs->ip; while (trace->nr_entries < trace->max_entries) { struct stack_frame_user frame; frame.next_fp = NULL; frame.ret_addr = 0; if (!copy_stack_frame(fp, &frame)) break; if ((unsigned long)fp < regs->sp) break; if (frame.ret_addr) { trace->entries[trace->nr_entries++] = frame.ret_addr; } if (fp == frame.next_fp) break; fp = frame.next_fp; count++; /** * 线上环境发现这里有hardlockup,这里强制退出 */ if (count >= trace->max_entries || count >= 100) break; } } // static void perfect_save_stack_trace_user(struct stack_trace *trace) { // /* // * Trace user stack if we are not a kernel thread // */ // if (current->mm) { // __save_stack_trace_user(trace); // } // if (trace->nr_entries < trace->max_entries) // trace->entries[trace->nr_entries++] = ULONG_MAX; // } /** * @brief save stack trace | current task * * @param backtrace */ // static void diagnose_save_stack_trace_user(unsigned long *backtrace) { // struct stack_trace trace; // memset(&trace, 0, sizeof(trace)); // memset(backtrace, 0, BACKTRACE_DEPTH * sizeof(unsigned long)); // trace.max_entries = BACKTRACE_DEPTH; // trace.entries = backtrace; // perfect_save_stack_trace_user(&trace); // } /** * @brief save stack trace | not current task * * @param tsk * @param backtrace */ // static void diagnose_save_stack_trace_user_remote(struct task_struct *tsk, // unsigned long *backtrace) { // struct stack_trace trace; // memset(&trace, 0, sizeof(trace)); // memset(backtrace, 0, BACKTRACE_DEPTH * sizeof(unsigned long)); // trace.max_entries = BACKTRACE_DEPTH; // trace.entries = backtrace; // /* // * Trace user stack if we are not a kernel thread // */ // if (tsk->mm) { // // printk(KERN_INFO "save_stack_trace_user_remote %d mm\n", tsk->pid); // save_stack_trace_user_remote(tsk, &trace); // } // if (trace.nr_entries < trace.max_entries) // trace.entries[trace.nr_entries++] = ULONG_MAX; // } static int diagnose_task_raw_stack_remote(struct task_struct *tsk, void *to, void __user *from, unsigned long n) { int ret; struct mm_struct *mm; // if (in_atomic()) { // printk(KERN_INFO "task_raw_stack_remote %d in_atomic\n", tsk->pid); // return 0; // } // if (irqs_disabled()) { // printk(KERN_INFO "task_raw_stack_remote %d irqs_disabled\n", tsk->pid); // return 0; // } if (in_atomic() || irqs_disabled()) { return 0; } mm = get_task_mm(tsk); if (!mm) return 0; ret = orig_access_remote_vm(mm, (unsigned long)from, to, n, 0); mmput(mm); // printk(KERN_INFO "task_raw_stack_remote %d access_remote_vm ret: %d\n", // tsk->pid, ret); return ret < 0 ? ret : 0; } void diag_task_brief(struct task_struct *tsk, task_detail *detail) { struct pid_namespace *ns; struct pt_regs *task_regs; struct task_struct *leader; struct pt_regs *irq_regs; if (!detail) return; memset(detail, 0, sizeof(task_detail)); if (!tsk || tsk->exit_state == EXIT_ZOMBIE) // zombie return; leader = tsk->group_leader; if (!leader || leader->exit_state == EXIT_ZOMBIE) { return; } if (tsk != current) { // not current task detail->user_mode = -1; detail->syscallno = -1; } else if (!tsk->mm) { // current task but kernel thread detail->user_mode = 0; detail->syscallno = -1; } else { // current task and user thread irq_regs = get_irq_regs(); // get current irq regs task_regs = task_pt_regs(tsk); if ((irq_regs && user_mode(irq_regs)) || (task_regs && user_mode(task_regs))) { detail->user_mode = 1; // user mode } else { detail->user_mode = 0; // kernel mode } if (task_regs) { detail->syscallno = syscall_get_nr(tsk, task_regs); // get syscall no } } if (tsk->sched_class == orig_idle_sched_class) // idle task detail->sys_task = 2; else if (!tsk->mm) // kernel thread detail->sys_task = 1; else detail->sys_task = 0; detail->pid = tsk->pid; // pid detail->tgid = tsk->tgid; // tgid detail->state = tsk->__state; // state detail->task_type = diag_get_task_type(tsk); // task type ns = task_active_pid_ns(tsk); // container pid if (ns && ns != &init_pid_ns) { detail->container_pid = task_pid_nr_ns(tsk, ns); detail->container_tgid = task_tgid_nr_ns(tsk, ns); } else { detail->container_pid = tsk->pid; detail->container_tgid = tsk->tgid; } strncpy(detail->comm, tsk->comm, TASK_COMM_LEN); detail->comm[TASK_COMM_LEN - 1] = 0; // comm name diag_cgroup_name(tsk, detail->cgroup_buf, CGROUP_NAME_LEN, 0); diag_cgroup_name(tsk, detail->cgroup_cpuset, CGROUP_NAME_LEN, 1); detail->cgroup_buf[CGROUP_NAME_LEN - 1] = 0; // cgroup name detail->cgroup_cpuset[CGROUP_NAME_LEN - 1] = 0; // cgroup cpuset name } // void diag_task_user_stack(struct task_struct *tsk, user_stack_detail *detail) // { // struct pt_regs *regs; // unsigned long sp, ip, bp; // struct task_struct *leader; // if (!detail) { // return; // } // detail->stack[0] = 0; // if (!tsk || !tsk->mm) { // return; // } // leader = tsk->group_leader; // if (!leader || !leader->mm || leader->exit_state == EXIT_ZOMBIE) { // return; // } // sp = 0; // ip = 0; // bp = 0; // regs = task_pt_regs(tsk); // if (regs) { // sp = regs->sp; // ip = regs->ip; // bp = regs->bp; // } // detail->regs = *regs; // detail->sp = sp; // detail->ip = ip; // detail->bp = bp; // if (tsk == current) { // // printk(KERN_INFO "diag_task_user_stack %d current\n", tsk->pid); // diagnose_save_stack_trace_user(detail->stack); // } else { // // printk(KERN_INFO "diag_task_user_stack %d no current\n", tsk->pid); // diagnose_save_stack_trace_user_remote(tsk, detail->stack); // } // } /** * @brief diag task kernel stack | -> to orig_stack_trace_save_tsk * * @param tsk * @param detail * @return unsigned int */ unsigned int diag_task_kern_stack(struct task_struct *tsk, kern_stack_detail *detail) { return orig_stack_trace_save_tsk(tsk, detail->stack, BACKTRACE_DEPTH, 0); } /** * @brief diag task proc chains * * @param style * @param tsk * @param mm_tree * @param detail */ void dump_proc_chains_argv(int style, struct task_struct *tsk, mm_tree *mm_tree, proc_chains_detail *detail) { struct task_struct *walker; mm_info *mm_info; int cnt = 0; int i = 0; struct task_struct *leader; for (i = 0; i < PROCESS_CHAINS_COUNT; i++) { detail->chains[i][0] = 0; detail->tgid[i] = 0; } if (style == 0) return; if (!tsk || !tsk->mm) return; leader = tsk->group_leader; if (!leader || !leader->mm || leader->exit_state == EXIT_ZOMBIE) { // leader is zombie or no mm return; } rcu_read_lock(); walker = tsk; while (walker->pid > 0) { if (!thread_group_leader(walker)) walker = rcu_dereference(walker->group_leader); mm_info = find_mm_info(mm_tree, walker->mm); if (mm_info) { if (mm_info->cgroup_buf[0] == 0) diag_cgroup_name(walker, mm_info->cgroup_buf, 255, 0); strncpy(detail->chains[cnt], mm_info->argv, PROCESS_ARGV_LEN); detail->full_argv[cnt] = 1; } else { strncpy(detail->chains[cnt], walker->comm, TASK_COMM_LEN); detail->full_argv[cnt] = 0; } detail->tgid[cnt] = walker->pid; walker = rcu_dereference(walker->real_parent); cnt++; if (cnt >= PROCESS_CHAINS_COUNT) break; } rcu_read_unlock(); } /** * @brief copy task raw stack * * @param tsk * @param detail */ void diag_task_raw_stack(struct task_struct *tsk, raw_stack_detail *detail) { struct pt_regs *regs; int i; int ret; unsigned long sp, ip, bp; char *stack; memset(detail->stack, 0, DIAG_USER_STACK_SIZE); detail->stack_size = 0; if (!tsk || !tsk->mm) return; regs = task_pt_regs(tsk); if (!regs) return; sp = regs->sp; ip = regs->ip; bp = regs->bp; detail->regs = *regs; detail->sp = sp; detail->ip = ip; detail->bp = bp; stack = (char *)&detail->stack[0]; for (i = 0; i < (DIAG_USER_STACK_SIZE / 1024); i++) { if (tsk == current) { pagefault_disable(); ret = __copy_from_user_inatomic( stack, (void __user *)sp + detail->stack_size, 1024); pagefault_enable(); } else { ret = diagnose_task_raw_stack_remote( tsk, stack, (void __user *)sp + detail->stack_size, 1024); } // printk(KERN_INFO "diag_task_raw_stack %d i:%d ret:%d\n", tsk->pid, i, // ret); if (ret) break; else detail->stack_size += 1024; stack += 1024; } }