#include "monitor_kernel_task.h" #include #include #include // for syscall_get_nr #include #include // for get_task_mm #include #include typedef struct { struct rcu_head rcu_head; pid_t pid; struct mm_struct *mm; char cgroup_buf[256]; char argv[256]; } mm_info; struct stack_trace { unsigned int nr_entries, max_entries; unsigned long *entries; int skip; /* input argument: How many entries to skip */ }; struct stack_frame_user { const void __user *next_fp; unsigned long ret_addr; }; static inline int diag_get_task_type(struct task_struct *tsk) { if (orig_get_task_type) return orig_get_task_type(&tsk->se); return 0; } static inline int orig_diag_cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) { if (orig_kernfs_name && cgrp && cgrp->kn) { return orig_kernfs_name(cgrp->kn, buf, buflen); } else { return 0; } } static inline mm_info *find_mm_info(mm_tree *mm_tree, struct mm_struct *mm) { mm_info *info; if (mm == NULL) return NULL; info = radix_tree_lookup(&mm_tree->mm_tree, (unsigned long)mm); return info; } static void __diag_cgroup_name(struct task_struct *tsk, char *buf, unsigned int count, int cgroup) { int cgroup_id = cpuacct_cgrp_id; memset(buf, 0, count); if (cgroup == 1) { cgroup_id = cpuset_cgrp_id; } if (tsk && tsk->cgroups && tsk->cgroups->subsys && tsk->cgroups->subsys[cgroup_id] && tsk->cgroups->subsys[cgroup_id]->cgroup) { orig_diag_cgroup_name(tsk->cgroups->subsys[cgroup_id]->cgroup, buf, count); } } static void diag_cgroup_name(struct task_struct *tsk, char *buf, unsigned int count, int cgroup) { __diag_cgroup_name(tsk, buf, count, cgroup); } static int copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) { int ret; ret = 1; pagefault_disable(); if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) ret = 0; pagefault_enable(); return ret; } static int copy_stack_frame_remote(struct task_struct *tsk, const void __user *fp, struct stack_frame_user *frame) { int ret; struct mm_struct *mm; mm = get_task_mm(tsk); if (!mm) return 0; ret = orig_access_remote_vm(mm, (unsigned long)fp, frame, sizeof(*frame), 0); mmput(mm); return ret; } static inline void save_stack_trace_user_remote(struct task_struct *tsk, struct stack_trace *trace) { const struct pt_regs *regs = task_pt_regs(tsk); const void __user *fp = (const void __user *)regs->bp; int count = 0; if (in_atomic() || irqs_disabled()) { return; } if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = regs->ip; while (trace->nr_entries < trace->max_entries) { struct stack_frame_user frame; frame.next_fp = NULL; frame.ret_addr = 0; if (!copy_stack_frame_remote(tsk, fp, &frame)) { break; } if ((unsigned long)fp < regs->sp) break; if (frame.ret_addr) { trace->entries[trace->nr_entries++] = frame.ret_addr; } else break; if (fp == frame.next_fp) break; fp = frame.next_fp; count++; /** * 线上环境发现这里有hardlockup,这里强制退出 */ if (count >= trace->max_entries || count >= 100) break; } } static inline void __save_stack_trace_user(struct stack_trace *trace) { const struct pt_regs *regs = task_pt_regs(current); const void __user *fp = (const void __user *)regs->bp; int count = 0; if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = regs->ip; while (trace->nr_entries < trace->max_entries) { struct stack_frame_user frame; frame.next_fp = NULL; frame.ret_addr = 0; if (!copy_stack_frame(fp, &frame)) break; if ((unsigned long)fp < regs->sp) break; if (frame.ret_addr) { trace->entries[trace->nr_entries++] = frame.ret_addr; } if (fp == frame.next_fp) break; fp = frame.next_fp; count++; /** * 线上环境发现这里有hardlockup,这里强制退出 */ if (count >= trace->max_entries || count >= 100) break; } } static void perfect_save_stack_trace_user(struct stack_trace *trace) { /* * Trace user stack if we are not a kernel thread */ if (current->mm) { __save_stack_trace_user(trace); } if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; } static void diagnose_save_stack_trace_user(unsigned long *backtrace) { struct stack_trace trace; memset(&trace, 0, sizeof(trace)); memset(backtrace, 0, BACKTRACE_DEPTH2 * sizeof(unsigned long)); trace.max_entries = BACKTRACE_DEPTH2; trace.entries = backtrace; perfect_save_stack_trace_user(&trace); } static void diagnose_save_stack_trace_user_remote(struct task_struct *tsk, unsigned long *backtrace) { struct stack_trace trace; memset(&trace, 0, sizeof(trace)); memset(backtrace, 0, BACKTRACE_DEPTH2 * sizeof(unsigned long)); trace.max_entries = BACKTRACE_DEPTH2; trace.entries = backtrace; /* * Trace user stack if we are not a kernel thread */ if (tsk->mm) { save_stack_trace_user_remote(tsk, &trace); } if (trace.nr_entries < trace.max_entries) trace.entries[trace.nr_entries++] = ULONG_MAX; } void diag_task_brief(struct task_struct *tsk, task_detail *detail) { struct pid_namespace *ns; struct pt_regs *task_regs; struct task_struct *leader; struct pt_regs *irq_regs; if (!detail) return; memset(detail, 0, sizeof(task_detail)); if (!tsk || tsk->exit_state == EXIT_ZOMBIE) // zombie return; leader = tsk->group_leader; if (!leader || leader->exit_state == EXIT_ZOMBIE) { return; } if (tsk != current) { // not current task detail->user_mode = -1; detail->syscallno = -1; } else if (!tsk->mm) { // current task but kernel thread detail->user_mode = 0; detail->syscallno = -1; } else { // current task and user thread irq_regs = get_irq_regs(); // get current irq regs task_regs = task_pt_regs(tsk); if ((irq_regs && user_mode(irq_regs)) || (task_regs && user_mode(task_regs))) { detail->user_mode = 1; // user mode } else { detail->user_mode = 0; // kernel mode } if (task_regs) { detail->syscallno = syscall_get_nr(tsk, task_regs); // get syscall no } } if (tsk->sched_class == orig_idle_sched_class) // idle task detail->sys_task = 2; else if (!tsk->mm) // kernel thread detail->sys_task = 1; else detail->sys_task = 0; detail->pid = tsk->pid; // pid detail->tgid = tsk->tgid; // tgid detail->state = tsk->__state; // state detail->task_type = diag_get_task_type(tsk); // task type ns = task_active_pid_ns(tsk); // container pid if (ns && ns != &init_pid_ns) { detail->container_pid = task_pid_nr_ns(tsk, ns); detail->container_tgid = task_tgid_nr_ns(tsk, ns); } else { detail->container_pid = tsk->pid; detail->container_tgid = tsk->tgid; } strncpy(detail->comm, tsk->comm, TASK_COMM_LEN); detail->comm[TASK_COMM_LEN - 1] = 0; // comm name diag_cgroup_name(tsk, detail->cgroup_buf, CGROUP_NAME_LEN, 0); diag_cgroup_name(tsk, detail->cgroup_cpuset, CGROUP_NAME_LEN, 1); detail->cgroup_buf[CGROUP_NAME_LEN - 1] = 0; // cgroup name detail->cgroup_cpuset[CGROUP_NAME_LEN - 1] = 0; // cgroup cpuset name } void diag_task_user_stack(struct task_struct *tsk, user_stack_detail *detail) { struct pt_regs *regs; unsigned long sp, ip, bp; struct task_struct *leader; if (!detail) return; detail->stack[0] = 0; if (!tsk || !tsk->mm) return; leader = tsk->group_leader; if (!leader || !leader->mm || leader->exit_state == EXIT_ZOMBIE) { return; } sp = 0; ip = 0; bp = 0; regs = task_pt_regs(tsk); if (regs) { sp = regs->sp; #if defined(DIAG_ARM64) ip = regs->pc; bp = regs->sp; #else ip = regs->ip; bp = regs->bp; #endif } #if defined(DIAG_ARM64) detail->regs = regs->user_regs; #else detail->regs = *regs; #endif detail->sp = sp; detail->ip = ip; detail->bp = bp; if (tsk == current) { diagnose_save_stack_trace_user(detail->stack); } else { diagnose_save_stack_trace_user_remote(tsk, detail->stack); } } void diag_task_kern_stack(struct task_struct *tsk, kern_stack_detail *detail) { orig_stack_trace_save_tsk(tsk, detail->stack, BACKTRACE_DEPTH2, 0); } void dump_proc_chains_argv(int style, struct task_struct *tsk, mm_tree *mm_tree, proc_chains_detail *detail) { struct task_struct *walker; mm_info *mm_info; int cnt = 0; int i = 0; struct task_struct *leader; for (i = 0; i < PROCESS_CHAINS_COUNT; i++) { detail->chains[i][0] = 0; detail->tgid[i] = 0; } if (style == 0) return; if (!tsk || !tsk->mm) return; leader = tsk->group_leader; if (!leader || !leader->mm || leader->exit_state == EXIT_ZOMBIE) { // leader is zombie or no mm return; } rcu_read_lock(); walker = tsk; while (walker->pid > 0) { if (!thread_group_leader(walker)) walker = rcu_dereference(walker->group_leader); mm_info = find_mm_info(mm_tree, walker->mm); if (mm_info) { if (mm_info->cgroup_buf[0] == 0) diag_cgroup_name(walker, mm_info->cgroup_buf, 255, 0); strncpy(detail->chains[cnt], mm_info->argv, PROCESS_ARGV_LEN); detail->full_argv[cnt] = 1; } else { strncpy(detail->chains[cnt], walker->comm, TASK_COMM_LEN); detail->full_argv[cnt] = 0; } detail->tgid[cnt] = walker->pid; walker = rcu_dereference(walker->real_parent); cnt++; if (cnt >= PROCESS_CHAINS_COUNT) break; } rcu_read_unlock(); }