commit 8950e1280197f2d571b449be6d1b15b96656d5b6 Author: zy Date: Thu Nov 16 13:17:49 2023 +0800 Initial commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ce62a8e --- /dev/null +++ b/.gitignore @@ -0,0 +1,63 @@ +# Prerequisites +*.d + +# Object files +*.o +*.ko +*.obj +*.elf + +# Linker output +*.ilk +*.map +*.exp + +# Precompiled Headers +*.gch +*.pch + +# Libraries +*.lib +*.a +*.la +*.lo + +# Shared objects (inc. Windows DLLs) +*.dll +*.so +*.so.* +*.dylib + +# Executables +*.exe +*.out +*.app +*.i*86 +*.x86_64 +*.hex + +# Debug files +*.dSYM/ +*.su +*.idb +*.pdb + +# Kernel Module Compile Results +*.mod* +*.cmd +.tmp_versions/ +modules.order +Module.symvers +Mkfile.old +dkms.conf +watch +linux-5.17.15/** +linux-5.17.15.tar.xz +helloworld +hptest +linux-5.17.15/.clang-format +linux-5.17.15/.gitignore +linux-5.17.15/.mailmap +linux-5.17.15/.cocciconfig +linux-5.17.15/.get_maintainer.ignore +linux-5.17.15/.cocciconfig diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 0000000..d294f8b --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,28 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/**", + "../linux-5.17.15/include/**", + "../linux-5.17.15/arch/x86/include/**", + "../linux-5.17.15/arch/x86/include/generated/**" + ], + "forcedInclude": [ + "../linux-5.17.15/include/generated/autoconf.h" + ], + "defines": [ + "__GNUC__", + "__KERNEL__", + "__linux__", + "__x86_64__", + "_GNU_SOURCE" + ], + "compilerPath": "/usr/bin/gcc", + "cStandard": "c89", + "compilerArgs": [], + "intelliSenseMode": "linux-gcc-x64", + } + ], + "version": 4 +} \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..54cbfab --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,46 @@ +{ + // 使用 IntelliSense 了解相关属性。 + // 悬停以查看现有属性的描述。 + // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "(gdb) linux kernel", + "type": "cppdbg", + "request": "launch", + //"preLaunchTask": "centos7", + // socat pty,link=./rootkit.pty,raw,echo=0 EXEC:"/mnt/c/ProgramData/chocolatey/lib/npiperelay/tools/npiperelay.exe -ep -s //./pipe/rootkit",nofork + "program": "${workspaceFolder}/linux-5.17.15/vmlinux", + //"miDebuggerServerAddress": "localhost:1234", + //"debugServerPath": "${workspaceFolder}/rootkit.pty", + "miDebuggerPath": "/usr/bin/gdb", + "miDebuggerArgs": "-ex 'set serial baud 115200 target remote ./rootkit.pty'", + "args": [], + "stopAtEntry": true, + "cwd": "${workspaceFolder}", + "environment": [], + "externalConsole": false, + "MIMode": "gdb", + //"miDebuggerArgs": "-n", + "targetArchitecture": "x64", + "setupCommands": [ // 或许在这里添加的 set serial baud 115200 | target remote ./rootkit.pty + { + "text": "set arch i386:x86-64:intel", + "ignoreFailures": false + }, + { + "text": "dir .", + "ignoreFailures": false + }, + { + "text": "add-auto-load-safe-path ./", + "ignoreFailures": false + }, + { + "text": "-enable-pretty-printing", + "ignoreFailures": true + } + ] + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..ac3c01a --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,27 @@ +{ + "files.associations": { + "hrtimer.h": "c", + "ktime.h": "c", + "typeinfo": "c", + "signal.h": "c", + "module.h": "c", + "watch_module.h": "c", + "kernel.h": "c", + "device.h": "c", + "mm.h": "c", + "fs.h": "c", + "sched.h": "c", + "monitor_user.h": "c" + }, + "clangd.arguments": [ + "--compile-commands-dir=${workspaceFolder}/linux-5.17.15", + "--background-index", + "--completion-style=detailed", + "--header-insertion=never", + "-log=info" + ], + "C_Cpp.autocomplete": "disabled", + "C_Cpp.codeFolding": "disabled", + "C_Cpp.configurationWarnings": "disabled", + "C_Cpp.intelliSenseEngine": "disabled" +} \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json new file mode 100644 index 0000000..d33a6a8 --- /dev/null +++ b/.vscode/tasks.json @@ -0,0 +1,33 @@ +{ + "tasks": [ + { + "label": "centos7", + "type": "shell", + "command": "./run.sh", + "presentation": { + "echo": true, + "clear": true, + "group": "vm" + }, + "isBackground": true, + "problemMatcher": [ + { + "pattern": [ + { + "regexp": ".", + "file": 1, + "location": 2, + "message": 3 + } + ], + "background": { + "activeOnStart": true, + "beginsPattern": ".", + "endsPattern": ".", + } + } + ] + } + ], + "version": "2.0.0" +} \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..7870185 --- /dev/null +++ b/Makefile @@ -0,0 +1,36 @@ +CC = gcc +CFLAGS = -Wall + +PROG = helloworld +HPTEST = hptest + +UDIR = $(PWD)/user +MDIR := $(PWD)/kernel +KDIR := $(PWD)/linux-5.17.15 # 内核源码目录 +TDIR := $(PWD)/testcase + +BUILD_DIR := $(PWD)/build +OUTPUT_DIR = $(PWD)/build + + +# KMOD = variable_monitor +# obj-m := kernel/$(KMOD).o +# $(KMOD)-objs := kernel/monitor_kernel.o + +all: $(PROG) $(HPTEST) module + +$(PROG): $(TDIR)/helloworld.c + $(CC) $(CFLAGS) -o $(OUTPUT_DIR)/$(PROG) $(TDIR)/helloworld.c $(UDIR)/monitor_user.c + +$(HPTEST): $(TDIR)/helloworld.c + $(CC) $(CFLAGS) -o $(OUTPUT_DIR)/$(HPTEST) $(TDIR)/hptest.c $(UDIR)/monitor_user.c + +module: + make -C $(KDIR) M=$(MDIR) modules + +# module: +# make -C linux-5.17.15 M=$(PWD)/kernel modules + +clean: + rm -f $(OUTPUT_DIR)/* + make -C $(KDIR) M=$(MDIR) clean \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..409f05b --- /dev/null +++ b/README.md @@ -0,0 +1,150 @@ +## Variable Monitor + +Monitor numerical variables (given address, length), and print system stack information when the set conditions are exceeded. + +Number of simultaneous monitoring +- Monitoring with the same timing length will be grouped into one group, corresponding to one timer. +- A set of up to 32 variables, after which a new timer is allocated. +- The global maximum number of timers is 128. +- The above quantity limit is defined in the `watch_module.h` header macro. + +Currently, monitoring is limited to the same application, and simultaneous calls from multiple applications are not currently supported. +- Multiple applications can work normally if only one program calls `cancel_all_watch();`. + +## Usage + +Example: helloworld.c +- Add `#include "watch.h"` +- Set each variable that needs to be monitored: name && address && length, set threshold, comparison method, timer interval (ns), etc. +- `start_watch(watch_arg);` Start monitoring +- Call `cancel_all_watch();` when you need to cancel monitoring + +When the set conditions are exceeded, the system stack information is printed and viewed with `dmesg`, as shown in the following example: +- Within a timer, if multiple variables exceed the threshold, the stack information will not be output repeatedly; +- The timer restart time after printing the stack is 1s, and the next round of monitoring will start after 1s. + +```log +[86245.364861] ------------------------------------- +[86245.364864] -------------watch monitor----------- +[86245.364865] Threshold reached: + name: temp0, threshold: 150 +[86245.364866] Timestamp (ns): 1699589000606300743 +[86245.364867] Recent Load: 116.65, 126.83, 151.17 +[86245.365669] task: name lcore-worker-4, pid 803327 +[86245.365672] task: name lcore-worker-5, pid 803328 +[86245.365673] task: name lcore-worker-6, pid 803329 +[86245.365674] task: name lcore-worker-7, pid 803330 +[86245.365676] task: name lcore-worker-8, pid 803331 +[86245.365677] task: name lcore-worker-9, pid 803332 +[86245.365679] task: name lcore-worker-10, pid 803333 +[86245.365681] task: name lcore-worker-11, pid 803334 +[86245.365682] task: name lcore-worker-68, pid 803335 +[86245.365683] task: name lcore-worker-69, pid 803336 +[86245.365684] task: name lcore-worker-70, pid 803337 +[86245.365685] task: name lcore-worker-71, pid 803338 +[86245.365686] task: name lcore-worker-72, pid 803339 +[86245.365687] task: name lcore-worker-73, pid 803340 +[86245.365688] task: name lcore-worker-74, pid 803341 +[86245.365689] task: name lcore-worker-75, pid 803342 +[86245.365694] task: name pkt:worker-0, pid 803638 +[86245.365702] hrtimer_nanosleep+0x8d/0x120 +[86245.365709] __x64_sys_nanosleep+0x96/0xd0 +[86245.365711] do_syscall_64+0x37/0x80 +[86245.365716] entry_SYSCALL_64_after_hwframe+0x44/0xae +[86245.365718] task: name pkt:worker-1, pid 803639 +[86245.365721] hrtimer_nanosleep+0x8d/0x120 +[86245.365724] __x64_sys_nanosleep+0x96/0xd0 +[86245.365726] do_syscall_64+0x37/0x80 +[86245.365728] entry_SYSCALL_64_after_hwframe+0x44/0xae +[86245.365730] task: name pkt:worker-2, pid 803640 +[86245.365732] hrtimer_nanosleep+0x8d/0x120 +[86245.365734] __x64_sys_nanosleep+0x96/0xd0 +[86245.365737] do_syscall_64+0x37/0x80 +[86245.365739] entry_SYSCALL_64_after_hwframe+0x44/0xae +[86245.365740] task: name pkt:worker-3, pid 803641 +[86245.365743] hrtimer_nanosleep+0x8d/0x120 +``` + +### Parameter Description + +start_watch passes in the watch_arg structure. The meaning of each field is as follows +- name limit `MAX_NAME_LEN`(15) valid characters + +```c +typedef struct +{ + pid_t task_id; // current process id + char name[MAX_NAME_LEN + 1]; // name (15+1) + void *ptr; // virtual address + int length_byte; // byte + long long threshold; // threshold value + unsigned char unsigned_flag; // unsigned flag (true: unsigned, false: signed) + unsigned char greater_flag; // reverse flag (true: >, false: <) + unsigned long time_ns; // timer interval (ns) +} watch_arg; +``` + +An initialization example + +```c +watch_args = (watch_arg){ + .task_id = getpid(), + .ptr = &temp, + .name = "temp", + .length_byte = sizeof(int), + .threshold = 150 + i, + .unsigned_flag = 0, + .greater_flag = 1, + .time_ns = 2000 + (i / 33) * 5000 +}; +``` + +## demo + +In the main project directory: + +```bash +make && insmod watch_module.ko +./watch +``` + +You can see the printed stack information in dmesg + +```bash +# Unload module and clean compile files +rmmod watch_module.ko && make clean +``` + +Only tested on kernel 5.17.15-1.el8.x86_64. + +## Other + +The program is divided into two parts: character device and user space interface, both of which communicate through ioctl. + +User space address access +- The variable virtual address passed in by the user program, use `get_user_pages_remote` to obtain the memory page where the address is located, and `kmap` maps it to the kernel. + - In the 192.168.40.204 environment, the HugeTLB Pages test mounts normally. +- The memory page address + offset is stored in the `kernel_watch_arg` corresponding to the timer, and hrTimer accesses `kernel_watch_arg` when polling to get the real value. + +timer grouping +- The hrTimer data structure is defined in the global array `kernel_wtimer_list`. When allocating a timer, it will check the traversal `kernel_wtimer_list` to compare the timer interval. +- Watches with the same timing interval are assigned to the same group and correspond to the same hrTimer. +- If the number of variables monitored by a timer exceeds `TIMER_MAX_WATCH_NUM` (32), a new hrTimer will be created. +- The total number of hrTimers (`kernel_wtimer_list` array length) limit is `MAX_TIMER_NUM`(128). + +Memory page mount/unmount +- `get_user_pages_remote`/ `kmap` will increase the corresponding count and requires the equivalent `put_page`/`kunmap`. +- A global linked list in the module `watch_local_memory_list` stores the page and kt corresponding to each successfully mounted variable. When performing the close operation of the character device, it is traversed and unloaded. + +Stack output conditions: The conditions are referenced from [diagnose-tools::load.c](https://github.com/alibaba/diagnose-tools/blob/e285bc4626a7d207eabd4a69cb276e1a3b1b7c76/SOURCE/module/kernel/load.c#L209) +- `TASK` must satisfy TASK_RUNNING and `__task_contributes_to_load`. +- `__task_contributes_to_load` corresponds to the kernel macro `task_contributes_to_loa`. + +```c +// https://www.spinics.net/lists/kernel/msg3582022.html +// remove from 5.8.rc3,but it still work +// whether the task contributes to the load +#define __task_contributes_to_load(task) \ + ((READ_ONCE(task->__state) & TASK_UNINTERRUPTIBLE) != 0 && (task->flags & PF_FROZEN) == 0 && \ + (READ_ONCE(task->__state) & TASK_NOLOAD) == 0) +``` \ No newline at end of file diff --git a/README_zh.md b/README_zh.md new file mode 100644 index 0000000..ec0bebb --- /dev/null +++ b/README_zh.md @@ -0,0 +1,174 @@ +## Variable Monitor + +changelog + +```log +11.9 多个变量监控支持 +11.10 按照 pid 区分不同内核结构, 支持每个进程单独申请取消自己的监控. +11.13 用户接口 cancel_all_watch -> cancel_watch, 每个进程互不干扰. +``` + +## 说明 + +监控 数值变量(给定 地址,长度), 超过设定条件打印系统堆栈信息. + +同时监控数量 +- 相同定时长度的监控 会被分为一组,对应一个定时器. +- 一组最多 32 个变量,超过后会分配一个新的定时器. +- 定时器数量全局最多 128 个. +- 以上数量限制定义在 `watch_module.h` 头部宏. + +## 使用 + +示例如 helloworld.c +- 添加 `#include "watch.h"` +- 对每个需要监控的变量 设置: 名称 && 地址 && 长度, 设置阈值, 比较方式, 定时器间隔(ns) 等. +- `start_watch(watch_arg);` 启动监控 +- 需要取消监控时调用 `cancel_watch();` + +超出设定条件时,打印系统堆栈信息, `dmesg` 查看,如下示例: +- 一个定时器内,多个变量超过阈值,堆栈信息不会重复输出; +- 打印堆栈后定时器再启动时间为 1s, 1s 后开始下一个轮次监控. + +```log +[ 713.225894] ------------------------------------- +[ 713.225900] -------------watch monitor----------- +[ 713.225900] Threshold reached: +[ 713.225901] name: temp0, threshold: 150, pid: 4261 +[ 713.225902] name: temp1, threshold: 151, pid: 4261 +[ 713.225903] name: temp2, threshold: 152, pid: 4261 +[ 713.225904] name: temp3, threshold: 153, pid: 4261 +[ 713.225904] name: temp4, threshold: 154, pid: 4261 +[ 713.225905] name: temp5, threshold: 155, pid: 4261 +[ 713.225905] name: temp6, threshold: 156, pid: 4261 +[ 713.225906] name: temp7, threshold: 157, pid: 4261 +[ 713.225906] name: temp8, threshold: 158, pid: 4261 +[ 713.225907] name: temp9, threshold: 159, pid: 4261 +[ 713.225907] name: temp10, threshold: 160, pid: 4261 +[ 713.225908] name: temp11, threshold: 161, pid: 4261 +[ 713.225908] name: temp12, threshold: 162, pid: 4261 +[ 713.225909] name: temp13, threshold: 163, pid: 4261 +[ 713.225909] name: temp14, threshold: 164, pid: 4261 +[ 713.225910] name: temp15, threshold: 165, pid: 4261 +[ 713.225910] name: temp16, threshold: 166, pid: 4261 +[ 713.225911] name: temp17, threshold: 167, pid: 4261 +[ 713.225911] name: temp18, threshold: 168, pid: 4261 +[ 713.225912] name: temp19, threshold: 169, pid: 4261 +[ 713.225912] name: temp20, threshold: 170, pid: 4261 +[ 713.225913] name: temp21, threshold: 171, pid: 4261 +[ 713.225913] name: temp22, threshold: 172, pid: 4261 +[ 713.225914] name: temp23, threshold: 173, pid: 4261 +[ 713.225914] name: temp24, threshold: 174, pid: 4261 +[ 713.225915] name: temp25, threshold: 175, pid: 4261 +[ 713.225915] name: temp26, threshold: 176, pid: 4261 +[ 713.225916] name: temp27, threshold: 177, pid: 4261 +[ 713.225916] name: temp28, threshold: 178, pid: 4261 +[ 713.225916] name: temp29, threshold: 179, pid: 4261 +[ 713.225917] name: temp30, threshold: 180, pid: 4261 +[ 713.225917] name: temp31, threshold: 181, pid: 4261 +[ 713.225918] Timestamp (ns): 1699846710299420862 +[ 713.225919] Recent Load: 0.05, 0.12, 0.08 +[ 713.225921] task: name rcu_gp, pid 3, state 1026 +[ 713.225926] rescuer_thread+0x290/0x390 +[ 713.225931] kthread+0xd7/0x100 +[ 713.225932] ret_from_fork+0x1f/0x30 +[ 713.225935] task: name rcu_par_gp, pid 4, state 1026 +[ 713.225936] rescuer_thread+0x290/0x390 +[ 713.225937] kthread+0xd7/0x100 +[ 713.225938] ret_from_fork+0x1f/0x30 +[ 713.225940] task: name netns, pid 5, state 1026 +[ 713.225941] rescuer_thread+0x290/0x390 +[ 713.225942] kthread+0xd7/0x100 +``` + +### 参数说明 + +start_watch 传入的是 watch_arg 结构体.各个字段意义如下 +- name 限制 `MAX_NAME_LEN`(15) 个有效字符 + +```c +typedef struct +{ + pid_t task_id; // current process id + char name[MAX_NAME_LEN + 1]; // name (15+1) + void *ptr; // virtual address + int length_byte; // byte + long long threshold; // threshold value + unsigned char unsigned_flag; // unsigned flag (true: unsigned, false: signed) + unsigned char greater_flag; // reverse flag (true: >, false: <) + unsigned long time_ns; // timer interval (ns) +} watch_arg; +``` + +一个初始化示例 + +```c +watch_args = (watch_arg){ + .task_id = getpid(), + .ptr = &temp, + .name = "temp", + .length_byte = sizeof(int), + .threshold = 150 + i, + .unsigned_flag = 0, + .greater_flag = 1, + .time_ns = 2000 + (i / 33) * 5000 +}; +``` + +## demo + +项目主文件下 +- `helloworld.c`: 测试大量变量监控 +- `hptest.c`: 测试 hugePage 挂载 + +```bash +# 编译加载模块 +make && insmod variable_monitor.ko +./helloworld +``` + +dmesg 可以看到打印的堆栈信息 + +```bash +# 卸载模块,清理编译文件 +rmmod variable_monitor.ko && make clean +``` + +仅在 `kernel 5.17.15-1.el8.x86_64` 测试,其他内核版本未测试. + +## 其他 + +程序分为两部分: 字符设备 和 用户空间接口, 两者通过 ioctl 通信. + +用户空间地址访问 +- 用户程序传入的变量 虚拟地址, 使用 `get_user_pages_remote` 获取地址所在内存页, `kmap` 将其映射到内核. + - 192.168.40.204 环境下,HugeTLB Pages 测试挂载正常. +- 内存页地址 + 偏移量存入定时器对应的 `kernel_watch_arg` 中, hrTimer 轮询时访问 `kernel_watch_arg` 得到真实值. + +定时器分组 +- hrTimer 数据结构定义在全局数组 `kernel_wtimer_list`.分配定时器时,会检查遍历 `kernel_wtimer_list` 比较定时器间隔, +- 相同定时间隔的 watch 分配到同一组,对应同一个 hrTimer. +- 若一个定时器监控变量数量超过 `TIMER_MAX_WATCH_NUM` (32),则会创建一个新的 hrTimer. +- hrTimer 的总数量(`kernel_wtimer_list` 数组长度)限制是 `MAX_TIMER_NUM`(128). + +内存页 mount/unmount +- `get_user_pages_remote`/ `kmap` 会增加对应的计数,需要对等的 `put_page`/`kunmap`. +- 一个模块内全局链表 `watch_local_memory_list` 存储每一个成功挂载的变量对应的 page 和 kt,执行字符设备的 close 操作时,遍历并卸载. + +variable monitor 添加/删除 +- kernel_watch_arg 数据结构中有 pid 的成员变量,但添加变量监控时,不按照进程区分. +- 删除时遍历全部监控变量,比较 pid. +- 删除造成的缺位,将最后的变量移动到空位, sentinel--; hrTimer 同理. + +堆栈输出条件: 条件参考自 [diagnose-tools::load.c](https://github.com/alibaba/diagnose-tools/blob/e285bc4626a7d207eabd4a69cb276e1a3b1b7c76/SOURCE/module/kernel/load.c#L209) +- `TASK` 要满足 TASK_RUNNING 和 `__task_contributes_to_load` 和 `TASK_IDLE`(可能有阻塞进程). +- `__task_contributes_to_load` 对应内核宏 `task_contributes_to_loa`. + +```c +// https://www.spinics.net/lists/kernel/msg3582022.html +// remove from 5.8.rc3,but it still work +// whether the task contributes to the load +#define __task_contributes_to_load(task) \ + ((READ_ONCE(task->__state) & TASK_UNINTERRUPTIBLE) != 0 && (task->flags & PF_FROZEN) == 0 && \ + (READ_ONCE(task->__state) & TASK_NOLOAD) == 0) +``` \ No newline at end of file diff --git a/kernel/Makefile b/kernel/Makefile new file mode 100644 index 0000000..d3ede68 --- /dev/null +++ b/kernel/Makefile @@ -0,0 +1,3 @@ +KMOD = variable_monitor +obj-m := $(KMOD).o +$(KMOD)-objs := monitor_kernel.o diff --git a/kernel/monitor_kernel.c b/kernel/monitor_kernel.c new file mode 100644 index 0000000..1d17d64 --- /dev/null +++ b/kernel/monitor_kernel.c @@ -0,0 +1,157 @@ +#include +#include +#include +#include +#include + +#include "monitor_kernel_lib.c" +#include "monitor_kernel_task.c" + +#define DEVICE_NAME "variable_monitor" + +// for character device +static dev_t dev_num; +static struct cdev *watch_cdev; +static struct class *watch_class; + +struct my_device_data { + pid_t pid; +}; + +static int device_open(struct inode *inode, struct file *file) { + struct my_device_data *data; + printk(KERN_INFO "%s: with pid %d\n", __FUNCTION__, current->pid); + // save pid + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + data->pid = current->pid; + file->private_data = data; + return 0; +} + +static int device_release(struct inode *inode, struct file *file) { + // printk(KERN_INFO "%s\n", __FUNCTION__); + // load pid + struct my_device_data *data = file->private_data; + // clear watch with pid + clear_watch(data->pid); + kfree(data); // free data memory + return 0; +} + +static long device_ioctl(struct file *file, unsigned int ioctl_num, + unsigned long ioctl_param) { + watch_arg warg; + void *kptr; + kernel_watch_timer *timer = NULL; + kernel_watch_arg k_watch_arg; + // copy watch_arg + if (copy_from_user(&warg, (watch_arg *)ioctl_param, sizeof(warg))) { + return -EACCES; + } + + printk(KERN_INFO "Watch_arg: task_id=%d, name=%s, ptr=%p, length_byte=%d, " + "time_ns=%ld, threshold=%lld\n", + warg.task_id, warg.name, warg.ptr, warg.length_byte, warg.time_ns, + warg.threshold); + // user space address to kernel space address + kptr = convert_user_space_ptr(warg.task_id, (unsigned long)warg.ptr); + if (kptr == NULL) { + printk(KERN_ERR "Cannot access user space\n"); + return -EACCES; + } + // check length + if (warg.length_byte != 1 && warg.length_byte != 2 && warg.length_byte != 4 && + warg.length_byte != 8) { + printk(KERN_ERR "Invalid length %d\n", warg.length_byte); + return -EINVAL; + } + // k_watch_arg init + w_arg2k_w_arg(kptr, warg, &k_watch_arg); + timer = get_timer(warg.time_ns); // get a valuable timer + + printk(KERN_INFO "ptr transform kptr: %p\n", kptr); + printk(KERN_INFO "timer: %p\n", timer); + printk(KERN_INFO "timer->sentinel: %d, timer->time_ns: %lld\n", + timer->sentinel, timer->time_ns); + printk(KERN_INFO "timer->hr_timer: %p\n", &timer->hr_timer); + + TIMER_CANCEL(timer); // just in case + timer_add_watch(timer, k_watch_arg); + TIMER_START(timer); + + printk(KERN_INFO "Start watching var: %s\n", warg.name); + return 0; +} + +static struct file_operations fops = { + .open = device_open, + .release = device_release, + .unlocked_ioctl = device_ioctl, +}; + +int init_module(void) { + printk(KERN_INFO "%s\n", __FUNCTION__); + if (alloc_chrdev_region(&dev_num, 0, 1, DEVICE_NAME) < 0) { + printk(KERN_ALERT "Failed to register device number\n"); + return -1; + } + + if ((watch_cdev = cdev_alloc()) == NULL) { + printk(KERN_ALERT "Failed to allocate cdev structure\n"); + unregister_chrdev_region(dev_num, 1); + return -1; + } + + cdev_init(watch_cdev, &fops); + if (cdev_add(watch_cdev, dev_num, 1) == -1) { + printk(KERN_ALERT "Failed to add cdev structure\n"); + device_destroy(watch_class, dev_num); + class_destroy(watch_class); + unregister_chrdev_region(dev_num, 1); + return -1; + } + + if ((watch_class = class_create(THIS_MODULE, DEVICE_NAME)) == NULL) { + printk(KERN_ALERT "Failed to create class\n"); + cdev_del(watch_cdev); + unregister_chrdev_region(dev_num, 1); + return -1; + } + + if (device_create(watch_class, NULL, dev_num, NULL, DEVICE_NAME) == NULL) { + printk(KERN_ALERT "Failed to create device\n"); + class_destroy(watch_class); + cdev_del(watch_cdev); + unregister_chrdev_region(dev_num, 1); + return -1; + } + + printk(KERN_INFO "dev number: %d\n", dev_num); + printk(KERN_INFO "path: /dev/%s %d\n", DEVICE_NAME, dev_num); + + fn_kallsyms_lookup_name_init(); // init kallsyms_lookup_name + LOOKUP_SYMS(stack_trace_save_tsk); // stack_trace_save_tsk + LOOKUP_SYMS(show_stack); // show_stack + LOOKUP_SYMS(idle_sched_class); // idle_sched_class + LOOKUP_SYMS(access_remote_vm); // access_remote_vm + + LOOKUP_SYMS_NORET(get_task_type); // get_task_type + LOOKUP_SYMS_NORET(kernfs_name); // kernfs_name + + return 0; +} + +void cleanup_module(void) { + printk(KERN_INFO "%s\n", __FUNCTION__); + // clear all timer and page list + clear_all_watch(); + // unmount + device_destroy(watch_class, dev_num); + class_destroy(watch_class); + cdev_del(watch_cdev); + unregister_chrdev_region(dev_num, 1); +} + +MODULE_LICENSE("GPL"); \ No newline at end of file diff --git a/kernel/monitor_kernel.h b/kernel/monitor_kernel.h new file mode 100644 index 0000000..47e36d0 --- /dev/null +++ b/kernel/monitor_kernel.h @@ -0,0 +1,179 @@ +#include +#include +#include +#include +#include /* for kmalloc */ +#include + +#include +#include +#include +#include +#include /* for avenrun, LOAD_* */ +#include +#include +#include /* for stack_trace_print */ + +#define MAX_TIMER_NUM (128) // max timer number +#define TIMER_MAX_WATCH_NUM (32) // A timer max watch number at once time +#define MAX_NAME_LEN (15) // max name length +typedef struct { + pid_t task_id; // current process id + char name[MAX_NAME_LEN + 1]; // name + void *ptr; // virtual address + int length_byte; // byte + long long threshold; // threshold value + unsigned char unsigned_flag; // unsigned flag (true: unsigned, false: signed) + unsigned char greater_flag; // reverse flag (true: >, false: <) + unsigned long time_ns; // timer interval (ns) +} watch_arg; + +typedef struct { + pid_t task_id; // current process id + char name[MAX_NAME_LEN + 2]; // name, last char automatically add '\0' + void *kptr; // kernel address + offset + int length_byte; // byte + long long threshold; // threshold value + unsigned char unsigned_flag; // unsigned flag (true: unsigned, false: signed) + unsigned char greater_flag; // reverse flag (true: >, false: <) +} kernel_watch_arg; + +typedef struct { + unsigned long long time_ns; // hrTimer time interval (ns) + struct hrtimer hr_timer; // hrTimer + ktime_t kt; // hrTimer time + unsigned sentinel; // sentinel + kernel_watch_arg + k_watch_args[TIMER_MAX_WATCH_NUM]; // all watched kernel_watch_arg +} kernel_watch_timer; + +#define TIMER_FILLED(timer) ((timer)->sentinel >= TIMER_MAX_WATCH_NUM) +#define TIMER_EMPTY(timer) (!((timer)->time_ns | (timer)->sentinel)) +#define TIMER_NO_KWARG(timer) ((timer)->sentinel == 0) + +#define TIMER_START(timer) \ + (hrtimer_start(&timer->hr_timer, timer->kt, HRTIMER_MODE_REL)) +#define TIMER_CANCEL(timer) (hrtimer_cancel(&timer->hr_timer)) + +kernel_watch_timer kernel_wtimer_list[MAX_TIMER_NUM] = { + 0}; // all kernel_watch_timer +int kernel_wtimer_num = 0; // current kernel_watch_timer number + +EXPORT_SYMBOL(kernel_wtimer_list); // export kernel_watch_timer_list +EXPORT_SYMBOL(kernel_wtimer_num); // export kernel_watch_timer_num + +// Helper function +unsigned char w_arg2k_w_arg(void *ptr, watch_arg warg, + kernel_watch_arg *k_watch_arg); + +// for timer +kernel_watch_timer *get_timer(unsigned long long time_ns); +unsigned char timer_add_watch(kernel_watch_timer *timer, + kernel_watch_arg k_watch_arg); +unsigned char timer_del_watch_by_pid(kernel_watch_timer *timer, pid_t pid); + +// for memory access +typedef struct { + pid_t task_id; // current process id + struct page *page; + void *kaddr; + struct list_head entry; +} watch_local_memory; + +static LIST_HEAD(watch_local_memory_list); + +void free_page_list(pid_t task_id); +void free_all_page_list(void); + +// static struct page *page = NULL; +// static void *kaddr = NULL; + +void *convert_user_space_ptr(pid_t pid, unsigned long kaddr); + +// for timer +// #define US2NS (1000) // Interval in microseconds +// static struct hrtimer hr_timer; +// static ktime_t kt; + +// hrTimer +enum hrtimer_restart check_variable_cb(struct hrtimer *timer); +void start_all_hrTimer(void); +void cancel_all_hrTimer(void); + +unsigned char read_and_compare(kernel_watch_arg *k_arg); + +// for diag_kallsyms_lookup_name +unsigned long (*diag_kallsyms_lookup_name)(const char *name); +static struct kprobe kprobe_kallsyms_lookup_name = {.symbol_name = + "kallsyms_lookup_name"}; + +int fn_kallsyms_lookup_name_init(void); // init kallsyms_lookup_name + +// form +// https://github.com/alibaba/diagnose-tools/blob/8cd905a1c17f2201e460a2d607413a1303757a32/SOURCE/module/internal.h#L65 +// look for current function address, all the function with prefix "orig_" are +#define LOOKUP_SYMS(name) \ + do { \ + orig_##name = (void *)diag_kallsyms_lookup_name(#name); \ + if (!orig_##name) { \ + printk(KERN_ERR "kallsyms_lookup_name: %s\n", #name); \ + return -EINVAL; \ + } \ + } while (0) + +#define LOOKUP_SYMS_NORET(name) \ + do { \ + orig_##name = (void *)diag_kallsyms_lookup_name(#name); \ + if (!orig_##name) \ + pr_err("kallsyms_lookup_name: %s\n", #name); \ + } while (0) + +#define BACKTRACE_DEPTH 20 // max stack depth + +// LOOKUP_SYMS(stack_trace_save_tsk); +unsigned int (*orig_stack_trace_save_tsk)(struct task_struct *task, + unsigned long *store, + unsigned int size, + unsigned int skipnr); +// LOOKUP_SYMS(show_stack); +void (*orig_show_stack)(struct task_struct *task, unsigned long *sp, + const char *loglvl); + +// https://www.spinics.net/lists/kernel/msg3582022.html +// remove from 5.8.rc3,but it still work +// whether the task contributes to the load +#define __task_contributes_to_load(task) \ + ((READ_ONCE(task->__state) & TASK_UNINTERRUPTIBLE) != 0 && \ + (task->flags & PF_FROZEN) == 0 && \ + (READ_ONCE(task->__state) & TASK_NOLOAD) == 0) + +/// @brief print all task stack +/// @param +static void print_task_stack(void) { + struct task_struct *g, *p; // g: task group; p: task + unsigned long backtrace[BACKTRACE_DEPTH]; // save stack + unsigned int nr_bt; // stack depth + unsigned long long current_time; // last time + current_time = ktime_get_real(); + printk("Timestamp (ns): %lld\n", current_time); + printk("Recent Load: %lu.%02lu, %lu.%02lu, %lu.%02lu\n", // recent load + LOAD_INT(avenrun[0]), LOAD_FRAC(avenrun[0]), LOAD_INT(avenrun[1]), + LOAD_FRAC(avenrun[1]), LOAD_INT(avenrun[2]), LOAD_FRAC(avenrun[2])); + rcu_read_lock(); // lock run queue + // printk("Running task\n"); + do_each_thread(g, p) { + if (p->__state == TASK_RUNNING || __task_contributes_to_load(p) || + p->__state == TASK_IDLE) { + printk("task: %s, pid %d, state %d\n", p->comm, p->pid, + p->__state); //! todo + nr_bt = orig_stack_trace_save_tsk(p, backtrace, BACKTRACE_DEPTH, 0); + stack_trace_print(backtrace, nr_bt, 0); // print + } + } + while_each_thread(g, p); + rcu_read_unlock(); // unlock run queue +} + +unsigned char del_all_kwarg_by_pid(pid_t pid); +void clear_watch(pid_t pid); +void clear_all_watch(void); \ No newline at end of file diff --git a/kernel/monitor_kernel_lib.c b/kernel/monitor_kernel_lib.c new file mode 100644 index 0000000..3dea0cd --- /dev/null +++ b/kernel/monitor_kernel_lib.c @@ -0,0 +1,427 @@ +#include "monitor_kernel.h" + +unsigned char w_arg2k_w_arg(void *ptr, watch_arg warg, + kernel_watch_arg *k_watch_arg) { + // k_watch_arg init + k_watch_arg->task_id = warg.task_id; + strncpy(k_watch_arg->name, warg.name, MAX_NAME_LEN + 1); // name + k_watch_arg->name[MAX_NAME_LEN + 1] = '\0'; // just in case + k_watch_arg->kptr = ptr; + k_watch_arg->length_byte = warg.length_byte; + k_watch_arg->threshold = warg.threshold; + k_watch_arg->unsigned_flag = warg.unsigned_flag; + k_watch_arg->greater_flag = warg.greater_flag; + return 0; +} + +/// @brief get a valuable timer +/// @param time_ns +/// @return kernel_watch_timer *, NULL means fail +kernel_watch_timer *get_timer(unsigned long long time_ns) { + int i = 0; + kernel_watch_timer *timer = NULL; + // chose a timer + for (i = 0; i < kernel_wtimer_num; i++) { + timer = &kernel_wtimer_list[i]; + + if (TIMER_EMPTY(timer)) { + break; + } + if ((timer->time_ns == time_ns) && (!TIMER_FILLED(timer))) { + break; + } + } + // if all timer is full + if (i >= MAX_TIMER_NUM) { + printk(KERN_ERR "No timer available\n"); + return NULL; + } + // if a new timer, init it + if (i > kernel_wtimer_num - 1) { + printk(KERN_INFO "New timer\n"); + + kernel_wtimer_list[i].time_ns = time_ns; + kernel_wtimer_list[i].sentinel = 0; + + kernel_wtimer_list[i].kt = ktime_set(0, (unsigned long)time_ns); // ns + // CLOCK_MONOTONIC: time since boot | HRTIMER_MODE_REL : relative time + hrtimer_init(&(kernel_wtimer_list[i].hr_timer), CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + kernel_wtimer_list[i].hr_timer.function = + check_variable_cb; // callback function + + kernel_wtimer_num = i + 1; + } + printk(KERN_INFO "now, we have %d timers\n", kernel_wtimer_num); + return &kernel_wtimer_list[i]; +} + +/// @brief hrTimer add watch +/// @param timer +/// @param k_watch_arg +/// @return 0 is success +unsigned char timer_add_watch(kernel_watch_timer *timer, + kernel_watch_arg k_watch_arg) { + if (TIMER_FILLED(timer)) { + printk(KERN_ERR "Timer is full\n"); + return -1; + } + memcpy(&timer->k_watch_args[timer->sentinel], &k_watch_arg, + sizeof(k_watch_arg)); + // timer->k_watch_args[timer->sentinel] = k_watch_arg; + timer->sentinel++; + return 0; +} + +unsigned char timer_del_watch_by_pid(kernel_watch_timer *timer, pid_t pid) { + int i = 0; + for (i = 0; i < timer->sentinel; i++) { + // if pid match, delete it and move the last one to this position, check + // again + if (timer->k_watch_args[i].task_id == pid) { + if (i != timer->sentinel - 1) { + memcpy(&timer->k_watch_args[i], + &timer->k_watch_args[timer->sentinel - 1], + sizeof(kernel_watch_arg)); + } + timer->sentinel--; + i--; + } + } + return 0; +} + +/// @brief transfer user space address to kernel space address +/// change static global "kaddr" and "page" value +/// @param pid: process id +/// @param kaddr: user space address +/// @return kernel space address + offset +void *convert_user_space_ptr(pid_t pid, unsigned long addr) { + struct task_struct *task; + struct mm_struct *mm; + int ret; + + // unsigned long aligned_addr = 0; + // unsigned long offset = 0; + + watch_local_memory *node; + + // if (addr < TASK_SIZE || addr > -PAGE_SIZE) + // { + // printk(KERN_ERR "Invalid address\n"); + // return NULL; + // } + + // for get_user_pages_remote + unsigned long aligned_addr = addr & PAGE_MASK; + unsigned long offset = addr & ~PAGE_MASK; + + printk(KERN_INFO "%s\n", __FUNCTION__); + + node = kmalloc(sizeof(watch_local_memory), GFP_KERNEL); + node->task_id = pid; + + // Find the task with pid + rcu_read_lock(); + task = pid_task(find_vpid(pid), PIDTYPE_PID); + rcu_read_unlock(); + + if (!task) { + printk(KERN_ERR "Cannot find task for PID %d\n", pid); + kfree(node); // careful there is kfree + return NULL; + } + // Get memory descriptor + mm = get_task_mm(task); + if (!mm) { + printk(KERN_ERR "Cannot get memory descriptor\n"); + kfree(node); // careful there is kfree + return NULL; + } + down_read(&task->mm->mmap_lock); + ret = get_user_pages_remote(task->mm, aligned_addr, 1, FOLL_FORCE, + &(node->page), NULL, NULL); + up_read(&task->mm->mmap_lock); + + if (ret != 1) { + printk(KERN_ERR "Cannot get user page\n"); + kfree(node); // careful there is kfree + return NULL; + } + // Map the page to kernel space + node->kaddr = kmap(node->page); + list_add_tail(&node->entry, &watch_local_memory_list); // add to list + // printk(KERN_INFO "node->kaddr: %p, aligned_addr: %ld, offset: %ld\n", + // node->kaddr, aligned_addr, offset); + return (void *)((unsigned long)(node->kaddr) + offset); +} + +/// @brief free page in watch_local_memory_list with task_id +/// @param task_id +void free_page_list(pid_t task_id) { + watch_local_memory *node, *next; + list_for_each_entry_safe(node, next, &watch_local_memory_list, entry) { + if (node == NULL) + break; + if (node->task_id == task_id) { + // unmap and release the page + if (node->kaddr) + kunmap(node->kaddr); + if (node->page) + put_page(node->page); + list_del(&node->entry); + kfree(node); // careful there is kfree + } + } +} + +/// @brief free all page in watch_local_memory_list +/// @param +void free_all_page_list(void) { + watch_local_memory *node, *next; + list_for_each_entry_safe(node, next, &watch_local_memory_list, entry) { + if (node == NULL) + break; + // unmap and release the page + if (node->kaddr) + kunmap(node->kaddr); + if (node->page) + put_page(node->page); + list_del(&node->entry); + kfree(node); // careful there is kfree + } +} + +/// @brief hrTimer handler +enum hrtimer_restart check_variable_cb(struct hrtimer *timer) { + kernel_watch_timer *k_watch_timer = + container_of(timer, kernel_watch_timer, hr_timer); + int i = 0, j = 0; + int buffer[TIMER_MAX_WATCH_NUM]; // Buffer to store the messages + + // check all watched kernel_watch_arg + for (i = 0; i < k_watch_timer->sentinel; i++) { + if (read_and_compare(&k_watch_timer->k_watch_args[i])) { + // snprintf(buffer + strlen(buffer), sizeof(buffer) - strlen(buffer), " + // name: %s, threshold: %lld, pid: %d\n", + // k_watch_timer->k_watch_args[i].name, + // k_watch_timer->k_watch_args[i].threshold, + // k_watch_timer->k_watch_args[i].task_id); + buffer[j] = i; + j++; + + // printk(KERN_INFO "j: name %s, threshold: %lld\n", + // k_watch_timer->k_watch_args[i].name, + // k_watch_timer->k_watch_args[i].threshold); + // printk(KERN_INFO "j: %d\n", j); + } + } + if (j > 0) // if any threshold reached + { + printk("-------------------------------------\n"); + printk("-------------watch monitor-----------\n"); + printk("Threshold reached:\n"); + + for (i = 0; i < j; i++) { + printk(" name: %s, threshold: %lld, pid: %d\n", + k_watch_timer->k_watch_args[buffer[i]].name, //! todo + k_watch_timer->k_watch_args[buffer[i]].threshold, + k_watch_timer->k_watch_args[buffer[i]].task_id); + } + print_task_stack(); + // restart timer after 1s + hrtimer_forward(timer, timer->base->get_time(), ktime_set(1, 0)); //! todo + printk("-------------------------------------\n"); + } else { + // keep frequency + hrtimer_forward(timer, timer->base->get_time(), k_watch_timer->kt); + } + return HRTIMER_RESTART; // restart timer +} + +/// @brief start hrTimer +/// @param timeout: timeout in us +/// @return 0 is success +// int start_hrTimer(unsigned long timeout) +// { +// printk("HrTimer Start\n"); + +// kt = ktime_set(0, (unsigned long)timeout); // us -> ns +// // CLOCK_MONOTONIC: time since boot | HRTIMER_MODE_REL : relative time +// hrtimer_init(&hr_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +// hr_timer.function = check_variable_cb; +// // mode the same as hrtimer_init +// hrtimer_start(&hr_timer, kt, HRTIMER_MODE_REL); +// return 0; +// } + +/// @brief start all hrTimer +/// @param +void start_all_hrTimer(void) { + int i = 0; + kernel_watch_timer *timer = NULL; + for (i = 0; i < kernel_wtimer_num; i++) { + timer = &(kernel_wtimer_list[i]); + TIMER_START(timer); + } + printk("HrTimer start,module keep %d hrtimer for now\n", kernel_wtimer_num); +} + +/// @brief cancel hrTimer +/// @param +void cancel_all_hrTimer(void) { + int i = 0; + kernel_watch_timer *timer = NULL; + for (i = 0; i < kernel_wtimer_num; i++) { + timer = &(kernel_wtimer_list[i]); + TIMER_CANCEL(timer); + } + + printk("HrTimer cancel,module keep %d hrtimer for now\n", kernel_wtimer_num); +} + +// for read_and_compare +typedef unsigned char (*compare_func)(void *, long long); + +unsigned char compare_1_byte_signed(void *ptr, long long threshold) { + // printk("compare_1_byte_signed: value %d, biss: %lld\n", *(char *)ptr, + // threshold); + return *(char *)ptr > threshold; +} +unsigned char compare_1_byte_unsigned(void *ptr, long long threshold) { + // printk("compare_1_byte_unsigned: value %d, biss: %lld\n", *(unsigned char + // *)ptr, threshold); + return *(unsigned char *)ptr > threshold; +} +unsigned char compare_2_byte_signed(void *ptr, long long threshold) { + // printk("compare_2_byte_signed: value %d, biss: %lld\n", *(short int *)ptr, + // threshold); + return *(short int *)ptr > threshold; +} +unsigned char compare_2_byte_unsigned(void *ptr, long long threshold) { + // printk("compare_2_byte_unsigned: value %d, biss: %lld\n", *(unsigned short + // int *)ptr, threshold); + return *(unsigned short int *)ptr > threshold; +} +unsigned char compare_4_byte_signed(void *ptr, long long threshold) { + // printk("compare_4_byte_signed: value %d, biss: %lld\n", *(int *)ptr, + // threshold); + return *(int *)ptr > threshold; +} +unsigned char compare_4_byte_unsigned(void *ptr, long long threshold) { + // printk("compare_4_byte_unsigned: value %d, biss: %lld\n", *(unsigned int + // *)ptr, threshold); + return *(unsigned int *)ptr > threshold; +} +unsigned char compare_8_byte_signed(void *ptr, long long threshold) { + // printk("compare_8_byte_signed: value %lld, biss: %lld\n", *(long long + // *)ptr, threshold); + return *(long long *)ptr > threshold; +} +unsigned char compare_8_byte_unsigned(void *ptr, long long threshold) { + // printk("compare_8_byte_unsigned: value %lld, biss: %lld\n", *(unsigned long + // long *)ptr, threshold); + return *(unsigned long long *)ptr > threshold; +} +// list of compare functions +static compare_func compare_funcs[8] = { + compare_1_byte_signed, compare_2_byte_signed, compare_4_byte_signed, + compare_8_byte_signed, compare_1_byte_unsigned, compare_2_byte_unsigned, + compare_4_byte_unsigned, compare_8_byte_unsigned}; + +static int func_indices[2][9] = {{0, 0, 1, 0, 2, 0, 0, 0, 3}, + {0, 4, 5, 0, 6, 0, 0, 0, 7}}; + +/// @brief read k_arg->kptr and compare with threshold +/// @param k_arg +/// @return result of compare +unsigned char read_and_compare(kernel_watch_arg *k_arg) { + void *ptr = k_arg->kptr; + int len = k_arg->length_byte; + unsigned char is_unsigned = k_arg->unsigned_flag; + long long threshold = k_arg->threshold; + + unsigned char result = 0; + + // if (len != 1 && len != 2 && len != 4 && len != 8) + // { + // printk(KERN_ERR "Invalid length\n"); + // return 0; + // } + + result = compare_funcs[func_indices[is_unsigned][len]](ptr, threshold); + + // printk(KERN_INFO "read_and_compare: name %s, value %d, biss: %lld, result: + // %d \n", k_arg->name, *(int *)ptr, + // threshold, result); + + if (k_arg->greater_flag) + return result; + else + return !result; +} + +/// @brief init kallsyms_lookup_name +/// @param +/// @return 0 is success +int fn_kallsyms_lookup_name_init(void) { + register_kprobe(&kprobe_kallsyms_lookup_name); + diag_kallsyms_lookup_name = (void *)kprobe_kallsyms_lookup_name.addr; + unregister_kprobe(&kprobe_kallsyms_lookup_name); + + printk("xby-debug, diag_kallsyms_lookup_name is %p\n", + diag_kallsyms_lookup_name); + + if (!diag_kallsyms_lookup_name) { + return -EINVAL; + } + return 0; +} + +unsigned char del_all_kwarg_by_pid(pid_t pid) { + int i = 0; + kernel_watch_timer *timer = NULL; + + printk(KERN_INFO "del kwarg..."); + + for (i = 0; i < kernel_wtimer_num; i++) { + timer = &(kernel_wtimer_list[i]); + timer_del_watch_by_pid(timer, pid); + } + for (i = 0; i < kernel_wtimer_num; i++) { + timer = &(kernel_wtimer_list[i]); + if (TIMER_NO_KWARG(timer)) // no available kwarg + { + if (i != kernel_wtimer_num - 1) { + memcpy(timer, &kernel_wtimer_list[kernel_wtimer_num - 1], + sizeof(kernel_watch_timer)); + } + kernel_wtimer_num--; + i--; + } + } + return 0; +} + +/// @brief clear watch with pid +/// @param pid +void clear_watch(pid_t pid) { + printk(KERN_INFO "clear pid %d 's watch variable\n", pid); + cancel_all_hrTimer(); // just in case + del_all_kwarg_by_pid(pid); // delete all kwarg with pid + free_page_list(pid); // free page with pid + start_all_hrTimer(); // restart timer +} + +/// @brief clear all watch and reset kernel_wtimer_list/kernel_wtimer_num +/// @param +void clear_all_watch(void) { + printk(KERN_INFO "clear all watch variable\n"); + // unmap and release the page + free_all_page_list(); + // cancel timer + cancel_all_hrTimer(); + // clear timer + kernel_wtimer_num = 0; + memset(kernel_wtimer_list, 0, sizeof(kernel_wtimer_list)); +} \ No newline at end of file diff --git a/kernel/monitor_kernel_task.c b/kernel/monitor_kernel_task.c new file mode 100644 index 0000000..3b57152 --- /dev/null +++ b/kernel/monitor_kernel_task.c @@ -0,0 +1,377 @@ +#include "monitor_kernel_task.h" +#include +#include +#include // for syscall_get_nr +#include +#include // for get_task_mm +#include +#include + +struct stack_trace { + unsigned int nr_entries, max_entries; + unsigned long *entries; + int skip; /* input argument: How many entries to skip */ +}; + +struct stack_frame_user { + const void __user *next_fp; + unsigned long ret_addr; +}; + +static inline int diag_get_task_type(struct task_struct *tsk) { + if (orig_get_task_type) + return orig_get_task_type(&tsk->se); + return 0; +} + +static inline int orig_diag_cgroup_name(struct cgroup *cgrp, char *buf, + size_t buflen) { + if (orig_kernfs_name && cgrp && cgrp->kn) { + return orig_kernfs_name(cgrp->kn, buf, buflen); + } else { + return 0; + } +} + +static inline mm_info *find_mm_info(mm_tree *mm_tree, struct mm_struct *mm) { + mm_info *info; + if (mm == NULL) + return NULL; + info = radix_tree_lookup(&mm_tree->mm_tree, (unsigned long)mm); + return info; +} + +static void __diag_cgroup_name(struct task_struct *tsk, char *buf, + unsigned int count, int cgroup) { + int cgroup_id = cpuacct_cgrp_id; + + memset(buf, 0, count); + + if (cgroup == 1) { + cgroup_id = cpuset_cgrp_id; + } + + if (tsk && tsk->cgroups && tsk->cgroups->subsys && + tsk->cgroups->subsys[cgroup_id] && + tsk->cgroups->subsys[cgroup_id]->cgroup) { + orig_diag_cgroup_name(tsk->cgroups->subsys[cgroup_id]->cgroup, buf, count); + } +} + +static void diag_cgroup_name(struct task_struct *tsk, char *buf, + unsigned int count, int cgroup) { + __diag_cgroup_name(tsk, buf, count, cgroup); +} + +static int copy_stack_frame(const void __user *fp, + struct stack_frame_user *frame) { + int ret; + + ret = 1; + pagefault_disable(); + if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) + ret = 0; + pagefault_enable(); + + return ret; +} + +static int copy_stack_frame_remote(struct task_struct *tsk, + const void __user *fp, + struct stack_frame_user *frame) { + int ret; + struct mm_struct *mm; + + mm = get_task_mm(tsk); + if (!mm) + return 0; + + ret = orig_access_remote_vm(mm, (unsigned long)fp, frame, sizeof(*frame), 0); + mmput(mm); + + return ret; +} + +static inline void save_stack_trace_user_remote(struct task_struct *tsk, + struct stack_trace *trace) { + const struct pt_regs *regs = task_pt_regs(tsk); + const void __user *fp = (const void __user *)regs->bp; + int count = 0; + + if (in_atomic() || irqs_disabled()) { + return; + } + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = regs->ip; + + while (trace->nr_entries < trace->max_entries) { + struct stack_frame_user frame; + + frame.next_fp = NULL; + frame.ret_addr = 0; + + if (!copy_stack_frame_remote(tsk, fp, &frame)) { + break; + } + + if ((unsigned long)fp < regs->sp) + break; + + if (frame.ret_addr) { + trace->entries[trace->nr_entries++] = frame.ret_addr; + } else + break; + + if (fp == frame.next_fp) + break; + fp = frame.next_fp; + + count++; + /** + * 线上环境发现这里有hardlockup,这里强制退出 + */ + if (count >= trace->max_entries || count >= 100) + break; + } +} + +static inline void __save_stack_trace_user(struct stack_trace *trace) { + const struct pt_regs *regs = task_pt_regs(current); + const void __user *fp = (const void __user *)regs->bp; + int count = 0; + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = regs->ip; + + while (trace->nr_entries < trace->max_entries) { + struct stack_frame_user frame; + + frame.next_fp = NULL; + frame.ret_addr = 0; + if (!copy_stack_frame(fp, &frame)) + break; + if ((unsigned long)fp < regs->sp) + break; + if (frame.ret_addr) { + trace->entries[trace->nr_entries++] = frame.ret_addr; + } + if (fp == frame.next_fp) + break; + fp = frame.next_fp; + count++; + /** + * 线上环境发现这里有hardlockup,这里强制退出 + */ + if (count >= trace->max_entries || count >= 100) + break; + } +} + +void perfect_save_stack_trace_user(struct stack_trace *trace) { + /* + * Trace user stack if we are not a kernel thread + */ + if (current->mm) { + __save_stack_trace_user(trace); + } + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} + +void diagnose_save_stack_trace_user(unsigned long *backtrace) { + struct stack_trace trace; + + memset(&trace, 0, sizeof(trace)); + memset(backtrace, 0, BACKTRACE_DEPTH2 * sizeof(unsigned long)); + trace.max_entries = BACKTRACE_DEPTH2; + trace.entries = backtrace; + perfect_save_stack_trace_user(&trace); +} + +void diagnose_save_stack_trace_user_remote(struct task_struct *tsk, + unsigned long *backtrace) { + struct stack_trace trace; + + memset(&trace, 0, sizeof(trace)); + memset(backtrace, 0, BACKTRACE_DEPTH2 * sizeof(unsigned long)); + trace.max_entries = BACKTRACE_DEPTH2; + trace.entries = backtrace; + + /* + * Trace user stack if we are not a kernel thread + */ + if (tsk->mm) { + save_stack_trace_user_remote(tsk, &trace); + } + if (trace.nr_entries < trace.max_entries) + trace.entries[trace.nr_entries++] = ULONG_MAX; +} + +void diag_task_brief(struct task_struct *tsk, task_detail *detail) { + struct pid_namespace *ns; + struct pt_regs *task_regs; + struct task_struct *leader; + struct pt_regs *irq_regs; + + if (!detail) + return; + + memset(detail, 0, sizeof(task_detail)); + + if (!tsk || tsk->exit_state == EXIT_ZOMBIE) // zombie + return; + leader = tsk->group_leader; + if (!leader || leader->exit_state == EXIT_ZOMBIE) { + return; + } + + if (tsk != current) { // not current task + detail->user_mode = -1; + detail->syscallno = -1; + } else if (!tsk->mm) { // current task but kernel thread + detail->user_mode = 0; + detail->syscallno = -1; + } else { // current task and user thread + irq_regs = get_irq_regs(); // get current irq regs + task_regs = task_pt_regs(tsk); + + if ((irq_regs && user_mode(irq_regs)) || + (task_regs && user_mode(task_regs))) { + detail->user_mode = 1; // user mode + } else { + detail->user_mode = 0; // kernel mode + } + + if (task_regs) { + detail->syscallno = syscall_get_nr(tsk, task_regs); // get syscall no + } + } + + if (tsk->sched_class == orig_idle_sched_class) // idle task + detail->sys_task = 2; + else if (!tsk->mm) // kernel thread + detail->sys_task = 1; + else + detail->sys_task = 0; + + detail->pid = tsk->pid; // pid + detail->tgid = tsk->tgid; // tgid + detail->state = tsk->__state; // state + detail->task_type = diag_get_task_type(tsk); // task type + ns = task_active_pid_ns(tsk); // container pid + if (ns && ns != &init_pid_ns) { + detail->container_pid = task_pid_nr_ns(tsk, ns); + detail->container_tgid = task_tgid_nr_ns(tsk, ns); + } else { + detail->container_pid = tsk->pid; + detail->container_tgid = tsk->tgid; + } + strncpy(detail->comm, tsk->comm, TASK_COMM_LEN); + detail->comm[TASK_COMM_LEN - 1] = 0; // comm name + diag_cgroup_name(tsk, detail->cgroup_buf, CGROUP_NAME_LEN, 0); + diag_cgroup_name(tsk, detail->cgroup_cpuset, CGROUP_NAME_LEN, 1); + + detail->cgroup_buf[CGROUP_NAME_LEN - 1] = 0; // cgroup name + detail->cgroup_cpuset[CGROUP_NAME_LEN - 1] = 0; // cgroup cpuset name +} + +void diag_task_user_stack(struct task_struct *tsk, user_stack_detail *detail) { + struct pt_regs *regs; + unsigned long sp, ip, bp; + struct task_struct *leader; + + if (!detail) + return; + + detail->stack[0] = 0; + if (!tsk || !tsk->mm) + return; + + leader = tsk->group_leader; + if (!leader || !leader->mm || leader->exit_state == EXIT_ZOMBIE) { + return; + } + + sp = 0; + ip = 0; + bp = 0; + regs = task_pt_regs(tsk); + if (regs) { + sp = regs->sp; +#if defined(DIAG_ARM64) + ip = regs->pc; + bp = regs->sp; +#else + ip = regs->ip; + bp = regs->bp; +#endif + } +#if defined(DIAG_ARM64) + detail->regs = regs->user_regs; +#else + detail->regs = *regs; +#endif + detail->sp = sp; + detail->ip = ip; + detail->bp = bp; + + if (tsk == current) { + diagnose_save_stack_trace_user(detail->stack); + } else { + diagnose_save_stack_trace_user_remote(tsk, detail->stack); + } +} + +void diag_task_kern_stack(struct task_struct *tsk, kern_stack_detail *detail) { + orig_stack_trace_save_tsk(tsk, detail->stack, BACKTRACE_DEPTH2, 0); +} + +void dump_proc_chains_argv(int style, struct task_struct *tsk, mm_tree *mm_tree, + proc_chains_detail *detail) { + struct task_struct *walker; + mm_info *mm_info; + int cnt = 0; + int i = 0; + struct task_struct *leader; + + for (i = 0; i < PROCESS_CHAINS_COUNT; i++) { + detail->chains[i][0] = 0; + detail->tgid[i] = 0; + } + if (style == 0) + return; + + if (!tsk || !tsk->mm) + return; + + leader = tsk->group_leader; + if (!leader || !leader->mm || + leader->exit_state == EXIT_ZOMBIE) { // leader is zombie or no mm + return; + } + + rcu_read_lock(); + walker = tsk; + + while (walker->pid > 0) { + if (!thread_group_leader(walker)) + walker = rcu_dereference(walker->group_leader); + mm_info = find_mm_info(mm_tree, walker->mm); + if (mm_info) { + if (mm_info->cgroup_buf[0] == 0) + diag_cgroup_name(walker, mm_info->cgroup_buf, 255, 0); + strncpy(detail->chains[cnt], mm_info->argv, PROCESS_ARGV_LEN); + detail->full_argv[cnt] = 1; + } else { + strncpy(detail->chains[cnt], walker->comm, TASK_COMM_LEN); + detail->full_argv[cnt] = 0; + } + detail->tgid[cnt] = walker->pid; + walker = rcu_dereference(walker->real_parent); + cnt++; + if (cnt >= PROCESS_CHAINS_COUNT) + break; + } + rcu_read_unlock(); +} \ No newline at end of file diff --git a/kernel/monitor_kernel_task.h b/kernel/monitor_kernel_task.h new file mode 100644 index 0000000..62e501c --- /dev/null +++ b/kernel/monitor_kernel_task.h @@ -0,0 +1,98 @@ +#include +#include + +#define CGROUP_NAME_LEN 32 // max length of cgroup name +#define TASK_COMM_LEN 16 // max length of task name + +#define BACKTRACE_DEPTH2 30 // max depth of backtrace + +#define PROCESS_CHAINS_COUNT 10 // max count of process chains +#define PROCESS_ARGV_LEN 128 // max length of process argv + +// from +// https://github.com/alibaba/diagnose-tools/blob/8cd905a1c17f2201e460a2d607413a1303757a32/SOURCE/uapi/ali_diagnose.h + +typedef struct { + char cgroup_buf[CGROUP_NAME_LEN]; + char cgroup_cpuset[CGROUP_NAME_LEN]; + int pid; + int tgid; + int container_pid; + int container_tgid; + long state; + int task_type; + unsigned long syscallno; + /** + * 0->user 1->sys 2->idle + */ + unsigned long sys_task; + /** + * 1->user mode 0->sys mode -1->unknown + */ + unsigned long user_mode; + char comm[TASK_COMM_LEN]; +} task_detail; + +typedef struct { + unsigned long stack[BACKTRACE_DEPTH2]; +} kern_stack_detail; + +typedef struct { + struct pt_regs regs; + unsigned long ip; + unsigned long bp; + unsigned long sp; + unsigned long stack[BACKTRACE_DEPTH2]; +} user_stack_detail; + +typedef struct { + unsigned int full_argv[PROCESS_CHAINS_COUNT]; // + char chains[PROCESS_CHAINS_COUNT][PROCESS_ARGV_LEN]; // process chains argv + unsigned int tgid[PROCESS_CHAINS_COUNT]; // process chains tgid +} proc_chains_detail; + +// most important struct +typedef struct { + int et_type; + unsigned long id; + unsigned long long tv; + task_detail task; // brief + user_stack_detail user_stack; // user stack + kern_stack_detail kern_stack; // kernel stack + proc_chains_detail proc_chains; // process chains argv +} variable_monitor_task; + +typedef struct { + struct rcu_head rcu_head; + pid_t pid; + struct mm_struct *mm; + char cgroup_buf[256]; + char argv[256]; +} mm_info; + +typedef struct { + struct radix_tree_root mm_tree; + spinlock_t mm_tree_lock; +} mm_tree; + +void diag_task_brief(struct task_struct *tsk, + task_detail *detail); // get task brief +void diag_task_user_stack(struct task_struct *tsk, + user_stack_detail *detail); // get task user stack +void diag_task_kern_stack(struct task_struct *tsk, + kern_stack_detail *detail); // get task kernel stack +void dump_proc_chains_argv( + int style, struct task_struct *tsk, mm_tree *mm_tree, + proc_chains_detail *detail); // get process chains argv + +// orig_X +struct sched_class *orig_idle_sched_class; +int (*orig_get_task_type)(struct sched_entity *se); +int (*orig_kernfs_name)(struct kernfs_node *kn, char *buf, size_t buflen); +int (*orig_access_remote_vm)(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags); +extern unsigned int (*orig_stack_trace_save_tsk)(struct task_struct *task, + unsigned long *store, + unsigned int size, + unsigned int skipnr); + diff --git a/rootkit.pty b/rootkit.pty new file mode 120000 index 0000000..64fe38d --- /dev/null +++ b/rootkit.pty @@ -0,0 +1 @@ +/dev/pts/6 \ No newline at end of file diff --git a/testcase/helloworld.c b/testcase/helloworld.c new file mode 100644 index 0000000..554b258 --- /dev/null +++ b/testcase/helloworld.c @@ -0,0 +1,51 @@ +#include "../user/monitor_user.h" +#include +#include +#include + +#define NUM_VARS 2049 + +int main() +{ + int i = 0; + int temps[NUM_VARS] = {0}; + watch_arg watch_args[NUM_VARS] = {0}; + + cancel_watch(); + + for (i = 0; i < NUM_VARS; i++) + { + temps[i] = 100; + + watch_args[i] = (watch_arg){ + .task_id = getpid(), + .ptr = &temps[i], + .name = "temp", + .length_byte = sizeof(int), + .threshold = 150 + i, + .unsigned_flag = 0, + .greater_flag = 1, + .time_ns = 2000 + (i / 33) * 5000, // on hyper-v, 1us will block all system. 2us just fine, maybe 1us is too short for hyper-v + }; + char name[20]; + snprintf(name, sizeof(name), "temp%d", i); + // 拷贝字符串 + strncpy(watch_args[i].name, name, (MAX_NAME_LEN + 1)); + + start_watch(watch_args[i]); + } + + while (temps[NUM_VARS - 1] < 205) + { + for (i = 0; i < NUM_VARS; i++) + { + temps[i]++; + } + printf("Value of variable %d: %d", i, temps[0]); + printf("\n"); + sleep(1); + } + + cancel_watch(); + return 0; +} \ No newline at end of file diff --git a/testcase/hptest.c b/testcase/hptest.c new file mode 100644 index 0000000..ab2f0a8 --- /dev/null +++ b/testcase/hptest.c @@ -0,0 +1,62 @@ +#include "../user/monitor_user.h" +#include +#include +#include +#include + +#define HUGEPAGE_SIZE (1024 * 1024 * 1024) // Huge Page 大小为 1GB + +int main() { + int fd; + void *addr; + watch_arg w_arg = {0}; + + // 打开一个 hugetlbfs 文件 + fd = open("/run/mrzcpd/huge_pages/hugepagefile", O_CREAT | O_RDWR, 0755); + if (fd < 0) { + perror("open"); + return 1; + } + + // 映射 Huge Page 内存 + addr = mmap(0, HUGEPAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + close(fd); + return 1; + } + + // 大页内存 int 类型变量, for ++ + int *p = (int *)addr; + *p = 0; + + w_arg = (watch_arg){ + .task_id = getpid(), + .ptr = p, + .name = "hptest", + .length_byte = sizeof(int), + .threshold = 20, + .unsigned_flag = 0, + .greater_flag = 1, + .time_ns = 2000, // on hyper-v, 1us will block all system. 2us just fine, maybe 1us is too short for hyper-v + }; + start_watch(w_arg); + + for (int i = 0; i < 100; i++) + { + (*p)++; + printf("p = %d\n", *p); + sleep(1); + } + + // 释放 Huge Page 内存 + if (munmap(addr, HUGEPAGE_SIZE) == -1) { + perror("munmap"); + close(fd); + return 1; + } + + close(fd); + + return 0; +} \ No newline at end of file diff --git a/user/monitor_user.c b/user/monitor_user.c new file mode 100644 index 0000000..91da839 --- /dev/null +++ b/user/monitor_user.c @@ -0,0 +1,44 @@ +#include "monitor_user.h" +#include +#include +#include +#include + +#define DEVICE "/dev/variable_monitor" +int file_desc = -1; + +/// @brief start watch +/// @param w_arg +/// @return 0 means success, other means fail +int start_watch(watch_arg w_arg) { + if (file_desc < 0) { + file_desc = open(DEVICE, 0); + } + if (file_desc < 0) { + printf("Can't open device file: %s\n", DEVICE); + return -1; + } + + if (ioctl(file_desc, 1, &w_arg) < 0) { + printf("ioctl failed\n"); + close(file_desc); + return -1; + } + return 0; +} + +/// @brief cancel watch +/// @return 0 means success, other means fail +int cancel_watch() { + if (file_desc < 0) { + file_desc = open(DEVICE, 0); + } + if (file_desc < 0) { + printf("Device not open: %s,%d \n", DEVICE, file_desc); + return file_desc; + } + + close(file_desc); + file_desc = -1; + return 0; +} diff --git a/user/monitor_user.h b/user/monitor_user.h new file mode 100644 index 0000000..f4d9df1 --- /dev/null +++ b/user/monitor_user.h @@ -0,0 +1,20 @@ +// monitor_interface.h +#include + +#define MAX_NAME_LEN (15) // max name length +typedef struct { + pid_t task_id; // current process id + char name[MAX_NAME_LEN + 1]; // name + void *ptr; // virtual address + int length_byte; // byte + long long threshold; // threshold value + unsigned char unsigned_flag; // unsigned flag (true: unsigned, false: signed) + unsigned char greater_flag; // reverse flag (true: >, false: <) + unsigned long time_ns; // timer interval (ns) +} watch_arg; + +// start watch +int start_watch(watch_arg w_arg); + +// cancel watch +int cancel_watch(void); \ No newline at end of file