Initial commit
This commit is contained in:
63
.gitignore
vendored
Normal file
63
.gitignore
vendored
Normal file
@@ -0,0 +1,63 @@
|
||||
# Prerequisites
|
||||
*.d
|
||||
|
||||
# Object files
|
||||
*.o
|
||||
*.ko
|
||||
*.obj
|
||||
*.elf
|
||||
|
||||
# Linker output
|
||||
*.ilk
|
||||
*.map
|
||||
*.exp
|
||||
|
||||
# Precompiled Headers
|
||||
*.gch
|
||||
*.pch
|
||||
|
||||
# Libraries
|
||||
*.lib
|
||||
*.a
|
||||
*.la
|
||||
*.lo
|
||||
|
||||
# Shared objects (inc. Windows DLLs)
|
||||
*.dll
|
||||
*.so
|
||||
*.so.*
|
||||
*.dylib
|
||||
|
||||
# Executables
|
||||
*.exe
|
||||
*.out
|
||||
*.app
|
||||
*.i*86
|
||||
*.x86_64
|
||||
*.hex
|
||||
|
||||
# Debug files
|
||||
*.dSYM/
|
||||
*.su
|
||||
*.idb
|
||||
*.pdb
|
||||
|
||||
# Kernel Module Compile Results
|
||||
*.mod*
|
||||
*.cmd
|
||||
.tmp_versions/
|
||||
modules.order
|
||||
Module.symvers
|
||||
Mkfile.old
|
||||
dkms.conf
|
||||
watch
|
||||
linux-5.17.15/**
|
||||
linux-5.17.15.tar.xz
|
||||
helloworld
|
||||
hptest
|
||||
linux-5.17.15/.clang-format
|
||||
linux-5.17.15/.gitignore
|
||||
linux-5.17.15/.mailmap
|
||||
linux-5.17.15/.cocciconfig
|
||||
linux-5.17.15/.get_maintainer.ignore
|
||||
linux-5.17.15/.cocciconfig
|
||||
28
.vscode/c_cpp_properties.json
vendored
Normal file
28
.vscode/c_cpp_properties.json
vendored
Normal file
@@ -0,0 +1,28 @@
|
||||
{
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Linux",
|
||||
"includePath": [
|
||||
"${workspaceFolder}/**",
|
||||
"../linux-5.17.15/include/**",
|
||||
"../linux-5.17.15/arch/x86/include/**",
|
||||
"../linux-5.17.15/arch/x86/include/generated/**"
|
||||
],
|
||||
"forcedInclude": [
|
||||
"../linux-5.17.15/include/generated/autoconf.h"
|
||||
],
|
||||
"defines": [
|
||||
"__GNUC__",
|
||||
"__KERNEL__",
|
||||
"__linux__",
|
||||
"__x86_64__",
|
||||
"_GNU_SOURCE"
|
||||
],
|
||||
"compilerPath": "/usr/bin/gcc",
|
||||
"cStandard": "c89",
|
||||
"compilerArgs": [],
|
||||
"intelliSenseMode": "linux-gcc-x64",
|
||||
}
|
||||
],
|
||||
"version": 4
|
||||
}
|
||||
46
.vscode/launch.json
vendored
Normal file
46
.vscode/launch.json
vendored
Normal file
@@ -0,0 +1,46 @@
|
||||
{
|
||||
// 使用 IntelliSense 了解相关属性。
|
||||
// 悬停以查看现有属性的描述。
|
||||
// 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "(gdb) linux kernel",
|
||||
"type": "cppdbg",
|
||||
"request": "launch",
|
||||
//"preLaunchTask": "centos7",
|
||||
// socat pty,link=./rootkit.pty,raw,echo=0 EXEC:"/mnt/c/ProgramData/chocolatey/lib/npiperelay/tools/npiperelay.exe -ep -s //./pipe/rootkit",nofork
|
||||
"program": "${workspaceFolder}/linux-5.17.15/vmlinux",
|
||||
//"miDebuggerServerAddress": "localhost:1234",
|
||||
//"debugServerPath": "${workspaceFolder}/rootkit.pty",
|
||||
"miDebuggerPath": "/usr/bin/gdb",
|
||||
"miDebuggerArgs": "-ex 'set serial baud 115200 target remote ./rootkit.pty'",
|
||||
"args": [],
|
||||
"stopAtEntry": true,
|
||||
"cwd": "${workspaceFolder}",
|
||||
"environment": [],
|
||||
"externalConsole": false,
|
||||
"MIMode": "gdb",
|
||||
//"miDebuggerArgs": "-n",
|
||||
"targetArchitecture": "x64",
|
||||
"setupCommands": [ // 或许在这里添加的 set serial baud 115200 | target remote ./rootkit.pty
|
||||
{
|
||||
"text": "set arch i386:x86-64:intel",
|
||||
"ignoreFailures": false
|
||||
},
|
||||
{
|
||||
"text": "dir .",
|
||||
"ignoreFailures": false
|
||||
},
|
||||
{
|
||||
"text": "add-auto-load-safe-path ./",
|
||||
"ignoreFailures": false
|
||||
},
|
||||
{
|
||||
"text": "-enable-pretty-printing",
|
||||
"ignoreFailures": true
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
27
.vscode/settings.json
vendored
Normal file
27
.vscode/settings.json
vendored
Normal file
@@ -0,0 +1,27 @@
|
||||
{
|
||||
"files.associations": {
|
||||
"hrtimer.h": "c",
|
||||
"ktime.h": "c",
|
||||
"typeinfo": "c",
|
||||
"signal.h": "c",
|
||||
"module.h": "c",
|
||||
"watch_module.h": "c",
|
||||
"kernel.h": "c",
|
||||
"device.h": "c",
|
||||
"mm.h": "c",
|
||||
"fs.h": "c",
|
||||
"sched.h": "c",
|
||||
"monitor_user.h": "c"
|
||||
},
|
||||
"clangd.arguments": [
|
||||
"--compile-commands-dir=${workspaceFolder}/linux-5.17.15",
|
||||
"--background-index",
|
||||
"--completion-style=detailed",
|
||||
"--header-insertion=never",
|
||||
"-log=info"
|
||||
],
|
||||
"C_Cpp.autocomplete": "disabled",
|
||||
"C_Cpp.codeFolding": "disabled",
|
||||
"C_Cpp.configurationWarnings": "disabled",
|
||||
"C_Cpp.intelliSenseEngine": "disabled"
|
||||
}
|
||||
33
.vscode/tasks.json
vendored
Normal file
33
.vscode/tasks.json
vendored
Normal file
@@ -0,0 +1,33 @@
|
||||
{
|
||||
"tasks": [
|
||||
{
|
||||
"label": "centos7",
|
||||
"type": "shell",
|
||||
"command": "./run.sh",
|
||||
"presentation": {
|
||||
"echo": true,
|
||||
"clear": true,
|
||||
"group": "vm"
|
||||
},
|
||||
"isBackground": true,
|
||||
"problemMatcher": [
|
||||
{
|
||||
"pattern": [
|
||||
{
|
||||
"regexp": ".",
|
||||
"file": 1,
|
||||
"location": 2,
|
||||
"message": 3
|
||||
}
|
||||
],
|
||||
"background": {
|
||||
"activeOnStart": true,
|
||||
"beginsPattern": ".",
|
||||
"endsPattern": ".",
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"version": "2.0.0"
|
||||
}
|
||||
36
Makefile
Normal file
36
Makefile
Normal file
@@ -0,0 +1,36 @@
|
||||
CC = gcc
|
||||
CFLAGS = -Wall
|
||||
|
||||
PROG = helloworld
|
||||
HPTEST = hptest
|
||||
|
||||
UDIR = $(PWD)/user
|
||||
MDIR := $(PWD)/kernel
|
||||
KDIR := $(PWD)/linux-5.17.15 # 内核源码目录
|
||||
TDIR := $(PWD)/testcase
|
||||
|
||||
BUILD_DIR := $(PWD)/build
|
||||
OUTPUT_DIR = $(PWD)/build
|
||||
|
||||
|
||||
# KMOD = variable_monitor
|
||||
# obj-m := kernel/$(KMOD).o
|
||||
# $(KMOD)-objs := kernel/monitor_kernel.o
|
||||
|
||||
all: $(PROG) $(HPTEST) module
|
||||
|
||||
$(PROG): $(TDIR)/helloworld.c
|
||||
$(CC) $(CFLAGS) -o $(OUTPUT_DIR)/$(PROG) $(TDIR)/helloworld.c $(UDIR)/monitor_user.c
|
||||
|
||||
$(HPTEST): $(TDIR)/helloworld.c
|
||||
$(CC) $(CFLAGS) -o $(OUTPUT_DIR)/$(HPTEST) $(TDIR)/hptest.c $(UDIR)/monitor_user.c
|
||||
|
||||
module:
|
||||
make -C $(KDIR) M=$(MDIR) modules
|
||||
|
||||
# module:
|
||||
# make -C linux-5.17.15 M=$(PWD)/kernel modules
|
||||
|
||||
clean:
|
||||
rm -f $(OUTPUT_DIR)/*
|
||||
make -C $(KDIR) M=$(MDIR) clean
|
||||
150
README.md
Normal file
150
README.md
Normal file
@@ -0,0 +1,150 @@
|
||||
## Variable Monitor
|
||||
|
||||
Monitor numerical variables (given address, length), and print system stack information when the set conditions are exceeded.
|
||||
|
||||
Number of simultaneous monitoring
|
||||
- Monitoring with the same timing length will be grouped into one group, corresponding to one timer.
|
||||
- A set of up to 32 variables, after which a new timer is allocated.
|
||||
- The global maximum number of timers is 128.
|
||||
- The above quantity limit is defined in the `watch_module.h` header macro.
|
||||
|
||||
Currently, monitoring is limited to the same application, and simultaneous calls from multiple applications are not currently supported.
|
||||
- Multiple applications can work normally if only one program calls `cancel_all_watch();`.
|
||||
|
||||
## Usage
|
||||
|
||||
Example: helloworld.c
|
||||
- Add `#include "watch.h"`
|
||||
- Set each variable that needs to be monitored: name && address && length, set threshold, comparison method, timer interval (ns), etc.
|
||||
- `start_watch(watch_arg);` Start monitoring
|
||||
- Call `cancel_all_watch();` when you need to cancel monitoring
|
||||
|
||||
When the set conditions are exceeded, the system stack information is printed and viewed with `dmesg`, as shown in the following example:
|
||||
- Within a timer, if multiple variables exceed the threshold, the stack information will not be output repeatedly;
|
||||
- The timer restart time after printing the stack is 1s, and the next round of monitoring will start after 1s.
|
||||
|
||||
```log
|
||||
[86245.364861] -------------------------------------
|
||||
[86245.364864] -------------watch monitor-----------
|
||||
[86245.364865] Threshold reached:
|
||||
name: temp0, threshold: 150
|
||||
[86245.364866] Timestamp (ns): 1699589000606300743
|
||||
[86245.364867] Recent Load: 116.65, 126.83, 151.17
|
||||
[86245.365669] task: name lcore-worker-4, pid 803327
|
||||
[86245.365672] task: name lcore-worker-5, pid 803328
|
||||
[86245.365673] task: name lcore-worker-6, pid 803329
|
||||
[86245.365674] task: name lcore-worker-7, pid 803330
|
||||
[86245.365676] task: name lcore-worker-8, pid 803331
|
||||
[86245.365677] task: name lcore-worker-9, pid 803332
|
||||
[86245.365679] task: name lcore-worker-10, pid 803333
|
||||
[86245.365681] task: name lcore-worker-11, pid 803334
|
||||
[86245.365682] task: name lcore-worker-68, pid 803335
|
||||
[86245.365683] task: name lcore-worker-69, pid 803336
|
||||
[86245.365684] task: name lcore-worker-70, pid 803337
|
||||
[86245.365685] task: name lcore-worker-71, pid 803338
|
||||
[86245.365686] task: name lcore-worker-72, pid 803339
|
||||
[86245.365687] task: name lcore-worker-73, pid 803340
|
||||
[86245.365688] task: name lcore-worker-74, pid 803341
|
||||
[86245.365689] task: name lcore-worker-75, pid 803342
|
||||
[86245.365694] task: name pkt:worker-0, pid 803638
|
||||
[86245.365702] hrtimer_nanosleep+0x8d/0x120
|
||||
[86245.365709] __x64_sys_nanosleep+0x96/0xd0
|
||||
[86245.365711] do_syscall_64+0x37/0x80
|
||||
[86245.365716] entry_SYSCALL_64_after_hwframe+0x44/0xae
|
||||
[86245.365718] task: name pkt:worker-1, pid 803639
|
||||
[86245.365721] hrtimer_nanosleep+0x8d/0x120
|
||||
[86245.365724] __x64_sys_nanosleep+0x96/0xd0
|
||||
[86245.365726] do_syscall_64+0x37/0x80
|
||||
[86245.365728] entry_SYSCALL_64_after_hwframe+0x44/0xae
|
||||
[86245.365730] task: name pkt:worker-2, pid 803640
|
||||
[86245.365732] hrtimer_nanosleep+0x8d/0x120
|
||||
[86245.365734] __x64_sys_nanosleep+0x96/0xd0
|
||||
[86245.365737] do_syscall_64+0x37/0x80
|
||||
[86245.365739] entry_SYSCALL_64_after_hwframe+0x44/0xae
|
||||
[86245.365740] task: name pkt:worker-3, pid 803641
|
||||
[86245.365743] hrtimer_nanosleep+0x8d/0x120
|
||||
```
|
||||
|
||||
### Parameter Description
|
||||
|
||||
start_watch passes in the watch_arg structure. The meaning of each field is as follows
|
||||
- name limit `MAX_NAME_LEN`(15) valid characters
|
||||
|
||||
```c
|
||||
typedef struct
|
||||
{
|
||||
pid_t task_id; // current process id
|
||||
char name[MAX_NAME_LEN + 1]; // name (15+1)
|
||||
void *ptr; // virtual address
|
||||
int length_byte; // byte
|
||||
long long threshold; // threshold value
|
||||
unsigned char unsigned_flag; // unsigned flag (true: unsigned, false: signed)
|
||||
unsigned char greater_flag; // reverse flag (true: >, false: <)
|
||||
unsigned long time_ns; // timer interval (ns)
|
||||
} watch_arg;
|
||||
```
|
||||
|
||||
An initialization example
|
||||
|
||||
```c
|
||||
watch_args = (watch_arg){
|
||||
.task_id = getpid(),
|
||||
.ptr = &temp,
|
||||
.name = "temp",
|
||||
.length_byte = sizeof(int),
|
||||
.threshold = 150 + i,
|
||||
.unsigned_flag = 0,
|
||||
.greater_flag = 1,
|
||||
.time_ns = 2000 + (i / 33) * 5000
|
||||
};
|
||||
```
|
||||
|
||||
## demo
|
||||
|
||||
In the main project directory:
|
||||
|
||||
```bash
|
||||
make && insmod watch_module.ko
|
||||
./watch
|
||||
```
|
||||
|
||||
You can see the printed stack information in dmesg
|
||||
|
||||
```bash
|
||||
# Unload module and clean compile files
|
||||
rmmod watch_module.ko && make clean
|
||||
```
|
||||
|
||||
Only tested on kernel 5.17.15-1.el8.x86_64.
|
||||
|
||||
## Other
|
||||
|
||||
The program is divided into two parts: character device and user space interface, both of which communicate through ioctl.
|
||||
|
||||
User space address access
|
||||
- The variable virtual address passed in by the user program, use `get_user_pages_remote` to obtain the memory page where the address is located, and `kmap` maps it to the kernel.
|
||||
- In the 192.168.40.204 environment, the HugeTLB Pages test mounts normally.
|
||||
- The memory page address + offset is stored in the `kernel_watch_arg` corresponding to the timer, and hrTimer accesses `kernel_watch_arg` when polling to get the real value.
|
||||
|
||||
timer grouping
|
||||
- The hrTimer data structure is defined in the global array `kernel_wtimer_list`. When allocating a timer, it will check the traversal `kernel_wtimer_list` to compare the timer interval.
|
||||
- Watches with the same timing interval are assigned to the same group and correspond to the same hrTimer.
|
||||
- If the number of variables monitored by a timer exceeds `TIMER_MAX_WATCH_NUM` (32), a new hrTimer will be created.
|
||||
- The total number of hrTimers (`kernel_wtimer_list` array length) limit is `MAX_TIMER_NUM`(128).
|
||||
|
||||
Memory page mount/unmount
|
||||
- `get_user_pages_remote`/ `kmap` will increase the corresponding count and requires the equivalent `put_page`/`kunmap`.
|
||||
- A global linked list in the module `watch_local_memory_list` stores the page and kt corresponding to each successfully mounted variable. When performing the close operation of the character device, it is traversed and unloaded.
|
||||
|
||||
Stack output conditions: The conditions are referenced from [diagnose-tools::load.c](https://github.com/alibaba/diagnose-tools/blob/e285bc4626a7d207eabd4a69cb276e1a3b1b7c76/SOURCE/module/kernel/load.c#L209)
|
||||
- `TASK` must satisfy TASK_RUNNING and `__task_contributes_to_load`.
|
||||
- `__task_contributes_to_load` corresponds to the kernel macro `task_contributes_to_loa`.
|
||||
|
||||
```c
|
||||
// https://www.spinics.net/lists/kernel/msg3582022.html
|
||||
// remove from 5.8.rc3,but it still work
|
||||
// whether the task contributes to the load
|
||||
#define __task_contributes_to_load(task) \
|
||||
((READ_ONCE(task->__state) & TASK_UNINTERRUPTIBLE) != 0 && (task->flags & PF_FROZEN) == 0 && \
|
||||
(READ_ONCE(task->__state) & TASK_NOLOAD) == 0)
|
||||
```
|
||||
174
README_zh.md
Normal file
174
README_zh.md
Normal file
@@ -0,0 +1,174 @@
|
||||
## Variable Monitor
|
||||
|
||||
changelog
|
||||
|
||||
```log
|
||||
11.9 多个变量监控支持
|
||||
11.10 按照 pid 区分不同内核结构, 支持每个进程单独申请取消自己的监控.
|
||||
11.13 用户接口 cancel_all_watch -> cancel_watch, 每个进程互不干扰.
|
||||
```
|
||||
|
||||
## 说明
|
||||
|
||||
监控 数值变量(给定 地址,长度), 超过设定条件打印系统堆栈信息.
|
||||
|
||||
同时监控数量
|
||||
- 相同定时长度的监控 会被分为一组,对应一个定时器.
|
||||
- 一组最多 32 个变量,超过后会分配一个新的定时器.
|
||||
- 定时器数量全局最多 128 个.
|
||||
- 以上数量限制定义在 `watch_module.h` 头部宏.
|
||||
|
||||
## 使用
|
||||
|
||||
示例如 helloworld.c
|
||||
- 添加 `#include "watch.h"`
|
||||
- 对每个需要监控的变量 设置: 名称 && 地址 && 长度, 设置阈值, 比较方式, 定时器间隔(ns) 等.
|
||||
- `start_watch(watch_arg);` 启动监控
|
||||
- 需要取消监控时调用 `cancel_watch();`
|
||||
|
||||
超出设定条件时,打印系统堆栈信息, `dmesg` 查看,如下示例:
|
||||
- 一个定时器内,多个变量超过阈值,堆栈信息不会重复输出;
|
||||
- 打印堆栈后定时器再启动时间为 1s, 1s 后开始下一个轮次监控.
|
||||
|
||||
```log
|
||||
[ 713.225894] -------------------------------------
|
||||
[ 713.225900] -------------watch monitor-----------
|
||||
[ 713.225900] Threshold reached:
|
||||
[ 713.225901] name: temp0, threshold: 150, pid: 4261
|
||||
[ 713.225902] name: temp1, threshold: 151, pid: 4261
|
||||
[ 713.225903] name: temp2, threshold: 152, pid: 4261
|
||||
[ 713.225904] name: temp3, threshold: 153, pid: 4261
|
||||
[ 713.225904] name: temp4, threshold: 154, pid: 4261
|
||||
[ 713.225905] name: temp5, threshold: 155, pid: 4261
|
||||
[ 713.225905] name: temp6, threshold: 156, pid: 4261
|
||||
[ 713.225906] name: temp7, threshold: 157, pid: 4261
|
||||
[ 713.225906] name: temp8, threshold: 158, pid: 4261
|
||||
[ 713.225907] name: temp9, threshold: 159, pid: 4261
|
||||
[ 713.225907] name: temp10, threshold: 160, pid: 4261
|
||||
[ 713.225908] name: temp11, threshold: 161, pid: 4261
|
||||
[ 713.225908] name: temp12, threshold: 162, pid: 4261
|
||||
[ 713.225909] name: temp13, threshold: 163, pid: 4261
|
||||
[ 713.225909] name: temp14, threshold: 164, pid: 4261
|
||||
[ 713.225910] name: temp15, threshold: 165, pid: 4261
|
||||
[ 713.225910] name: temp16, threshold: 166, pid: 4261
|
||||
[ 713.225911] name: temp17, threshold: 167, pid: 4261
|
||||
[ 713.225911] name: temp18, threshold: 168, pid: 4261
|
||||
[ 713.225912] name: temp19, threshold: 169, pid: 4261
|
||||
[ 713.225912] name: temp20, threshold: 170, pid: 4261
|
||||
[ 713.225913] name: temp21, threshold: 171, pid: 4261
|
||||
[ 713.225913] name: temp22, threshold: 172, pid: 4261
|
||||
[ 713.225914] name: temp23, threshold: 173, pid: 4261
|
||||
[ 713.225914] name: temp24, threshold: 174, pid: 4261
|
||||
[ 713.225915] name: temp25, threshold: 175, pid: 4261
|
||||
[ 713.225915] name: temp26, threshold: 176, pid: 4261
|
||||
[ 713.225916] name: temp27, threshold: 177, pid: 4261
|
||||
[ 713.225916] name: temp28, threshold: 178, pid: 4261
|
||||
[ 713.225916] name: temp29, threshold: 179, pid: 4261
|
||||
[ 713.225917] name: temp30, threshold: 180, pid: 4261
|
||||
[ 713.225917] name: temp31, threshold: 181, pid: 4261
|
||||
[ 713.225918] Timestamp (ns): 1699846710299420862
|
||||
[ 713.225919] Recent Load: 0.05, 0.12, 0.08
|
||||
[ 713.225921] task: name rcu_gp, pid 3, state 1026
|
||||
[ 713.225926] rescuer_thread+0x290/0x390
|
||||
[ 713.225931] kthread+0xd7/0x100
|
||||
[ 713.225932] ret_from_fork+0x1f/0x30
|
||||
[ 713.225935] task: name rcu_par_gp, pid 4, state 1026
|
||||
[ 713.225936] rescuer_thread+0x290/0x390
|
||||
[ 713.225937] kthread+0xd7/0x100
|
||||
[ 713.225938] ret_from_fork+0x1f/0x30
|
||||
[ 713.225940] task: name netns, pid 5, state 1026
|
||||
[ 713.225941] rescuer_thread+0x290/0x390
|
||||
[ 713.225942] kthread+0xd7/0x100
|
||||
```
|
||||
|
||||
### 参数说明
|
||||
|
||||
start_watch 传入的是 watch_arg 结构体.各个字段意义如下
|
||||
- name 限制 `MAX_NAME_LEN`(15) 个有效字符
|
||||
|
||||
```c
|
||||
typedef struct
|
||||
{
|
||||
pid_t task_id; // current process id
|
||||
char name[MAX_NAME_LEN + 1]; // name (15+1)
|
||||
void *ptr; // virtual address
|
||||
int length_byte; // byte
|
||||
long long threshold; // threshold value
|
||||
unsigned char unsigned_flag; // unsigned flag (true: unsigned, false: signed)
|
||||
unsigned char greater_flag; // reverse flag (true: >, false: <)
|
||||
unsigned long time_ns; // timer interval (ns)
|
||||
} watch_arg;
|
||||
```
|
||||
|
||||
一个初始化示例
|
||||
|
||||
```c
|
||||
watch_args = (watch_arg){
|
||||
.task_id = getpid(),
|
||||
.ptr = &temp,
|
||||
.name = "temp",
|
||||
.length_byte = sizeof(int),
|
||||
.threshold = 150 + i,
|
||||
.unsigned_flag = 0,
|
||||
.greater_flag = 1,
|
||||
.time_ns = 2000 + (i / 33) * 5000
|
||||
};
|
||||
```
|
||||
|
||||
## demo
|
||||
|
||||
项目主文件下
|
||||
- `helloworld.c`: 测试大量变量监控
|
||||
- `hptest.c`: 测试 hugePage 挂载
|
||||
|
||||
```bash
|
||||
# 编译加载模块
|
||||
make && insmod variable_monitor.ko
|
||||
./helloworld
|
||||
```
|
||||
|
||||
dmesg 可以看到打印的堆栈信息
|
||||
|
||||
```bash
|
||||
# 卸载模块,清理编译文件
|
||||
rmmod variable_monitor.ko && make clean
|
||||
```
|
||||
|
||||
仅在 `kernel 5.17.15-1.el8.x86_64` 测试,其他内核版本未测试.
|
||||
|
||||
## 其他
|
||||
|
||||
程序分为两部分: 字符设备 和 用户空间接口, 两者通过 ioctl 通信.
|
||||
|
||||
用户空间地址访问
|
||||
- 用户程序传入的变量 虚拟地址, 使用 `get_user_pages_remote` 获取地址所在内存页, `kmap` 将其映射到内核.
|
||||
- 192.168.40.204 环境下,HugeTLB Pages 测试挂载正常.
|
||||
- 内存页地址 + 偏移量存入定时器对应的 `kernel_watch_arg` 中, hrTimer 轮询时访问 `kernel_watch_arg` 得到真实值.
|
||||
|
||||
定时器分组
|
||||
- hrTimer 数据结构定义在全局数组 `kernel_wtimer_list`.分配定时器时,会检查遍历 `kernel_wtimer_list` 比较定时器间隔,
|
||||
- 相同定时间隔的 watch 分配到同一组,对应同一个 hrTimer.
|
||||
- 若一个定时器监控变量数量超过 `TIMER_MAX_WATCH_NUM` (32),则会创建一个新的 hrTimer.
|
||||
- hrTimer 的总数量(`kernel_wtimer_list` 数组长度)限制是 `MAX_TIMER_NUM`(128).
|
||||
|
||||
内存页 mount/unmount
|
||||
- `get_user_pages_remote`/ `kmap` 会增加对应的计数,需要对等的 `put_page`/`kunmap`.
|
||||
- 一个模块内全局链表 `watch_local_memory_list` 存储每一个成功挂载的变量对应的 page 和 kt,执行字符设备的 close 操作时,遍历并卸载.
|
||||
|
||||
variable monitor 添加/删除
|
||||
- kernel_watch_arg 数据结构中有 pid 的成员变量,但添加变量监控时,不按照进程区分.
|
||||
- 删除时遍历全部监控变量,比较 pid.
|
||||
- 删除造成的缺位,将最后的变量移动到空位, sentinel--; hrTimer 同理.
|
||||
|
||||
堆栈输出条件: 条件参考自 [diagnose-tools::load.c](https://github.com/alibaba/diagnose-tools/blob/e285bc4626a7d207eabd4a69cb276e1a3b1b7c76/SOURCE/module/kernel/load.c#L209)
|
||||
- `TASK` 要满足 TASK_RUNNING 和 `__task_contributes_to_load` 和 `TASK_IDLE`(可能有阻塞进程).
|
||||
- `__task_contributes_to_load` 对应内核宏 `task_contributes_to_loa`.
|
||||
|
||||
```c
|
||||
// https://www.spinics.net/lists/kernel/msg3582022.html
|
||||
// remove from 5.8.rc3,but it still work
|
||||
// whether the task contributes to the load
|
||||
#define __task_contributes_to_load(task) \
|
||||
((READ_ONCE(task->__state) & TASK_UNINTERRUPTIBLE) != 0 && (task->flags & PF_FROZEN) == 0 && \
|
||||
(READ_ONCE(task->__state) & TASK_NOLOAD) == 0)
|
||||
```
|
||||
3
kernel/Makefile
Normal file
3
kernel/Makefile
Normal file
@@ -0,0 +1,3 @@
|
||||
KMOD = variable_monitor
|
||||
obj-m := $(KMOD).o
|
||||
$(KMOD)-objs := monitor_kernel.o
|
||||
157
kernel/monitor_kernel.c
Normal file
157
kernel/monitor_kernel.c
Normal file
@@ -0,0 +1,157 @@
|
||||
#include <linux/device.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
#include "monitor_kernel_lib.c"
|
||||
#include "monitor_kernel_task.c"
|
||||
|
||||
#define DEVICE_NAME "variable_monitor"
|
||||
|
||||
// for character device
|
||||
static dev_t dev_num;
|
||||
static struct cdev *watch_cdev;
|
||||
static struct class *watch_class;
|
||||
|
||||
struct my_device_data {
|
||||
pid_t pid;
|
||||
};
|
||||
|
||||
static int device_open(struct inode *inode, struct file *file) {
|
||||
struct my_device_data *data;
|
||||
printk(KERN_INFO "%s: with pid %d\n", __FUNCTION__, current->pid);
|
||||
// save pid
|
||||
data = kmalloc(sizeof(*data), GFP_KERNEL);
|
||||
if (!data)
|
||||
return -ENOMEM;
|
||||
data->pid = current->pid;
|
||||
file->private_data = data;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int device_release(struct inode *inode, struct file *file) {
|
||||
// printk(KERN_INFO "%s\n", __FUNCTION__);
|
||||
// load pid
|
||||
struct my_device_data *data = file->private_data;
|
||||
// clear watch with pid
|
||||
clear_watch(data->pid);
|
||||
kfree(data); // free data memory
|
||||
return 0;
|
||||
}
|
||||
|
||||
static long device_ioctl(struct file *file, unsigned int ioctl_num,
|
||||
unsigned long ioctl_param) {
|
||||
watch_arg warg;
|
||||
void *kptr;
|
||||
kernel_watch_timer *timer = NULL;
|
||||
kernel_watch_arg k_watch_arg;
|
||||
// copy watch_arg
|
||||
if (copy_from_user(&warg, (watch_arg *)ioctl_param, sizeof(warg))) {
|
||||
return -EACCES;
|
||||
}
|
||||
|
||||
printk(KERN_INFO "Watch_arg: task_id=%d, name=%s, ptr=%p, length_byte=%d, "
|
||||
"time_ns=%ld, threshold=%lld\n",
|
||||
warg.task_id, warg.name, warg.ptr, warg.length_byte, warg.time_ns,
|
||||
warg.threshold);
|
||||
// user space address to kernel space address
|
||||
kptr = convert_user_space_ptr(warg.task_id, (unsigned long)warg.ptr);
|
||||
if (kptr == NULL) {
|
||||
printk(KERN_ERR "Cannot access user space\n");
|
||||
return -EACCES;
|
||||
}
|
||||
// check length
|
||||
if (warg.length_byte != 1 && warg.length_byte != 2 && warg.length_byte != 4 &&
|
||||
warg.length_byte != 8) {
|
||||
printk(KERN_ERR "Invalid length %d\n", warg.length_byte);
|
||||
return -EINVAL;
|
||||
}
|
||||
// k_watch_arg init
|
||||
w_arg2k_w_arg(kptr, warg, &k_watch_arg);
|
||||
timer = get_timer(warg.time_ns); // get a valuable timer
|
||||
|
||||
printk(KERN_INFO "ptr transform kptr: %p\n", kptr);
|
||||
printk(KERN_INFO "timer: %p\n", timer);
|
||||
printk(KERN_INFO "timer->sentinel: %d, timer->time_ns: %lld\n",
|
||||
timer->sentinel, timer->time_ns);
|
||||
printk(KERN_INFO "timer->hr_timer: %p\n", &timer->hr_timer);
|
||||
|
||||
TIMER_CANCEL(timer); // just in case
|
||||
timer_add_watch(timer, k_watch_arg);
|
||||
TIMER_START(timer);
|
||||
|
||||
printk(KERN_INFO "Start watching var: %s\n", warg.name);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct file_operations fops = {
|
||||
.open = device_open,
|
||||
.release = device_release,
|
||||
.unlocked_ioctl = device_ioctl,
|
||||
};
|
||||
|
||||
int init_module(void) {
|
||||
printk(KERN_INFO "%s\n", __FUNCTION__);
|
||||
if (alloc_chrdev_region(&dev_num, 0, 1, DEVICE_NAME) < 0) {
|
||||
printk(KERN_ALERT "Failed to register device number\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ((watch_cdev = cdev_alloc()) == NULL) {
|
||||
printk(KERN_ALERT "Failed to allocate cdev structure\n");
|
||||
unregister_chrdev_region(dev_num, 1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
cdev_init(watch_cdev, &fops);
|
||||
if (cdev_add(watch_cdev, dev_num, 1) == -1) {
|
||||
printk(KERN_ALERT "Failed to add cdev structure\n");
|
||||
device_destroy(watch_class, dev_num);
|
||||
class_destroy(watch_class);
|
||||
unregister_chrdev_region(dev_num, 1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if ((watch_class = class_create(THIS_MODULE, DEVICE_NAME)) == NULL) {
|
||||
printk(KERN_ALERT "Failed to create class\n");
|
||||
cdev_del(watch_cdev);
|
||||
unregister_chrdev_region(dev_num, 1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (device_create(watch_class, NULL, dev_num, NULL, DEVICE_NAME) == NULL) {
|
||||
printk(KERN_ALERT "Failed to create device\n");
|
||||
class_destroy(watch_class);
|
||||
cdev_del(watch_cdev);
|
||||
unregister_chrdev_region(dev_num, 1);
|
||||
return -1;
|
||||
}
|
||||
|
||||
printk(KERN_INFO "dev number: %d\n", dev_num);
|
||||
printk(KERN_INFO "path: /dev/%s %d\n", DEVICE_NAME, dev_num);
|
||||
|
||||
fn_kallsyms_lookup_name_init(); // init kallsyms_lookup_name
|
||||
LOOKUP_SYMS(stack_trace_save_tsk); // stack_trace_save_tsk
|
||||
LOOKUP_SYMS(show_stack); // show_stack
|
||||
LOOKUP_SYMS(idle_sched_class); // idle_sched_class
|
||||
LOOKUP_SYMS(access_remote_vm); // access_remote_vm
|
||||
|
||||
LOOKUP_SYMS_NORET(get_task_type); // get_task_type
|
||||
LOOKUP_SYMS_NORET(kernfs_name); // kernfs_name
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void cleanup_module(void) {
|
||||
printk(KERN_INFO "%s\n", __FUNCTION__);
|
||||
// clear all timer and page list
|
||||
clear_all_watch();
|
||||
// unmount
|
||||
device_destroy(watch_class, dev_num);
|
||||
class_destroy(watch_class);
|
||||
cdev_del(watch_cdev);
|
||||
unregister_chrdev_region(dev_num, 1);
|
||||
}
|
||||
|
||||
MODULE_LICENSE("GPL");
|
||||
179
kernel/monitor_kernel.h
Normal file
179
kernel/monitor_kernel.h
Normal file
@@ -0,0 +1,179 @@
|
||||
#include <linux/hrtimer.h>
|
||||
#include <linux/kprobes.h>
|
||||
#include <linux/ktime.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/slab.h> /* for kmalloc */
|
||||
#include <linux/string.h>
|
||||
|
||||
#include <asm/uaccess.h>
|
||||
#include <linux/cdev.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sched/loadavg.h> /* for avenrun, LOAD_* */
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/sched/signal.h>
|
||||
#include <linux/stacktrace.h> /* for stack_trace_print */
|
||||
|
||||
#define MAX_TIMER_NUM (128) // max timer number
|
||||
#define TIMER_MAX_WATCH_NUM (32) // A timer max watch number at once time
|
||||
#define MAX_NAME_LEN (15) // max name length
|
||||
typedef struct {
|
||||
pid_t task_id; // current process id
|
||||
char name[MAX_NAME_LEN + 1]; // name
|
||||
void *ptr; // virtual address
|
||||
int length_byte; // byte
|
||||
long long threshold; // threshold value
|
||||
unsigned char unsigned_flag; // unsigned flag (true: unsigned, false: signed)
|
||||
unsigned char greater_flag; // reverse flag (true: >, false: <)
|
||||
unsigned long time_ns; // timer interval (ns)
|
||||
} watch_arg;
|
||||
|
||||
typedef struct {
|
||||
pid_t task_id; // current process id
|
||||
char name[MAX_NAME_LEN + 2]; // name, last char automatically add '\0'
|
||||
void *kptr; // kernel address + offset
|
||||
int length_byte; // byte
|
||||
long long threshold; // threshold value
|
||||
unsigned char unsigned_flag; // unsigned flag (true: unsigned, false: signed)
|
||||
unsigned char greater_flag; // reverse flag (true: >, false: <)
|
||||
} kernel_watch_arg;
|
||||
|
||||
typedef struct {
|
||||
unsigned long long time_ns; // hrTimer time interval (ns)
|
||||
struct hrtimer hr_timer; // hrTimer
|
||||
ktime_t kt; // hrTimer time
|
||||
unsigned sentinel; // sentinel
|
||||
kernel_watch_arg
|
||||
k_watch_args[TIMER_MAX_WATCH_NUM]; // all watched kernel_watch_arg
|
||||
} kernel_watch_timer;
|
||||
|
||||
#define TIMER_FILLED(timer) ((timer)->sentinel >= TIMER_MAX_WATCH_NUM)
|
||||
#define TIMER_EMPTY(timer) (!((timer)->time_ns | (timer)->sentinel))
|
||||
#define TIMER_NO_KWARG(timer) ((timer)->sentinel == 0)
|
||||
|
||||
#define TIMER_START(timer) \
|
||||
(hrtimer_start(&timer->hr_timer, timer->kt, HRTIMER_MODE_REL))
|
||||
#define TIMER_CANCEL(timer) (hrtimer_cancel(&timer->hr_timer))
|
||||
|
||||
kernel_watch_timer kernel_wtimer_list[MAX_TIMER_NUM] = {
|
||||
0}; // all kernel_watch_timer
|
||||
int kernel_wtimer_num = 0; // current kernel_watch_timer number
|
||||
|
||||
EXPORT_SYMBOL(kernel_wtimer_list); // export kernel_watch_timer_list
|
||||
EXPORT_SYMBOL(kernel_wtimer_num); // export kernel_watch_timer_num
|
||||
|
||||
// Helper function
|
||||
unsigned char w_arg2k_w_arg(void *ptr, watch_arg warg,
|
||||
kernel_watch_arg *k_watch_arg);
|
||||
|
||||
// for timer
|
||||
kernel_watch_timer *get_timer(unsigned long long time_ns);
|
||||
unsigned char timer_add_watch(kernel_watch_timer *timer,
|
||||
kernel_watch_arg k_watch_arg);
|
||||
unsigned char timer_del_watch_by_pid(kernel_watch_timer *timer, pid_t pid);
|
||||
|
||||
// for memory access
|
||||
typedef struct {
|
||||
pid_t task_id; // current process id
|
||||
struct page *page;
|
||||
void *kaddr;
|
||||
struct list_head entry;
|
||||
} watch_local_memory;
|
||||
|
||||
static LIST_HEAD(watch_local_memory_list);
|
||||
|
||||
void free_page_list(pid_t task_id);
|
||||
void free_all_page_list(void);
|
||||
|
||||
// static struct page *page = NULL;
|
||||
// static void *kaddr = NULL;
|
||||
|
||||
void *convert_user_space_ptr(pid_t pid, unsigned long kaddr);
|
||||
|
||||
// for timer
|
||||
// #define US2NS (1000) // Interval in microseconds
|
||||
// static struct hrtimer hr_timer;
|
||||
// static ktime_t kt;
|
||||
|
||||
// hrTimer
|
||||
enum hrtimer_restart check_variable_cb(struct hrtimer *timer);
|
||||
void start_all_hrTimer(void);
|
||||
void cancel_all_hrTimer(void);
|
||||
|
||||
unsigned char read_and_compare(kernel_watch_arg *k_arg);
|
||||
|
||||
// for diag_kallsyms_lookup_name
|
||||
unsigned long (*diag_kallsyms_lookup_name)(const char *name);
|
||||
static struct kprobe kprobe_kallsyms_lookup_name = {.symbol_name =
|
||||
"kallsyms_lookup_name"};
|
||||
|
||||
int fn_kallsyms_lookup_name_init(void); // init kallsyms_lookup_name
|
||||
|
||||
// form
|
||||
// https://github.com/alibaba/diagnose-tools/blob/8cd905a1c17f2201e460a2d607413a1303757a32/SOURCE/module/internal.h#L65
|
||||
// look for current function address, all the function with prefix "orig_" are
|
||||
#define LOOKUP_SYMS(name) \
|
||||
do { \
|
||||
orig_##name = (void *)diag_kallsyms_lookup_name(#name); \
|
||||
if (!orig_##name) { \
|
||||
printk(KERN_ERR "kallsyms_lookup_name: %s\n", #name); \
|
||||
return -EINVAL; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define LOOKUP_SYMS_NORET(name) \
|
||||
do { \
|
||||
orig_##name = (void *)diag_kallsyms_lookup_name(#name); \
|
||||
if (!orig_##name) \
|
||||
pr_err("kallsyms_lookup_name: %s\n", #name); \
|
||||
} while (0)
|
||||
|
||||
#define BACKTRACE_DEPTH 20 // max stack depth
|
||||
|
||||
// LOOKUP_SYMS(stack_trace_save_tsk);
|
||||
unsigned int (*orig_stack_trace_save_tsk)(struct task_struct *task,
|
||||
unsigned long *store,
|
||||
unsigned int size,
|
||||
unsigned int skipnr);
|
||||
// LOOKUP_SYMS(show_stack);
|
||||
void (*orig_show_stack)(struct task_struct *task, unsigned long *sp,
|
||||
const char *loglvl);
|
||||
|
||||
// https://www.spinics.net/lists/kernel/msg3582022.html
|
||||
// remove from 5.8.rc3,but it still work
|
||||
// whether the task contributes to the load
|
||||
#define __task_contributes_to_load(task) \
|
||||
((READ_ONCE(task->__state) & TASK_UNINTERRUPTIBLE) != 0 && \
|
||||
(task->flags & PF_FROZEN) == 0 && \
|
||||
(READ_ONCE(task->__state) & TASK_NOLOAD) == 0)
|
||||
|
||||
/// @brief print all task stack
|
||||
/// @param
|
||||
static void print_task_stack(void) {
|
||||
struct task_struct *g, *p; // g: task group; p: task
|
||||
unsigned long backtrace[BACKTRACE_DEPTH]; // save stack
|
||||
unsigned int nr_bt; // stack depth
|
||||
unsigned long long current_time; // last time
|
||||
current_time = ktime_get_real();
|
||||
printk("Timestamp (ns): %lld\n", current_time);
|
||||
printk("Recent Load: %lu.%02lu, %lu.%02lu, %lu.%02lu\n", // recent load
|
||||
LOAD_INT(avenrun[0]), LOAD_FRAC(avenrun[0]), LOAD_INT(avenrun[1]),
|
||||
LOAD_FRAC(avenrun[1]), LOAD_INT(avenrun[2]), LOAD_FRAC(avenrun[2]));
|
||||
rcu_read_lock(); // lock run queue
|
||||
// printk("Running task\n");
|
||||
do_each_thread(g, p) {
|
||||
if (p->__state == TASK_RUNNING || __task_contributes_to_load(p) ||
|
||||
p->__state == TASK_IDLE) {
|
||||
printk("task: %s, pid %d, state %d\n", p->comm, p->pid,
|
||||
p->__state); //! todo
|
||||
nr_bt = orig_stack_trace_save_tsk(p, backtrace, BACKTRACE_DEPTH, 0);
|
||||
stack_trace_print(backtrace, nr_bt, 0); // print
|
||||
}
|
||||
}
|
||||
while_each_thread(g, p);
|
||||
rcu_read_unlock(); // unlock run queue
|
||||
}
|
||||
|
||||
unsigned char del_all_kwarg_by_pid(pid_t pid);
|
||||
void clear_watch(pid_t pid);
|
||||
void clear_all_watch(void);
|
||||
427
kernel/monitor_kernel_lib.c
Normal file
427
kernel/monitor_kernel_lib.c
Normal file
@@ -0,0 +1,427 @@
|
||||
#include "monitor_kernel.h"
|
||||
|
||||
unsigned char w_arg2k_w_arg(void *ptr, watch_arg warg,
|
||||
kernel_watch_arg *k_watch_arg) {
|
||||
// k_watch_arg init
|
||||
k_watch_arg->task_id = warg.task_id;
|
||||
strncpy(k_watch_arg->name, warg.name, MAX_NAME_LEN + 1); // name
|
||||
k_watch_arg->name[MAX_NAME_LEN + 1] = '\0'; // just in case
|
||||
k_watch_arg->kptr = ptr;
|
||||
k_watch_arg->length_byte = warg.length_byte;
|
||||
k_watch_arg->threshold = warg.threshold;
|
||||
k_watch_arg->unsigned_flag = warg.unsigned_flag;
|
||||
k_watch_arg->greater_flag = warg.greater_flag;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// @brief get a valuable timer
|
||||
/// @param time_ns
|
||||
/// @return kernel_watch_timer *, NULL means fail
|
||||
kernel_watch_timer *get_timer(unsigned long long time_ns) {
|
||||
int i = 0;
|
||||
kernel_watch_timer *timer = NULL;
|
||||
// chose a timer
|
||||
for (i = 0; i < kernel_wtimer_num; i++) {
|
||||
timer = &kernel_wtimer_list[i];
|
||||
|
||||
if (TIMER_EMPTY(timer)) {
|
||||
break;
|
||||
}
|
||||
if ((timer->time_ns == time_ns) && (!TIMER_FILLED(timer))) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// if all timer is full
|
||||
if (i >= MAX_TIMER_NUM) {
|
||||
printk(KERN_ERR "No timer available\n");
|
||||
return NULL;
|
||||
}
|
||||
// if a new timer, init it
|
||||
if (i > kernel_wtimer_num - 1) {
|
||||
printk(KERN_INFO "New timer\n");
|
||||
|
||||
kernel_wtimer_list[i].time_ns = time_ns;
|
||||
kernel_wtimer_list[i].sentinel = 0;
|
||||
|
||||
kernel_wtimer_list[i].kt = ktime_set(0, (unsigned long)time_ns); // ns
|
||||
// CLOCK_MONOTONIC: time since boot | HRTIMER_MODE_REL : relative time
|
||||
hrtimer_init(&(kernel_wtimer_list[i].hr_timer), CLOCK_MONOTONIC,
|
||||
HRTIMER_MODE_REL);
|
||||
kernel_wtimer_list[i].hr_timer.function =
|
||||
check_variable_cb; // callback function
|
||||
|
||||
kernel_wtimer_num = i + 1;
|
||||
}
|
||||
printk(KERN_INFO "now, we have %d timers\n", kernel_wtimer_num);
|
||||
return &kernel_wtimer_list[i];
|
||||
}
|
||||
|
||||
/// @brief hrTimer add watch
|
||||
/// @param timer
|
||||
/// @param k_watch_arg
|
||||
/// @return 0 is success
|
||||
unsigned char timer_add_watch(kernel_watch_timer *timer,
|
||||
kernel_watch_arg k_watch_arg) {
|
||||
if (TIMER_FILLED(timer)) {
|
||||
printk(KERN_ERR "Timer is full\n");
|
||||
return -1;
|
||||
}
|
||||
memcpy(&timer->k_watch_args[timer->sentinel], &k_watch_arg,
|
||||
sizeof(k_watch_arg));
|
||||
// timer->k_watch_args[timer->sentinel] = k_watch_arg;
|
||||
timer->sentinel++;
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned char timer_del_watch_by_pid(kernel_watch_timer *timer, pid_t pid) {
|
||||
int i = 0;
|
||||
for (i = 0; i < timer->sentinel; i++) {
|
||||
// if pid match, delete it and move the last one to this position, check
|
||||
// again
|
||||
if (timer->k_watch_args[i].task_id == pid) {
|
||||
if (i != timer->sentinel - 1) {
|
||||
memcpy(&timer->k_watch_args[i],
|
||||
&timer->k_watch_args[timer->sentinel - 1],
|
||||
sizeof(kernel_watch_arg));
|
||||
}
|
||||
timer->sentinel--;
|
||||
i--;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// @brief transfer user space address to kernel space address
|
||||
/// change static global "kaddr" and "page" value
|
||||
/// @param pid: process id
|
||||
/// @param kaddr: user space address
|
||||
/// @return kernel space address + offset
|
||||
void *convert_user_space_ptr(pid_t pid, unsigned long addr) {
|
||||
struct task_struct *task;
|
||||
struct mm_struct *mm;
|
||||
int ret;
|
||||
|
||||
// unsigned long aligned_addr = 0;
|
||||
// unsigned long offset = 0;
|
||||
|
||||
watch_local_memory *node;
|
||||
|
||||
// if (addr < TASK_SIZE || addr > -PAGE_SIZE)
|
||||
// {
|
||||
// printk(KERN_ERR "Invalid address\n");
|
||||
// return NULL;
|
||||
// }
|
||||
|
||||
// for get_user_pages_remote
|
||||
unsigned long aligned_addr = addr & PAGE_MASK;
|
||||
unsigned long offset = addr & ~PAGE_MASK;
|
||||
|
||||
printk(KERN_INFO "%s\n", __FUNCTION__);
|
||||
|
||||
node = kmalloc(sizeof(watch_local_memory), GFP_KERNEL);
|
||||
node->task_id = pid;
|
||||
|
||||
// Find the task with pid
|
||||
rcu_read_lock();
|
||||
task = pid_task(find_vpid(pid), PIDTYPE_PID);
|
||||
rcu_read_unlock();
|
||||
|
||||
if (!task) {
|
||||
printk(KERN_ERR "Cannot find task for PID %d\n", pid);
|
||||
kfree(node); // careful there is kfree
|
||||
return NULL;
|
||||
}
|
||||
// Get memory descriptor
|
||||
mm = get_task_mm(task);
|
||||
if (!mm) {
|
||||
printk(KERN_ERR "Cannot get memory descriptor\n");
|
||||
kfree(node); // careful there is kfree
|
||||
return NULL;
|
||||
}
|
||||
down_read(&task->mm->mmap_lock);
|
||||
ret = get_user_pages_remote(task->mm, aligned_addr, 1, FOLL_FORCE,
|
||||
&(node->page), NULL, NULL);
|
||||
up_read(&task->mm->mmap_lock);
|
||||
|
||||
if (ret != 1) {
|
||||
printk(KERN_ERR "Cannot get user page\n");
|
||||
kfree(node); // careful there is kfree
|
||||
return NULL;
|
||||
}
|
||||
// Map the page to kernel space
|
||||
node->kaddr = kmap(node->page);
|
||||
list_add_tail(&node->entry, &watch_local_memory_list); // add to list
|
||||
// printk(KERN_INFO "node->kaddr: %p, aligned_addr: %ld, offset: %ld\n",
|
||||
// node->kaddr, aligned_addr, offset);
|
||||
return (void *)((unsigned long)(node->kaddr) + offset);
|
||||
}
|
||||
|
||||
/// @brief free page in watch_local_memory_list with task_id
|
||||
/// @param task_id
|
||||
void free_page_list(pid_t task_id) {
|
||||
watch_local_memory *node, *next;
|
||||
list_for_each_entry_safe(node, next, &watch_local_memory_list, entry) {
|
||||
if (node == NULL)
|
||||
break;
|
||||
if (node->task_id == task_id) {
|
||||
// unmap and release the page
|
||||
if (node->kaddr)
|
||||
kunmap(node->kaddr);
|
||||
if (node->page)
|
||||
put_page(node->page);
|
||||
list_del(&node->entry);
|
||||
kfree(node); // careful there is kfree
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// @brief free all page in watch_local_memory_list
|
||||
/// @param
|
||||
void free_all_page_list(void) {
|
||||
watch_local_memory *node, *next;
|
||||
list_for_each_entry_safe(node, next, &watch_local_memory_list, entry) {
|
||||
if (node == NULL)
|
||||
break;
|
||||
// unmap and release the page
|
||||
if (node->kaddr)
|
||||
kunmap(node->kaddr);
|
||||
if (node->page)
|
||||
put_page(node->page);
|
||||
list_del(&node->entry);
|
||||
kfree(node); // careful there is kfree
|
||||
}
|
||||
}
|
||||
|
||||
/// @brief hrTimer handler
|
||||
enum hrtimer_restart check_variable_cb(struct hrtimer *timer) {
|
||||
kernel_watch_timer *k_watch_timer =
|
||||
container_of(timer, kernel_watch_timer, hr_timer);
|
||||
int i = 0, j = 0;
|
||||
int buffer[TIMER_MAX_WATCH_NUM]; // Buffer to store the messages
|
||||
|
||||
// check all watched kernel_watch_arg
|
||||
for (i = 0; i < k_watch_timer->sentinel; i++) {
|
||||
if (read_and_compare(&k_watch_timer->k_watch_args[i])) {
|
||||
// snprintf(buffer + strlen(buffer), sizeof(buffer) - strlen(buffer), "
|
||||
// name: %s, threshold: %lld, pid: %d\n",
|
||||
// k_watch_timer->k_watch_args[i].name,
|
||||
// k_watch_timer->k_watch_args[i].threshold,
|
||||
// k_watch_timer->k_watch_args[i].task_id);
|
||||
buffer[j] = i;
|
||||
j++;
|
||||
|
||||
// printk(KERN_INFO "j: name %s, threshold: %lld\n",
|
||||
// k_watch_timer->k_watch_args[i].name,
|
||||
// k_watch_timer->k_watch_args[i].threshold);
|
||||
// printk(KERN_INFO "j: %d\n", j);
|
||||
}
|
||||
}
|
||||
if (j > 0) // if any threshold reached
|
||||
{
|
||||
printk("-------------------------------------\n");
|
||||
printk("-------------watch monitor-----------\n");
|
||||
printk("Threshold reached:\n");
|
||||
|
||||
for (i = 0; i < j; i++) {
|
||||
printk(" name: %s, threshold: %lld, pid: %d\n",
|
||||
k_watch_timer->k_watch_args[buffer[i]].name, //! todo
|
||||
k_watch_timer->k_watch_args[buffer[i]].threshold,
|
||||
k_watch_timer->k_watch_args[buffer[i]].task_id);
|
||||
}
|
||||
print_task_stack();
|
||||
// restart timer after 1s
|
||||
hrtimer_forward(timer, timer->base->get_time(), ktime_set(1, 0)); //! todo
|
||||
printk("-------------------------------------\n");
|
||||
} else {
|
||||
// keep frequency
|
||||
hrtimer_forward(timer, timer->base->get_time(), k_watch_timer->kt);
|
||||
}
|
||||
return HRTIMER_RESTART; // restart timer
|
||||
}
|
||||
|
||||
/// @brief start hrTimer
|
||||
/// @param timeout: timeout in us
|
||||
/// @return 0 is success
|
||||
// int start_hrTimer(unsigned long timeout)
|
||||
// {
|
||||
// printk("HrTimer Start\n");
|
||||
|
||||
// kt = ktime_set(0, (unsigned long)timeout); // us -> ns
|
||||
// // CLOCK_MONOTONIC: time since boot | HRTIMER_MODE_REL : relative time
|
||||
// hrtimer_init(&hr_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
||||
// hr_timer.function = check_variable_cb;
|
||||
// // mode the same as hrtimer_init
|
||||
// hrtimer_start(&hr_timer, kt, HRTIMER_MODE_REL);
|
||||
// return 0;
|
||||
// }
|
||||
|
||||
/// @brief start all hrTimer
|
||||
/// @param
|
||||
void start_all_hrTimer(void) {
|
||||
int i = 0;
|
||||
kernel_watch_timer *timer = NULL;
|
||||
for (i = 0; i < kernel_wtimer_num; i++) {
|
||||
timer = &(kernel_wtimer_list[i]);
|
||||
TIMER_START(timer);
|
||||
}
|
||||
printk("HrTimer start,module keep %d hrtimer for now\n", kernel_wtimer_num);
|
||||
}
|
||||
|
||||
/// @brief cancel hrTimer
|
||||
/// @param
|
||||
void cancel_all_hrTimer(void) {
|
||||
int i = 0;
|
||||
kernel_watch_timer *timer = NULL;
|
||||
for (i = 0; i < kernel_wtimer_num; i++) {
|
||||
timer = &(kernel_wtimer_list[i]);
|
||||
TIMER_CANCEL(timer);
|
||||
}
|
||||
|
||||
printk("HrTimer cancel,module keep %d hrtimer for now\n", kernel_wtimer_num);
|
||||
}
|
||||
|
||||
// for read_and_compare
|
||||
typedef unsigned char (*compare_func)(void *, long long);
|
||||
|
||||
unsigned char compare_1_byte_signed(void *ptr, long long threshold) {
|
||||
// printk("compare_1_byte_signed: value %d, biss: %lld\n", *(char *)ptr,
|
||||
// threshold);
|
||||
return *(char *)ptr > threshold;
|
||||
}
|
||||
unsigned char compare_1_byte_unsigned(void *ptr, long long threshold) {
|
||||
// printk("compare_1_byte_unsigned: value %d, biss: %lld\n", *(unsigned char
|
||||
// *)ptr, threshold);
|
||||
return *(unsigned char *)ptr > threshold;
|
||||
}
|
||||
unsigned char compare_2_byte_signed(void *ptr, long long threshold) {
|
||||
// printk("compare_2_byte_signed: value %d, biss: %lld\n", *(short int *)ptr,
|
||||
// threshold);
|
||||
return *(short int *)ptr > threshold;
|
||||
}
|
||||
unsigned char compare_2_byte_unsigned(void *ptr, long long threshold) {
|
||||
// printk("compare_2_byte_unsigned: value %d, biss: %lld\n", *(unsigned short
|
||||
// int *)ptr, threshold);
|
||||
return *(unsigned short int *)ptr > threshold;
|
||||
}
|
||||
unsigned char compare_4_byte_signed(void *ptr, long long threshold) {
|
||||
// printk("compare_4_byte_signed: value %d, biss: %lld\n", *(int *)ptr,
|
||||
// threshold);
|
||||
return *(int *)ptr > threshold;
|
||||
}
|
||||
unsigned char compare_4_byte_unsigned(void *ptr, long long threshold) {
|
||||
// printk("compare_4_byte_unsigned: value %d, biss: %lld\n", *(unsigned int
|
||||
// *)ptr, threshold);
|
||||
return *(unsigned int *)ptr > threshold;
|
||||
}
|
||||
unsigned char compare_8_byte_signed(void *ptr, long long threshold) {
|
||||
// printk("compare_8_byte_signed: value %lld, biss: %lld\n", *(long long
|
||||
// *)ptr, threshold);
|
||||
return *(long long *)ptr > threshold;
|
||||
}
|
||||
unsigned char compare_8_byte_unsigned(void *ptr, long long threshold) {
|
||||
// printk("compare_8_byte_unsigned: value %lld, biss: %lld\n", *(unsigned long
|
||||
// long *)ptr, threshold);
|
||||
return *(unsigned long long *)ptr > threshold;
|
||||
}
|
||||
// list of compare functions
|
||||
static compare_func compare_funcs[8] = {
|
||||
compare_1_byte_signed, compare_2_byte_signed, compare_4_byte_signed,
|
||||
compare_8_byte_signed, compare_1_byte_unsigned, compare_2_byte_unsigned,
|
||||
compare_4_byte_unsigned, compare_8_byte_unsigned};
|
||||
|
||||
static int func_indices[2][9] = {{0, 0, 1, 0, 2, 0, 0, 0, 3},
|
||||
{0, 4, 5, 0, 6, 0, 0, 0, 7}};
|
||||
|
||||
/// @brief read k_arg->kptr and compare with threshold
|
||||
/// @param k_arg
|
||||
/// @return result of compare
|
||||
unsigned char read_and_compare(kernel_watch_arg *k_arg) {
|
||||
void *ptr = k_arg->kptr;
|
||||
int len = k_arg->length_byte;
|
||||
unsigned char is_unsigned = k_arg->unsigned_flag;
|
||||
long long threshold = k_arg->threshold;
|
||||
|
||||
unsigned char result = 0;
|
||||
|
||||
// if (len != 1 && len != 2 && len != 4 && len != 8)
|
||||
// {
|
||||
// printk(KERN_ERR "Invalid length\n");
|
||||
// return 0;
|
||||
// }
|
||||
|
||||
result = compare_funcs[func_indices[is_unsigned][len]](ptr, threshold);
|
||||
|
||||
// printk(KERN_INFO "read_and_compare: name %s, value %d, biss: %lld, result:
|
||||
// %d \n", k_arg->name, *(int *)ptr,
|
||||
// threshold, result);
|
||||
|
||||
if (k_arg->greater_flag)
|
||||
return result;
|
||||
else
|
||||
return !result;
|
||||
}
|
||||
|
||||
/// @brief init kallsyms_lookup_name
|
||||
/// @param
|
||||
/// @return 0 is success
|
||||
int fn_kallsyms_lookup_name_init(void) {
|
||||
register_kprobe(&kprobe_kallsyms_lookup_name);
|
||||
diag_kallsyms_lookup_name = (void *)kprobe_kallsyms_lookup_name.addr;
|
||||
unregister_kprobe(&kprobe_kallsyms_lookup_name);
|
||||
|
||||
printk("xby-debug, diag_kallsyms_lookup_name is %p\n",
|
||||
diag_kallsyms_lookup_name);
|
||||
|
||||
if (!diag_kallsyms_lookup_name) {
|
||||
return -EINVAL;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
unsigned char del_all_kwarg_by_pid(pid_t pid) {
|
||||
int i = 0;
|
||||
kernel_watch_timer *timer = NULL;
|
||||
|
||||
printk(KERN_INFO "del kwarg...");
|
||||
|
||||
for (i = 0; i < kernel_wtimer_num; i++) {
|
||||
timer = &(kernel_wtimer_list[i]);
|
||||
timer_del_watch_by_pid(timer, pid);
|
||||
}
|
||||
for (i = 0; i < kernel_wtimer_num; i++) {
|
||||
timer = &(kernel_wtimer_list[i]);
|
||||
if (TIMER_NO_KWARG(timer)) // no available kwarg
|
||||
{
|
||||
if (i != kernel_wtimer_num - 1) {
|
||||
memcpy(timer, &kernel_wtimer_list[kernel_wtimer_num - 1],
|
||||
sizeof(kernel_watch_timer));
|
||||
}
|
||||
kernel_wtimer_num--;
|
||||
i--;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// @brief clear watch with pid
|
||||
/// @param pid
|
||||
void clear_watch(pid_t pid) {
|
||||
printk(KERN_INFO "clear pid %d 's watch variable\n", pid);
|
||||
cancel_all_hrTimer(); // just in case
|
||||
del_all_kwarg_by_pid(pid); // delete all kwarg with pid
|
||||
free_page_list(pid); // free page with pid
|
||||
start_all_hrTimer(); // restart timer
|
||||
}
|
||||
|
||||
/// @brief clear all watch and reset kernel_wtimer_list/kernel_wtimer_num
|
||||
/// @param
|
||||
void clear_all_watch(void) {
|
||||
printk(KERN_INFO "clear all watch variable\n");
|
||||
// unmap and release the page
|
||||
free_all_page_list();
|
||||
// cancel timer
|
||||
cancel_all_hrTimer();
|
||||
// clear timer
|
||||
kernel_wtimer_num = 0;
|
||||
memset(kernel_wtimer_list, 0, sizeof(kernel_wtimer_list));
|
||||
}
|
||||
377
kernel/monitor_kernel_task.c
Normal file
377
kernel/monitor_kernel_task.c
Normal file
@@ -0,0 +1,377 @@
|
||||
#include "monitor_kernel_task.h"
|
||||
#include <asm/processor.h>
|
||||
#include <asm/ptrace.h>
|
||||
#include <asm/syscall.h> // for syscall_get_nr
|
||||
#include <linux/irq.h>
|
||||
#include <linux/sched/mm.h> // for get_task_mm
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/tracehook.h>
|
||||
|
||||
struct stack_trace {
|
||||
unsigned int nr_entries, max_entries;
|
||||
unsigned long *entries;
|
||||
int skip; /* input argument: How many entries to skip */
|
||||
};
|
||||
|
||||
struct stack_frame_user {
|
||||
const void __user *next_fp;
|
||||
unsigned long ret_addr;
|
||||
};
|
||||
|
||||
static inline int diag_get_task_type(struct task_struct *tsk) {
|
||||
if (orig_get_task_type)
|
||||
return orig_get_task_type(&tsk->se);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int orig_diag_cgroup_name(struct cgroup *cgrp, char *buf,
|
||||
size_t buflen) {
|
||||
if (orig_kernfs_name && cgrp && cgrp->kn) {
|
||||
return orig_kernfs_name(cgrp->kn, buf, buflen);
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
static inline mm_info *find_mm_info(mm_tree *mm_tree, struct mm_struct *mm) {
|
||||
mm_info *info;
|
||||
if (mm == NULL)
|
||||
return NULL;
|
||||
info = radix_tree_lookup(&mm_tree->mm_tree, (unsigned long)mm);
|
||||
return info;
|
||||
}
|
||||
|
||||
static void __diag_cgroup_name(struct task_struct *tsk, char *buf,
|
||||
unsigned int count, int cgroup) {
|
||||
int cgroup_id = cpuacct_cgrp_id;
|
||||
|
||||
memset(buf, 0, count);
|
||||
|
||||
if (cgroup == 1) {
|
||||
cgroup_id = cpuset_cgrp_id;
|
||||
}
|
||||
|
||||
if (tsk && tsk->cgroups && tsk->cgroups->subsys &&
|
||||
tsk->cgroups->subsys[cgroup_id] &&
|
||||
tsk->cgroups->subsys[cgroup_id]->cgroup) {
|
||||
orig_diag_cgroup_name(tsk->cgroups->subsys[cgroup_id]->cgroup, buf, count);
|
||||
}
|
||||
}
|
||||
|
||||
static void diag_cgroup_name(struct task_struct *tsk, char *buf,
|
||||
unsigned int count, int cgroup) {
|
||||
__diag_cgroup_name(tsk, buf, count, cgroup);
|
||||
}
|
||||
|
||||
static int copy_stack_frame(const void __user *fp,
|
||||
struct stack_frame_user *frame) {
|
||||
int ret;
|
||||
|
||||
ret = 1;
|
||||
pagefault_disable();
|
||||
if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
|
||||
ret = 0;
|
||||
pagefault_enable();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int copy_stack_frame_remote(struct task_struct *tsk,
|
||||
const void __user *fp,
|
||||
struct stack_frame_user *frame) {
|
||||
int ret;
|
||||
struct mm_struct *mm;
|
||||
|
||||
mm = get_task_mm(tsk);
|
||||
if (!mm)
|
||||
return 0;
|
||||
|
||||
ret = orig_access_remote_vm(mm, (unsigned long)fp, frame, sizeof(*frame), 0);
|
||||
mmput(mm);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline void save_stack_trace_user_remote(struct task_struct *tsk,
|
||||
struct stack_trace *trace) {
|
||||
const struct pt_regs *regs = task_pt_regs(tsk);
|
||||
const void __user *fp = (const void __user *)regs->bp;
|
||||
int count = 0;
|
||||
|
||||
if (in_atomic() || irqs_disabled()) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (trace->nr_entries < trace->max_entries)
|
||||
trace->entries[trace->nr_entries++] = regs->ip;
|
||||
|
||||
while (trace->nr_entries < trace->max_entries) {
|
||||
struct stack_frame_user frame;
|
||||
|
||||
frame.next_fp = NULL;
|
||||
frame.ret_addr = 0;
|
||||
|
||||
if (!copy_stack_frame_remote(tsk, fp, &frame)) {
|
||||
break;
|
||||
}
|
||||
|
||||
if ((unsigned long)fp < regs->sp)
|
||||
break;
|
||||
|
||||
if (frame.ret_addr) {
|
||||
trace->entries[trace->nr_entries++] = frame.ret_addr;
|
||||
} else
|
||||
break;
|
||||
|
||||
if (fp == frame.next_fp)
|
||||
break;
|
||||
fp = frame.next_fp;
|
||||
|
||||
count++;
|
||||
/**
|
||||
* 线上环境发现这里有hardlockup,这里强制退出
|
||||
*/
|
||||
if (count >= trace->max_entries || count >= 100)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void __save_stack_trace_user(struct stack_trace *trace) {
|
||||
const struct pt_regs *regs = task_pt_regs(current);
|
||||
const void __user *fp = (const void __user *)regs->bp;
|
||||
int count = 0;
|
||||
|
||||
if (trace->nr_entries < trace->max_entries)
|
||||
trace->entries[trace->nr_entries++] = regs->ip;
|
||||
|
||||
while (trace->nr_entries < trace->max_entries) {
|
||||
struct stack_frame_user frame;
|
||||
|
||||
frame.next_fp = NULL;
|
||||
frame.ret_addr = 0;
|
||||
if (!copy_stack_frame(fp, &frame))
|
||||
break;
|
||||
if ((unsigned long)fp < regs->sp)
|
||||
break;
|
||||
if (frame.ret_addr) {
|
||||
trace->entries[trace->nr_entries++] = frame.ret_addr;
|
||||
}
|
||||
if (fp == frame.next_fp)
|
||||
break;
|
||||
fp = frame.next_fp;
|
||||
count++;
|
||||
/**
|
||||
* 线上环境发现这里有hardlockup,这里强制退出
|
||||
*/
|
||||
if (count >= trace->max_entries || count >= 100)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void perfect_save_stack_trace_user(struct stack_trace *trace) {
|
||||
/*
|
||||
* Trace user stack if we are not a kernel thread
|
||||
*/
|
||||
if (current->mm) {
|
||||
__save_stack_trace_user(trace);
|
||||
}
|
||||
if (trace->nr_entries < trace->max_entries)
|
||||
trace->entries[trace->nr_entries++] = ULONG_MAX;
|
||||
}
|
||||
|
||||
void diagnose_save_stack_trace_user(unsigned long *backtrace) {
|
||||
struct stack_trace trace;
|
||||
|
||||
memset(&trace, 0, sizeof(trace));
|
||||
memset(backtrace, 0, BACKTRACE_DEPTH2 * sizeof(unsigned long));
|
||||
trace.max_entries = BACKTRACE_DEPTH2;
|
||||
trace.entries = backtrace;
|
||||
perfect_save_stack_trace_user(&trace);
|
||||
}
|
||||
|
||||
void diagnose_save_stack_trace_user_remote(struct task_struct *tsk,
|
||||
unsigned long *backtrace) {
|
||||
struct stack_trace trace;
|
||||
|
||||
memset(&trace, 0, sizeof(trace));
|
||||
memset(backtrace, 0, BACKTRACE_DEPTH2 * sizeof(unsigned long));
|
||||
trace.max_entries = BACKTRACE_DEPTH2;
|
||||
trace.entries = backtrace;
|
||||
|
||||
/*
|
||||
* Trace user stack if we are not a kernel thread
|
||||
*/
|
||||
if (tsk->mm) {
|
||||
save_stack_trace_user_remote(tsk, &trace);
|
||||
}
|
||||
if (trace.nr_entries < trace.max_entries)
|
||||
trace.entries[trace.nr_entries++] = ULONG_MAX;
|
||||
}
|
||||
|
||||
void diag_task_brief(struct task_struct *tsk, task_detail *detail) {
|
||||
struct pid_namespace *ns;
|
||||
struct pt_regs *task_regs;
|
||||
struct task_struct *leader;
|
||||
struct pt_regs *irq_regs;
|
||||
|
||||
if (!detail)
|
||||
return;
|
||||
|
||||
memset(detail, 0, sizeof(task_detail));
|
||||
|
||||
if (!tsk || tsk->exit_state == EXIT_ZOMBIE) // zombie
|
||||
return;
|
||||
leader = tsk->group_leader;
|
||||
if (!leader || leader->exit_state == EXIT_ZOMBIE) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (tsk != current) { // not current task
|
||||
detail->user_mode = -1;
|
||||
detail->syscallno = -1;
|
||||
} else if (!tsk->mm) { // current task but kernel thread
|
||||
detail->user_mode = 0;
|
||||
detail->syscallno = -1;
|
||||
} else { // current task and user thread
|
||||
irq_regs = get_irq_regs(); // get current irq regs
|
||||
task_regs = task_pt_regs(tsk);
|
||||
|
||||
if ((irq_regs && user_mode(irq_regs)) ||
|
||||
(task_regs && user_mode(task_regs))) {
|
||||
detail->user_mode = 1; // user mode
|
||||
} else {
|
||||
detail->user_mode = 0; // kernel mode
|
||||
}
|
||||
|
||||
if (task_regs) {
|
||||
detail->syscallno = syscall_get_nr(tsk, task_regs); // get syscall no
|
||||
}
|
||||
}
|
||||
|
||||
if (tsk->sched_class == orig_idle_sched_class) // idle task
|
||||
detail->sys_task = 2;
|
||||
else if (!tsk->mm) // kernel thread
|
||||
detail->sys_task = 1;
|
||||
else
|
||||
detail->sys_task = 0;
|
||||
|
||||
detail->pid = tsk->pid; // pid
|
||||
detail->tgid = tsk->tgid; // tgid
|
||||
detail->state = tsk->__state; // state
|
||||
detail->task_type = diag_get_task_type(tsk); // task type
|
||||
ns = task_active_pid_ns(tsk); // container pid
|
||||
if (ns && ns != &init_pid_ns) {
|
||||
detail->container_pid = task_pid_nr_ns(tsk, ns);
|
||||
detail->container_tgid = task_tgid_nr_ns(tsk, ns);
|
||||
} else {
|
||||
detail->container_pid = tsk->pid;
|
||||
detail->container_tgid = tsk->tgid;
|
||||
}
|
||||
strncpy(detail->comm, tsk->comm, TASK_COMM_LEN);
|
||||
detail->comm[TASK_COMM_LEN - 1] = 0; // comm name
|
||||
diag_cgroup_name(tsk, detail->cgroup_buf, CGROUP_NAME_LEN, 0);
|
||||
diag_cgroup_name(tsk, detail->cgroup_cpuset, CGROUP_NAME_LEN, 1);
|
||||
|
||||
detail->cgroup_buf[CGROUP_NAME_LEN - 1] = 0; // cgroup name
|
||||
detail->cgroup_cpuset[CGROUP_NAME_LEN - 1] = 0; // cgroup cpuset name
|
||||
}
|
||||
|
||||
void diag_task_user_stack(struct task_struct *tsk, user_stack_detail *detail) {
|
||||
struct pt_regs *regs;
|
||||
unsigned long sp, ip, bp;
|
||||
struct task_struct *leader;
|
||||
|
||||
if (!detail)
|
||||
return;
|
||||
|
||||
detail->stack[0] = 0;
|
||||
if (!tsk || !tsk->mm)
|
||||
return;
|
||||
|
||||
leader = tsk->group_leader;
|
||||
if (!leader || !leader->mm || leader->exit_state == EXIT_ZOMBIE) {
|
||||
return;
|
||||
}
|
||||
|
||||
sp = 0;
|
||||
ip = 0;
|
||||
bp = 0;
|
||||
regs = task_pt_regs(tsk);
|
||||
if (regs) {
|
||||
sp = regs->sp;
|
||||
#if defined(DIAG_ARM64)
|
||||
ip = regs->pc;
|
||||
bp = regs->sp;
|
||||
#else
|
||||
ip = regs->ip;
|
||||
bp = regs->bp;
|
||||
#endif
|
||||
}
|
||||
#if defined(DIAG_ARM64)
|
||||
detail->regs = regs->user_regs;
|
||||
#else
|
||||
detail->regs = *regs;
|
||||
#endif
|
||||
detail->sp = sp;
|
||||
detail->ip = ip;
|
||||
detail->bp = bp;
|
||||
|
||||
if (tsk == current) {
|
||||
diagnose_save_stack_trace_user(detail->stack);
|
||||
} else {
|
||||
diagnose_save_stack_trace_user_remote(tsk, detail->stack);
|
||||
}
|
||||
}
|
||||
|
||||
void diag_task_kern_stack(struct task_struct *tsk, kern_stack_detail *detail) {
|
||||
orig_stack_trace_save_tsk(tsk, detail->stack, BACKTRACE_DEPTH2, 0);
|
||||
}
|
||||
|
||||
void dump_proc_chains_argv(int style, struct task_struct *tsk, mm_tree *mm_tree,
|
||||
proc_chains_detail *detail) {
|
||||
struct task_struct *walker;
|
||||
mm_info *mm_info;
|
||||
int cnt = 0;
|
||||
int i = 0;
|
||||
struct task_struct *leader;
|
||||
|
||||
for (i = 0; i < PROCESS_CHAINS_COUNT; i++) {
|
||||
detail->chains[i][0] = 0;
|
||||
detail->tgid[i] = 0;
|
||||
}
|
||||
if (style == 0)
|
||||
return;
|
||||
|
||||
if (!tsk || !tsk->mm)
|
||||
return;
|
||||
|
||||
leader = tsk->group_leader;
|
||||
if (!leader || !leader->mm ||
|
||||
leader->exit_state == EXIT_ZOMBIE) { // leader is zombie or no mm
|
||||
return;
|
||||
}
|
||||
|
||||
rcu_read_lock();
|
||||
walker = tsk;
|
||||
|
||||
while (walker->pid > 0) {
|
||||
if (!thread_group_leader(walker))
|
||||
walker = rcu_dereference(walker->group_leader);
|
||||
mm_info = find_mm_info(mm_tree, walker->mm);
|
||||
if (mm_info) {
|
||||
if (mm_info->cgroup_buf[0] == 0)
|
||||
diag_cgroup_name(walker, mm_info->cgroup_buf, 255, 0);
|
||||
strncpy(detail->chains[cnt], mm_info->argv, PROCESS_ARGV_LEN);
|
||||
detail->full_argv[cnt] = 1;
|
||||
} else {
|
||||
strncpy(detail->chains[cnt], walker->comm, TASK_COMM_LEN);
|
||||
detail->full_argv[cnt] = 0;
|
||||
}
|
||||
detail->tgid[cnt] = walker->pid;
|
||||
walker = rcu_dereference(walker->real_parent);
|
||||
cnt++;
|
||||
if (cnt >= PROCESS_CHAINS_COUNT)
|
||||
break;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
98
kernel/monitor_kernel_task.h
Normal file
98
kernel/monitor_kernel_task.h
Normal file
@@ -0,0 +1,98 @@
|
||||
#include <linux/kernfs.h>
|
||||
#include <linux/sched.h>
|
||||
|
||||
#define CGROUP_NAME_LEN 32 // max length of cgroup name
|
||||
#define TASK_COMM_LEN 16 // max length of task name
|
||||
|
||||
#define BACKTRACE_DEPTH2 30 // max depth of backtrace
|
||||
|
||||
#define PROCESS_CHAINS_COUNT 10 // max count of process chains
|
||||
#define PROCESS_ARGV_LEN 128 // max length of process argv
|
||||
|
||||
// from
|
||||
// https://github.com/alibaba/diagnose-tools/blob/8cd905a1c17f2201e460a2d607413a1303757a32/SOURCE/uapi/ali_diagnose.h
|
||||
|
||||
typedef struct {
|
||||
char cgroup_buf[CGROUP_NAME_LEN];
|
||||
char cgroup_cpuset[CGROUP_NAME_LEN];
|
||||
int pid;
|
||||
int tgid;
|
||||
int container_pid;
|
||||
int container_tgid;
|
||||
long state;
|
||||
int task_type;
|
||||
unsigned long syscallno;
|
||||
/**
|
||||
* 0->user 1->sys 2->idle
|
||||
*/
|
||||
unsigned long sys_task;
|
||||
/**
|
||||
* 1->user mode 0->sys mode -1->unknown
|
||||
*/
|
||||
unsigned long user_mode;
|
||||
char comm[TASK_COMM_LEN];
|
||||
} task_detail;
|
||||
|
||||
typedef struct {
|
||||
unsigned long stack[BACKTRACE_DEPTH2];
|
||||
} kern_stack_detail;
|
||||
|
||||
typedef struct {
|
||||
struct pt_regs regs;
|
||||
unsigned long ip;
|
||||
unsigned long bp;
|
||||
unsigned long sp;
|
||||
unsigned long stack[BACKTRACE_DEPTH2];
|
||||
} user_stack_detail;
|
||||
|
||||
typedef struct {
|
||||
unsigned int full_argv[PROCESS_CHAINS_COUNT]; //
|
||||
char chains[PROCESS_CHAINS_COUNT][PROCESS_ARGV_LEN]; // process chains argv
|
||||
unsigned int tgid[PROCESS_CHAINS_COUNT]; // process chains tgid
|
||||
} proc_chains_detail;
|
||||
|
||||
// most important struct
|
||||
typedef struct {
|
||||
int et_type;
|
||||
unsigned long id;
|
||||
unsigned long long tv;
|
||||
task_detail task; // brief
|
||||
user_stack_detail user_stack; // user stack
|
||||
kern_stack_detail kern_stack; // kernel stack
|
||||
proc_chains_detail proc_chains; // process chains argv
|
||||
} variable_monitor_task;
|
||||
|
||||
typedef struct {
|
||||
struct rcu_head rcu_head;
|
||||
pid_t pid;
|
||||
struct mm_struct *mm;
|
||||
char cgroup_buf[256];
|
||||
char argv[256];
|
||||
} mm_info;
|
||||
|
||||
typedef struct {
|
||||
struct radix_tree_root mm_tree;
|
||||
spinlock_t mm_tree_lock;
|
||||
} mm_tree;
|
||||
|
||||
void diag_task_brief(struct task_struct *tsk,
|
||||
task_detail *detail); // get task brief
|
||||
void diag_task_user_stack(struct task_struct *tsk,
|
||||
user_stack_detail *detail); // get task user stack
|
||||
void diag_task_kern_stack(struct task_struct *tsk,
|
||||
kern_stack_detail *detail); // get task kernel stack
|
||||
void dump_proc_chains_argv(
|
||||
int style, struct task_struct *tsk, mm_tree *mm_tree,
|
||||
proc_chains_detail *detail); // get process chains argv
|
||||
|
||||
// orig_X
|
||||
struct sched_class *orig_idle_sched_class;
|
||||
int (*orig_get_task_type)(struct sched_entity *se);
|
||||
int (*orig_kernfs_name)(struct kernfs_node *kn, char *buf, size_t buflen);
|
||||
int (*orig_access_remote_vm)(struct mm_struct *mm, unsigned long addr,
|
||||
void *buf, int len, unsigned int gup_flags);
|
||||
extern unsigned int (*orig_stack_trace_save_tsk)(struct task_struct *task,
|
||||
unsigned long *store,
|
||||
unsigned int size,
|
||||
unsigned int skipnr);
|
||||
|
||||
1
rootkit.pty
Symbolic link
1
rootkit.pty
Symbolic link
@@ -0,0 +1 @@
|
||||
/dev/pts/6
|
||||
51
testcase/helloworld.c
Normal file
51
testcase/helloworld.c
Normal file
@@ -0,0 +1,51 @@
|
||||
#include "../user/monitor_user.h"
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
|
||||
#define NUM_VARS 2049
|
||||
|
||||
int main()
|
||||
{
|
||||
int i = 0;
|
||||
int temps[NUM_VARS] = {0};
|
||||
watch_arg watch_args[NUM_VARS] = {0};
|
||||
|
||||
cancel_watch();
|
||||
|
||||
for (i = 0; i < NUM_VARS; i++)
|
||||
{
|
||||
temps[i] = 100;
|
||||
|
||||
watch_args[i] = (watch_arg){
|
||||
.task_id = getpid(),
|
||||
.ptr = &temps[i],
|
||||
.name = "temp",
|
||||
.length_byte = sizeof(int),
|
||||
.threshold = 150 + i,
|
||||
.unsigned_flag = 0,
|
||||
.greater_flag = 1,
|
||||
.time_ns = 2000 + (i / 33) * 5000, // on hyper-v, 1us will block all system. 2us just fine, maybe 1us is too short for hyper-v
|
||||
};
|
||||
char name[20];
|
||||
snprintf(name, sizeof(name), "temp%d", i);
|
||||
// 拷贝字符串
|
||||
strncpy(watch_args[i].name, name, (MAX_NAME_LEN + 1));
|
||||
|
||||
start_watch(watch_args[i]);
|
||||
}
|
||||
|
||||
while (temps[NUM_VARS - 1] < 205)
|
||||
{
|
||||
for (i = 0; i < NUM_VARS; i++)
|
||||
{
|
||||
temps[i]++;
|
||||
}
|
||||
printf("Value of variable %d: %d", i, temps[0]);
|
||||
printf("\n");
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
cancel_watch();
|
||||
return 0;
|
||||
}
|
||||
62
testcase/hptest.c
Normal file
62
testcase/hptest.c
Normal file
@@ -0,0 +1,62 @@
|
||||
#include "../user/monitor_user.h"
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/mman.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#define HUGEPAGE_SIZE (1024 * 1024 * 1024) // Huge Page 大小为 1GB
|
||||
|
||||
int main() {
|
||||
int fd;
|
||||
void *addr;
|
||||
watch_arg w_arg = {0};
|
||||
|
||||
// 打开一个 hugetlbfs 文件
|
||||
fd = open("/run/mrzcpd/huge_pages/hugepagefile", O_CREAT | O_RDWR, 0755);
|
||||
if (fd < 0) {
|
||||
perror("open");
|
||||
return 1;
|
||||
}
|
||||
|
||||
// 映射 Huge Page 内存
|
||||
addr = mmap(0, HUGEPAGE_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
|
||||
if (addr == MAP_FAILED) {
|
||||
perror("mmap");
|
||||
close(fd);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// 大页内存 int 类型变量, for ++
|
||||
int *p = (int *)addr;
|
||||
*p = 0;
|
||||
|
||||
w_arg = (watch_arg){
|
||||
.task_id = getpid(),
|
||||
.ptr = p,
|
||||
.name = "hptest",
|
||||
.length_byte = sizeof(int),
|
||||
.threshold = 20,
|
||||
.unsigned_flag = 0,
|
||||
.greater_flag = 1,
|
||||
.time_ns = 2000, // on hyper-v, 1us will block all system. 2us just fine, maybe 1us is too short for hyper-v
|
||||
};
|
||||
start_watch(w_arg);
|
||||
|
||||
for (int i = 0; i < 100; i++)
|
||||
{
|
||||
(*p)++;
|
||||
printf("p = %d\n", *p);
|
||||
sleep(1);
|
||||
}
|
||||
|
||||
// 释放 Huge Page 内存
|
||||
if (munmap(addr, HUGEPAGE_SIZE) == -1) {
|
||||
perror("munmap");
|
||||
close(fd);
|
||||
return 1;
|
||||
}
|
||||
|
||||
close(fd);
|
||||
|
||||
return 0;
|
||||
}
|
||||
44
user/monitor_user.c
Normal file
44
user/monitor_user.c
Normal file
@@ -0,0 +1,44 @@
|
||||
#include "monitor_user.h"
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
#include <sys/ioctl.h>
|
||||
#include <unistd.h>
|
||||
|
||||
#define DEVICE "/dev/variable_monitor"
|
||||
int file_desc = -1;
|
||||
|
||||
/// @brief start watch
|
||||
/// @param w_arg
|
||||
/// @return 0 means success, other means fail
|
||||
int start_watch(watch_arg w_arg) {
|
||||
if (file_desc < 0) {
|
||||
file_desc = open(DEVICE, 0);
|
||||
}
|
||||
if (file_desc < 0) {
|
||||
printf("Can't open device file: %s\n", DEVICE);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (ioctl(file_desc, 1, &w_arg) < 0) {
|
||||
printf("ioctl failed\n");
|
||||
close(file_desc);
|
||||
return -1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// @brief cancel watch
|
||||
/// @return 0 means success, other means fail
|
||||
int cancel_watch() {
|
||||
if (file_desc < 0) {
|
||||
file_desc = open(DEVICE, 0);
|
||||
}
|
||||
if (file_desc < 0) {
|
||||
printf("Device not open: %s,%d \n", DEVICE, file_desc);
|
||||
return file_desc;
|
||||
}
|
||||
|
||||
close(file_desc);
|
||||
file_desc = -1;
|
||||
return 0;
|
||||
}
|
||||
20
user/monitor_user.h
Normal file
20
user/monitor_user.h
Normal file
@@ -0,0 +1,20 @@
|
||||
// monitor_interface.h
|
||||
#include <sys/types.h>
|
||||
|
||||
#define MAX_NAME_LEN (15) // max name length
|
||||
typedef struct {
|
||||
pid_t task_id; // current process id
|
||||
char name[MAX_NAME_LEN + 1]; // name
|
||||
void *ptr; // virtual address
|
||||
int length_byte; // byte
|
||||
long long threshold; // threshold value
|
||||
unsigned char unsigned_flag; // unsigned flag (true: unsigned, false: signed)
|
||||
unsigned char greater_flag; // reverse flag (true: >, false: <)
|
||||
unsigned long time_ns; // timer interval (ns)
|
||||
} watch_arg;
|
||||
|
||||
// start watch
|
||||
int start_watch(watch_arg w_arg);
|
||||
|
||||
// cancel watch
|
||||
int cancel_watch(void);
|
||||
Reference in New Issue
Block a user