diff --git a/clickhouse/monitor/README.md b/clickhouse/monitor/README.md new file mode 100644 index 0000000..431609b --- /dev/null +++ b/clickhouse/monitor/README.md @@ -0,0 +1,5 @@ +## Description +- run_ck_monitor:放置/etc/cron.d目录下,每分钟执行一次 +- clickhouse_monitor.sh:输出ClickHouse系统监控指标,部署至每一台节点 + - 放置目录/opt/tsg/olap/clickhouse/monitor + - 输出Metrics至/opt/tsg/olap/node-exporter/prom/clickhouse_metrics.prom文件 diff --git a/clickhouse/monitor/clickhouse_exporter.sh b/clickhouse/monitor/clickhouse_exporter.sh new file mode 100755 index 0000000..cc6cbe8 --- /dev/null +++ b/clickhouse/monitor/clickhouse_exporter.sh @@ -0,0 +1,55 @@ +#!/bin/bash + +# Configuration +CK_USER="default" # Change to your own ClickHouse username +CK_PIN="galaxy2019" +CK_HOST="127.0.0.1" +CK_PORT="9001" +METRICS_FILE="/opt/tsg/olap/node-exporter/prom/clickhouse_metrics.prom" +TIMEOUT=100 + +# Query ClickHouse data +query_clickhouse() { + clickhouse-client -h "$CK_HOST" --port "$CK_PORT" -m -u "$CK_USER" --password "$CK_PIN" --max_execution_time="$TIMEOUT" --query="$1" +} + +# Check if ClickHouse service is running +if pgrep -f "clickhouse-server/config.xml" > /dev/null; then + echo "# HELP clickhouse_up ClickHouse service status (1=up, 0=down)" > $METRICS_FILE + echo "# TYPE clickhouse_up gauge" >> $METRICS_FILE + echo "clickhouse_up 1" >> $METRICS_FILE +else + echo "# HELP clickhouse_up ClickHouse service status (1=up, 0=down)" > $METRICS_FILE + echo "# TYPE clickhouse_up gauge" >> $METRICS_FILE + echo "clickhouse_up 0" >> $METRICS_FILE + exit 1 # Exit the script if ClickHouse is not running +fi + +# Record start time +start_time=$(date +%s) + +# Query metrics +process_count=$(query_clickhouse "SELECT count(*) FROM system.processes") +merge_count=$(query_clickhouse "SELECT count(*) FROM system.merges") +expired_parts_count=$(query_clickhouse "SELECT count() FROM system.parts WHERE delete_ttl_info_max < (now() - INTERVAL 3 DAY) AND delete_ttl_info_max > toDateTime('2000-01-01 00:00:00')") + +# Record end time and calculate execution time +end_time=$(date +%s) +execution_time=$((end_time - start_time)) + +# Write Prometheus format metrics to the file +{ + echo "# HELP ck_processes_count Number of running processes in ClickHouse" + echo "# TYPE ck_processes_count gauge" + echo "ck_processes_count $process_count" + echo "# HELP ck_merges_count Number of running merges in ClickHouse" + echo "# TYPE ck_merges_count gauge" + echo "ck_merges_count $merge_count" + echo "# HELP ck_connect_time Time taken to connect and run queries in seconds" + echo "# TYPE ck_connect_time gauge" + echo "ck_connect_time $execution_time" + echo "# HELP ck_expired_parts_count Number of expired parts in ClickHouse" + echo "# TYPE ck_expired_parts_count gauge" + echo "ck_expired_parts_count $expired_parts_count" +} >> "$METRICS_FILE" + diff --git a/clickhouse/monitor/run_ck_monitor b/clickhouse/monitor/run_ck_monitor new file mode 100644 index 0000000..55f442b --- /dev/null +++ b/clickhouse/monitor/run_ck_monitor @@ -0,0 +1,3 @@ +#Ansible: Check Clickhouse up to node_exporter +*/1 * * * * root cd /opt/tsg/olap/clickhouse/monitor && sh clickhouse_exporter.sh + diff --git a/druid/monitor/README.md b/druid/monitor/README.md new file mode 100644 index 0000000..b7df8d4 --- /dev/null +++ b/druid/monitor/README.md @@ -0,0 +1,6 @@ +## Description +- run_druid_monitor:放置/etc/cron.d目录下,每分钟执行一次 +- druid_monitor.sh:输出Druid系统监控指标,可以部署至每一台节点 + - 放置目录/opt/tsg/olap/druid/monitor + - 输出Metrics至/opt/tsg/olap/node-exporter/prom/druid_metrics.prom文件 + - 更改状态druid_index_task_healthy_flag状态:1- healthy,0- unhealthy diff --git a/druid/monitor/druid_exporter.sh b/druid/monitor/druid_exporter.sh new file mode 100755 index 0000000..e72b2e4 --- /dev/null +++ b/druid/monitor/druid_exporter.sh @@ -0,0 +1,77 @@ +#!/bin/bash + +druid_ip=127.0.0.1 +druid_port=8089 +druid_monitor_prom_file=/opt/tsg/olap/node-exporter/prom/druid_metrics.prom + +task_name=$(find "../topology/tasks" -name "*.json") + +# Clear the previous metrics file to avoid appending issues +> $druid_monitor_prom_file + +# Function to retrieve supervisor status information +function getSupervisorStatus() { + druid_supervisor_healthy_num=$(curl -G -d 'state=true' -s $druid_ip:$druid_port/druid/indexer/v1/supervisor | jq '[ .[] | .state] | length') + echo "# HELP druid_supervisor_healthy_num Number of healthy supervisors in Druid." >> $druid_monitor_prom_file + echo "# TYPE druid_supervisor_healthy_num gauge" >> $druid_monitor_prom_file + echo druid_supervisor_healthy_num $druid_supervisor_healthy_num >> $druid_monitor_prom_file + + druid_supervisor_unhealthy_num=$(curl -G -d 'state=true' -s $druid_ip:$druid_port/druid/indexer/v1/supervisor | jq '[ .[] | select(.state != "RUNNING")] | length') + echo "# HELP druid_supervisor_unhealthy_num Number of unhealthy supervisors in Druid." >> $druid_monitor_prom_file + echo "# TYPE druid_supervisor_unhealthy_num gauge" >> $druid_monitor_prom_file + echo druid_supervisor_unhealthy_num $druid_supervisor_unhealthy_num >> $druid_monitor_prom_file +} + +# Function to retrieve Druid task status information +function getTaskStatus() { + druid_index_running_task_num=$(curl -G -d 'type=index_kafka' -s $druid_ip:$druid_port/druid/indexer/v1/runningTasks | jq '. | length') + echo "# HELP druid_index_running_task_num Number of running index_kafka tasks in Druid." >> $druid_monitor_prom_file + echo "# TYPE druid_index_running_task_num gauge" >> $druid_monitor_prom_file + echo druid_index_running_task_num $druid_index_running_task_num >> $druid_monitor_prom_file + + druid_index_waiting_task_num=$(curl -G -d 'type=index_kafka' -s $druid_ip:$druid_port/druid/indexer/v1/waitingTasks | jq '. | length') + echo "# HELP druid_index_waiting_task_num Number of waiting index_kafka tasks in Druid." >> $druid_monitor_prom_file + echo "# TYPE druid_index_waiting_task_num gauge" >> $druid_monitor_prom_file + echo druid_index_waiting_task_num $druid_index_waiting_task_num >> $druid_monitor_prom_file + + druid_index_pending_task_num=$(curl -G -d 'type=index_kafka' -s $druid_ip:$druid_port/druid/indexer/v1/pendingTasks | jq '. | length') + echo "# HELP druid_index_pending_task_num Number of pending index_kafka tasks in Druid." >> $druid_monitor_prom_file + echo "# TYPE druid_index_pending_task_num gauge" >> $druid_monitor_prom_file + echo druid_index_pending_task_num $druid_index_pending_task_num >> $druid_monitor_prom_file + + druid_compact_pending_task_num=$(curl -G -d 'type=compact' -s $druid_ip:$druid_port/druid/indexer/v1/pendingTasks | jq '. | length') + echo "# HELP druid_compact_pending_task_num Number of pending compact tasks in Druid." >> $druid_monitor_prom_file + echo "# TYPE druid_compact_pending_task_num gauge" >> $druid_monitor_prom_file + echo druid_compact_pending_task_num $druid_compact_pending_task_num >> $druid_monitor_prom_file + + druid_compact_waiting_task_num=$(curl -G -d 'type=compact' -s $druid_ip:$druid_port/druid/indexer/v1/waitingTasks | jq '. | length') + echo "# HELP druid_compact_waiting_task_num Number of waiting compact tasks in Druid." >> $druid_monitor_prom_file + echo "# TYPE druid_compact_waiting_task_num gauge" >> $druid_monitor_prom_file + echo druid_compact_waiting_task_num $druid_compact_waiting_task_num >> $druid_monitor_prom_file + +} + +# Function to check if the running index task count matches the specified count in the configuration +# If the running count is greater than or equal to the configured count, it is considered healthy (1), otherwise unhealthy (0) +function checkIndexStatus() { + druid_task_sum=0 + for var in ${task_name[@]}; do + druid_task_num=$(cat ${var} | jq .ioConfig.taskCount) + druid_task_sum=$(expr $druid_task_num + $druid_task_sum) + done + + druid_index_task_healthy_flag=1 + if [ $druid_index_running_task_num -lt $druid_task_sum ]; then + druid_index_task_healthy_flag=0 + fi + + echo "# HELP druid_index_task_healthy_flag Health flag for index tasks in Druid (1 = healthy, 0 = unhealthy)." >> $druid_monitor_prom_file + echo "# TYPE druid_index_task_healthy_flag gauge" >> $druid_monitor_prom_file + echo druid_index_task_healthy_flag $druid_index_task_healthy_flag >> $druid_monitor_prom_file +} + +# Call the functions to collect and write the metrics +getTaskStatus +getSupervisorStatus +checkIndexStatus + diff --git a/druid/monitor/run_druid_monitor b/druid/monitor/run_druid_monitor new file mode 100644 index 0000000..4e74620 --- /dev/null +++ b/druid/monitor/run_druid_monitor @@ -0,0 +1,2 @@ +#Ansible: Check druid up to node_exporter +* * * * * root cd /opt/tsg/olap/druid/monitor/ && sh druid_monitor.sh \ No newline at end of file