优化ClickHouse/Druid系统监控指标

2024-11-14 17:20:57 +08:00
parent 1346da854c
commit de08ed2109
6 changed files with 148 additions and 0 deletions
--- a/clickhouse/monitor/README.md
+++ b/clickhouse/monitor/README.md
@@ -0,0 +1,5 @@
+## Description
+- run_ck_monitor：放置/etc/cron.d目录下，每分钟执行一次
+- clickhouse_monitor.sh：输出ClickHouse系统监控指标，部署至每一台节点
+  - 放置目录/opt/tsg/olap/clickhouse/monitor
+  - 输出Metrics至/opt/tsg/olap/node-exporter/prom/clickhouse_metrics.prom文件
--- a/clickhouse/monitor/clickhouse_exporter.sh
+++ b/clickhouse/monitor/clickhouse_exporter.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+# Configuration
+CK_USER="default"  # Change to your own ClickHouse username
+CK_PIN="galaxy2019"
+CK_HOST="127.0.0.1"
+CK_PORT="9001"
+METRICS_FILE="/opt/tsg/olap/node-exporter/prom/clickhouse_metrics.prom"
+TIMEOUT=100
+
+# Query ClickHouse data
+query_clickhouse() {
+    clickhouse-client -h "$CK_HOST" --port "$CK_PORT" -m -u "$CK_USER" --password "$CK_PIN" --max_execution_time="$TIMEOUT" --query="$1"
+}
+
+# Check if ClickHouse service is running
+if pgrep -f "clickhouse-server/config.xml" > /dev/null; then
+    echo "# HELP clickhouse_up ClickHouse service status (1=up, 0=down)" > $METRICS_FILE
+    echo "# TYPE clickhouse_up gauge" >> $METRICS_FILE 
+    echo "clickhouse_up 1" >> $METRICS_FILE
+else
+    echo "# HELP clickhouse_up ClickHouse service status (1=up, 0=down)" > $METRICS_FILE
+    echo "# TYPE clickhouse_up gauge" >> $METRICS_FILE
+    echo "clickhouse_up 0" >> $METRICS_FILE
+    exit 1  # Exit the script if ClickHouse is not running
+fi
+
+# Record start time
+start_time=$(date +%s)
+
+# Query metrics
+process_count=$(query_clickhouse "SELECT count(*) FROM system.processes")
+merge_count=$(query_clickhouse "SELECT count(*) FROM system.merges")
+expired_parts_count=$(query_clickhouse "SELECT count() FROM system.parts WHERE delete_ttl_info_max < (now() - INTERVAL 3 DAY) AND delete_ttl_info_max > toDateTime('2000-01-01 00:00:00')")
+
+# Record end time and calculate execution time
+end_time=$(date +%s)
+execution_time=$((end_time - start_time))
+
+# Write Prometheus format metrics to the file
+{
+    echo "# HELP ck_processes_count Number of running processes in ClickHouse"
+    echo "# TYPE ck_processes_count gauge"
+    echo "ck_processes_count $process_count"
+    echo "# HELP ck_merges_count Number of running merges in ClickHouse"
+    echo "# TYPE ck_merges_count gauge"
+    echo "ck_merges_count $merge_count"
+    echo "# HELP ck_connect_time Time taken to connect and run queries in seconds"
+    echo "# TYPE ck_connect_time gauge"
+    echo "ck_connect_time $execution_time"
+    echo "# HELP ck_expired_parts_count Number of expired parts in ClickHouse"
+    echo "# TYPE ck_expired_parts_count gauge"
+    echo "ck_expired_parts_count $expired_parts_count"
+} >> "$METRICS_FILE"
+
--- a/clickhouse/monitor/run_ck_monitor
+++ b/clickhouse/monitor/run_ck_monitor
@@ -0,0 +1,3 @@
+#Ansible: Check Clickhouse up to node_exporter
+*/1 * * * * root cd /opt/tsg/olap/clickhouse/monitor && sh clickhouse_exporter.sh
+
--- a/druid/monitor/README.md
+++ b/druid/monitor/README.md
@@ -0,0 +1,6 @@
+## Description
+- run_druid_monitor：放置/etc/cron.d目录下，每分钟执行一次
+- druid_monitor.sh：输出Druid系统监控指标，可以部署至每一台节点
+  - 放置目录/opt/tsg/olap/druid/monitor
+  - 输出Metrics至/opt/tsg/olap/node-exporter/prom/druid_metrics.prom文件
+  - 更改状态druid_index_task_healthy_flag状态：1- healthy，0- unhealthy
--- a/druid/monitor/druid_exporter.sh
+++ b/druid/monitor/druid_exporter.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+
+druid_ip=127.0.0.1
+druid_port=8089
+druid_monitor_prom_file=/opt/tsg/olap/node-exporter/prom/druid_metrics.prom
+
+task_name=$(find "../topology/tasks" -name "*.json")
+
+# Clear the previous metrics file to avoid appending issues
+> $druid_monitor_prom_file
+
+# Function to retrieve supervisor status information
+function getSupervisorStatus() {
+  druid_supervisor_healthy_num=$(curl -G -d 'state=true' -s $druid_ip:$druid_port/druid/indexer/v1/supervisor | jq '[ .[] | .state] | length')
+  echo "# HELP druid_supervisor_healthy_num Number of healthy supervisors in Druid." >> $druid_monitor_prom_file
+  echo "# TYPE druid_supervisor_healthy_num gauge" >> $druid_monitor_prom_file
+  echo druid_supervisor_healthy_num $druid_supervisor_healthy_num >> $druid_monitor_prom_file
+
+  druid_supervisor_unhealthy_num=$(curl -G -d 'state=true' -s $druid_ip:$druid_port/druid/indexer/v1/supervisor | jq '[ .[] | select(.state != "RUNNING")] | length')
+  echo "# HELP druid_supervisor_unhealthy_num Number of unhealthy supervisors in Druid." >> $druid_monitor_prom_file
+  echo "# TYPE druid_supervisor_unhealthy_num gauge" >> $druid_monitor_prom_file
+  echo druid_supervisor_unhealthy_num $druid_supervisor_unhealthy_num >> $druid_monitor_prom_file
+}
+
+# Function to retrieve Druid task status information
+function getTaskStatus() {
+  druid_index_running_task_num=$(curl -G -d 'type=index_kafka' -s $druid_ip:$druid_port/druid/indexer/v1/runningTasks | jq '. | length')
+  echo "# HELP druid_index_running_task_num Number of running index_kafka tasks in Druid." >> $druid_monitor_prom_file
+  echo "# TYPE druid_index_running_task_num gauge" >> $druid_monitor_prom_file
+  echo druid_index_running_task_num $druid_index_running_task_num >> $druid_monitor_prom_file
+
+  druid_index_waiting_task_num=$(curl -G -d 'type=index_kafka' -s $druid_ip:$druid_port/druid/indexer/v1/waitingTasks | jq '. | length')
+  echo "# HELP druid_index_waiting_task_num Number of waiting index_kafka tasks in Druid." >> $druid_monitor_prom_file
+  echo "# TYPE druid_index_waiting_task_num gauge" >> $druid_monitor_prom_file
+  echo druid_index_waiting_task_num $druid_index_waiting_task_num >> $druid_monitor_prom_file
+
+  druid_index_pending_task_num=$(curl -G -d 'type=index_kafka' -s $druid_ip:$druid_port/druid/indexer/v1/pendingTasks | jq '. | length')
+  echo "# HELP druid_index_pending_task_num Number of pending index_kafka tasks in Druid." >> $druid_monitor_prom_file
+  echo "# TYPE druid_index_pending_task_num gauge" >> $druid_monitor_prom_file
+  echo druid_index_pending_task_num $druid_index_pending_task_num >> $druid_monitor_prom_file
+
+  druid_compact_pending_task_num=$(curl -G -d 'type=compact' -s $druid_ip:$druid_port/druid/indexer/v1/pendingTasks | jq '. | length')
+  echo "# HELP druid_compact_pending_task_num Number of pending compact tasks in Druid." >> $druid_monitor_prom_file
+  echo "# TYPE druid_compact_pending_task_num gauge" >> $druid_monitor_prom_file
+  echo druid_compact_pending_task_num $druid_compact_pending_task_num >> $druid_monitor_prom_file
+
+  druid_compact_waiting_task_num=$(curl -G -d 'type=compact' -s $druid_ip:$druid_port/druid/indexer/v1/waitingTasks | jq '. | length')
+  echo "# HELP druid_compact_waiting_task_num Number of waiting compact tasks in Druid." >> $druid_monitor_prom_file
+  echo "# TYPE druid_compact_waiting_task_num gauge" >> $druid_monitor_prom_file
+  echo druid_compact_waiting_task_num $druid_compact_waiting_task_num >> $druid_monitor_prom_file
+
+}
+
+# Function to check if the running index task count matches the specified count in the configuration
+# If the running count is greater than or equal to the configured count, it is considered healthy (1), otherwise unhealthy (0)
+function checkIndexStatus() {
+  druid_task_sum=0
+  for var in ${task_name[@]}; do
+    druid_task_num=$(cat ${var} | jq .ioConfig.taskCount)
+    druid_task_sum=$(expr $druid_task_num + $druid_task_sum)
+  done
+
+  druid_index_task_healthy_flag=1
+  if [ $druid_index_running_task_num -lt $druid_task_sum ]; then
+    druid_index_task_healthy_flag=0
+  fi
+
+  echo "# HELP druid_index_task_healthy_flag Health flag for index tasks in Druid (1 = healthy, 0 = unhealthy)." >> $druid_monitor_prom_file
+  echo "# TYPE druid_index_task_healthy_flag gauge" >> $druid_monitor_prom_file
+  echo druid_index_task_healthy_flag $druid_index_task_healthy_flag >> $druid_monitor_prom_file
+}
+
+# Call the functions to collect and write the metrics
+getTaskStatus
+getSupervisorStatus
+checkIndexStatus
+
--- a/druid/monitor/run_druid_monitor
+++ b/druid/monitor/run_druid_monitor
@@ -0,0 +1,2 @@
+#Ansible: Check druid up to node_exporter
+* * * * * root cd /opt/tsg/olap/druid/monitor/ && sh druid_monitor.sh