159 lines
5.3 KiB
C++
159 lines
5.3 KiB
C++
#include <arpa/inet.h>
|
|
#include <netinet/tcp.h>
|
|
#include <event2/bufferevent.h>
|
|
#include <event2/event.h>
|
|
#include <event2/buffer.h>
|
|
#include <unistd.h>
|
|
#include <assert.h>
|
|
#include <sys/prctl.h>
|
|
#include <stdlib.h>
|
|
#include <systemd/sd-daemon.h>
|
|
|
|
#include <proxy.h>
|
|
#include <platform.h>
|
|
#include <tfe_utils.h>
|
|
#include <watchdog_tfe.h>
|
|
#include <MESA/MESA_prof_load.h>
|
|
|
|
struct watchdog_tfe
|
|
{
|
|
struct tfe_proxy *proxy;
|
|
struct event_base *ev_base;
|
|
pthread_t pthread;
|
|
const char *profile;
|
|
void *logger;
|
|
|
|
unsigned int enable;
|
|
unsigned int timeout_seconds;
|
|
unsigned int statistics_window;
|
|
unsigned int timeout_cnt_as_fail;
|
|
unsigned int timeout_debug;
|
|
|
|
unsigned int cur_time_window_fail_cnt;
|
|
time_t cur_time_window_begin;
|
|
time_t cur_time_window_end;
|
|
};
|
|
|
|
void *watchdog_tfe_thread(void *arg)
|
|
{
|
|
char thread_name[16];
|
|
snprintf(thread_name, sizeof(thread_name), "watchdog:tfe");
|
|
prctl(PR_SET_NAME, (unsigned long long)thread_name, NULL, NULL, NULL);
|
|
|
|
struct watchdog_tfe *__ctx = (struct watchdog_tfe *)arg;
|
|
while (event_base_dispatch(__ctx->ev_base) >= 0)
|
|
{
|
|
}
|
|
DIE("Watchdog TFE thread is terminated.");
|
|
}
|
|
|
|
static void watchdog_tfe_thread_handle(evutil_socket_t fd, short what, void *arg)
|
|
{
|
|
struct tfe_proxy *proxy = (struct tfe_proxy *)arg;
|
|
struct watchdog_tfe *__ctx = proxy->watchdog_tfe;
|
|
struct timespec now;
|
|
time_t temp;
|
|
const char *check_result = "WATCHDOG=1";
|
|
|
|
clock_gettime(CLOCK_MONOTONIC, &now);
|
|
|
|
if (now.tv_sec > __ctx->cur_time_window_end)
|
|
{
|
|
__ctx->cur_time_window_begin = now.tv_sec;
|
|
__ctx->cur_time_window_end = now.tv_sec + __ctx->statistics_window;
|
|
__ctx->cur_time_window_fail_cnt = 0;
|
|
}
|
|
|
|
for (unsigned int i = 0; i < proxy->nr_work_threads; i++)
|
|
{
|
|
temp = ATOMIC_READ(&(proxy->work_threads[i]->lastime));
|
|
if (temp + __ctx->timeout_seconds < now.tv_sec)
|
|
{
|
|
if (__ctx->timeout_debug)
|
|
{
|
|
TFE_LOG_ERROR(__ctx->logger, "Current timestamp is %ld, Worker thread[%d] tid %d timestamp is %ld, Worker thread timeout, Exit !!!",
|
|
now.tv_sec, proxy->work_threads[i]->thread_id, proxy->work_threads[i]->readable_tid, temp);
|
|
abort();
|
|
}
|
|
else
|
|
{
|
|
__ctx->cur_time_window_fail_cnt++;
|
|
TFE_LOG_ERROR(__ctx->logger, "Current timestamp is %ld, Worker thread[%d] tid %d timestamp is %ld, Worker thread timeout, fail count %d !!!",
|
|
now.tv_sec, proxy->work_threads[i]->thread_id, proxy->work_threads[i]->readable_tid, temp, __ctx->cur_time_window_fail_cnt);
|
|
if (__ctx->cur_time_window_fail_cnt >= __ctx->timeout_cnt_as_fail)
|
|
{
|
|
TFE_LOG_ERROR(__ctx->logger, "Frome %ld to %ld, there are %d timeouts of the worker threads, Ready to Exit !!!",
|
|
__ctx->cur_time_window_begin, __ctx->cur_time_window_end, __ctx->cur_time_window_fail_cnt);
|
|
check_result = "WATCHDOG=trigger";
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (sd_watchdog_enabled(0, NULL))
|
|
{
|
|
sd_notify(0, check_result);
|
|
}
|
|
}
|
|
|
|
struct watchdog_tfe *watchdog_tfe_create(struct tfe_proxy *proxy, const char *profile, void *logger)
|
|
{
|
|
struct watchdog_tfe *__ctx = ALLOC(struct watchdog_tfe, 1);
|
|
int ret = 0;
|
|
struct event *ev = NULL;
|
|
// The worker thread updates the timestamp every two seconds
|
|
// The watchdog thread checks the timestamp every second
|
|
struct timeval timer_delay = {1, 0};
|
|
|
|
__ctx->proxy = proxy;
|
|
__ctx->profile = profile;
|
|
__ctx->logger = logger;
|
|
|
|
MESA_load_profile_uint_def(profile, "watchdog_tfe", "enable", &(__ctx->enable), 1);
|
|
MESA_load_profile_uint_def(profile, "watchdog_tfe", "timeout_seconds", &(__ctx->timeout_seconds), 5);
|
|
MESA_load_profile_uint_def(profile, "watchdog_tfe", "statistics_window", &(__ctx->statistics_window), 20);
|
|
MESA_load_profile_uint_def(profile, "watchdog_tfe", "timeout_cnt_as_fail", &(__ctx->timeout_cnt_as_fail), 3);
|
|
MESA_load_profile_uint_def(profile, "watchdog_tfe", "timeout_debug", &(__ctx->timeout_debug), 0);
|
|
|
|
if (!__ctx->enable)
|
|
{
|
|
return __ctx;
|
|
}
|
|
|
|
struct timespec now;
|
|
clock_gettime(CLOCK_MONOTONIC, &now);
|
|
__ctx->cur_time_window_begin = now.tv_sec;
|
|
__ctx->cur_time_window_end = now.tv_sec + __ctx->statistics_window;
|
|
__ctx->cur_time_window_fail_cnt = 0;
|
|
|
|
__ctx->ev_base = event_base_new();
|
|
if (!__ctx->ev_base)
|
|
{
|
|
TFE_LOG_ERROR(__ctx->logger, "Fail to create event base: %s", strerror(errno));
|
|
errno = 0;
|
|
goto errout;
|
|
}
|
|
|
|
ev = event_new(__ctx->ev_base, -1, EV_PERSIST, watchdog_tfe_thread_handle, proxy);
|
|
if (unlikely(ev == NULL))
|
|
{
|
|
TFE_LOG_ERROR(__ctx->logger, "Fail to create tfe watchdog event");
|
|
errno = 0;
|
|
goto errout;
|
|
}
|
|
evtimer_add(ev, &timer_delay);
|
|
|
|
ret = pthread_create(&__ctx->pthread, NULL, watchdog_tfe_thread, (void *)__ctx);
|
|
if (unlikely(ret < 0))
|
|
{
|
|
TFE_LOG_ERROR(__ctx->logger, "Fail to create tfe watchdog thread: %s", strerror(errno));
|
|
errno = 0;
|
|
goto errout;
|
|
}
|
|
|
|
TFE_LOG_INFO(__ctx->logger, "Watchdog TFE module init successfully.");
|
|
return __ctx;
|
|
|
|
errout:
|
|
return NULL;
|
|
}; |