#include #include #include #include #include #include #include #include #include #include #include #include #include #include struct watchdog_tfe { struct tfe_proxy *proxy; struct event_base *ev_base; pthread_t pthread; const char *profile; void *logger; unsigned int enable; unsigned int timeout_seconds; unsigned int statistics_window; unsigned int timeout_cnt_as_fail; unsigned int timeout_debug; unsigned int cur_time_window_fail_cnt; time_t cur_time_window_begin; time_t cur_time_window_end; }; void *watchdog_tfe_thread(void *arg) { char thread_name[16]; snprintf(thread_name, sizeof(thread_name), "watchdog:tfe"); prctl(PR_SET_NAME, (unsigned long long)thread_name, NULL, NULL, NULL); struct watchdog_tfe *__ctx = (struct watchdog_tfe *)arg; while (event_base_dispatch(__ctx->ev_base) >= 0) { } DIE("Watchdog TFE thread is terminated."); } static void watchdog_tfe_thread_handle(evutil_socket_t fd, short what, void *arg) { struct tfe_proxy *proxy = (struct tfe_proxy *)arg; struct watchdog_tfe *__ctx = proxy->watchdog_tfe; struct timespec now; time_t temp; clock_gettime(CLOCK_MONOTONIC, &now); if (now.tv_sec > __ctx->cur_time_window_end) { __ctx->cur_time_window_begin = now.tv_sec; __ctx->cur_time_window_end = now.tv_sec + __ctx->statistics_window; __ctx->cur_time_window_fail_cnt = 0; } for (unsigned int i = 0; i < proxy->nr_work_threads; i++) { temp = ATOMIC_READ(&(proxy->work_threads[i]->lastime)); if (temp + __ctx->timeout_seconds < now.tv_sec) { if (__ctx->timeout_debug) { TFE_LOG_ERROR(__ctx->logger, "Current timestamp is %ld, Worker thread[%d] tid %d timestamp is %ld, Worker thread timeout, Exit !!!", now.tv_sec, proxy->work_threads[i]->thread_id, proxy->work_threads[i]->readable_tid, temp); abort(); } else { __ctx->cur_time_window_fail_cnt++; TFE_LOG_ERROR(__ctx->logger, "Current timestamp is %ld, Worker thread[%d] tid %d timestamp is %ld, Worker thread timeout, fail count %d !!!", now.tv_sec, proxy->work_threads[i]->thread_id, proxy->work_threads[i]->readable_tid, temp, __ctx->cur_time_window_fail_cnt); if (__ctx->cur_time_window_fail_cnt >= __ctx->timeout_cnt_as_fail) { TFE_LOG_ERROR(__ctx->logger, "Frome %ld to %ld, there are %d timeouts of the worker threads, Exit !!!", __ctx->cur_time_window_begin, __ctx->cur_time_window_end, __ctx->cur_time_window_fail_cnt); exit(-1); } } } } } struct watchdog_tfe *watchdog_tfe_create(struct tfe_proxy *proxy, const char *profile, void *logger) { struct watchdog_tfe *__ctx = ALLOC(struct watchdog_tfe, 1); int ret = 0; struct event *ev = NULL; // The worker thread updates the timestamp every two seconds // The watchdog thread checks the timestamp every second struct timeval timer_delay = {1, 0}; __ctx->proxy = proxy; __ctx->profile = profile; __ctx->logger = logger; MESA_load_profile_uint_def(profile, "watchdog_tfe", "enable", &(__ctx->enable), 1); MESA_load_profile_uint_def(profile, "watchdog_tfe", "timeout_seconds", &(__ctx->timeout_seconds), 5); MESA_load_profile_uint_def(profile, "watchdog_tfe", "statistics_window", &(__ctx->statistics_window), 20); MESA_load_profile_uint_def(profile, "watchdog_tfe", "timeout_cnt_as_fail", &(__ctx->timeout_cnt_as_fail), 3); MESA_load_profile_uint_def(profile, "watchdog_tfe", "timeout_debug", &(__ctx->timeout_debug), 0); if (!__ctx->enable) { return __ctx; } struct timespec now; clock_gettime(CLOCK_MONOTONIC, &now); __ctx->cur_time_window_begin = now.tv_sec; __ctx->cur_time_window_end = now.tv_sec + __ctx->statistics_window; __ctx->cur_time_window_fail_cnt = 0; __ctx->ev_base = event_base_new(); if (!__ctx->ev_base) { TFE_LOG_ERROR(__ctx->logger, "Fail to create event base: %s", strerror(errno)); errno = 0; goto errout; } ev = event_new(__ctx->ev_base, -1, EV_PERSIST, watchdog_tfe_thread_handle, proxy); if (unlikely(ev == NULL)) { TFE_LOG_ERROR(__ctx->logger, "Fail to create tfe watchdog event"); errno = 0; goto errout; } evtimer_add(ev, &timer_delay); ret = pthread_create(&__ctx->pthread, NULL, watchdog_tfe_thread, (void *)__ctx); if (unlikely(ret < 0)) { TFE_LOG_ERROR(__ctx->logger, "Fail to create tfe watchdog thread: %s", strerror(errno)); errno = 0; goto errout; } TFE_LOG_INFO(__ctx->logger, "Watchdog TFE module init successfully."); return __ctx; errout: return NULL; };