Line data Source code
1 : /*
2 : * If not stated otherwise in this file or this component's LICENSE file the
3 : * following copyright and licenses apply:
4 : *
5 : * Copyright 2023 Sky UK
6 : *
7 : * Licensed under the Apache License, Version 2.0 (the "License");
8 : * you may not use this file except in compliance with the License.
9 : * You may obtain a copy of the License at
10 : *
11 : * http://www.apache.org/licenses/LICENSE-2.0
12 : *
13 : * Unless required by applicable law or agreed to in writing, software
14 : * distributed under the License is distributed on an "AS IS" BASIS,
15 : * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 : * See the License for the specific language governing permissions and
17 : * limitations under the License.
18 : */
19 :
20 : #include "HealthcheckService.h"
21 : #include "RialtoServerManagerLogging.h"
22 :
23 : namespace
24 : {
25 28 : int generatePingId()
26 : {
27 : static int id{0};
28 28 : return id++;
29 : }
30 : } // namespace
31 :
32 : namespace rialto::servermanager::common
33 : {
34 15 : HealthcheckService::HealthcheckService(ISessionServerAppManager &sessionServerAppManager,
35 : const std::shared_ptr<firebolt::rialto::common::ITimerFactory> &timerFactory,
36 15 : std::chrono::seconds healthcheckInterval, unsigned numOfFailedPingsBeforeRecovery)
37 15 : : m_sessionServerAppManager{sessionServerAppManager},
38 15 : m_kNumOfFailedPingsBeforeRecovery{numOfFailedPingsBeforeRecovery}, m_currentPingId{-1}
39 : {
40 15 : if (std::chrono::seconds{0} != healthcheckInterval)
41 : {
42 42 : m_healthcheckTimer = timerFactory->createTimer(healthcheckInterval,
43 28 : std::bind(&HealthcheckService::sendPing, this),
44 14 : firebolt::rialto::common::TimerType::PERIODIC);
45 : }
46 15 : }
47 :
48 30 : HealthcheckService::~HealthcheckService()
49 : {
50 15 : if (m_healthcheckTimer && m_healthcheckTimer->isActive())
51 : {
52 14 : m_healthcheckTimer->cancel();
53 14 : m_healthcheckTimer.reset();
54 : }
55 30 : }
56 :
57 14 : void HealthcheckService::onPingSent(int serverId, int pingId)
58 : {
59 14 : std::unique_lock<std::mutex> lock{m_mutex};
60 14 : if (pingId != m_currentPingId)
61 : {
62 1 : RIALTO_SERVER_MANAGER_LOG_ERROR("Something went seriously wrong. Ping sent with wrong id to server: %d, valid "
63 : "ping id: %d, sent pingId: %d",
64 : serverId, m_currentPingId, pingId);
65 1 : return;
66 : }
67 13 : m_remainingPings.insert(serverId);
68 13 : m_failedPings.try_emplace(serverId, std::set<int>{});
69 14 : }
70 :
71 3 : void HealthcheckService::onPingFailed(int serverId, int pingId)
72 : {
73 3 : std::unique_lock<std::mutex> lock{m_mutex};
74 3 : if (pingId != m_currentPingId)
75 : {
76 1 : RIALTO_SERVER_MANAGER_LOG_ERROR("Something went seriously wrong. Ping sent with wrong id to server: %d, valid "
77 : "ping id: %d, sent pingId: %d",
78 : serverId, m_currentPingId, pingId);
79 1 : return;
80 : }
81 2 : if (m_failedPings.end() != m_failedPings.find(serverId))
82 : {
83 1 : handleError(serverId);
84 : }
85 : else
86 : {
87 1 : m_sessionServerAppManager.onSessionServerStateChanged(serverId,
88 1 : firebolt::rialto::common::SessionServerState::ERROR);
89 3 : m_failedPings.emplace(serverId, std::set<int>{pingId});
90 : }
91 : }
92 :
93 6 : void HealthcheckService::onAckReceived(int serverId, int pingId, bool success)
94 : {
95 6 : std::unique_lock<std::mutex> lock{m_mutex};
96 6 : if (pingId != m_currentPingId)
97 : {
98 2 : if (success && m_failedPings[serverId].find(pingId) != m_failedPings[serverId].end())
99 : {
100 1 : RIALTO_SERVER_MANAGER_LOG_WARN("Late ack received for server id: %d, Current ping id: %d, received ping "
101 : "id: %d. Removing from failed pings list",
102 : serverId, m_currentPingId, pingId);
103 1 : m_failedPings[serverId].erase(pingId);
104 : }
105 : else
106 : {
107 1 : RIALTO_SERVER_MANAGER_LOG_ERROR("Unexpected ack received from server id: %d. Current ping id: %d, received "
108 : "ping "
109 : "id: %d",
110 : serverId, m_currentPingId, pingId);
111 : }
112 2 : return;
113 : }
114 4 : m_remainingPings.erase(serverId);
115 4 : if (success)
116 : {
117 3 : m_failedPings[serverId].clear();
118 : }
119 : else
120 : {
121 1 : RIALTO_SERVER_MANAGER_LOG_WARN("Ack with error received from server id: %d, ping id: %d", serverId, pingId);
122 1 : handleError(serverId);
123 : }
124 6 : }
125 :
126 1 : void HealthcheckService::onServerRemoved(int serverId)
127 : {
128 1 : std::unique_lock<std::mutex> lock{m_mutex};
129 1 : m_remainingPings.erase(serverId);
130 1 : m_failedPings.erase(serverId);
131 : }
132 :
133 28 : void HealthcheckService::sendPing()
134 : {
135 28 : std::unique_lock<std::mutex> lock{m_mutex};
136 37 : for (int serverId : m_remainingPings)
137 : {
138 9 : RIALTO_SERVER_MANAGER_LOG_WARN("Ping (id: %d) timeout for server id: %d", m_currentPingId, serverId);
139 9 : handleError(serverId);
140 : }
141 28 : m_remainingPings.clear();
142 28 : m_currentPingId = generatePingId();
143 28 : RIALTO_SERVER_MANAGER_LOG_DEBUG("Start ping procedure with id: %d", m_currentPingId);
144 28 : m_sessionServerAppManager.sendPingEvents(m_currentPingId);
145 : }
146 :
147 11 : void HealthcheckService::handleError(int serverId)
148 : {
149 11 : m_sessionServerAppManager.onSessionServerStateChanged(serverId, firebolt::rialto::common::SessionServerState::ERROR);
150 11 : auto &failedPings{m_failedPings[serverId]};
151 11 : failedPings.insert(m_currentPingId);
152 11 : if (failedPings.size() >= m_kNumOfFailedPingsBeforeRecovery)
153 : {
154 2 : RIALTO_SERVER_MANAGER_LOG_WARN(
155 : "Max num of failed pings reached for server with id: %d. Starting recovery action", serverId);
156 2 : failedPings.clear();
157 2 : m_sessionServerAppManager.restartServer(serverId);
158 : }
159 11 : }
160 : } // namespace rialto::servermanager::common
|