Line data Source code
1 : /*
2 : * If not stated otherwise in this file or this component's LICENSE file the
3 : * following copyright and licenses apply:
4 : *
5 : * Copyright 2023 Sky UK
6 : *
7 : * Licensed under the Apache License, Version 2.0 (the "License");
8 : * you may not use this file except in compliance with the License.
9 : * You may obtain a copy of the License at
10 : *
11 : * http://www.apache.org/licenses/LICENSE-2.0
12 : *
13 : * Unless required by applicable law or agreed to in writing, software
14 : * distributed under the License is distributed on an "AS IS" BASIS,
15 : * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 : * See the License for the specific language governing permissions and
17 : * limitations under the License.
18 : */
19 :
20 : #include "HealthcheckService.h"
21 : #include "RialtoServerManagerLogging.h"
22 :
23 : namespace
24 : {
25 25 : int generatePingId()
26 : {
27 : static int id{0};
28 25 : return id++;
29 : }
30 : } // namespace
31 :
32 : namespace rialto::servermanager::common
33 : {
34 14 : HealthcheckService::HealthcheckService(ISessionServerAppManager &sessionServerAppManager,
35 : const std::shared_ptr<firebolt::rialto::common::ITimerFactory> &timerFactory,
36 14 : std::chrono::seconds healthcheckInterval, unsigned numOfFailedPingsBeforeRecovery)
37 14 : : m_sessionServerAppManager{sessionServerAppManager},
38 14 : m_kNumOfFailedPingsBeforeRecovery{numOfFailedPingsBeforeRecovery}, m_currentPingId{-1}
39 : {
40 14 : if (std::chrono::seconds{0} != healthcheckInterval)
41 : {
42 39 : m_healthcheckTimer = timerFactory->createTimer(healthcheckInterval,
43 26 : std::bind(&HealthcheckService::sendPing, this),
44 13 : firebolt::rialto::common::TimerType::PERIODIC);
45 : }
46 14 : }
47 :
48 28 : HealthcheckService::~HealthcheckService()
49 : {
50 14 : if (m_healthcheckTimer && m_healthcheckTimer->isActive())
51 : {
52 13 : m_healthcheckTimer->cancel();
53 13 : m_healthcheckTimer.reset();
54 : }
55 28 : }
56 :
57 12 : void HealthcheckService::onPingSent(int serverId, int pingId)
58 : {
59 12 : std::unique_lock<std::mutex> lock{m_mutex};
60 12 : if (pingId != m_currentPingId)
61 : {
62 1 : RIALTO_SERVER_MANAGER_LOG_ERROR("Something went seriously wrong. Ping sent with wrong id to server: %d, valid "
63 : "ping id: %d, sent pingId: %d",
64 : serverId, m_currentPingId, pingId);
65 1 : return;
66 : }
67 11 : m_remainingPings.insert(serverId);
68 11 : m_failedPings.try_emplace(serverId, 0);
69 12 : }
70 :
71 3 : void HealthcheckService::onPingFailed(int serverId, int pingId)
72 : {
73 3 : std::unique_lock<std::mutex> lock{m_mutex};
74 3 : if (pingId != m_currentPingId)
75 : {
76 1 : RIALTO_SERVER_MANAGER_LOG_ERROR("Something went seriously wrong. Ping sent with wrong id to server: %d, valid "
77 : "ping id: %d, sent pingId: %d",
78 : serverId, m_currentPingId, pingId);
79 1 : return;
80 : }
81 2 : if (m_failedPings.end() != m_failedPings.find(serverId))
82 : {
83 1 : handleError(serverId);
84 : }
85 : else
86 : {
87 1 : m_sessionServerAppManager.onSessionServerStateChanged(serverId,
88 1 : firebolt::rialto::common::SessionServerState::ERROR);
89 1 : m_failedPings.emplace(serverId, 1);
90 : }
91 3 : }
92 :
93 5 : void HealthcheckService::onAckReceived(int serverId, int pingId, bool success)
94 : {
95 5 : std::unique_lock<std::mutex> lock{m_mutex};
96 5 : if (pingId != m_currentPingId)
97 : {
98 1 : RIALTO_SERVER_MANAGER_LOG_WARN("Unexpected ack received from server id: %d. Current ping id: %d, received ping "
99 : "id: %d",
100 : serverId, m_currentPingId, pingId);
101 1 : return;
102 : }
103 4 : m_remainingPings.erase(serverId);
104 4 : if (success)
105 : {
106 3 : m_failedPings[serverId] = 0;
107 : }
108 : else
109 : {
110 1 : RIALTO_SERVER_MANAGER_LOG_WARN("Ack with error received from server id: %d, ping id: %d", serverId, pingId);
111 1 : handleError(serverId);
112 : }
113 5 : }
114 :
115 1 : void HealthcheckService::onServerRemoved(int serverId)
116 : {
117 1 : std::unique_lock<std::mutex> lock{m_mutex};
118 1 : m_remainingPings.erase(serverId);
119 1 : m_failedPings.erase(serverId);
120 : }
121 :
122 25 : void HealthcheckService::sendPing()
123 : {
124 25 : std::unique_lock<std::mutex> lock{m_mutex};
125 32 : for (int serverId : m_remainingPings)
126 : {
127 7 : RIALTO_SERVER_MANAGER_LOG_WARN("Ping (id: %d) timeout for server id: %d", m_currentPingId, serverId);
128 7 : handleError(serverId);
129 : }
130 25 : m_remainingPings.clear();
131 25 : m_currentPingId = generatePingId();
132 25 : RIALTO_SERVER_MANAGER_LOG_DEBUG("Start ping procedure with id: %d", m_currentPingId);
133 25 : m_sessionServerAppManager.sendPingEvents(m_currentPingId);
134 : }
135 :
136 9 : void HealthcheckService::handleError(int serverId)
137 : {
138 9 : m_sessionServerAppManager.onSessionServerStateChanged(serverId, firebolt::rialto::common::SessionServerState::ERROR);
139 9 : unsigned &failedPingsNum{m_failedPings[serverId]};
140 9 : if (++failedPingsNum >= m_kNumOfFailedPingsBeforeRecovery)
141 : {
142 2 : RIALTO_SERVER_MANAGER_LOG_WARN(
143 : "Max num of failed pings reached for server with id: %d. Starting recovery action", serverId);
144 2 : failedPingsNum = 0;
145 2 : m_sessionServerAppManager.restartServer(serverId);
146 : }
147 9 : }
148 : } // namespace rialto::servermanager::common
|