LCOV - code coverage report
Current view: top level - serverManager/common/source - HealthcheckService.cpp (source / functions) Coverage Total Hit
Test: coverage.info Lines: 100.0 % 70 70
Test Date: 2026-05-14 05:59:51 Functions: 100.0 % 9 9

            Line data    Source code
       1              : /*
       2              :  * If not stated otherwise in this file or this component's LICENSE file the
       3              :  * following copyright and licenses apply:
       4              :  *
       5              :  * Copyright 2023 Sky UK
       6              :  *
       7              :  * Licensed under the Apache License, Version 2.0 (the "License");
       8              :  * you may not use this file except in compliance with the License.
       9              :  * You may obtain a copy of the License at
      10              :  *
      11              :  * http://www.apache.org/licenses/LICENSE-2.0
      12              :  *
      13              :  * Unless required by applicable law or agreed to in writing, software
      14              :  * distributed under the License is distributed on an "AS IS" BASIS,
      15              :  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      16              :  * See the License for the specific language governing permissions and
      17              :  * limitations under the License.
      18              :  */
      19              : 
      20              : #include "HealthcheckService.h"
      21              : #include "RialtoServerManagerLogging.h"
      22              : 
      23              : namespace
      24              : {
      25           28 : int generatePingId()
      26              : {
      27              :     static int id{0};
      28           28 :     return id++;
      29              : }
      30              : } // namespace
      31              : 
      32              : namespace rialto::servermanager::common
      33              : {
      34           15 : HealthcheckService::HealthcheckService(ISessionServerAppManager &sessionServerAppManager,
      35              :                                        const std::shared_ptr<firebolt::rialto::common::ITimerFactory> &timerFactory,
      36           15 :                                        std::chrono::seconds healthcheckInterval, unsigned numOfFailedPingsBeforeRecovery)
      37           15 :     : m_sessionServerAppManager{sessionServerAppManager},
      38           15 :       m_kNumOfFailedPingsBeforeRecovery{numOfFailedPingsBeforeRecovery}, m_currentPingId{-1}
      39              : {
      40           15 :     if (std::chrono::seconds{0} != healthcheckInterval)
      41              :     {
      42           42 :         m_healthcheckTimer = timerFactory->createTimer(healthcheckInterval,
      43           28 :                                                        std::bind(&HealthcheckService::sendPing, this),
      44           14 :                                                        firebolt::rialto::common::TimerType::PERIODIC);
      45              :     }
      46           15 : }
      47              : 
      48           30 : HealthcheckService::~HealthcheckService()
      49              : {
      50           15 :     if (m_healthcheckTimer && m_healthcheckTimer->isActive())
      51              :     {
      52           14 :         m_healthcheckTimer->cancel();
      53           14 :         m_healthcheckTimer.reset();
      54              :     }
      55           30 : }
      56              : 
      57           14 : void HealthcheckService::onPingSent(int serverId, int pingId)
      58              : {
      59           14 :     std::unique_lock<std::mutex> lock{m_mutex};
      60           14 :     if (pingId != m_currentPingId)
      61              :     {
      62            1 :         RIALTO_SERVER_MANAGER_LOG_ERROR("Something went seriously wrong. Ping sent with wrong id to server: %d, valid "
      63              :                                         "ping id: %d, sent pingId: %d",
      64              :                                         serverId, m_currentPingId, pingId);
      65            1 :         return;
      66              :     }
      67           13 :     m_remainingPings.insert(serverId);
      68           13 :     m_failedPings.try_emplace(serverId, std::set<int>{});
      69           14 : }
      70              : 
      71            3 : void HealthcheckService::onPingFailed(int serverId, int pingId)
      72              : {
      73            3 :     std::unique_lock<std::mutex> lock{m_mutex};
      74            3 :     if (pingId != m_currentPingId)
      75              :     {
      76            1 :         RIALTO_SERVER_MANAGER_LOG_ERROR("Something went seriously wrong. Ping sent with wrong id to server: %d, valid "
      77              :                                         "ping id: %d, sent pingId: %d",
      78              :                                         serverId, m_currentPingId, pingId);
      79            1 :         return;
      80              :     }
      81            2 :     if (m_failedPings.end() != m_failedPings.find(serverId))
      82              :     {
      83            1 :         handleError(serverId);
      84              :     }
      85              :     else
      86              :     {
      87            1 :         m_sessionServerAppManager.onSessionServerStateChanged(serverId,
      88            1 :                                                               firebolt::rialto::common::SessionServerState::ERROR);
      89            3 :         m_failedPings.emplace(serverId, std::set<int>{pingId});
      90              :     }
      91              : }
      92              : 
      93            6 : void HealthcheckService::onAckReceived(int serverId, int pingId, bool success)
      94              : {
      95            6 :     std::unique_lock<std::mutex> lock{m_mutex};
      96            6 :     if (pingId != m_currentPingId)
      97              :     {
      98            2 :         if (success && m_failedPings[serverId].find(pingId) != m_failedPings[serverId].end())
      99              :         {
     100            1 :             RIALTO_SERVER_MANAGER_LOG_WARN("Late ack received for server id: %d, Current ping id: %d, received ping "
     101              :                                            "id: %d. Removing from failed pings list",
     102              :                                            serverId, m_currentPingId, pingId);
     103            1 :             m_failedPings[serverId].erase(pingId);
     104              :         }
     105              :         else
     106              :         {
     107            1 :             RIALTO_SERVER_MANAGER_LOG_ERROR("Unexpected ack received from server id: %d. Current ping id: %d, received "
     108              :                                             "ping "
     109              :                                             "id: %d",
     110              :                                             serverId, m_currentPingId, pingId);
     111              :         }
     112            2 :         return;
     113              :     }
     114            4 :     m_remainingPings.erase(serverId);
     115            4 :     if (success)
     116              :     {
     117            3 :         m_failedPings[serverId].clear();
     118              :     }
     119              :     else
     120              :     {
     121            1 :         RIALTO_SERVER_MANAGER_LOG_WARN("Ack with error received from server id: %d, ping id: %d", serverId, pingId);
     122            1 :         handleError(serverId);
     123              :     }
     124            6 : }
     125              : 
     126            1 : void HealthcheckService::onServerRemoved(int serverId)
     127              : {
     128            1 :     std::unique_lock<std::mutex> lock{m_mutex};
     129            1 :     m_remainingPings.erase(serverId);
     130            1 :     m_failedPings.erase(serverId);
     131              : }
     132              : 
     133           28 : void HealthcheckService::sendPing()
     134              : {
     135           28 :     std::unique_lock<std::mutex> lock{m_mutex};
     136           37 :     for (int serverId : m_remainingPings)
     137              :     {
     138            9 :         RIALTO_SERVER_MANAGER_LOG_WARN("Ping (id: %d) timeout for server id: %d", m_currentPingId, serverId);
     139            9 :         handleError(serverId);
     140              :     }
     141           28 :     m_remainingPings.clear();
     142           28 :     m_currentPingId = generatePingId();
     143           28 :     RIALTO_SERVER_MANAGER_LOG_DEBUG("Start ping procedure with id: %d", m_currentPingId);
     144           28 :     m_sessionServerAppManager.sendPingEvents(m_currentPingId);
     145              : }
     146              : 
     147           11 : void HealthcheckService::handleError(int serverId)
     148              : {
     149           11 :     m_sessionServerAppManager.onSessionServerStateChanged(serverId, firebolt::rialto::common::SessionServerState::ERROR);
     150           11 :     auto &failedPings{m_failedPings[serverId]};
     151           11 :     failedPings.insert(m_currentPingId);
     152           11 :     if (failedPings.size() >= m_kNumOfFailedPingsBeforeRecovery)
     153              :     {
     154            2 :         RIALTO_SERVER_MANAGER_LOG_WARN(
     155              :             "Max num of failed pings reached for server with id: %d. Starting recovery action", serverId);
     156            2 :         failedPings.clear();
     157            2 :         m_sessionServerAppManager.restartServer(serverId);
     158              :     }
     159           11 : }
     160              : } // namespace rialto::servermanager::common
        

Generated by: LCOV version 2.0-1