284 lines
11 KiB
C++
284 lines
11 KiB
C++
/*
|
|
* Copyright (C) 2022 The Android Open Source Project
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
#pragma once
|
|
|
|
#include <chrono>
|
|
#include <functional>
|
|
#include <future>
|
|
#include <optional>
|
|
#include <queue>
|
|
#include <stack>
|
|
#include <string>
|
|
#include <type_traits>
|
|
#include <unordered_map>
|
|
#include <unordered_set>
|
|
#include <variant>
|
|
#include <utility>
|
|
|
|
#include "aemu/base/AndroidHealthMonitorConsumer.h"
|
|
#include "aemu/base/synchronization/AndroidConditionVariable.h"
|
|
#include "aemu/base/synchronization/AndroidLock.h"
|
|
#include "aemu/base/threads/AndroidThread.h"
|
|
|
|
#include <log/log.h>
|
|
|
|
using android::base::guest::EventHangMetadata;
|
|
|
|
#define WATCHDOG_BUILDER(healthMonitorPtr, msg) \
|
|
::android::base::guest::HealthWatchdogBuilder<std::decay_t<decltype(*(healthMonitorPtr))>>( \
|
|
(healthMonitorPtr), __FILE__, __func__, msg, __LINE__)
|
|
|
|
namespace android {
|
|
namespace base {
|
|
namespace guest {
|
|
|
|
using android::base::guest::ConditionVariable;
|
|
using android::base::guest::Lock;
|
|
using std::chrono::duration;
|
|
using std::chrono::steady_clock;
|
|
using std::chrono::time_point;
|
|
using HangAnnotations = EventHangMetadata::HangAnnotations;
|
|
|
|
static uint64_t kDefaultIntervalMs = 1'000;
|
|
static uint64_t kDefaultTimeoutMs = 5'000;
|
|
static std::chrono::nanoseconds kTimeEpsilon(1);
|
|
|
|
// HealthMonitor provides the ability to register arbitrary start/touch/stop events associated
|
|
// with client defined tasks. At some pre-defined interval, it will periodically consume
|
|
// all logged events to assess whether the system is hanging on any task. Via the
|
|
// HealthMonitorConsumer, it will log hang and unhang events when it detects tasks hanging/resuming.
|
|
// Design doc: http://go/gfxstream-health-monitor
|
|
template <class Clock = steady_clock>
|
|
class HealthMonitor : public android::base::guest::Thread {
|
|
public:
|
|
// Alias for task id.
|
|
using Id = uint64_t;
|
|
|
|
// Constructor
|
|
// `heatbeatIntervalMs` is the interval, in milleseconds, that the thread will sleep for
|
|
// in between health checks.
|
|
HealthMonitor(HealthMonitorConsumer& consumer, uint64_t heartbeatInterval = kDefaultIntervalMs);
|
|
|
|
// Destructor
|
|
// Enqueues an event to end monitoring and waits on thread to process remaining queued events.
|
|
~HealthMonitor();
|
|
|
|
// Start monitoring a task. Returns an id that is used for touch and stop operations.
|
|
// `metadata` is a struct containing info on the task watchdog to be passed through to the
|
|
// metrics logger.
|
|
// `onHangAnnotationsCallback` is an optional containing a callable that will return key-value
|
|
// string pairs to be recorded at the time a hang is detected, which is useful for debugging.
|
|
// `timeout` is the duration in milliseconds a task is allowed to run before it's
|
|
// considered "hung". Because `timeout` must be larger than the monitor's heartbeat
|
|
// interval, as shorter timeout periods would not be detected, this method will set actual
|
|
// timeout to the lesser of `timeout` and twice the heartbeat interval.
|
|
// `parentId` can be the Id of another task. Events in this monitored task will update
|
|
// the parent task recursively.
|
|
Id startMonitoringTask(std::unique_ptr<EventHangMetadata> metadata,
|
|
std::optional<std::function<std::unique_ptr<HangAnnotations>()>>
|
|
onHangAnnotationsCallback = std::nullopt,
|
|
uint64_t timeout = kDefaultTimeoutMs,
|
|
std::optional<Id> parentId = std::nullopt);
|
|
|
|
// Touch a monitored task. Resets the timeout countdown for that task.
|
|
void touchMonitoredTask(Id id);
|
|
|
|
// Stop monitoring a task.
|
|
void stopMonitoringTask(Id id);
|
|
|
|
private:
|
|
using Duration = typename Clock::duration; // duration<double>;
|
|
using Timestamp = time_point<Clock, Duration>;
|
|
|
|
// Allow test class access to private functions
|
|
friend class HealthMonitorTest;
|
|
|
|
struct MonitoredEventType {
|
|
struct Start {
|
|
Id id;
|
|
std::unique_ptr<EventHangMetadata> metadata;
|
|
Timestamp timeOccurred;
|
|
std::optional<std::function<std::unique_ptr<HangAnnotations>()>>
|
|
onHangAnnotationsCallback;
|
|
Duration timeoutThreshold;
|
|
std::optional<Id> parentId;
|
|
};
|
|
struct Touch {
|
|
Id id;
|
|
Timestamp timeOccurred;
|
|
};
|
|
struct Stop {
|
|
Id id;
|
|
Timestamp timeOccurred;
|
|
};
|
|
struct EndMonitoring {};
|
|
struct Poll {
|
|
std::promise<void> complete;
|
|
};
|
|
};
|
|
|
|
using MonitoredEvent =
|
|
std::variant<std::monostate, typename MonitoredEventType::Start,
|
|
typename MonitoredEventType::Touch, typename MonitoredEventType::Stop,
|
|
typename MonitoredEventType::EndMonitoring, typename MonitoredEventType::Poll>;
|
|
|
|
struct MonitoredTask {
|
|
Id id;
|
|
Timestamp timeoutTimestamp;
|
|
Duration timeoutThreshold;
|
|
std::optional<Timestamp> hungTimestamp;
|
|
std::unique_ptr<EventHangMetadata> metadata;
|
|
std::optional<std::function<std::unique_ptr<HangAnnotations>()>> onHangAnnotationsCallback;
|
|
std::optional<Id> parentId;
|
|
};
|
|
|
|
// Thread's main loop
|
|
intptr_t main() override;
|
|
|
|
// Update the parent task
|
|
void updateTaskParent(std::queue<std::unique_ptr<MonitoredEvent>>& events,
|
|
const MonitoredTask& task, Timestamp eventTime);
|
|
|
|
// Explicitly wake the monitor thread. Returns a future that can be used to wait until the
|
|
// poll event has been processed.
|
|
std::future<void> poll();
|
|
|
|
// Immutable. Multi-thread access is safe.
|
|
const Duration mInterval;
|
|
|
|
// Members accessed only on the worker thread. Not protected by mutex.
|
|
int mHungTasks = 0;
|
|
HealthMonitorConsumer& mConsumer;
|
|
std::unordered_map<Id, MonitoredTask> mMonitoredTasks;
|
|
|
|
// Lock and cv control access to queue and id counter
|
|
ConditionVariable mCv;
|
|
Lock mLock;
|
|
Id mNextId = 0;
|
|
std::queue<std::unique_ptr<MonitoredEvent>> mEventQueue;
|
|
};
|
|
|
|
// This class provides an RAII mechanism for monitoring a task.
|
|
// HealthMonitorT should have the exact same interface as HealthMonitor. Note that HealthWatchdog
|
|
// can be used in performance critical path, so we use a template to dispatch a call here to
|
|
// overcome the performance cost of virtual function dispatch.
|
|
template <class HealthMonitorT = HealthMonitor<>>
|
|
class HealthWatchdog {
|
|
public:
|
|
HealthWatchdog(HealthMonitorT* healthMonitor, std::unique_ptr<EventHangMetadata> metadata,
|
|
std::optional<std::function<std::unique_ptr<HangAnnotations>()>>
|
|
onHangAnnotationsCallback = std::nullopt,
|
|
uint64_t timeout = kDefaultTimeoutMs)
|
|
: mHealthMonitor(healthMonitor), mThreadId(getCurrentThreadId()) {
|
|
if (!mHealthMonitor) {
|
|
mId = std::nullopt;
|
|
return;
|
|
}
|
|
// TODO: willho@ re-enable thread awareness b/253483619
|
|
typename HealthMonitorT::Id id = mHealthMonitor->startMonitoringTask(
|
|
std::move(metadata), std::move(onHangAnnotationsCallback), timeout, std::nullopt);
|
|
mId = id;
|
|
}
|
|
|
|
~HealthWatchdog() {
|
|
if (!mId.has_value()) {
|
|
return;
|
|
}
|
|
mHealthMonitor->stopMonitoringTask(*mId);
|
|
}
|
|
|
|
void touch() {
|
|
if (!mId.has_value()) {
|
|
return;
|
|
}
|
|
mHealthMonitor->touchMonitoredTask(*mId);
|
|
}
|
|
|
|
// Return the underlying Id, and don't issue a stop on destruction.
|
|
std::optional<typename HealthMonitorT::Id> release() {
|
|
return std::exchange(mId, std::nullopt);
|
|
}
|
|
|
|
private:
|
|
using ThreadTasks =
|
|
std::unordered_map<HealthMonitorT*, std::stack<typename HealthMonitorT::Id>>;
|
|
std::optional<typename HealthMonitorT::Id> mId;
|
|
HealthMonitorT* mHealthMonitor;
|
|
const unsigned long mThreadId;
|
|
};
|
|
|
|
// HealthMonitorT should have the exact same interface as HealthMonitor. This template parameter is
|
|
// used for injecting a different type for testing.
|
|
template <class HealthMonitorT>
|
|
class HealthWatchdogBuilder {
|
|
public:
|
|
HealthWatchdogBuilder(HealthMonitorT* healthMonitor, const char* fileName,
|
|
const char* functionName, const char* message, uint32_t line)
|
|
: mHealthMonitor(healthMonitor),
|
|
mMetadata(std::make_unique<EventHangMetadata>(
|
|
fileName, functionName, message, line, EventHangMetadata::HangType::kOther, nullptr)),
|
|
mTimeoutMs(kDefaultTimeoutMs),
|
|
mOnHangCallback(std::nullopt) {}
|
|
|
|
DISALLOW_COPY_ASSIGN_AND_MOVE(HealthWatchdogBuilder);
|
|
|
|
HealthWatchdogBuilder& setHangType(EventHangMetadata::HangType hangType) {
|
|
if (mHealthMonitor) mMetadata->hangType = hangType;
|
|
return *this;
|
|
}
|
|
HealthWatchdogBuilder& setTimeoutMs(uint32_t timeoutMs) {
|
|
if (mHealthMonitor) mTimeoutMs = timeoutMs;
|
|
return *this;
|
|
}
|
|
// F should be a callable that returns a std::unique_ptr<EventHangMetadata::HangAnnotations>. We
|
|
// use template instead of std::function here to avoid extra copy.
|
|
template <class F>
|
|
HealthWatchdogBuilder& setOnHangCallback(F&& callback) {
|
|
if (mHealthMonitor) {
|
|
mOnHangCallback =
|
|
std::function<std::unique_ptr<HangAnnotations>()>(std::forward<F>(callback));
|
|
}
|
|
return *this;
|
|
}
|
|
|
|
HealthWatchdogBuilder& setAnnotations(std::unique_ptr<HangAnnotations> annotations) {
|
|
if (mHealthMonitor) mMetadata->data = std::move(annotations);
|
|
return *this;
|
|
}
|
|
|
|
std::unique_ptr<HealthWatchdog<HealthMonitorT>> build() {
|
|
// We are allocating on the heap, so there is a performance hit. However we also allocate
|
|
// EventHangMetadata on the heap, so this should be Ok. If we see performance issues with
|
|
// these allocations, for HealthWatchdog, we can always use placement new + noop deleter to
|
|
// avoid heap allocation for HealthWatchdog.
|
|
return std::make_unique<HealthWatchdog<HealthMonitorT>>(
|
|
mHealthMonitor, std::move(mMetadata), std::move(mOnHangCallback), mTimeoutMs);
|
|
}
|
|
|
|
private:
|
|
HealthMonitorT* mHealthMonitor;
|
|
std::unique_ptr<EventHangMetadata> mMetadata;
|
|
uint32_t mTimeoutMs;
|
|
std::optional<std::function<std::unique_ptr<HangAnnotations>()>> mOnHangCallback;
|
|
};
|
|
|
|
std::unique_ptr<HealthMonitor<>> CreateHealthMonitor(
|
|
HealthMonitorConsumer& consumer, uint64_t heartbeatInterval = kDefaultIntervalMs);
|
|
|
|
} // namespace guest
|
|
} // namespace base
|
|
} // namespace android
|