128 lines
4.6 KiB
Python
128 lines
4.6 KiB
Python
import asyncio
|
|
import time
|
|
import logging
|
|
import psutil
|
|
import socket
|
|
import platform
|
|
from datetime import datetime, timezone
|
|
from sqlalchemy import insert
|
|
from sqlalchemy.ext.asyncio import async_sessionmaker
|
|
|
|
from backend.database.schema import server_metrics
|
|
from backend.services.broadcaster import Broadcaster
|
|
|
|
logger = logging.getLogger("backend.monitor")
|
|
|
|
class ServerMonitorService:
|
|
def __init__(self, session_factory: async_sessionmaker, broadcaster: Broadcaster):
|
|
self._session_factory = session_factory
|
|
self._broadcaster = broadcaster
|
|
self._host_name = socket.gethostname()
|
|
self._running = False
|
|
self._task = None
|
|
self._sample_interval = 1.0 / 50.0 # 50Hz (20ms)
|
|
self._report_interval = 1.0 / 10.0 # 10Hz (100ms)
|
|
self._last_report_time = 0.0
|
|
|
|
# Buffer for downsampling
|
|
self._buffer_cpu = []
|
|
self._buffer_mem = []
|
|
|
|
async def start(self):
|
|
if self._running:
|
|
return
|
|
self._running = True
|
|
self._task = asyncio.create_task(self._run_loop())
|
|
logger.info("ServerMonitorService started")
|
|
|
|
async def stop(self):
|
|
self._running = False
|
|
if self._task:
|
|
try:
|
|
await self._task
|
|
except asyncio.CancelledError:
|
|
pass
|
|
logger.info("ServerMonitorService stopped")
|
|
|
|
async def _run_loop(self):
|
|
loop = asyncio.get_running_loop()
|
|
next_time = loop.time()
|
|
|
|
while self._running:
|
|
# High frequency sampling (50Hz)
|
|
# psutil.cpu_percent(interval=None) is non-blocking
|
|
cpu_percent = psutil.cpu_percent(interval=None)
|
|
mem = psutil.virtual_memory()
|
|
|
|
self._buffer_cpu.append(cpu_percent)
|
|
self._buffer_mem.append(mem)
|
|
|
|
current_time = loop.time()
|
|
|
|
# Check if it's time to report (10Hz)
|
|
if current_time - self._last_report_time >= self._report_interval:
|
|
await self._process_and_report()
|
|
self._last_report_time = current_time
|
|
self._buffer_cpu.clear()
|
|
self._buffer_mem.clear()
|
|
|
|
# Precise timing control
|
|
next_time += self._sample_interval
|
|
sleep_time = next_time - loop.time()
|
|
if sleep_time > 0:
|
|
await asyncio.sleep(sleep_time)
|
|
else:
|
|
# If we are lagging, yield execution but don't sleep
|
|
await asyncio.sleep(0)
|
|
|
|
async def _process_and_report(self):
|
|
if not self._buffer_cpu:
|
|
return
|
|
|
|
# Downsampling: Calculate average of buffered samples
|
|
avg_cpu = sum(self._buffer_cpu) / len(self._buffer_cpu)
|
|
|
|
# Take the latest memory reading (memory doesn't fluctuate as fast as CPU)
|
|
last_mem = self._buffer_mem[-1]
|
|
|
|
payload = {
|
|
"ts": datetime.now(timezone.utc).isoformat(),
|
|
"host_name": self._host_name,
|
|
"cpu_usage_percent": {
|
|
"total": round(avg_cpu, 2),
|
|
# Note: per-core usage is expensive to query at 50Hz, so we only track total here
|
|
# or we could sample per-core at lower frequency
|
|
},
|
|
"memory_usage_bytes": {
|
|
"total": last_mem.total,
|
|
"available": last_mem.available,
|
|
"used": last_mem.used,
|
|
"percent": last_mem.percent
|
|
},
|
|
"disk_usage_bytes": {} # Optional: disk usage changes slowly, maybe check every 1s
|
|
}
|
|
|
|
# 1. Broadcast via WebSocket (10Hz)
|
|
await self._broadcaster.broadcast_json({
|
|
"type": "server.metrics",
|
|
"payload": payload
|
|
})
|
|
|
|
# 2. Persist to Database (10Hz)
|
|
# Note: In production, consider batching inserts further (e.g., every 1s)
|
|
# to reduce DB load, but 10Hz single insert is manageable for TimescaleDB.
|
|
async with self._session_factory() as session:
|
|
try:
|
|
stmt = insert(server_metrics).values(
|
|
ts=datetime.fromisoformat(payload["ts"]),
|
|
host_name=payload["host_name"],
|
|
cpu_usage_percent=payload["cpu_usage_percent"],
|
|
memory_usage_bytes=payload["memory_usage_bytes"],
|
|
disk_usage_bytes=payload["disk_usage_bytes"]
|
|
)
|
|
await session.execute(stmt)
|
|
await session.commit()
|
|
except Exception as e:
|
|
logger.warning("Failed to persist server metrics: %s", e)
|
|
# Don't raise, keep monitoring running
|