SmartEDT/backend/services/server_monitor.py

128 lines
4.6 KiB
Python
Raw Normal View History

import asyncio
import time
import logging
import psutil
import socket
import platform
from datetime import datetime, timezone
from sqlalchemy import insert
from sqlalchemy.ext.asyncio import async_sessionmaker
from backend.database.schema import server_metrics
from backend.services.broadcaster import Broadcaster
logger = logging.getLogger("backend.monitor")
class ServerMonitorService:
def __init__(self, session_factory: async_sessionmaker, broadcaster: Broadcaster):
self._session_factory = session_factory
self._broadcaster = broadcaster
self._host_name = socket.gethostname()
self._running = False
self._task = None
self._sample_interval = 1.0 / 50.0 # 50Hz (20ms)
self._report_interval = 1.0 / 10.0 # 10Hz (100ms)
self._last_report_time = 0.0
# Buffer for downsampling
self._buffer_cpu = []
self._buffer_mem = []
async def start(self):
if self._running:
return
self._running = True
self._task = asyncio.create_task(self._run_loop())
logger.info("ServerMonitorService started")
async def stop(self):
self._running = False
if self._task:
try:
await self._task
except asyncio.CancelledError:
pass
logger.info("ServerMonitorService stopped")
async def _run_loop(self):
loop = asyncio.get_running_loop()
next_time = loop.time()
while self._running:
# High frequency sampling (50Hz)
# psutil.cpu_percent(interval=None) is non-blocking
cpu_percent = psutil.cpu_percent(interval=None)
mem = psutil.virtual_memory()
self._buffer_cpu.append(cpu_percent)
self._buffer_mem.append(mem)
current_time = loop.time()
# Check if it's time to report (10Hz)
if current_time - self._last_report_time >= self._report_interval:
await self._process_and_report()
self._last_report_time = current_time
self._buffer_cpu.clear()
self._buffer_mem.clear()
# Precise timing control
next_time += self._sample_interval
sleep_time = next_time - loop.time()
if sleep_time > 0:
await asyncio.sleep(sleep_time)
else:
# If we are lagging, yield execution but don't sleep
await asyncio.sleep(0)
async def _process_and_report(self):
if not self._buffer_cpu:
return
# Downsampling: Calculate average of buffered samples
avg_cpu = sum(self._buffer_cpu) / len(self._buffer_cpu)
# Take the latest memory reading (memory doesn't fluctuate as fast as CPU)
last_mem = self._buffer_mem[-1]
payload = {
"ts": datetime.now(timezone.utc).isoformat(),
"host_name": self._host_name,
"cpu_usage_percent": {
"total": round(avg_cpu, 2),
# Note: per-core usage is expensive to query at 50Hz, so we only track total here
# or we could sample per-core at lower frequency
},
"memory_usage_bytes": {
"total": last_mem.total,
"available": last_mem.available,
"used": last_mem.used,
"percent": last_mem.percent
},
"disk_usage_bytes": {} # Optional: disk usage changes slowly, maybe check every 1s
}
# 1. Broadcast via WebSocket (10Hz)
await self._broadcaster.broadcast_json({
"type": "server.metrics",
"payload": payload
})
# 2. Persist to Database (10Hz)
# Note: In production, consider batching inserts further (e.g., every 1s)
# to reduce DB load, but 10Hz single insert is manageable for TimescaleDB.
async with self._session_factory() as session:
try:
stmt = insert(server_metrics).values(
ts=datetime.fromisoformat(payload["ts"]),
host_name=payload["host_name"],
cpu_usage_percent=payload["cpu_usage_percent"],
memory_usage_bytes=payload["memory_usage_bytes"],
disk_usage_bytes=payload["disk_usage_bytes"]
)
await session.execute(stmt)
await session.commit()
except Exception as e:
logger.warning("Failed to persist server metrics: %s", e)
# Don't raise, keep monitoring running