mirvmon/agent.py

401 lines
15 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
import time
import json
import psutil
import requests
import subprocess
import os
from datetime import datetime
# Скипаем виртуальные и служебные интерфейсы
SKIP_INTERFACE_PREFIXES = ('lo', 'docker', 'veth', 'br-', 'tun', 'tap', 'wg', 'virbr', 'vmnet', 'vmxnet')
# Храним предыдущие значения net_io для расчёта дельты
_prev_net_io = {}
def _is_real_interface(name, stats):
for prefix in SKIP_INTERFACE_PREFIXES:
if name.startswith(prefix):
return False
if not stats.isup:
return False
return True
def get_network_metrics(interval=60):
global _prev_net_io
metrics = {}
try:
counters = psutil.net_io_counters(pernic=True)
stats = psutil.net_if_stats()
now = __import__('time').time()
for name, counter in counters.items():
if name not in stats:
continue
if not _is_real_interface(name, stats[name]):
continue
speed_mbps = stats[name].speed if stats[name].speed > 0 else 1000
speed_bps = speed_mbps * 1000000 / 8
if name in _prev_net_io:
prev = _prev_net_io[name]
elapsed = now - prev['time']
if elapsed > 0:
rx_delta = counter.bytes_recv - prev['rx']
tx_delta = counter.bytes_sent - prev['tx']
rx_pct = min((rx_delta / elapsed) / speed_bps * 100, 100.0)
tx_pct = min((tx_delta / elapsed) / speed_bps * 100, 100.0)
iface_key = name.replace('-', '_')
metrics[f'net_in_{iface_key}'] = round(rx_pct, 2)
metrics[f'net_out_{iface_key}'] = round(tx_pct, 2)
_prev_net_io[name] = {'rx': counter.bytes_recv, 'tx': counter.bytes_sent, 'time': now}
except Exception as e:
print(f'Ошибка сбора сетевых метрик: {e}')
return metrics
def _is_real_partition(mountpoint, fstype):
"""Проверяем что раздел реальный (не tmpfs, docker, snap и т.д.)"""
skip_fstypes = {'tmpfs', 'devtmpfs', 'overlay', 'squashfs', 'snap',
'devpts', 'proc', 'sysfs', 'cgroup', 'cgroup2',
'pstore', 'hugetlbfs', 'mqueue', 'debugfs',
'tracefs', 'bpf', 'fusectl', 'configfs',
'securityfs', 'ramfs'}
skip_mounts = {'/run', '/run/lock', '/sys', '/proc', '/dev',
'/dev/shm', '/dev/pts', '/sys/fs/cgroup'}
if fstype in skip_fstypes:
return False
if mountpoint in skip_mounts:
return False
# Пропускаем EFI — слишком маленький, не информативен
if mountpoint == '/boot/efi':
return False
return True
def get_disk_metrics():
"""Собираем метрики диска для реальных разделов"""
metrics = {}
skip_fstypes = {'tmpfs', 'devtmpfs', 'overlay', 'squashfs', 'snap',
'devpts', 'proc', 'sysfs', 'cgroup', 'cgroup2',
'pstore', 'hugetlbfs', 'mqueue', 'debugfs',
'tracefs', 'bpf', 'fusectl', 'configfs',
'securityfs', 'ramfs'}
skip_mounts = {'/run', '/run/lock', '/sys', '/proc', '/dev',
'/dev/shm', '/dev/pts', '/sys/fs/cgroup'}
# Собираем реальные разделы с их устройствами
partitions = []
for part in psutil.disk_partitions(all=False):
if part.fstype in skip_fstypes:
continue
if part.mountpoint in skip_mounts:
continue
if part.mountpoint == '/boot/efi':
continue
try:
usage = psutil.disk_usage(part.mountpoint)
partitions.append({
'mountpoint': part.mountpoint,
'device': part.device,
'usage': usage
})
except (PermissionError, OSError):
pass
# Определяем уникальные устройства
devices = {}
for p in partitions:
dev = p['device']
if dev not in devices:
devices[dev] = []
devices[dev].append(p)
# Если одно устройство - только /
# Если несколько - для каждого отдельного устройства
if len(devices) == 1:
# Один диск - только корень
for p in partitions:
if p['mountpoint'] == '/':
metrics['disk_used_root'] = round(p['usage'].percent, 1)
metrics['disk_total_gb_root'] = round(p['usage'].total / (1024**3), 1)
metrics['disk_used'] = round(p['usage'].percent, 1)
break
else:
# Несколько устройств - собираем для каждого
for dev, parts in devices.items():
for p in parts:
mp = p['mountpoint']
name = mp.strip('/').replace('/', '_') or 'root'
metrics[f'disk_used_{name}'] = round(p['usage'].percent, 1)
metrics[f'disk_total_gb_{name}'] = round(p['usage'].total / (1024**3), 1)
if mp == '/':
metrics['disk_used'] = round(p['usage'].percent, 1)
return metrics
def get_metrics():
"""Сбор системных метрик"""
cpu_percent = psutil.cpu_percent(interval=1)
memory = psutil.virtual_memory()
# Дисковые метрики для всех реальных разделов
disk_metrics = get_disk_metrics()
result = {
'cpu_load': cpu_percent,
'ram_used': memory.percent,
}
result.update(disk_metrics)
# Метрики использования сети
net_metrics = get_network_metrics()
result.update(net_metrics)
# RAM total GB
result["ram_total_gb"] = round(memory.total / (1024**3), 1)
if net_metrics:
print(f" Сетевые метрики: {net_metrics}")
return result
def get_top_processes(process_type='cpu'):
"""Сбор топ-5 процессов по CPU или RAM"""
processes = []
try:
for proc in psutil.process_iter(['pid', 'name', 'cpu_percent', 'memory_percent', 'cmdline']):
try:
info = proc.info
if info['cpu_percent'] is None or info['memory_percent'] is None:
continue
cmdline = info.get('cmdline') or []
if cmdline:
full_cmd = ' '.join(cmdline)
cmd_display = full_cmd[:120] + ('...' if len(full_cmd) > 120 else '')
else:
cmd_display = info.get('name', '')
processes.append({
'pid': info['pid'],
'name': info['name'],
'cmdline': cmd_display,
'value': round(info[process_type + '_percent'], 1)
})
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
continue
# Сортируем по значению и берем топ-5
if process_type == 'cpu':
key = 'value'
else: # memory
key = 'value'
processes.sort(key=lambda x: x[key], reverse=True)
top_5 = processes[:5]
return top_5
except Exception as e:
print(f"Ошибка получения топ-процессов ({process_type}): {e}")
return []
def get_services():
"""Сбор списка сервисов через systemctl (list-unit-files + list-units)"""
try:
# 1. Получаем полный список всех сервисов (включая dead/выгруженные)
res_files = subprocess.run(['systemctl', 'list-unit-files', '--type=service', '--no-pager'],
capture_output=True, text=True, timeout=10)
# 2. Получаем текущие статусы активных/загруженных сервисов
res_units = subprocess.run(['systemctl', 'list-units', '--type=service', '--all', '--no-pager'],
capture_output=True, text=True, timeout=10)
# Парсим unit-files (список всех сервисов)
all_services = {}
for line in res_files.stdout.split('\n'):
parts = line.split()
if parts and parts[0].endswith('.service'):
all_services[parts[0]] = {'name': parts[0], 'enabled_state': parts[1] if len(parts) > 1 else 'unknown'}
# Парсим list-units (текущее состояние)
running_states = {}
for line in res_units.stdout.split('\n'):
parts = line.split(None, 4)
if len(parts) >= 4 and parts[0].endswith('.service'):
running_states[parts[0]] = {
'load_state': parts[1],
'active_state': parts[2],
'sub_state': parts[3]
}
services = []
# Объединяем: берем все сервисы из list-unit-files
for svc_name in all_services.keys():
if svc_name in running_states:
state = running_states[svc_name]
load = state['load_state']
active = state['active_state']
sub = state['sub_state']
else:
# Сервис есть в системе, но не загружен (dead)
load = 'loaded' # Обычно loaded, если файл юнита есть
active = 'inactive'
sub = 'dead'
if active == 'active':
status = 'running'
elif active in ['inactive', 'failed', 'deactivating']:
status = 'stopped'
else:
status = 'unknown'
services.append({
'name': svc_name,
'status': status,
'load_state': load,
'active_state': active,
'sub_state': sub
})
return services
except Exception as e:
print(f"Ошибка получения списка сервисов: {e}")
return []
def get_temperatures():
"""Сбор температур (CPU, GPU, Disks)"""
temps = {}
# 1. CPU via psutil
try:
sensors = psutil.sensors_temperatures()
if sensors:
cpu_temps = []
for name, entries in sensors.items():
if name.lower() in ['coretemp', 'k10temp', 'zenpower']:
for entry in entries:
if entry.current:
cpu_temps.append(entry.current)
if cpu_temps:
temps['temp_cpu'] = max(cpu_temps)
elif not temps:
for entries in sensors.values():
for entry in entries:
if entry.current:
cpu_temps.append(entry.current)
if cpu_temps:
temps['temp_cpu'] = max(cpu_temps)
except Exception:
pass
# 2. Disks via smartctl
try:
import glob
disks = glob.glob('/dev/sd[a-z]') + glob.glob('/dev/nvme[0-9]n1')
for disk in disks:
res = subprocess.run(['smartctl', '-n', 'standby', '-A', disk],
capture_output=True, text=True, timeout=5)
if res.returncode == 0 and 'STANDBY' not in res.stdout.upper():
for line in res.stdout.split('\n'):
if 'Temperature' in line:
parts = line.split()
# Ищем число в диапазоне 10-100
for p in reversed(parts):
try:
v = int(p)
if 10 < v < 100:
disk_name = disk.split('/')[-1]
temps[f'temp_disk_{disk_name}'] = float(v)
break
except ValueError:
pass
except Exception:
pass
# 3. GPU via nvidia-smi
try:
res = subprocess.run(['nvidia-smi', '--query-gpu=temperature.gpu', '--format=csv,noheader'],
capture_output=True, text=True, timeout=5)
if res.returncode == 0:
lines = res.stdout.strip().split('\n')
if len(lines) == 1:
try:
temps['temp_gpu'] = float(lines[0])
except: pass
else:
for i, line in enumerate(lines):
try:
temps[f'temp_gpu_{i}'] = float(line)
except: pass
except Exception:
pass
return temps
def send_metrics():
"""Отправка метрик на сервер"""
with open('/opt/server-monitor-agent/config.json', 'r') as f:
config = json.load(f)
token = config['token']
api_url = config['api_url']
# Собираем метрики
metrics = get_metrics()
temps = get_temperatures()
metrics.update(temps)
# Собираем топ-процессы
top_cpu = get_top_processes('cpu')
top_ram = get_top_processes('memory')
# Добавляем топ-процессы как метрики
if top_cpu:
metrics['top_cpu_proc'] = json.dumps(top_cpu)
if top_ram:
metrics['top_ram_proc'] = json.dumps(top_ram)
# Собираем сервисы
services = get_services()
# Формируем данные для отправки
data = {
'token': token,
'metrics': metrics,
'services': services
}
# Отправляем на сервер
try:
response = requests.post(api_url, json=data, timeout=5)
if response.status_code == 200:
print(f"{datetime.now()} - Метрики успешно отправлены")
else:
print(f"{datetime.now()} - Ошибка отправки: {response.status_code}")
except Exception as e:
print(f"{datetime.now()} - Ошибка соединения: {e}")
def main():
"""Главная функция агента"""
print(f"{datetime.now()} - Агент мониторинга запущен")
while True:
send_metrics()
# Ждем указанный интервал
with open('/opt/server-monitor-agent/config.json', 'r') as f:
config = json.load(f)
interval = config.get('interval_seconds', 60)
time.sleep(interval)
if __name__ == '__main__':
main()