telegram-cli-bot/bot/tools/rss_tool.py

368 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
RSS Reader Tool - инструмент для чтения RSS/Atom лент.
Бот может использовать этот инструмент автономно для получения новостей
из подписанных лент пользователя.
"""
import sys
import json
import logging
import sqlite3
import subprocess
import os
import re
from pathlib import Path
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional
from email.utils import parsedate_to_datetime
from bot.tools import BaseTool, ToolResult, register_tool
logger = logging.getLogger(__name__)
class RSSTool(BaseTool):
"""Инструмент для работы с RSS лентами."""
name = "rss_tool"
description = "Чтение RSS/Atom новостных лент. Управление подписками, получение новостей, дайджесты."
category = "news"
def __init__(self, db_path: str = None):
self.db_path = Path(db_path) if db_path else Path(__file__).parent.parent.parent / "rss.db"
self.lock_file = Path("/tmp/rss_fetch.lock")
self.fetch_interval_minutes = 5
def _init_db(self):
"""Инициализировать БД."""
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
c.execute('''
CREATE TABLE IF NOT EXISTS feeds (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL UNIQUE,
title TEXT,
last_fetched DATETIME,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
)
''')
c.execute('''
CREATE TABLE IF NOT EXISTS news (
id INTEGER PRIMARY KEY AUTOINCREMENT,
feed_id INTEGER NOT NULL,
guid TEXT NOT NULL,
pub_date DATETIME,
title TEXT,
description TEXT,
content TEXT,
link TEXT,
digest_flag INTEGER DEFAULT 0,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (feed_id) REFERENCES feeds(id),
UNIQUE(feed_id, guid)
)
''')
c.execute('CREATE INDEX IF NOT EXISTS idx_news_feed ON news(feed_id)')
c.execute('CREATE INDEX IF NOT EXISTS idx_news_date ON news(pub_date)')
conn.commit()
conn.close()
def _parse_feed(self, xml_content: str) -> List[Dict]:
"""Парсить RSS/Atom XML."""
items = []
# Remove CDATA markers
xml = re.sub(r'<!\[CDATA\[', '', xml_content)
xml = re.sub(r'\]\]>', '', xml)
# Find all items
xml_items = re.findall(r'<item[^>]*>(.*?)</item>', xml, re.DOTALL)
for item in xml_items:
# Title
title_match = re.search(r'<title>(.*?)</title>', item, re.DOTALL)
title = title_match.group(1).strip()[:500] if title_match else ""
# GUID
guid_match = re.search(r'<guid[^>]*>(.*?)</guid>', item, re.DOTALL)
guid = guid_match.group(1).strip() if guid_match else ""
# Link
link_match = re.search(r'<link>(.*?)</link>', item, re.DOTALL)
link = link_match.group(1).strip() if link_match else ""
# PubDate
pub_match = re.search(r'<pubDate>(.*?)</pubDate>', item, re.DOTALL)
pub = pub_match.group(1).strip() if pub_match else ""
if not guid and link:
guid = link
if title and guid:
items.append({'title': title, 'link': link, 'guid': guid, 'pub': pub})
return items
def _insert_news(self, feed_id: int, title: str, link: str, guid: str, pub: str):
"""Вставить новость в БД."""
pdate = None
if pub:
try:
dt = parsedate_to_datetime(pub)
pdate = dt.strftime('%Y-%m-%d %H:%M:%S')
except:
pass
if not pdate:
pdate = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
if not link:
link = guid
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
c.execute('''
INSERT OR IGNORE INTO news (feed_id, guid, pub_date, title, link)
VALUES (?, ?, ?, ?, ?)
''', (feed_id, guid, pdate, title, link))
conn.commit()
conn.close()
async def fetch(self) -> ToolResult:
"""Получить свежие новости из всех лент."""
self._init_db()
# Lock
if self.lock_file.exists():
logger.warning("Другой fetch уже выполняется")
return ToolResult(
success=False,
error="Другой процесс fetch уже выполняется",
metadata={'status': 'locked'}
)
with open(self.lock_file, 'w') as f:
f.write(str(os.getpid()))
try:
total = 0
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
c.execute("SELECT id, url FROM feeds")
feeds = c.fetchall()
conn.close()
for feed_id, url in feeds:
# Check last fetch
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
c.execute("SELECT last_fetched FROM feeds WHERE id = ?", (feed_id,))
row = c.fetchone()
conn.close()
if row and row[0]:
last = datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S')
mins = (datetime.now() - last).total_seconds() / 60
if mins < self.fetch_interval_minutes:
logger.info(f"Пропуск {url} ({int(mins)} мин назад)")
continue
logger.info(f"Получение: {url}")
result = subprocess.run(
['curl', '-sL', '-m', '30', '-A', 'Mozilla/5.0', url],
capture_output=True
)
if result.returncode == 0 and result.stdout:
# Декодируем с обработкой ошибок кодировки
try:
content = result.stdout.decode('utf-8', errors='ignore')
except Exception:
content = result.stdout.decode('latin-1', errors='ignore')
count = 0
for item in self._parse_feed(content):
self._insert_news(feed_id, item['title'], item['link'], item['guid'], item['pub'])
count += 1
if count > 0:
logger.info(f"Добавлено {count} элементов")
total += count
# Update last_fetched
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
c.execute("UPDATE feeds SET last_fetched = datetime('now') WHERE id = ?", (feed_id,))
conn.commit()
conn.close()
return ToolResult(
success=True,
data={'total_new_items': total},
metadata={'status': 'completed', 'action': 'fetch'}
)
finally:
self.lock_file.unlink(missing_ok=True)
async def list_news(self, limit: int = 20, feed_id: Optional[int] = None,
search: Optional[str] = None, undigested_only: bool = False) -> ToolResult:
"""Получить список новостей."""
self._init_db()
conditions = ["1=1"]
params = []
if feed_id:
conditions.append(f"feed_id = ?")
params.append(feed_id)
if search:
conditions.append(f"title LIKE ?")
params.append(f"%{search}%")
if undigested_only:
conditions.append("digest_flag = 0")
query = f"""
SELECT id, feed_id, title, pub_date, link, digest_flag
FROM news WHERE {' AND '.join(conditions)}
ORDER BY created_at DESC, id DESC LIMIT ?
"""
params.append(limit)
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
c.execute(query, params)
rows = c.fetchall()
conn.close()
news_list = []
for row in rows:
news_list.append({
'id': row[0],
'feed_id': row[1],
'title': row[2],
'pub_date': row[3],
'link': row[4],
'digest_flag': bool(row[5])
})
return ToolResult(
success=True,
data=news_list,
metadata={'count': len(news_list), 'limit': limit, 'action': 'list'}
)
async def add_feed(self, url: str, title: Optional[str] = None) -> ToolResult:
"""Добавить RSS ленту."""
self._init_db()
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
try:
c.execute("INSERT INTO feeds (url, title) VALUES (?, ?)", (url, title or url))
conn.commit()
return ToolResult(
success=True,
data={'url': url, 'title': title},
metadata={'status': 'added', 'action': 'add_feed'}
)
except sqlite3.IntegrityError:
return ToolResult(
success=False,
error=f"Лента уже существует: {url}"
)
finally:
conn.close()
async def list_feeds(self) -> ToolResult:
"""Получить список всех лент."""
self._init_db()
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
c.execute("SELECT id, url, title, last_fetched, created_at FROM feeds ORDER BY id")
rows = c.fetchall()
conn.close()
feeds = []
for row in rows:
feeds.append({
'id': row[0],
'url': row[1],
'title': row[2],
'last_fetched': row[3],
'created_at': row[4]
})
return ToolResult(
success=True,
data=feeds,
metadata={'count': len(feeds), 'action': 'list_feeds'}
)
async def mark_digest(self, news_id: int) -> ToolResult:
"""Отметить новость как прочитанную (в дайджесте)."""
self._init_db()
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
c.execute("UPDATE news SET digest_flag=1 WHERE id=?", (news_id,))
if c.rowcount == 0:
conn.close()
return ToolResult(
success=False,
error=f"Новость не найдена: {news_id}"
)
conn.commit()
conn.close()
return ToolResult(
success=True,
data={'id': news_id},
metadata={'status': 'marked'}
)
async def execute(self, action: str = "list", **kwargs) -> ToolResult:
"""
Выполнить действие с RSS.
Args:
action: Действие - fetch, list, add_feed, list_feeds, mark_digest
kwargs: Дополнительные аргументы для действия
"""
actions = {
'fetch': self.fetch,
'list': lambda: self.list_news(
limit=kwargs.get('limit', 20),
feed_id=kwargs.get('feed_id'),
search=kwargs.get('search'),
undigested_only=kwargs.get('undigested_only', False)
),
'add_feed': lambda: self.add_feed(
url=kwargs.get('url'),
title=kwargs.get('title')
),
'list_feeds': self.list_feeds,
'mark_digest': lambda: self.mark_digest(news_id=kwargs.get('news_id'))
}
if action not in actions:
return ToolResult(
success=False,
error=f"Неизвестное действие: {action}. Доступные: {list(actions.keys())}"
)
logger.info(f"RSS действие: {action} с аргументами: {kwargs}")
return await actions[action]()
# Автоматическая регистрация при импорте
@register_tool
class RSSToolAuto(RSSTool):
"""Авто-регистрируемая версия RSSTool."""
pass