telegram-cli-bot/tools/rss_reader.py

356 lines
10 KiB
Python
Executable File

#!/usr/bin/env python3
"""
RSS Reader - Based on sebsu/RSS-reader-in-bash parsing logic
"""
import sys
import sqlite3
import subprocess
import os
from datetime import datetime
DB_FILE = "rss.db"
LOCK_FILE = "fetch.lock"
FETCH_INTERVAL_MINUTES = 5
def log_info(msg):
print(f"\033[0;32m[INFO]\033[0m {msg}")
def log_warn(msg):
print(f"\033[1;33m[WARN]\033[0m {msg}", file=sys.stderr)
def log_error(msg):
print(f"\033[0;31m[ERROR]\033[0m {msg}", file=sys.stderr)
def check_deps():
for cmd in ['sqlite3', 'curl']:
if not subprocess.call(['which', cmd], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) == 0:
log_error(f"Missing: {cmd}")
sys.exit(1)
def init_db():
conn = sqlite3.connect(DB_FILE)
c = conn.cursor()
c.execute('''
CREATE TABLE IF NOT EXISTS feeds (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT NOT NULL UNIQUE,
title TEXT,
last_fetched DATETIME,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
)
''')
c.execute('''
CREATE TABLE IF NOT EXISTS news (
id INTEGER PRIMARY KEY AUTOINCREMENT,
feed_id INTEGER NOT NULL,
guid TEXT NOT NULL,
pub_date DATETIME,
title TEXT,
description TEXT,
content TEXT,
link TEXT,
digest_flag INTEGER DEFAULT 0,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
FOREIGN KEY (feed_id) REFERENCES feeds(id),
UNIQUE(feed_id, guid)
)
''')
c.execute('CREATE INDEX IF NOT EXISTS idx_news_feed ON news(feed_id)')
c.execute('CREATE INDEX IF NOT EXISTS idx_news_date ON news(pub_date)')
conn.commit()
conn.close()
def parse_feed(xml_content):
"""Parse RSS/Atom and yield items"""
import re
# Remove CDATA markers
xml = re.sub(r'<!\[CDATA\[', '', xml_content)
xml = re.sub(r'\]\]>', '', xml)
# Find all items
items = re.findall(r'<item[^>]*>(.*?)</item>', xml, re.DOTALL)
for item in items:
# Title
title_match = re.search(r'<title>(.*?)</title>', item, re.DOTALL)
title = title_match.group(1).strip()[:500] if title_match else ""
# GUID
guid_match = re.search(r'<guid[^>]*>(.*?)</guid>', item, re.DOTALL)
guid = guid_match.group(1).strip() if guid_match else ""
# Link
link_match = re.search(r'<link>(.*?)</link>', item, re.DOTALL)
link = link_match.group(1).strip() if link_match else ""
# PubDate
pub_match = re.search(r'<pubDate>(.*?)</pubDate>', item, re.DOTALL)
pub = pub_match.group(1).strip() if pub_match else ""
if not guid and link:
guid = link
if title and guid:
yield {'title': title, 'link': link, 'guid': guid, 'pub': pub}
def insert_news(feed_id, title, link, guid, pub):
"""Insert news item into DB"""
from email.utils import parsedate_to_datetime
# Parse date
pdate = None
if pub:
try:
dt = parsedate_to_datetime(pub)
pdate = dt.strftime('%Y-%m-%d %H:%M:%S')
except:
pass
if not pdate:
pdate = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
if not link:
link = guid
conn = sqlite3.connect(DB_FILE)
c = conn.cursor()
c.execute('''
INSERT OR IGNORE INTO news (feed_id, guid, pub_date, title, link)
VALUES (?, ?, ?, ?, ?)
''', (feed_id, guid, pdate, title, link))
conn.commit()
conn.close()
def cmd_fetch():
check_deps()
init_db()
# Lock
if os.path.exists(LOCK_FILE):
log_warn("Another fetch running, skipping...")
return
with open(LOCK_FILE, 'w') as f:
f.write(str(os.getpid()))
try:
log_info("Fetching feeds...")
total = 0
conn = sqlite3.connect(DB_FILE)
c = conn.cursor()
c.execute("SELECT id, url FROM feeds")
feeds = c.fetchall()
conn.close()
for feed_id, url in feeds:
# Check last fetch
conn = sqlite3.connect(DB_FILE)
c = conn.cursor()
c.execute("SELECT last_fetched FROM feeds WHERE id = ?", (feed_id,))
row = c.fetchone()
conn.close()
if row and row[0]:
last = datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S')
mins = (datetime.now() - last).total_seconds() / 60
if mins < FETCH_INTERVAL_MINUTES:
log_info(f"Skipping {url} ({int(mins)} min ago)")
continue
log_info(f"Fetching: {url}")
import subprocess
result = subprocess.run(
['curl', '-sL', '-m', '30', '-A', 'Mozilla/5.0', url],
capture_output=True, text=True
)
if result.returncode == 0 and result.stdout:
count = 0
for item in parse_feed(result.stdout):
insert_news(feed_id, item['title'], item['link'], item['guid'], item['pub'])
count += 1
if count > 0:
log_info(f"Added {count} items")
total += count
# Update last_fetched
conn = sqlite3.connect(DB_FILE)
c = conn.cursor()
c.execute("UPDATE feeds SET last_fetched = datetime('now') WHERE id = ?", (feed_id,))
conn.commit()
conn.close()
log_info(f"Total new items: {total}")
finally:
os.remove(LOCK_FILE)
def cmd_list(args):
check_deps()
init_db()
limit = 20
sort = "pub_date"
order = "DESC"
feed_filter = ""
from_date = ""
to_date = ""
search = ""
digested = ""
format_type = "plain"
i = 0
while i < len(args):
if args[i] in ['-n', '--limit']:
limit = args[i+1]
i += 2
elif args[i] == '--sort':
sort = args[i+1]
i += 2
elif args[i] == '--order':
order = args[i+1]
i += 2
elif args[i] == '--feed':
feed_filter = f"AND n.feed_id={args[i+1]}"
i += 2
elif args[i] == '--from':
from_date = f"AND n.pub_date>='{args[i+1]}'"
i += 2
elif args[i] == '--to':
to_date = f"AND n.pub_date<='{args[i+1]}'"
i += 2
elif args[i] == '--search':
search = f"AND n.title LIKE '%{args[i+1]}%'"
i += 2
elif args[i] == '--digested':
digested = "AND n.digest_flag=1"
i += 1
elif args[i] == '--undigested':
digested = "AND n.digest_flag=0"
i += 1
elif args[i] == '--format':
format_type = args[i+1]
i += 2
else:
i += 1
order_by = "ORDER BY n.pub_date DESC"
if sort == "title":
order_by = f"ORDER BY n.title {order}"
query = f"""
SELECT n.id, n.feed_id, n.title, n.pub_date, n.link, n.digest_flag
FROM news n WHERE 1=1 {feed_filter} {from_date} {to_date} {search} {digested}
{order_by} LIMIT {limit}
"""
conn = sqlite3.connect(DB_FILE)
c = conn.cursor()
c.execute(query)
rows = c.fetchall()
conn.close()
if format_type == "json":
import json
result = []
for row in rows:
result.append({
'id': row[0], 'feed_id': row[1], 'title': row[2],
'pub_date': row[3], 'link': row[4], 'digest_flag': row[5]
})
print(json.dumps(result, ensure_ascii=False, indent=2))
elif format_type == "csv":
print("id,feed_id,title,pub_date,link,digest_flag")
for row in rows:
print(",".join([str(x) for x in row]))
else:
for row in rows:
print(f"{row[0]}\t{row[1]}\t{row[2][:50]}\t{row[3]}\t{row[4][:50]}\t{row[5]}")
def cmd_digest(news_id):
check_deps()
init_db()
if not news_id:
log_error("Missing ID")
return
conn = sqlite3.connect(DB_FILE)
c = conn.cursor()
c.execute("UPDATE news SET digest_flag=1 WHERE id=?", (news_id,))
if c.rowcount == 0:
log_error(f"Not found: {news_id}")
else:
log_info(f"Marked {news_id} as digested")
conn.commit()
conn.close()
def cmd_clean(days):
check_deps()
init_db()
if not days:
log_error("Missing days")
return
conn = sqlite3.connect(DB_FILE)
c = conn.cursor()
c.execute("DELETE FROM news WHERE pub_date < datetime('now', '-{} days')".format(days))
deleted = c.rowcount
conn.commit()
conn.close()
log_info(f"Deleted {deleted} items")
def show_help():
print("""
RSS Reader - Based on sebsu/RSS-reader-in-bash
Usage: rss_reader.py [CMD] [OPTIONS]
Commands:
(default) Run --fetch
--fetch Fetch all feeds
--list List news
--digest ID Mark as digested
--clean N Delete older than N days
Options:
-n, --limit N Limit (default: 20)
--sort FIELD date|title
--order DIR asc|desc
--feed ID Filter by feed ID
--from DATE From date
--to DATE To date
--search WORD Search in title
--digested Only digested
--undigested Only undigested
--format FMT plain|json|csv
Examples:
rss_reader.py # Fetch all
rss_reader.py --list -n 10 # List 10 items
rss_reader.py --digest 123 # Mark 123
rss_reader.py --clean 30 # Delete old
DB: rss.db
""")
if __name__ == "__main__":
if len(sys.argv) == 1 or sys.argv[1] in ['-h', '--help']:
show_help()
elif sys.argv[1] == '--fetch':
cmd_fetch()
elif sys.argv[1] == '--list':
cmd_list(sys.argv[2:])
elif sys.argv[1] == '--digest':
cmd_digest(sys.argv[2] if len(sys.argv) > 2 else None)
elif sys.argv[1] == '--clean':
cmd_clean(sys.argv[2] if len(sys.argv) > 2 else None)
else:
log_error(f"Unknown: {sys.argv[1]}")
show_help()