356 lines
10 KiB
Python
Executable File
356 lines
10 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
RSS Reader - Based on sebsu/RSS-reader-in-bash parsing logic
|
|
"""
|
|
|
|
import sys
|
|
import sqlite3
|
|
import subprocess
|
|
import os
|
|
from datetime import datetime
|
|
|
|
DB_FILE = "rss.db"
|
|
LOCK_FILE = "fetch.lock"
|
|
FETCH_INTERVAL_MINUTES = 5
|
|
|
|
def log_info(msg):
|
|
print(f"\033[0;32m[INFO]\033[0m {msg}")
|
|
|
|
def log_warn(msg):
|
|
print(f"\033[1;33m[WARN]\033[0m {msg}", file=sys.stderr)
|
|
|
|
def log_error(msg):
|
|
print(f"\033[0;31m[ERROR]\033[0m {msg}", file=sys.stderr)
|
|
|
|
def check_deps():
|
|
for cmd in ['sqlite3', 'curl']:
|
|
if not subprocess.call(['which', cmd], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) == 0:
|
|
log_error(f"Missing: {cmd}")
|
|
sys.exit(1)
|
|
|
|
def init_db():
|
|
conn = sqlite3.connect(DB_FILE)
|
|
c = conn.cursor()
|
|
c.execute('''
|
|
CREATE TABLE IF NOT EXISTS feeds (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
url TEXT NOT NULL UNIQUE,
|
|
title TEXT,
|
|
last_fetched DATETIME,
|
|
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
)
|
|
''')
|
|
c.execute('''
|
|
CREATE TABLE IF NOT EXISTS news (
|
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
feed_id INTEGER NOT NULL,
|
|
guid TEXT NOT NULL,
|
|
pub_date DATETIME,
|
|
title TEXT,
|
|
description TEXT,
|
|
content TEXT,
|
|
link TEXT,
|
|
digest_flag INTEGER DEFAULT 0,
|
|
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
|
FOREIGN KEY (feed_id) REFERENCES feeds(id),
|
|
UNIQUE(feed_id, guid)
|
|
)
|
|
''')
|
|
c.execute('CREATE INDEX IF NOT EXISTS idx_news_feed ON news(feed_id)')
|
|
c.execute('CREATE INDEX IF NOT EXISTS idx_news_date ON news(pub_date)')
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
def parse_feed(xml_content):
|
|
"""Parse RSS/Atom and yield items"""
|
|
import re
|
|
|
|
# Remove CDATA markers
|
|
xml = re.sub(r'<!\[CDATA\[', '', xml_content)
|
|
xml = re.sub(r'\]\]>', '', xml)
|
|
|
|
# Find all items
|
|
items = re.findall(r'<item[^>]*>(.*?)</item>', xml, re.DOTALL)
|
|
|
|
for item in items:
|
|
# Title
|
|
title_match = re.search(r'<title>(.*?)</title>', item, re.DOTALL)
|
|
title = title_match.group(1).strip()[:500] if title_match else ""
|
|
|
|
# GUID
|
|
guid_match = re.search(r'<guid[^>]*>(.*?)</guid>', item, re.DOTALL)
|
|
guid = guid_match.group(1).strip() if guid_match else ""
|
|
|
|
# Link
|
|
link_match = re.search(r'<link>(.*?)</link>', item, re.DOTALL)
|
|
link = link_match.group(1).strip() if link_match else ""
|
|
|
|
# PubDate
|
|
pub_match = re.search(r'<pubDate>(.*?)</pubDate>', item, re.DOTALL)
|
|
pub = pub_match.group(1).strip() if pub_match else ""
|
|
|
|
if not guid and link:
|
|
guid = link
|
|
|
|
if title and guid:
|
|
yield {'title': title, 'link': link, 'guid': guid, 'pub': pub}
|
|
|
|
def insert_news(feed_id, title, link, guid, pub):
|
|
"""Insert news item into DB"""
|
|
from email.utils import parsedate_to_datetime
|
|
|
|
# Parse date
|
|
pdate = None
|
|
if pub:
|
|
try:
|
|
dt = parsedate_to_datetime(pub)
|
|
pdate = dt.strftime('%Y-%m-%d %H:%M:%S')
|
|
except:
|
|
pass
|
|
|
|
if not pdate:
|
|
pdate = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
if not link:
|
|
link = guid
|
|
|
|
conn = sqlite3.connect(DB_FILE)
|
|
c = conn.cursor()
|
|
c.execute('''
|
|
INSERT OR IGNORE INTO news (feed_id, guid, pub_date, title, link)
|
|
VALUES (?, ?, ?, ?, ?)
|
|
''', (feed_id, guid, pdate, title, link))
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
def cmd_fetch():
|
|
check_deps()
|
|
init_db()
|
|
|
|
# Lock
|
|
if os.path.exists(LOCK_FILE):
|
|
log_warn("Another fetch running, skipping...")
|
|
return
|
|
|
|
with open(LOCK_FILE, 'w') as f:
|
|
f.write(str(os.getpid()))
|
|
|
|
try:
|
|
log_info("Fetching feeds...")
|
|
total = 0
|
|
|
|
conn = sqlite3.connect(DB_FILE)
|
|
c = conn.cursor()
|
|
c.execute("SELECT id, url FROM feeds")
|
|
feeds = c.fetchall()
|
|
conn.close()
|
|
|
|
for feed_id, url in feeds:
|
|
# Check last fetch
|
|
conn = sqlite3.connect(DB_FILE)
|
|
c = conn.cursor()
|
|
c.execute("SELECT last_fetched FROM feeds WHERE id = ?", (feed_id,))
|
|
row = c.fetchone()
|
|
conn.close()
|
|
|
|
if row and row[0]:
|
|
last = datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S')
|
|
mins = (datetime.now() - last).total_seconds() / 60
|
|
if mins < FETCH_INTERVAL_MINUTES:
|
|
log_info(f"Skipping {url} ({int(mins)} min ago)")
|
|
continue
|
|
|
|
log_info(f"Fetching: {url}")
|
|
|
|
import subprocess
|
|
result = subprocess.run(
|
|
['curl', '-sL', '-m', '30', '-A', 'Mozilla/5.0', url],
|
|
capture_output=True, text=True
|
|
)
|
|
|
|
if result.returncode == 0 and result.stdout:
|
|
count = 0
|
|
for item in parse_feed(result.stdout):
|
|
insert_news(feed_id, item['title'], item['link'], item['guid'], item['pub'])
|
|
count += 1
|
|
|
|
if count > 0:
|
|
log_info(f"Added {count} items")
|
|
total += count
|
|
|
|
# Update last_fetched
|
|
conn = sqlite3.connect(DB_FILE)
|
|
c = conn.cursor()
|
|
c.execute("UPDATE feeds SET last_fetched = datetime('now') WHERE id = ?", (feed_id,))
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
log_info(f"Total new items: {total}")
|
|
finally:
|
|
os.remove(LOCK_FILE)
|
|
|
|
def cmd_list(args):
|
|
check_deps()
|
|
init_db()
|
|
|
|
limit = 20
|
|
sort = "pub_date"
|
|
order = "DESC"
|
|
feed_filter = ""
|
|
from_date = ""
|
|
to_date = ""
|
|
search = ""
|
|
digested = ""
|
|
format_type = "plain"
|
|
|
|
i = 0
|
|
while i < len(args):
|
|
if args[i] in ['-n', '--limit']:
|
|
limit = args[i+1]
|
|
i += 2
|
|
elif args[i] == '--sort':
|
|
sort = args[i+1]
|
|
i += 2
|
|
elif args[i] == '--order':
|
|
order = args[i+1]
|
|
i += 2
|
|
elif args[i] == '--feed':
|
|
feed_filter = f"AND n.feed_id={args[i+1]}"
|
|
i += 2
|
|
elif args[i] == '--from':
|
|
from_date = f"AND n.pub_date>='{args[i+1]}'"
|
|
i += 2
|
|
elif args[i] == '--to':
|
|
to_date = f"AND n.pub_date<='{args[i+1]}'"
|
|
i += 2
|
|
elif args[i] == '--search':
|
|
search = f"AND n.title LIKE '%{args[i+1]}%'"
|
|
i += 2
|
|
elif args[i] == '--digested':
|
|
digested = "AND n.digest_flag=1"
|
|
i += 1
|
|
elif args[i] == '--undigested':
|
|
digested = "AND n.digest_flag=0"
|
|
i += 1
|
|
elif args[i] == '--format':
|
|
format_type = args[i+1]
|
|
i += 2
|
|
else:
|
|
i += 1
|
|
|
|
order_by = "ORDER BY n.pub_date DESC"
|
|
if sort == "title":
|
|
order_by = f"ORDER BY n.title {order}"
|
|
|
|
query = f"""
|
|
SELECT n.id, n.feed_id, n.title, n.pub_date, n.link, n.digest_flag
|
|
FROM news n WHERE 1=1 {feed_filter} {from_date} {to_date} {search} {digested}
|
|
{order_by} LIMIT {limit}
|
|
"""
|
|
|
|
conn = sqlite3.connect(DB_FILE)
|
|
c = conn.cursor()
|
|
c.execute(query)
|
|
rows = c.fetchall()
|
|
conn.close()
|
|
|
|
if format_type == "json":
|
|
import json
|
|
result = []
|
|
for row in rows:
|
|
result.append({
|
|
'id': row[0], 'feed_id': row[1], 'title': row[2],
|
|
'pub_date': row[3], 'link': row[4], 'digest_flag': row[5]
|
|
})
|
|
print(json.dumps(result, ensure_ascii=False, indent=2))
|
|
elif format_type == "csv":
|
|
print("id,feed_id,title,pub_date,link,digest_flag")
|
|
for row in rows:
|
|
print(",".join([str(x) for x in row]))
|
|
else:
|
|
for row in rows:
|
|
print(f"{row[0]}\t{row[1]}\t{row[2][:50]}\t{row[3]}\t{row[4][:50]}\t{row[5]}")
|
|
|
|
def cmd_digest(news_id):
|
|
check_deps()
|
|
init_db()
|
|
|
|
if not news_id:
|
|
log_error("Missing ID")
|
|
return
|
|
|
|
conn = sqlite3.connect(DB_FILE)
|
|
c = conn.cursor()
|
|
c.execute("UPDATE news SET digest_flag=1 WHERE id=?", (news_id,))
|
|
if c.rowcount == 0:
|
|
log_error(f"Not found: {news_id}")
|
|
else:
|
|
log_info(f"Marked {news_id} as digested")
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
def cmd_clean(days):
|
|
check_deps()
|
|
init_db()
|
|
|
|
if not days:
|
|
log_error("Missing days")
|
|
return
|
|
|
|
conn = sqlite3.connect(DB_FILE)
|
|
c = conn.cursor()
|
|
c.execute("DELETE FROM news WHERE pub_date < datetime('now', '-{} days')".format(days))
|
|
deleted = c.rowcount
|
|
conn.commit()
|
|
conn.close()
|
|
log_info(f"Deleted {deleted} items")
|
|
|
|
def show_help():
|
|
print("""
|
|
RSS Reader - Based on sebsu/RSS-reader-in-bash
|
|
|
|
Usage: rss_reader.py [CMD] [OPTIONS]
|
|
|
|
Commands:
|
|
(default) Run --fetch
|
|
--fetch Fetch all feeds
|
|
--list List news
|
|
--digest ID Mark as digested
|
|
--clean N Delete older than N days
|
|
|
|
Options:
|
|
-n, --limit N Limit (default: 20)
|
|
--sort FIELD date|title
|
|
--order DIR asc|desc
|
|
--feed ID Filter by feed ID
|
|
--from DATE From date
|
|
--to DATE To date
|
|
--search WORD Search in title
|
|
--digested Only digested
|
|
--undigested Only undigested
|
|
--format FMT plain|json|csv
|
|
|
|
Examples:
|
|
rss_reader.py # Fetch all
|
|
rss_reader.py --list -n 10 # List 10 items
|
|
rss_reader.py --digest 123 # Mark 123
|
|
rss_reader.py --clean 30 # Delete old
|
|
|
|
DB: rss.db
|
|
""")
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) == 1 or sys.argv[1] in ['-h', '--help']:
|
|
show_help()
|
|
elif sys.argv[1] == '--fetch':
|
|
cmd_fetch()
|
|
elif sys.argv[1] == '--list':
|
|
cmd_list(sys.argv[2:])
|
|
elif sys.argv[1] == '--digest':
|
|
cmd_digest(sys.argv[2] if len(sys.argv) > 2 else None)
|
|
elif sys.argv[1] == '--clean':
|
|
cmd_clean(sys.argv[2] if len(sys.argv) > 2 else None)
|
|
else:
|
|
log_error(f"Unknown: {sys.argv[1]}")
|
|
show_help()
|