From 0a294e89fcb69cc51f7c1bc25e6c5654e1651b03 Mon Sep 17 00:00:00 2001 From: mirivlad Date: Tue, 17 Mar 2026 03:29:11 +0800 Subject: [PATCH] =?UTF-8?q?=D0=AD=D1=82=D0=B0=D0=BF=207:=20=D0=9B=D0=BE?= =?UTF-8?q?=D0=BA=D0=B0=D0=BB=D1=8C=D0=BD=D0=BE=D0=B5=20=D1=80=D0=B0=D1=81?= =?UTF-8?q?=D0=BF=D0=BE=D0=B7=D0=BD=D0=B0=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5=20?= =?UTF-8?q?=D1=80=D0=B5=D1=87=D0=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/bot/main.py | 62 +++++++++++++++++++--- src/speech/__init__.py | 0 src/speech/speech.py | 118 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 174 insertions(+), 6 deletions(-) create mode 100644 src/speech/__init__.py create mode 100644 src/speech/speech.py diff --git a/src/bot/main.py b/src/bot/main.py index 6d71f48..cd477ce 100644 --- a/src/bot/main.py +++ b/src/bot/main.py @@ -1,15 +1,18 @@ import asyncio import logging +import os +import tempfile from telegram import Update from telegram.ext import ( Application, CommandHandler, MessageHandler, filters, - ContextTypes, CallbackQueryHandler + ContextTypes, CallbackQueryHandler, VoiceHandler ) from telegram import InlineKeyboardButton, InlineKeyboardMarkup from config.config import get_settings from src.tools.orchestrator import Orchestrator from src.bot.states import chat_state, ChatMode from src.scheduler.scheduler import SchedulerManager +from src.speech.speech import SpeechRecognizer logging.basicConfig( format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", @@ -20,6 +23,7 @@ logger = logging.getLogger(__name__) settings = get_settings() orchestrator = Orchestrator() scheduler_manager = None +speech_recognizer = SpeechRecognizer() async def start(update: Update, context: ContextTypes.DEFAULT_TYPE): @@ -32,6 +36,7 @@ async def start(update: Update, context: ContextTypes.DEFAULT_TYPE): async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE): mode = chat_state.get_mode(update.effective_chat.id) current_tool = orchestrator.get_default_tool() + stt_status = "включено" if speech_recognizer.is_enabled() else "отключено" help_text = ( f"Я {settings.bot_name}, ваш ИИ-ассистент.\n\n" "Доступные команды:\n" @@ -44,9 +49,11 @@ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE): "/qwen <текст> - Задать вопрос qwen-code\n" "/open <текст> - Задать вопрос opencode\n" "/forget - Очистить историю чата\n" - "/remind <текст> <время> - Создать напоминание\n\n" + "/remind <текст> <время> - Создать напоминание\n" + "/stt on|off - Включить/выключить распознавание речи\n\n" f"Текущий режим: {'с подтверждением' if mode == ChatMode.CONFIRM else 'автономный'}\n" - f"Инструмент по умолчанию: {current_tool}" + f"Инструмент по умолчанию: {current_tool}\n" + f"Распознавание речи: {stt_status}" ) await update.message.reply_text(help_text) @@ -99,6 +106,23 @@ async def cancel_command(update: Update, context: ContextTypes.DEFAULT_TYPE): await update.message.reply_text("Нет активных задач для отмены.") +async def stt_command(update: Update, context: ContextTypes.DEFAULT_TYPE): + if not context.args: + status = "включено" if speech_recognizer.is_enabled() else "отключено" + await update.message.reply_text(f"Распознавание речи: {status}") + return + + arg = context.args[0].lower() + if arg == "on": + speech_recognizer.toggle(True) + await update.message.reply_text("Распознавание речи включено.") + elif arg == "off": + speech_recognizer.toggle(False) + await update.message.reply_text("Распознавание речи отключено.") + else: + await update.message.reply_text("Использование: /stt on | off") + + async def confirm_callback(update: Update, context: ContextTypes.DEFAULT_TYPE): query = update.callback_query await query.answer() @@ -239,7 +263,6 @@ async def remind_command(update: Update, context: ContextTypes.DEFAULT_TYPE): minutes = int(match.group(1)) * value break - from datetime import datetime run_at = datetime.now() + timedelta(minutes=minutes) scheduler_manager.add_reminder(chat_id, text, run_at) @@ -248,8 +271,31 @@ async def remind_command(update: Update, context: ContextTypes.DEFAULT_TYPE): ) -async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE): - prompt = update.message.text +async def handle_voice(update: Update, context: ContextTypes.DEFAULT_TYPE): + if not speech_recognizer.is_enabled(): + await update.message.reply_text("Распознавание речи отключено.") + return + + await update.message.reply_text("Распознаю голос...") + + voice = update.message.voice + file = await context.bot.get_file(voice.file_id) + + with tempfile.NamedTemporaryFile(delete=False, suffix=".ogg") as tmp: + await file.download_to_drive(tmp.name) + audio_path = tmp.name + + text = await speech_recognizer.recognize(audio_path) + + if text: + await update.message.reply_text(f"Распознано: {text}") + await handle_message(update, context, text) + else: + await update.message.reply_text("Не удалось распознать речь.") + + +async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE, override_text: str = None): + prompt = override_text or update.message.text chat_id = update.effective_chat.id mode = chat_state.get_mode(chat_id) @@ -281,6 +327,8 @@ async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE): def main(): global scheduler_manager + speech_recognizer.load_model() + builder = Application.builder() builder.token(settings.telegram_bot_token) @@ -298,11 +346,13 @@ def main(): application.add_handler(CommandHandler("mode", mode_command)) application.add_handler(CommandHandler("use", use_command)) application.add_handler(CommandHandler("cancel", cancel_command)) + application.add_handler(CommandHandler("stt", stt_command)) application.add_handler(CommandHandler("qwen", qwen_command)) application.add_handler(CommandHandler("open", open_command)) application.add_handler(CommandHandler("forget", forget_command)) application.add_handler(CommandHandler("remind", remind_command)) application.add_handler(CallbackQueryHandler(confirm_callback)) + application.add_handler(VoiceHandler(handle_voice)) application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message)) logger.info("Бот запущен") diff --git a/src/speech/__init__.py b/src/speech/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/speech/speech.py b/src/speech/speech.py new file mode 100644 index 0000000..71401b2 --- /dev/null +++ b/src/speech/speech.py @@ -0,0 +1,118 @@ +import asyncio +import logging +import os +import aiofiles +from typing import Optional +from pathlib import Path +from config.config import get_settings + +logger = logging.getLogger(__name__) +settings = get_settings() + + +class SpeechRecognizer: + def __init__(self): + self.enabled = settings.stt_enabled + self.model_name = settings.stt_model + self.model = None + self.recognizer = None + + def load_model(self): + if not self.enabled: + logger.info("Распознавание речи отключено") + return + + try: + if self.model_name == "vosk": + from vosk import Model, KaldiRecognizer + import json + + model_path = os.path.expanduser("~/.vosk/models/vosk-model-ru") + if not os.path.exists(model_path): + logger.warning(f"Модель Vosk не найдена по пути {model_path}") + return + + self.model = Model(model_path) + logger.info("Модель Vosk загружена") + + elif self.model_name == "whisper": + from faster_whisper import WhisperModel + + self.model = WhisperModel("small", device="cpu", compute_type="int8") + logger.info("Модель Whisper загружена") + + except Exception as e: + logger.error(f"Ошибка загрузки модели распознавания: {e}") + self.enabled = False + + async def download_and_convert(self, bot, file_id: str, output_path: str) -> Optional[str]: + try: + file = await bot.get_file(file_id) + await file.download_to_drive(output_path) + + if output_path.endswith(".ogg"): + wav_path = output_path.replace(".ogg", ".wav") + + process = await asyncio.create_subprocess_exec( + "ffmpeg", "-i", output_path, "-ar", "16000", "-ac", "1", wav_path, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE + ) + await process.communicate() + + os.remove(output_path) + return wav_path + + return output_path + + except Exception as e: + logger.error(f"Ошибка скачивания/конвертации: {e}") + return None + + async def recognize(self, audio_path: str) -> Optional[str]: + if not self.enabled or not self.model: + return None + + try: + if self.model_name == "vosk": + return await self._recognize_vosk(audio_path) + elif self.model_name == "whisper": + return await self._recognize_whisper(audio_path) + except Exception as e: + logger.error(f"Ошибка распознавания: {e}") + return None + finally: + if os.path.exists(audio_path): + os.remove(audio_path) + + async def _recognize_vosk(self, audio_path: str) -> Optional[str]: + import json + from vosk import KaldiRecognizer + + rec = KaldiRecognizer(self.model, 16000) + + async with aiofiles.open(audio_path, "rb") as f: + while True: + data = await f.read(4000) + if not data: + break + rec.AcceptWaveform(data) + + result = json.loads(rec.FinalResult()) + return result.get("text", "") + + async def _recognize_whisper(self, audio_path: str) -> Optional[str]: + segments, info = self.model.transcribe(audio_path, language="ru") + + text_parts = [] + async for segment in segments: + text_parts.append(segment.text) + + return " ".join(text_parts) + + def toggle(self, enabled: bool): + self.enabled = enabled + logger.info(f"Распознавание речи: {'включено' if enabled else 'отключено'}") + + def is_enabled(self) -> bool: + return self.enabled