Add: распознавание голосовых сообщений

- Добавлен обработчик голосовых сообщений (filters.VOICE) - Команда /stt on|off для включения/выключения распознавания - Голосовые конвертируются в текст через Vosk/Whisper - Распознанный текст обрабатывается как обычное сообщение - Модель загружается при старте бота Co-authored-by: Qwen-Coder <qwen-coder@alibabacloud.com>
2026-03-18 20:01:34 +08:00 · 2026-03-18 20:01:34 +08:00 · 9d91a9eed4
parent 5779dd7b14
commit 9d91a9eed4
1 changed files with 116 additions and 3 deletions
--- a/src/bot/main.py
+++ b/src/bot/main.py
@ -13,6 +13,7 @@ from telegram import InlineKeyboardButton, InlineKeyboardMarkup
 from config.config import get_settings
 from src.tools.orchestrator import Orchestrator
 from src.tools.xray import get_xray_client
+from src.speech.speech import SpeechRecognizer
 from src.bot.states import chat_state, ChatMode, XRayState
 from src.bot.config_manager import get_selected_tool, get_selected_model, set_tool

@ -27,6 +28,7 @@ logging.getLogger("asyncssh").setLevel(logging.ERROR)

 settings = get_settings()
 orchestrator = Orchestrator()
+speech_recognizer = SpeechRecognizer()

 DANGEROUS_PATTERNS = [
    r'\bwrite\b', r'\bedit\b', r'\bcopy\b', r'\bmove\b', r'\bdelete\b',
@ -124,7 +126,8 @@ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
        "/open - Выбрать модель OpenCode\n"
        "/mode confirm/auto - Режим подтверждения\n"
        "/forget - Очистить историю\n"
-        "/xray [email] - Добавить пользователя XRay\n\n"
+        "/xray [email] - Добавить пользователя XRay\n"
+        "/stt on|off - Распознавание речи\n\n"
        f"🔧 Текущая модель: {current_tool}"
    )
    if model:
@ -297,6 +300,24 @@ async def forget_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
    await update.message.reply_text("🗑️ История чата очищена.")


+async def stt_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
+    """Включение/выключение распознавания речи"""
+    if not context.args:
+        status = "включено" if speech_recognizer.is_enabled() else "отключено"
+        await update.message.reply_text(f"🎤 Распознавание речи: {status}\n\nИспользование: /stt on | off")
+        return
+    
+    arg = context.args[0].lower()
+    if arg == "on":
+        speech_recognizer.toggle(True)
+        await update.message.reply_text("🎤 Распознавание речи включено")
+    elif arg == "off":
+        speech_recognizer.toggle(False)
+        await update.message.reply_text("🎤 Распознавание речи отключено")
+    else:
+        await update.message.reply_text("Использование: /stt on | off")
+
+
 async def xray_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
    """
    Команда /xray - добавление пользователя XRay
@ -406,6 +427,93 @@ async def handle_xray_email(update: Update, context: ContextTypes.DEFAULT_TYPE):
    await process_xray_email(update, context, email)


+async def handle_voice(update: Update, context: ContextTypes.DEFAULT_TYPE):
+    """Обработчик голосовых сообщений"""
+    if not speech_recognizer.is_enabled():
+        await update.message.reply_text(
+            "🎤 Распознавание речи отключено.\n"
+            "Включите командой /stt on"
+        )
+        return
+    
+    chat_id = update.effective_chat.id
+    
+    # Получаем файл голосового сообщения
+    file = await update.message.voice.get_file()
+    
+    # Создаем временный файл для сохранения
+    temp_dir = "/tmp/valera_voice"
+    os.makedirs(temp_dir, exist_ok=True)
+    temp_ogg = os.path.join(temp_dir, f"{chat_id}_{file.file_id}.ogg")
+    
+    try:
+        # Скачиваем файл
+        await file.download_to_drive(temp_ogg)
+        
+        progress_msg = await update.message.reply_text("🎤 Распознаю голосовое сообщение...")
+        
+        # Конвертируем и распознаем
+        wav_path = temp_ogg.replace(".ogg", ".wav")
+        
+        # Конвертация через ffmpeg
+        process = await asyncio.create_subprocess_exec(
+            "ffmpeg", "-i", temp_ogg, "-ar", "16000", "-ac", "1", wav_path,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE
+        )
+        await process.communicate()
+        
+        # Распознавание
+        text = await speech_recognizer.recognize(wav_path)
+        
+        # Удаляем временные файлы
+        if os.path.exists(temp_ogg):
+            os.remove(temp_ogg)
+        
+        if not text:
+            await progress_msg.edit_text("❌ Не удалось распознать голосовое сообщение")
+            return
+        
+        await progress_msg.delete()
+        
+        # Показываем распознанный текст
+        await update.message.reply_text(f"🎤 Вы сказали:\n{text}")
+        
+        # Обрабатываем текст как обычное сообщение
+        await process_text_as_message(update, context, text)
+        
+    except Exception as e:
+        logger.error(f"Ошибка распознавания голоса: {e}")
+        if os.path.exists(temp_ogg):
+            os.remove(temp_ogg)
+        await update.message.reply_text(f"❌ Ошибка распознавания: {e}")
+
+
+async def process_text_as_message(update: Update, context: ContextTypes.DEFAULT_TYPE, text: str):
+    """Обработка распознанного текста как обычного сообщения"""
+    chat_id = update.effective_chat.id
+    mode = chat_state.get_mode(chat_id)
+    tool = get_selected_tool()
+    model = get_selected_model() if tool == "opencode" else None
+    
+    # Проверяем не ждем ли мы email для XRay
+    if chat_state.get_xray_state(chat_id) == XRayState.WAITING_EMAIL:
+        email = text.strip()
+        await process_xray_email(update, context, email)
+        return
+    
+    thinking_msg = await update.message.reply_text("🤔 Думаю...")
+    
+    result, success = await orchestrator.ask(text, chat_id, tool, model, yolo=True)
+    
+    text_result = result.strip() if result else ""
+    if not text_result:
+        text_result = "⚠️ Пустой ответ от модели."
+    
+    text_result = text_result[:4096]
+    await thinking_msg.edit_text(text_result, parse_mode="Markdown")
+
+
 async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
    prompt = update.message.text
    chat_id = update.effective_chat.id
@ -439,6 +547,9 @@ def main():

    application = builder.build()
    
+    # Загружаем модель распознавания речи
+    speech_recognizer.load_model()
+
    application.add_handler(CommandHandler("start", start))
    application.add_handler(CommandHandler("help", help_command))
    application.add_handler(CommandHandler("mode", mode_command))
@ -446,7 +557,9 @@ def main():
    application.add_handler(CommandHandler("open", open_command))
    application.add_handler(CommandHandler("forget", forget_command))
    application.add_handler(CommandHandler("xray", xray_command))
+    application.add_handler(CommandHandler("stt", stt_command))
    application.add_handler(CallbackQueryHandler(confirm_callback))
+    application.add_handler(MessageHandler(filters.VOICE, handle_voice))
    application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message))

    application.run_polling(allowed_updates=Update.ALL_TYPES)