From 0a294e89fcb69cc51f7c1bc25e6c5654e1651b03 Mon Sep 17 00:00:00 2001
From: mirivlad <mirivlad@mirv.top>
Date: Tue, 17 Mar 2026 03:29:11 +0800
Subject: [PATCH] =?UTF-8?q?=D0=AD=D1=82=D0=B0=D0=BF=207:=20=D0=9B=D0=BE?=
 =?UTF-8?q?=D0=BA=D0=B0=D0=BB=D1=8C=D0=BD=D0=BE=D0=B5=20=D1=80=D0=B0=D1=81?=
 =?UTF-8?q?=D0=BF=D0=BE=D0=B7=D0=BD=D0=B0=D0=B2=D0=B0=D0=BD=D0=B8=D0=B5=20?=
 =?UTF-8?q?=D1=80=D0=B5=D1=87=D0=B8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/bot/main.py        |  62 +++++++++++++++++++---
 src/speech/__init__.py |   0
 src/speech/speech.py   | 118 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 174 insertions(+), 6 deletions(-)
 create mode 100644 src/speech/__init__.py
 create mode 100644 src/speech/speech.py

diff --git a/src/bot/main.py b/src/bot/main.py
index 6d71f48..cd477ce 100644
--- a/src/bot/main.py
+++ b/src/bot/main.py
@@ -1,15 +1,18 @@
 import asyncio
 import logging
+import os
+import tempfile
 from telegram import Update
 from telegram.ext import (
     Application, CommandHandler, MessageHandler, filters, 
-    ContextTypes, CallbackQueryHandler
+    ContextTypes, CallbackQueryHandler, VoiceHandler
 )
 from telegram import InlineKeyboardButton, InlineKeyboardMarkup
 from config.config import get_settings
 from src.tools.orchestrator import Orchestrator
 from src.bot.states import chat_state, ChatMode
 from src.scheduler.scheduler import SchedulerManager
+from src.speech.speech import SpeechRecognizer
 
 logging.basicConfig(
     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
@@ -20,6 +23,7 @@ logger = logging.getLogger(__name__)
 settings = get_settings()
 orchestrator = Orchestrator()
 scheduler_manager = None
+speech_recognizer = SpeechRecognizer()
 
 
 async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
@@ -32,6 +36,7 @@ async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
 async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
     mode = chat_state.get_mode(update.effective_chat.id)
     current_tool = orchestrator.get_default_tool()
+    stt_status = "включено" if speech_recognizer.is_enabled() else "отключено"
     help_text = (
         f"Я {settings.bot_name}, ваш ИИ-ассистент.\n\n"
         "Доступные команды:\n"
@@ -44,9 +49,11 @@ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
         "/qwen <текст> - Задать вопрос qwen-code\n"
         "/open <текст> - Задать вопрос opencode\n"
         "/forget - Очистить историю чата\n"
-        "/remind <текст> <время> - Создать напоминание\n\n"
+        "/remind <текст> <время> - Создать напоминание\n"
+        "/stt on|off - Включить/выключить распознавание речи\n\n"
         f"Текущий режим: {'с подтверждением' if mode == ChatMode.CONFIRM else 'автономный'}\n"
-        f"Инструмент по умолчанию: {current_tool}"
+        f"Инструмент по умолчанию: {current_tool}\n"
+        f"Распознавание речи: {stt_status}"
     )
     await update.message.reply_text(help_text)
 
@@ -99,6 +106,23 @@ async def cancel_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
         await update.message.reply_text("Нет активных задач для отмены.")
 
 
+async def stt_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
+    if not context.args:
+        status = "включено" if speech_recognizer.is_enabled() else "отключено"
+        await update.message.reply_text(f"Распознавание речи: {status}")
+        return
+    
+    arg = context.args[0].lower()
+    if arg == "on":
+        speech_recognizer.toggle(True)
+        await update.message.reply_text("Распознавание речи включено.")
+    elif arg == "off":
+        speech_recognizer.toggle(False)
+        await update.message.reply_text("Распознавание речи отключено.")
+    else:
+        await update.message.reply_text("Использование: /stt on | off")
+
+
 async def confirm_callback(update: Update, context: ContextTypes.DEFAULT_TYPE):
     query = update.callback_query
     await query.answer()
@@ -239,7 +263,6 @@ async def remind_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
                 minutes = int(match.group(1)) * value
                 break
     
-    from datetime import datetime
     run_at = datetime.now() + timedelta(minutes=minutes)
     
     scheduler_manager.add_reminder(chat_id, text, run_at)
@@ -248,8 +271,31 @@ async def remind_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
     )
 
 
-async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
-    prompt = update.message.text
+async def handle_voice(update: Update, context: ContextTypes.DEFAULT_TYPE):
+    if not speech_recognizer.is_enabled():
+        await update.message.reply_text("Распознавание речи отключено.")
+        return
+    
+    await update.message.reply_text("Распознаю голос...")
+    
+    voice = update.message.voice
+    file = await context.bot.get_file(voice.file_id)
+    
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".ogg") as tmp:
+        await file.download_to_drive(tmp.name)
+        audio_path = tmp.name
+    
+    text = await speech_recognizer.recognize(audio_path)
+    
+    if text:
+        await update.message.reply_text(f"Распознано: {text}")
+        await handle_message(update, context, text)
+    else:
+        await update.message.reply_text("Не удалось распознать речь.")
+
+
+async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE, override_text: str = None):
+    prompt = override_text or update.message.text
     chat_id = update.effective_chat.id
     mode = chat_state.get_mode(chat_id)
     
@@ -281,6 +327,8 @@ async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
 def main():
     global scheduler_manager
     
+    speech_recognizer.load_model()
+    
     builder = Application.builder()
     builder.token(settings.telegram_bot_token)
     
@@ -298,11 +346,13 @@ def main():
     application.add_handler(CommandHandler("mode", mode_command))
     application.add_handler(CommandHandler("use", use_command))
     application.add_handler(CommandHandler("cancel", cancel_command))
+    application.add_handler(CommandHandler("stt", stt_command))
     application.add_handler(CommandHandler("qwen", qwen_command))
     application.add_handler(CommandHandler("open", open_command))
     application.add_handler(CommandHandler("forget", forget_command))
     application.add_handler(CommandHandler("remind", remind_command))
     application.add_handler(CallbackQueryHandler(confirm_callback))
+    application.add_handler(VoiceHandler(handle_voice))
     application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message))
     
     logger.info("Бот запущен")
diff --git a/src/speech/__init__.py b/src/speech/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/speech/speech.py b/src/speech/speech.py
new file mode 100644
index 0000000..71401b2
--- /dev/null
+++ b/src/speech/speech.py
@@ -0,0 +1,118 @@
+import asyncio
+import logging
+import os
+import aiofiles
+from typing import Optional
+from pathlib import Path
+from config.config import get_settings
+
+logger = logging.getLogger(__name__)
+settings = get_settings()
+
+
+class SpeechRecognizer:
+    def __init__(self):
+        self.enabled = settings.stt_enabled
+        self.model_name = settings.stt_model
+        self.model = None
+        self.recognizer = None
+    
+    def load_model(self):
+        if not self.enabled:
+            logger.info("Распознавание речи отключено")
+            return
+        
+        try:
+            if self.model_name == "vosk":
+                from vosk import Model, KaldiRecognizer
+                import json
+                
+                model_path = os.path.expanduser("~/.vosk/models/vosk-model-ru")
+                if not os.path.exists(model_path):
+                    logger.warning(f"Модель Vosk не найдена по пути {model_path}")
+                    return
+                
+                self.model = Model(model_path)
+                logger.info("Модель Vosk загружена")
+                
+            elif self.model_name == "whisper":
+                from faster_whisper import WhisperModel
+                
+                self.model = WhisperModel("small", device="cpu", compute_type="int8")
+                logger.info("Модель Whisper загружена")
+                
+        except Exception as e:
+            logger.error(f"Ошибка загрузки модели распознавания: {e}")
+            self.enabled = False
+    
+    async def download_and_convert(self, bot, file_id: str, output_path: str) -> Optional[str]:
+        try:
+            file = await bot.get_file(file_id)
+            await file.download_to_drive(output_path)
+            
+            if output_path.endswith(".ogg"):
+                wav_path = output_path.replace(".ogg", ".wav")
+                
+                process = await asyncio.create_subprocess_exec(
+                    "ffmpeg", "-i", output_path, "-ar", "16000", "-ac", "1", wav_path,
+                    stdout=asyncio.subprocess.PIPE,
+                    stderr=asyncio.subprocess.PIPE
+                )
+                await process.communicate()
+                
+                os.remove(output_path)
+                return wav_path
+            
+            return output_path
+            
+        except Exception as e:
+            logger.error(f"Ошибка скачивания/конвертации: {e}")
+            return None
+    
+    async def recognize(self, audio_path: str) -> Optional[str]:
+        if not self.enabled or not self.model:
+            return None
+        
+        try:
+            if self.model_name == "vosk":
+                return await self._recognize_vosk(audio_path)
+            elif self.model_name == "whisper":
+                return await self._recognize_whisper(audio_path)
+        except Exception as e:
+            logger.error(f"Ошибка распознавания: {e}")
+            return None
+        finally:
+            if os.path.exists(audio_path):
+                os.remove(audio_path)
+    
+    async def _recognize_vosk(self, audio_path: str) -> Optional[str]:
+        import json
+        from vosk import KaldiRecognizer
+        
+        rec = KaldiRecognizer(self.model, 16000)
+        
+        async with aiofiles.open(audio_path, "rb") as f:
+            while True:
+                data = await f.read(4000)
+                if not data:
+                    break
+                rec.AcceptWaveform(data)
+        
+        result = json.loads(rec.FinalResult())
+        return result.get("text", "")
+    
+    async def _recognize_whisper(self, audio_path: str) -> Optional[str]:
+        segments, info = self.model.transcribe(audio_path, language="ru")
+        
+        text_parts = []
+        async for segment in segments:
+            text_parts.append(segment.text)
+        
+        return " ".join(text_parts)
+    
+    def toggle(self, enabled: bool):
+        self.enabled = enabled
+        logger.info(f"Распознавание речи: {'включено' if enabled else 'отключено'}")
+    
+    def is_enabled(self) -> bool:
+        return self.enabled