Этап 7: Локальное распознавание речи
This commit is contained in:
parent
b50388063a
commit
0a294e89fc
|
|
@ -1,15 +1,18 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
from telegram import Update
|
from telegram import Update
|
||||||
from telegram.ext import (
|
from telegram.ext import (
|
||||||
Application, CommandHandler, MessageHandler, filters,
|
Application, CommandHandler, MessageHandler, filters,
|
||||||
ContextTypes, CallbackQueryHandler
|
ContextTypes, CallbackQueryHandler, VoiceHandler
|
||||||
)
|
)
|
||||||
from telegram import InlineKeyboardButton, InlineKeyboardMarkup
|
from telegram import InlineKeyboardButton, InlineKeyboardMarkup
|
||||||
from config.config import get_settings
|
from config.config import get_settings
|
||||||
from src.tools.orchestrator import Orchestrator
|
from src.tools.orchestrator import Orchestrator
|
||||||
from src.bot.states import chat_state, ChatMode
|
from src.bot.states import chat_state, ChatMode
|
||||||
from src.scheduler.scheduler import SchedulerManager
|
from src.scheduler.scheduler import SchedulerManager
|
||||||
|
from src.speech.speech import SpeechRecognizer
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||||
|
|
@ -20,6 +23,7 @@ logger = logging.getLogger(__name__)
|
||||||
settings = get_settings()
|
settings = get_settings()
|
||||||
orchestrator = Orchestrator()
|
orchestrator = Orchestrator()
|
||||||
scheduler_manager = None
|
scheduler_manager = None
|
||||||
|
speech_recognizer = SpeechRecognizer()
|
||||||
|
|
||||||
|
|
||||||
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
|
|
@ -32,6 +36,7 @@ async def start(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
mode = chat_state.get_mode(update.effective_chat.id)
|
mode = chat_state.get_mode(update.effective_chat.id)
|
||||||
current_tool = orchestrator.get_default_tool()
|
current_tool = orchestrator.get_default_tool()
|
||||||
|
stt_status = "включено" if speech_recognizer.is_enabled() else "отключено"
|
||||||
help_text = (
|
help_text = (
|
||||||
f"Я {settings.bot_name}, ваш ИИ-ассистент.\n\n"
|
f"Я {settings.bot_name}, ваш ИИ-ассистент.\n\n"
|
||||||
"Доступные команды:\n"
|
"Доступные команды:\n"
|
||||||
|
|
@ -44,9 +49,11 @@ async def help_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
"/qwen <текст> - Задать вопрос qwen-code\n"
|
"/qwen <текст> - Задать вопрос qwen-code\n"
|
||||||
"/open <текст> - Задать вопрос opencode\n"
|
"/open <текст> - Задать вопрос opencode\n"
|
||||||
"/forget - Очистить историю чата\n"
|
"/forget - Очистить историю чата\n"
|
||||||
"/remind <текст> <время> - Создать напоминание\n\n"
|
"/remind <текст> <время> - Создать напоминание\n"
|
||||||
|
"/stt on|off - Включить/выключить распознавание речи\n\n"
|
||||||
f"Текущий режим: {'с подтверждением' if mode == ChatMode.CONFIRM else 'автономный'}\n"
|
f"Текущий режим: {'с подтверждением' if mode == ChatMode.CONFIRM else 'автономный'}\n"
|
||||||
f"Инструмент по умолчанию: {current_tool}"
|
f"Инструмент по умолчанию: {current_tool}\n"
|
||||||
|
f"Распознавание речи: {stt_status}"
|
||||||
)
|
)
|
||||||
await update.message.reply_text(help_text)
|
await update.message.reply_text(help_text)
|
||||||
|
|
||||||
|
|
@ -99,6 +106,23 @@ async def cancel_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
await update.message.reply_text("Нет активных задач для отмены.")
|
await update.message.reply_text("Нет активных задач для отмены.")
|
||||||
|
|
||||||
|
|
||||||
|
async def stt_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
|
if not context.args:
|
||||||
|
status = "включено" if speech_recognizer.is_enabled() else "отключено"
|
||||||
|
await update.message.reply_text(f"Распознавание речи: {status}")
|
||||||
|
return
|
||||||
|
|
||||||
|
arg = context.args[0].lower()
|
||||||
|
if arg == "on":
|
||||||
|
speech_recognizer.toggle(True)
|
||||||
|
await update.message.reply_text("Распознавание речи включено.")
|
||||||
|
elif arg == "off":
|
||||||
|
speech_recognizer.toggle(False)
|
||||||
|
await update.message.reply_text("Распознавание речи отключено.")
|
||||||
|
else:
|
||||||
|
await update.message.reply_text("Использование: /stt on | off")
|
||||||
|
|
||||||
|
|
||||||
async def confirm_callback(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
async def confirm_callback(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
query = update.callback_query
|
query = update.callback_query
|
||||||
await query.answer()
|
await query.answer()
|
||||||
|
|
@ -239,7 +263,6 @@ async def remind_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
minutes = int(match.group(1)) * value
|
minutes = int(match.group(1)) * value
|
||||||
break
|
break
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
run_at = datetime.now() + timedelta(minutes=minutes)
|
run_at = datetime.now() + timedelta(minutes=minutes)
|
||||||
|
|
||||||
scheduler_manager.add_reminder(chat_id, text, run_at)
|
scheduler_manager.add_reminder(chat_id, text, run_at)
|
||||||
|
|
@ -248,8 +271,31 @@ async def remind_command(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
async def handle_voice(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
prompt = update.message.text
|
if not speech_recognizer.is_enabled():
|
||||||
|
await update.message.reply_text("Распознавание речи отключено.")
|
||||||
|
return
|
||||||
|
|
||||||
|
await update.message.reply_text("Распознаю голос...")
|
||||||
|
|
||||||
|
voice = update.message.voice
|
||||||
|
file = await context.bot.get_file(voice.file_id)
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".ogg") as tmp:
|
||||||
|
await file.download_to_drive(tmp.name)
|
||||||
|
audio_path = tmp.name
|
||||||
|
|
||||||
|
text = await speech_recognizer.recognize(audio_path)
|
||||||
|
|
||||||
|
if text:
|
||||||
|
await update.message.reply_text(f"Распознано: {text}")
|
||||||
|
await handle_message(update, context, text)
|
||||||
|
else:
|
||||||
|
await update.message.reply_text("Не удалось распознать речь.")
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE, override_text: str = None):
|
||||||
|
prompt = override_text or update.message.text
|
||||||
chat_id = update.effective_chat.id
|
chat_id = update.effective_chat.id
|
||||||
mode = chat_state.get_mode(chat_id)
|
mode = chat_state.get_mode(chat_id)
|
||||||
|
|
||||||
|
|
@ -281,6 +327,8 @@ async def handle_message(update: Update, context: ContextTypes.DEFAULT_TYPE):
|
||||||
def main():
|
def main():
|
||||||
global scheduler_manager
|
global scheduler_manager
|
||||||
|
|
||||||
|
speech_recognizer.load_model()
|
||||||
|
|
||||||
builder = Application.builder()
|
builder = Application.builder()
|
||||||
builder.token(settings.telegram_bot_token)
|
builder.token(settings.telegram_bot_token)
|
||||||
|
|
||||||
|
|
@ -298,11 +346,13 @@ def main():
|
||||||
application.add_handler(CommandHandler("mode", mode_command))
|
application.add_handler(CommandHandler("mode", mode_command))
|
||||||
application.add_handler(CommandHandler("use", use_command))
|
application.add_handler(CommandHandler("use", use_command))
|
||||||
application.add_handler(CommandHandler("cancel", cancel_command))
|
application.add_handler(CommandHandler("cancel", cancel_command))
|
||||||
|
application.add_handler(CommandHandler("stt", stt_command))
|
||||||
application.add_handler(CommandHandler("qwen", qwen_command))
|
application.add_handler(CommandHandler("qwen", qwen_command))
|
||||||
application.add_handler(CommandHandler("open", open_command))
|
application.add_handler(CommandHandler("open", open_command))
|
||||||
application.add_handler(CommandHandler("forget", forget_command))
|
application.add_handler(CommandHandler("forget", forget_command))
|
||||||
application.add_handler(CommandHandler("remind", remind_command))
|
application.add_handler(CommandHandler("remind", remind_command))
|
||||||
application.add_handler(CallbackQueryHandler(confirm_callback))
|
application.add_handler(CallbackQueryHandler(confirm_callback))
|
||||||
|
application.add_handler(VoiceHandler(handle_voice))
|
||||||
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message))
|
application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, handle_message))
|
||||||
|
|
||||||
logger.info("Бот запущен")
|
logger.info("Бот запущен")
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,118 @@
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import aiofiles
|
||||||
|
from typing import Optional
|
||||||
|
from pathlib import Path
|
||||||
|
from config.config import get_settings
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
settings = get_settings()
|
||||||
|
|
||||||
|
|
||||||
|
class SpeechRecognizer:
|
||||||
|
def __init__(self):
|
||||||
|
self.enabled = settings.stt_enabled
|
||||||
|
self.model_name = settings.stt_model
|
||||||
|
self.model = None
|
||||||
|
self.recognizer = None
|
||||||
|
|
||||||
|
def load_model(self):
|
||||||
|
if not self.enabled:
|
||||||
|
logger.info("Распознавание речи отключено")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
if self.model_name == "vosk":
|
||||||
|
from vosk import Model, KaldiRecognizer
|
||||||
|
import json
|
||||||
|
|
||||||
|
model_path = os.path.expanduser("~/.vosk/models/vosk-model-ru")
|
||||||
|
if not os.path.exists(model_path):
|
||||||
|
logger.warning(f"Модель Vosk не найдена по пути {model_path}")
|
||||||
|
return
|
||||||
|
|
||||||
|
self.model = Model(model_path)
|
||||||
|
logger.info("Модель Vosk загружена")
|
||||||
|
|
||||||
|
elif self.model_name == "whisper":
|
||||||
|
from faster_whisper import WhisperModel
|
||||||
|
|
||||||
|
self.model = WhisperModel("small", device="cpu", compute_type="int8")
|
||||||
|
logger.info("Модель Whisper загружена")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Ошибка загрузки модели распознавания: {e}")
|
||||||
|
self.enabled = False
|
||||||
|
|
||||||
|
async def download_and_convert(self, bot, file_id: str, output_path: str) -> Optional[str]:
|
||||||
|
try:
|
||||||
|
file = await bot.get_file(file_id)
|
||||||
|
await file.download_to_drive(output_path)
|
||||||
|
|
||||||
|
if output_path.endswith(".ogg"):
|
||||||
|
wav_path = output_path.replace(".ogg", ".wav")
|
||||||
|
|
||||||
|
process = await asyncio.create_subprocess_exec(
|
||||||
|
"ffmpeg", "-i", output_path, "-ar", "16000", "-ac", "1", wav_path,
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE
|
||||||
|
)
|
||||||
|
await process.communicate()
|
||||||
|
|
||||||
|
os.remove(output_path)
|
||||||
|
return wav_path
|
||||||
|
|
||||||
|
return output_path
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Ошибка скачивания/конвертации: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def recognize(self, audio_path: str) -> Optional[str]:
|
||||||
|
if not self.enabled or not self.model:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
if self.model_name == "vosk":
|
||||||
|
return await self._recognize_vosk(audio_path)
|
||||||
|
elif self.model_name == "whisper":
|
||||||
|
return await self._recognize_whisper(audio_path)
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Ошибка распознавания: {e}")
|
||||||
|
return None
|
||||||
|
finally:
|
||||||
|
if os.path.exists(audio_path):
|
||||||
|
os.remove(audio_path)
|
||||||
|
|
||||||
|
async def _recognize_vosk(self, audio_path: str) -> Optional[str]:
|
||||||
|
import json
|
||||||
|
from vosk import KaldiRecognizer
|
||||||
|
|
||||||
|
rec = KaldiRecognizer(self.model, 16000)
|
||||||
|
|
||||||
|
async with aiofiles.open(audio_path, "rb") as f:
|
||||||
|
while True:
|
||||||
|
data = await f.read(4000)
|
||||||
|
if not data:
|
||||||
|
break
|
||||||
|
rec.AcceptWaveform(data)
|
||||||
|
|
||||||
|
result = json.loads(rec.FinalResult())
|
||||||
|
return result.get("text", "")
|
||||||
|
|
||||||
|
async def _recognize_whisper(self, audio_path: str) -> Optional[str]:
|
||||||
|
segments, info = self.model.transcribe(audio_path, language="ru")
|
||||||
|
|
||||||
|
text_parts = []
|
||||||
|
async for segment in segments:
|
||||||
|
text_parts.append(segment.text)
|
||||||
|
|
||||||
|
return " ".join(text_parts)
|
||||||
|
|
||||||
|
def toggle(self, enabled: bool):
|
||||||
|
self.enabled = enabled
|
||||||
|
logger.info(f"Распознавание речи: {'включено' if enabled else 'отключено'}")
|
||||||
|
|
||||||
|
def is_enabled(self) -> bool:
|
||||||
|
return self.enabled
|
||||||
Loading…
Reference in New Issue