add tts support

This commit is contained in:
turn_wind 2025-03-07 22:17:52 +08:00
parent 3e7a411132
commit 7567f882a2
7 changed files with 539 additions and 254 deletions

View File

@ -153,6 +153,25 @@
color: #721c24; color: #721c24;
border: 1px solid #f5c6cb; border: 1px solid #f5c6cb;
} }
.audio-controls {
margin-top: 15px;
display: flex;
align-items: center;
gap: 10px;
}
.voice-selector {
display: flex;
align-items: center;
gap: 10px;
margin-bottom: 10px;
}
#audio-player {
width: 100%;
margin-top: 10px;
}
</style> </style>
</head> </head>
<body> <body>
@ -204,11 +223,35 @@
<div class="explanation-container"> <div class="explanation-container">
<div class="explanation-header"> <div class="explanation-header">
<h5>AI讲解</h5> <h5>AI讲解</h5>
<div class="d-flex gap-2">
<button id="explain-btn" class="btn btn-sm btn-primary">生成讲解</button> <button id="explain-btn" class="btn btn-sm btn-primary">生成讲解</button>
<button id="play-btn" class="btn btn-sm btn-success" disabled>
<i class="bi bi-play-fill"></i> 播放
</button>
</div> </div>
</div>
<div class="voice-selector">
<label for="voice-select" class="form-label mb-0">语音:</label>
<select id="voice-select" class="form-select form-select-sm">
<option value="zf_xiaoxiao">小小 (女)</option>
<option value="zf_xiaoni">小妮 (女)</option>
<option value="zf_xiaoyi">小怡 (女)</option>
<option value="zf_xiaobei">小贝 (女)</option>
<option value="zm_yunxi">云熙 (男)</option>
<option value="zm_yunyang">云扬 (男)</option>
</select>
<label for="speed-range" class="form-label mb-0 ms-2">语速:</label>
<input type="range" class="form-range" id="speed-range" min="0.5" max="2.0" step="0.1" value="1.0" style="width: 80px;">
<span id="speed-value">1.0</span>
</div>
<div id="explanation-text" class="p-3"> <div id="explanation-text" class="p-3">
点击"生成讲解"按钮AI将为您讲解当前页面的内容。 点击"生成讲解"按钮AI将为您讲解当前页面的内容。
</div> </div>
<audio id="audio-player" controls style="display: none;"></audio>
</div> </div>
</div> </div>
</div> </div>
@ -216,8 +259,9 @@
<div id="status-message"></div> <div id="status-message"></div>
<!-- 引入Bootstrap JS --> <!-- 引入Bootstrap JS 和 Icons -->
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script> <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap-icons@1.10.0/font/bootstrap-icons.css">
<script type="module" src="js/main.js"></script> <script type="module" src="js/main.js"></script>
</body> </body>
</html> </html>

View File

@ -13,10 +13,6 @@ class AITeacherApp {
this.ctx = this.canvas.getContext('2d'); this.ctx = this.canvas.getContext('2d');
this.messageTimeout = null; this.messageTimeout = null;
// 讲解状态
this.explanationsGenerated = false;
this.explanationsGenerating = false;
// Live2D控制器 // Live2D控制器
this.live2dController = null; this.live2dController = null;
@ -27,6 +23,12 @@ class AITeacherApp {
this.global_setting = null; this.global_setting = null;
this.api_host = null; this.api_host = null;
// 音频相关
this.audioPlayer = null;
this.currentAudioBase64 = null;
this.selectedVoice = 'zf_xiaoxiao';
this.speechSpeed = 1.0;
this.init(); this.init();
} }
@ -48,6 +50,12 @@ class AITeacherApp {
console.error('初始化Live2D控制器时出错:', error); console.error('初始化Live2D控制器时出错:', error);
} }
// 初始化音频播放器
this.audioPlayer = document.getElementById('audio-player');
// 初始化语音和语速控制
this.initVoiceControls();
await this.loadDefaultPDF(); await this.loadDefaultPDF();
this.setupEventListeners(); this.setupEventListeners();
@ -60,24 +68,22 @@ class AITeacherApp {
async loadDefaultPDF() { async loadDefaultPDF() {
try { try {
const defaultPdfPath = 'pdf/VLA4RM-仿生智能.pdf'; const defaultPdfPath = './public/pdf/test.pdf';
const loadingTask = pdfjsLib.getDocument(defaultPdfPath); const loadingTask = pdfjsLib.getDocument(defaultPdfPath);
this.pdfDoc = await loadingTask.promise; this.pdfDoc = await loadingTask.promise;
document.getElementById('page-count').textContent = this.pdfDoc.numPages; document.getElementById('page-count').textContent = this.pdfDoc.numPages;
this.renderPage(this.pageNum); this.renderPage(this.pageNum);
// 触发服务器端PDF加载和讲解生成 // 通知服务器加载PDF
this.triggerServerPdfLoad(defaultPdfPath); this.notifyServerPdfLoad(defaultPdfPath);
} catch (error) { } catch (error) {
console.error('加载PDF时出错:', error); console.error('加载PDF时出错:', error);
this.showMessage('PDF加载失败: ' + error.message, true); this.showMessage('PDF加载失败: ' + error.message, true);
} }
} }
async triggerServerPdfLoad(pdfPath) { async notifyServerPdfLoad(pdfPath) {
try { try {
this.explanationsGenerating = true;
this.showMessage('正在生成所有页面的讲解,请稍候...', false);
const response = await fetch(`http://${this.api_host}/api/load_pdf`, { const response = await fetch(`http://${this.api_host}/api/load_pdf`, {
method: 'POST', method: 'POST',
headers: { headers: {
@ -95,49 +101,12 @@ class AITeacherApp {
if (data.success) { if (data.success) {
this.showMessage(data.message, false); this.showMessage(data.message, false);
// 开始轮询讲解生成状态
this.pollExplanationStatus();
} else { } else {
this.showMessage(data.message, true); this.showMessage(data.message, true);
} }
} catch (error) { } catch (error) {
console.error('触发服务器PDF加载时出错:', error); console.error('通知服务器加载PDF时出错:', error);
this.showMessage('触发服务器PDF加载时出错: ' + error.message, true); this.showMessage('通知服务器加载PDF时出错: ' + error.message, true);
this.explanationsGenerating = false;
}
}
async pollExplanationStatus() {
// 如果已经生成完毕或不在生成状态,停止轮询
if (!this.explanationsGenerating) return;
try {
const response = await fetch('/api/explanation_status');
if (!response.ok) {
throw new Error('服务器响应错误');
}
const data = await response.json();
console.log('讲解状态:', data);
if (data.is_complete) {
this.explanationsGenerated = true;
this.explanationsGenerating = false;
this.showMessage(`所有 ${data.total_pages} 页的讲解已生成完毕`, false);
// 获取当前页面的讲解
this.fetchExplanationForCurrentPage();
} else {
// 更新生成进度
const progress = Math.round((data.explanations_generated / data.total_pages) * 100);
this.showMessage(`讲解生成中: ${progress}% (${data.explanations_generated}/${data.total_pages})`, false);
// 继续轮询
setTimeout(() => this.pollExplanationStatus(), 2000);
}
} catch (error) {
console.error('轮询讲解状态时出错:', error);
this.explanationsGenerating = false;
} }
} }
@ -162,59 +131,16 @@ class AITeacherApp {
this.pageNumPending = null; this.pageNumPending = null;
} }
// 页面渲染完成后,获取对应的讲解 // 清空讲解区域和停止音频播放
this.fetchExplanationForCurrentPage(); document.getElementById('explanation-text').textContent = '点击"生成讲解"按钮获取AI讲解';
this.stopAudio();
document.getElementById('play-btn').disabled = true;
}); });
}); });
document.getElementById('page-num').value = num; document.getElementById('page-num').value = num;
} }
async fetchExplanationForCurrentPage() {
// 如果讲解尚未生成完毕,使用传统方式获取讲解
if (!this.explanationsGenerated && !this.explanationsGenerating) {
this.onExplain();
return;
}
// 如果正在生成讲解,显示等待消息
if (this.explanationsGenerating) {
document.getElementById('explanation-text').textContent = '正在生成讲解,请稍候...';
return;
}
try {
// 显示加载中的消息
document.getElementById('explanation-text').textContent = '正在获取讲解...';
// 从服务器获取预生成的讲解
const response = await fetch(`/api/get_explanation/${this.pageNum}`);
if (!response.ok) {
throw new Error('服务器响应错误');
}
const data = await response.json();
if (data.success) {
document.getElementById('explanation-text').textContent = data.explanation;
// 如果Live2D控制器已初始化播放说话动作
if (this.live2dController && this.live2dController.initialized) {
this.live2dController.playMotion('Talk', 0);
}
} else {
// 如果预生成的讲解不存在,使用传统方式获取讲解
this.onExplain();
}
} catch (error) {
console.error('获取预生成讲解时出错:', error);
document.getElementById('explanation-text').textContent = '获取讲解时出错: ' + error.message;
// 尝试使用传统方式获取讲解
this.onExplain();
}
}
queueRenderPage(num) { queueRenderPage(num) {
if (this.pageRendering) { if (this.pageRendering) {
this.pageNumPending = num; this.pageNumPending = num;
@ -277,13 +203,6 @@ class AITeacherApp {
document.getElementById('page-count').textContent = this.pdfDoc.numPages; document.getElementById('page-count').textContent = this.pdfDoc.numPages;
this.renderPage(this.pageNum); this.renderPage(this.pageNum);
this.showMessage('PDF加载成功', false); this.showMessage('PDF加载成功', false);
// 重置讲解状态
this.explanationsGenerated = false;
this.explanationsGenerating = false;
// 对于上传的文件,我们暂时不触发服务器端讲解生成
// 因为服务器端需要访问文件,而上传的文件仅在客户端可用
} catch (error) { } catch (error) {
console.error('加载PDF时出错:', error); console.error('加载PDF时出错:', error);
this.showMessage('PDF加载失败: ' + error.message, true); this.showMessage('PDF加载失败: ' + error.message, true);
@ -298,23 +217,25 @@ class AITeacherApp {
async onExplain() { async onExplain() {
try { try {
// 获取当前页面的文本内容
const page = await this.pdfDoc.getPage(this.pageNum);
const textContent = await page.getTextContent();
const pageText = textContent.items.map(item => item.str).join(' ');
// 显示加载中的消息 // 显示加载中的消息
document.getElementById('explanation-text').textContent = '正在生成AI讲解...'; document.getElementById('explanation-text').textContent = '正在生成AI讲解...';
document.getElementById('play-btn').disabled = true;
this.stopAudio();
// 发送到服务器获取AI讲解 // 获取当前选择的语音和语速
const response = await fetch('/api/explain', { const voice = this.selectedVoice;
const speed = this.speechSpeed;
// 发送到服务器获取AI讲解和音频
const response = await fetch(`http://${this.api_host}/api/explain_with_audio`, {
method: 'POST', method: 'POST',
headers: { headers: {
'Content-Type': 'application/json' 'Content-Type': 'application/json'
}, },
body: JSON.stringify({ body: JSON.stringify({
text: pageText, page: this.pageNum,
page: this.pageNum voice: voice,
speed: speed
}) })
}); });
@ -323,12 +244,28 @@ class AITeacherApp {
} }
const data = await response.json(); const data = await response.json();
if (data.success) {
document.getElementById('explanation-text').textContent = data.explanation; document.getElementById('explanation-text').textContent = data.explanation;
// 如果有音频数据,启用播放按钮并自动播放
if (data.audio_base64) {
this.currentAudioBase64 = data.audio_base64;
document.getElementById('play-btn').disabled = false;
this.playAudio();
} else if (data.tts_error) {
console.error('TTS生成失败:', data.tts_error);
this.showMessage('语音生成失败,但文本讲解已生成', true);
}
// 如果Live2D控制器已初始化播放说话动作 // 如果Live2D控制器已初始化播放说话动作
if (this.live2dController && this.live2dController.initialized) { if (this.live2dController && this.live2dController.initialized) {
this.live2dController.playMotion('Talk', 0); this.live2dController.playMotion('Talk', 0);
} }
} else {
document.getElementById('explanation-text').textContent = data.explanation || '生成讲解失败';
this.showMessage('生成讲解失败', true);
}
} catch (error) { } catch (error) {
console.error('获取AI讲解时出错:', error); console.error('获取AI讲解时出错:', error);
document.getElementById('explanation-text').textContent = '获取AI讲解时出错: ' + error.message; document.getElementById('explanation-text').textContent = '获取AI讲解时出错: ' + error.message;
@ -336,6 +273,127 @@ class AITeacherApp {
} }
} }
playAudio() {
if (!this.currentAudioBase64) {
this.showMessage('没有可播放的音频', true);
return;
}
try {
// 将base64转换为Blob
const byteCharacters = atob(this.currentAudioBase64);
const byteNumbers = new Array(byteCharacters.length);
for (let i = 0; i < byteCharacters.length; i++) {
byteNumbers[i] = byteCharacters.charCodeAt(i);
}
const byteArray = new Uint8Array(byteNumbers);
const blob = new Blob([byteArray], { type: 'audio/wav' });
// 创建URL并设置到音频播放器
const audioUrl = URL.createObjectURL(blob);
this.audioPlayer.src = audioUrl;
this.audioPlayer.style.display = 'block';
// 播放音频
this.audioPlayer.play();
// 更新播放按钮状态
const playBtn = document.getElementById('play-btn');
playBtn.innerHTML = '<i class="bi bi-pause-fill"></i> 暂停';
playBtn.classList.remove('btn-success');
playBtn.classList.add('btn-warning');
// 监听音频播放结束事件
this.audioPlayer.onended = () => {
playBtn.innerHTML = '<i class="bi bi-play-fill"></i> 播放';
playBtn.classList.remove('btn-warning');
playBtn.classList.add('btn-success');
};
} catch (error) {
console.error('播放音频时出错:', error);
this.showMessage('播放音频时出错: ' + error.message, true);
}
}
stopAudio() {
if (this.audioPlayer) {
this.audioPlayer.pause();
this.audioPlayer.currentTime = 0;
this.audioPlayer.style.display = 'none';
// 更新播放按钮状态
const playBtn = document.getElementById('play-btn');
playBtn.innerHTML = '<i class="bi bi-play-fill"></i> 播放';
playBtn.classList.remove('btn-warning');
playBtn.classList.add('btn-success');
}
}
toggleAudio() {
if (this.audioPlayer.paused) {
this.audioPlayer.play();
document.getElementById('play-btn').innerHTML = '<i class="bi bi-pause-fill"></i> 暂停';
document.getElementById('play-btn').classList.remove('btn-success');
document.getElementById('play-btn').classList.add('btn-warning');
} else {
this.audioPlayer.pause();
document.getElementById('play-btn').innerHTML = '<i class="bi bi-play-fill"></i> 播放';
document.getElementById('play-btn').classList.remove('btn-warning');
document.getElementById('play-btn').classList.add('btn-success');
}
}
initVoiceControls() {
// 初始化语音选择器
const voiceSelect = document.getElementById('voice-select');
voiceSelect.addEventListener('change', () => {
this.selectedVoice = voiceSelect.value;
});
// 初始化语速控制
const speedRange = document.getElementById('speed-range');
const speedValue = document.getElementById('speed-value');
speedRange.addEventListener('input', () => {
this.speechSpeed = parseFloat(speedRange.value);
speedValue.textContent = this.speechSpeed.toFixed(1);
});
// 设置初始值
this.selectedVoice = voiceSelect.value;
this.speechSpeed = parseFloat(speedRange.value);
speedValue.textContent = this.speechSpeed.toFixed(1);
}
async loadVoices() {
try {
const response = await fetch(`http://${this.api_host}/api/voices`);
if (!response.ok) {
throw new Error('获取语音列表失败');
}
const data = await response.json();
if (data.success && data.voices && data.voices.length > 0) {
const voiceSelect = document.getElementById('voice-select');
// 清空现有选项
voiceSelect.innerHTML = '';
// 添加新选项
data.voices.forEach(voice => {
const option = document.createElement('option');
option.value = voice.id;
option.textContent = `${voice.name} (${voice.gender === 'female' ? '女' : '男'})`;
voiceSelect.appendChild(option);
});
// 更新选中的语音
this.selectedVoice = voiceSelect.value;
}
} catch (error) {
console.error('加载语音列表时出错:', error);
}
}
showMessage(message, isError = false) { showMessage(message, isError = false) {
const statusMessage = document.getElementById('status-message'); const statusMessage = document.getElementById('status-message');
@ -363,10 +421,14 @@ class AITeacherApp {
document.getElementById('zoom-reset').addEventListener('click', () => this.onZoomReset()); document.getElementById('zoom-reset').addEventListener('click', () => this.onZoomReset());
document.getElementById('pdf-upload').addEventListener('change', (e) => this.onFileUpload(e)); document.getElementById('pdf-upload').addEventListener('change', (e) => this.onFileUpload(e));
document.getElementById('explain-btn').addEventListener('click', () => this.onExplain()); document.getElementById('explain-btn').addEventListener('click', () => this.onExplain());
document.getElementById('play-btn').addEventListener('click', () => this.toggleAudio());
document.getElementById('model-select').addEventListener('change', () => { document.getElementById('model-select').addEventListener('change', () => {
const modelName = document.getElementById('model-select').value; const modelName = document.getElementById('model-select').value;
this.live2dController.loadModel(modelName); this.live2dController.loadModel(modelName);
}); });
// 尝试加载可用的语音列表
this.loadVoices();
} }
} }

BIN
public/pdf/test.pdf Normal file

Binary file not shown.

BIN
received_audio.wav Normal file

Binary file not shown.

View File

@ -30,7 +30,7 @@ networkx==3.4.2
nibabel==5.3.2 nibabel==5.3.2
nipype==1.9.2 nipype==1.9.2
numpy==2.2.3 numpy==2.2.3
openai==1.3.0 openai
packaging==24.2 packaging==24.2
pandas==2.2.3 pandas==2.2.3
pathlib==1.0.1 pathlib==1.0.1
@ -64,3 +64,5 @@ uvicorn==0.34.0
websockets==10.4 websockets==10.4
Werkzeug==2.2.3 Werkzeug==2.2.3
wheel==0.45.1 wheel==0.45.1
soundfile
IPython

315
server.py
View File

@ -1,11 +1,12 @@
import os import os
import json import json
import logging import logging
import asyncio import requests
import base64
from flask import Flask, request, jsonify, send_from_directory from flask import Flask, request, jsonify, send_from_directory
from flask_cors import CORS from flask_cors import CORS
import openai import openai
import fitz # PyMuPDF import PyPDF2
from dotenv import load_dotenv from dotenv import load_dotenv
# 加载环境变量 # 加载环境变量
@ -26,6 +27,9 @@ logger = logging.getLogger(__name__)
openai_api_key = "sk-95ab48a1e0754ad39c13e2987f73fe37" openai_api_key = "sk-95ab48a1e0754ad39c13e2987f73fe37"
openai_base_url = "https://api.deepseek.com" openai_base_url = "https://api.deepseek.com"
# TTS API地址
TTS_BASE_URL = "http://feng-arch.cn:31006"
if not openai_api_key: if not openai_api_key:
logger.warning("OpenAI API key not found. AI explanation will use fallback mode.") logger.warning("OpenAI API key not found. AI explanation will use fallback mode.")
@ -41,31 +45,30 @@ except Exception as e:
app = Flask(__name__, static_url_path='') app = Flask(__name__, static_url_path='')
CORS(app) CORS(app)
# 存储PDF文档内容和生成的讲解 # 存储当前加载的PDF路径
pdf_content = { current_pdf_path = None
"full_text": "",
"pages": [],
"explanations": []
}
def extract_pdf_text(pdf_path): def extract_page_text(pdf_path, page_num):
"""提取PDF文档全部文本内容和每一页的文本""" """提取PDF文档指定页面的文本内容"""
try: try:
doc = fitz.open(pdf_path) with open(pdf_path, 'rb') as file:
full_text = "" reader = PyPDF2.PdfReader(file)
pages = []
for page_num in range(len(doc)): # 检查页码是否有效
page = doc.load_page(page_num) if page_num < 1 or page_num > len(reader.pages):
page_text = page.get_text() return {
pages.append(page_text) "success": False,
full_text += f"\n--- 第{page_num+1}页 ---\n{page_text}" "error": f"无效的页码: {page_num}PDF共有 {len(reader.pages)}"
}
# 提取指定页面的文本
page = reader.pages[page_num - 1] # 页码从1开始但索引从0开始
page_text = page.extract_text()
return { return {
"success": True, "success": True,
"full_text": full_text, "page_text": page_text,
"pages": pages, "page_count": len(reader.pages)
"page_count": len(doc)
} }
except Exception as e: except Exception as e:
logger.error(f"Error extracting PDF text: {e}") logger.error(f"Error extracting PDF text: {e}")
@ -74,63 +77,18 @@ def extract_pdf_text(pdf_path):
"error": str(e) "error": str(e)
} }
async def generate_explanations_for_all_pages(full_text, pages): def generate_explanation(page_text):
"""为所有页面生成讲解内容"""
explanations = []
client = openai.OpenAI(api_key=openai_api_key, base_url=openai_base_url)
# 首先让LLM理解整个文档
try:
logger.info("Generating context understanding from full document...")
context_response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "你是一位专业的教师需要理解整个PDF文档的内容以便后续为每一页生成讲解。"},
{"role": "user", "content": f"请阅读并理解以下PDF文档的全部内容不需要回复具体内容只需要理解\n\n{full_text}"}
]
)
context_understanding = context_response.choices[0].message.content.strip()
logger.info("Context understanding generated successfully")
except Exception as e:
logger.error(f"Error generating context understanding: {e}")
context_understanding = "无法生成文档理解,将基于单页内容生成讲解。"
# 为每一页生成讲解
for i, page_text in enumerate(pages):
try:
logger.info(f"Generating explanation for page {i+1}...")
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": f"你是一位专业的教师正在为学生讲解PDF文档内容。你已经理解了整个文档的内容现在需要为第{i+1}页生成简洁的讲解。请提供清晰、简洁的解释,重点突出关键概念。你的讲解应该考虑到整个文档的上下文,而不仅仅是孤立地解释当前页面。"},
{"role": "user", "content": f"基于你对整个文档的理解,请为第{i+1}页生成简洁的讲解:\n\n{page_text}"}
]
)
explanation = response.choices[0].message.content.strip()
explanations.append(explanation)
logger.info(f"Explanation for page {i+1} generated successfully")
except Exception as e:
logger.error(f"Error generating explanation for page {i+1}: {e}")
explanations.append(f"生成第{i+1}页讲解时出错: {str(e)}")
return explanations
def generate_explanation(page_text, page_num=None):
"""为单个页面生成讲解内容""" """为单个页面生成讲解内容"""
if not openai_api_key: if not openai_api_key:
return "这是一个示例讲解。请设置OpenAI API密钥以获取真实的AI讲解内容。" return "这是一个示例讲解。请设置OpenAI API密钥以获取真实的AI讲解内容。"
# 如果已经有预生成的讲解,直接返回
if pdf_content["explanations"] and page_num is not None and 0 <= page_num-1 < len(pdf_content["explanations"]):
return pdf_content["explanations"][page_num-1]
try: try:
client = openai.OpenAI(api_key=openai_api_key, base_url=openai_base_url) client = openai.OpenAI(api_key=openai_api_key, base_url=openai_base_url)
response = client.chat.completions.create( response = client.chat.completions.create(
model="deepseek-chat", model="deepseek-chat",
messages=[ messages=[
{"role": "system", "content": "你是一位专业的教师正在为学生讲解PDF文档内容。请提供清晰、简洁的解释重点突出关键概念。"}, {"role": "system", "content": "你是一位专业的教师正在为学生讲解PDF文档内容。请提供清晰、简洁的解释重点突出关键概念。"},
{"role": "user", "content": f"请讲解以下内容:\n\n{page_text}"} {"role": "user", "content": f"请讲解以下内容:\n\n{page_text},你的输出应符合讲稿的风格,句子间连贯。"}
] ]
) )
return response.choices[0].message.content.strip() return response.choices[0].message.content.strip()
@ -138,6 +96,40 @@ def generate_explanation(page_text, page_num=None):
logger.error(f"Error generating explanation: {e}") logger.error(f"Error generating explanation: {e}")
return f"生成讲解时出错: {str(e)}" return f"生成讲解时出错: {str(e)}"
def text_to_speech(text, voice="zf_xiaoxiao", speed=1.0):
"""将文本转换为语音返回base64编码的音频数据"""
try:
url = f"{TTS_BASE_URL}/tts"
payload = {
"text": text,
"voice": voice,
"speed": speed,
"return_type": "base64"
}
response = requests.post(url, json=payload)
if response.status_code == 200:
data = response.json()
# 获取base64编码的音频
audio_base64 = data.get("audio_base64")
return {
"success": True,
"audio_base64": audio_base64
}
else:
logger.error(f"TTS API error: {response.status_code} - {response.text}")
return {
"success": False,
"error": f"TTS API error: {response.status_code}"
}
except Exception as e:
logger.error(f"Error in text_to_speech: {e}")
return {
"success": False,
"error": str(e)
}
@app.route('/') @app.route('/')
def index(): def index():
return send_from_directory('', 'index.html') return send_from_directory('', 'index.html')
@ -152,86 +144,153 @@ def explain():
text = data.get('text', '') text = data.get('text', '')
page_num = data.get('page', None) page_num = data.get('page', None)
explanation = generate_explanation(text, page_num) # 如果提供了页码但没有提供文本尝试从PDF中提取
return jsonify({'explanation': explanation}) if page_num and not text and current_pdf_path:
result = extract_page_text(current_pdf_path, page_num)
if result["success"]:
text = result["page_text"]
else:
return jsonify({
'success': False,
'explanation': f"无法提取页面文本: {result['error']}"
})
explanation = generate_explanation(text)
return jsonify({
'success': True,
'explanation': explanation
})
@app.route('/api/tts', methods=['POST'])
def tts():
data = request.json
text = data.get('text', '')
voice = data.get('voice', 'zf_xiaoxiao')
speed = data.get('speed', 1.0)
if not text:
return jsonify({
'success': False,
'error': '文本不能为空'
})
# 将文本转换为语音
result = text_to_speech(text, voice, speed)
if result["success"]:
return jsonify({
'success': True,
'audio_base64': result["audio_base64"]
})
else:
return jsonify({
'success': False,
'error': result["error"]
})
@app.route('/api/explain_with_audio', methods=['POST'])
def explain_with_audio():
data = request.json
text = data.get('text', '')
page_num = data.get('page', None)
voice = data.get('voice', 'zf_xiaoxiao')
speed = data.get('speed', 1.0)
# 如果提供了页码但没有提供文本尝试从PDF中提取
if page_num and not text and current_pdf_path:
result = extract_page_text(current_pdf_path, page_num)
if result["success"]:
text = result["page_text"]
else:
return jsonify({
'success': False,
'explanation': f"无法提取页面文本: {result['error']}",
'error': result["error"]
})
# 生成讲解
explanation = generate_explanation(text)
# 将讲解转换为语音
tts_result = text_to_speech(explanation, voice, speed)
if tts_result["success"]:
return jsonify({
'success': True,
'explanation': explanation,
'audio_base64': tts_result["audio_base64"]
})
else:
return jsonify({
'success': True,
'explanation': explanation,
'audio_base64': None,
'tts_error': tts_result["error"]
})
@app.route('/api/load_pdf', methods=['POST']) @app.route('/api/load_pdf', methods=['POST'])
def load_pdf(): def load_pdf():
global current_pdf_path
data = request.json data = request.json
pdf_path = data.get('path', './public/pdf/VLA4RM-仿生智能.pdf') pdf_path = data.get('path', './public/pdf/test.pdf')
# 提取PDF文本 try:
result = extract_pdf_text(pdf_path) # 检查PDF是否存在
if not os.path.exists(pdf_path):
return jsonify({
'success': False,
'message': f'PDF文件不存在: {pdf_path}'
})
if result["success"]: # 尝试打开PDF以验证其有效性
# 更新全局PDF内容 with open(pdf_path, 'rb') as file:
pdf_content["full_text"] = result["full_text"] reader = PyPDF2.PdfReader(file)
pdf_content["pages"] = result["pages"] page_count = len(reader.pages)
# 异步生成所有页面的讲解 # 更新当前PDF路径
async def process_explanations(): current_pdf_path = pdf_path
explanations = await generate_explanations_for_all_pages(
result["full_text"],
result["pages"]
)
pdf_content["explanations"] = explanations
logger.info(f"Generated explanations for all {len(explanations)} pages")
# 启动异步任务
asyncio.run(process_explanations())
return jsonify({ return jsonify({
'success': True, 'success': True,
'message': '已加载PDF并开始生成讲解', 'message': '已成功加载PDF',
'page_count': result["page_count"] 'page_count': page_count
}) })
else: except Exception as e:
logger.error(f"Error loading PDF: {e}")
return jsonify({ return jsonify({
'success': False, 'success': False,
'message': f'加载PDF失败: {result["error"]}' 'message': f'加载PDF失败: {str(e)}'
}) })
@app.route('/api/get_explanation/<int:page_num>', methods=['GET']) @app.route('/api/voices', methods=['GET'])
def get_explanation(page_num): def get_voices():
if 0 <= page_num-1 < len(pdf_content["explanations"]): """获取可用的TTS声音列表"""
voices = [
{"id": "zf_xiaoxiao", "name": "小小", "gender": "female", "lang": "zh"},
{"id": "zf_xiaoni", "name": "小妮", "gender": "female", "lang": "zh"},
{"id": "zf_xiaoyi", "name": "小怡", "gender": "female", "lang": "zh"},
{"id": "zf_xiaobei", "name": "小贝", "gender": "female", "lang": "zh"},
{"id": "zm_yunxi", "name": "云熙", "gender": "male", "lang": "zh"},
{"id": "zm_yunyang", "name": "云扬", "gender": "male", "lang": "zh"},
{"id": "zm_yunxia", "name": "云夏", "gender": "male", "lang": "zh"},
{"id": "zm_yunjian", "name": "云健", "gender": "male", "lang": "zh"},
{"id": "af_heart", "name": "Heart", "gender": "female", "lang": "en"},
{"id": "af_bella", "name": "Bella", "gender": "female", "lang": "en"},
{"id": "am_michael", "name": "Michael", "gender": "male", "lang": "en"},
{"id": "am_puck", "name": "Puck", "gender": "male", "lang": "en"}
]
return jsonify({ return jsonify({
'success': True, 'success': True,
'explanation': pdf_content["explanations"][page_num-1] 'voices': voices
})
else:
return jsonify({
'success': False,
'message': f'页码 {page_num} 的讲解不存在'
})
@app.route('/api/explanation_status', methods=['GET'])
def explanation_status():
return jsonify({
'total_pages': len(pdf_content["pages"]),
'explanations_generated': len(pdf_content["explanations"]),
'is_complete': len(pdf_content["pages"]) > 0 and len(pdf_content["pages"]) == len(pdf_content["explanations"])
}) })
if __name__ == '__main__': if __name__ == '__main__':
# 在启动时预加载默认PDF # 设置默认PDF路径
default_pdf_path = './VLA4RM-仿生智能.pdf' default_pdf_path = './public/pdf/test.pdf'
if os.path.exists(default_pdf_path): if os.path.exists(default_pdf_path):
logger.info(f"Pre-loading default PDF: {default_pdf_path}") current_pdf_path = default_pdf_path
result = extract_pdf_text(default_pdf_path) logger.info(f"默认PDF已设置: {default_pdf_path}")
if result["success"]:
pdf_content["full_text"] = result["full_text"]
pdf_content["pages"] = result["pages"]
# 异步生成所有页面的讲解
async def process_explanations():
explanations = await generate_explanations_for_all_pages(
result["full_text"],
result["pages"]
)
pdf_content["explanations"] = explanations
logger.info(f"Generated explanations for all {len(explanations)} pages")
# 启动异步任务
asyncio.run(process_explanations())
app.run(host='0.0.0.0', port=port, debug=True) app.run(host='0.0.0.0', port=port, debug=True)

118
tts_test.py Normal file
View File

@ -0,0 +1,118 @@
import requests
import base64
import json
import io
from IPython.display import Audio
import soundfile as sf
# 假设您的FastAPI服务在本地运行
BASE_URL = "http://feng-arch.cn:31006"
def tts_file_response(text, voice="af_heart", speed=1.0, lang_code="z"):
"""
发送请求并直接获取音频文件
"""
url = f"{BASE_URL}/tts"
payload = {
"text": text,
"voice": voice,
"speed": speed,
"return_type": "file"
}
response = requests.post(url, json=payload)
if response.status_code == 200:
# 保存音频文件
with open("received_audio.wav", "wb") as f:
f.write(response.content)
print("音频已保存为 received_audio.wav")
# 如果在Jupyter Notebook中可以直接播放
return Audio(data=response.content, rate=24000)
else:
print(f"错误: {response.status_code}")
print(response.text)
return None
def tts_base64_response(text, voice="af_heart", speed=1.0, lang_code="z"):
"""
发送请求并获取base64编码的音频数据
"""
url = f"{BASE_URL}/tts"
payload = {
"text": text,
"voice": voice,
"speed": speed,
"return_type": "base64"
}
response = requests.post(url, json=payload)
if response.status_code == 200:
data = response.json()
# 获取base64编码的音频
audio_base64 = data.get("audio_base64")
# 解码base64数据
audio_data = base64.b64decode(audio_base64)
# 保存音频文件
with open("received_audio.wav", "wb") as f:
f.write(audio_data)
print("音频已保存为 received_audio.wav")
# 如果在Jupyter Notebook中可以直接播放
return Audio(data=audio_data, rate=24000)
else:
print(f"错误: {response.status_code}")
print(response.text)
return None
def get_available_voices(lang_code="z"):
"""
获取指定语言的可用声音列表
Name Traits Target Quality Training Duration Overall Grade SHA256
af_heart 🚺 A 0ab5709b
af_alloy 🚺 B MM minutes C 6d877149
af_aoede 🚺 B H hours C+ c03bd1a4
af_bella 🚺🔥 A HH hours A- 8cb64e02
af_jessica 🚺 C MM minutes D cdfdccb8
af_kore 🚺 B H hours C+ 8bfbc512
af_nicole 🚺🎧 B HH hours B- c5561808
af_nova 🚺 B MM minutes C e0233676
af_river 🚺 C MM minutes D e149459b
af_sarah 🚺 B H hours C+ 49bd364e
af_sky 🚺 B M minutes 🤏 C- c799548a
am_adam 🚹 D H hours F+ ced7e284
am_echo 🚹 C MM minutes D 8bcfdc85
am_eric 🚹 C MM minutes D ada66f0e
am_fenrir 🚹 B H hours C+ 98e507ec
am_liam 🚹 C MM minutes D c8255075
am_michael 🚹 B H hours C+ 9a443b79
am_onyx 🚹 C MM minutes D e8452be1
am_puck 🚹 B H hours C+ dd1d8973
am_santa 🚹 C M minutes 🤏 D- 7f2f7582
Name Traits Target Quality Training Duration Overall Grade SHA256
zf_xiaobei 🚺 C MM minutes D 9b76be63
zf_xiaoni 🚺 C MM minutes D 95b49f16
zf_xiaoxiao 🚺 C MM minutes D cfaf6f2d
zf_xiaoyi 🚺 C MM minutes D b5235dba
zm_yunjian 🚹 C MM minutes D 76cbf8ba
zm_yunxi 🚹 C MM minutes D dbe6e1ce
zm_yunxia 🚹 C MM minutes D bb2b03b0
zm_yunyang 🚹 C MM minutes D 5238ac22
"""
# 示例使用
if __name__ == "__main__":
text = "你能解决什么问题"
# 获取音频文件
audio = tts_file_response(text, voice="zf_xiaoxiao")
# 或者获取base64编码的音频
# audio = tts_base64_response(text)