【Python实战】搭建AI数字人对话系统:从语音识别到虚拟形象的全流程实现

张开发
2026/4/3 16:35:37 15 分钟阅读
【Python实战】搭建AI数字人对话系统:从语音识别到虚拟形象的全流程实现
一、项目背景最近接了一个数字人项目需求是做一个能对话的虚拟形象。用户说话数字人回复还要有口型动画。听起来简单实际踩了不少坑ASR接口从WebSocket改成HTTP404报错搞了半天Wav2Lip的PyTorch DLL加载失败C盘空间不足项目文件迁移到D盘这篇文章记录完整实现过程帮你避开这些坑。二、系统架构整个系统分为四个模块用户语音输入 → ASR识别 → LLM生成回复 → TTS语音合成 → Wav2Lip生成视频三、核心代码实现3.1 ASR语音识别模块FunASR原来的WebSocket接口经常断开改成HTTP接口后稳定多了。import aiohttp import asyncio class ASRService: 语音识别服务 - HTTP接口版 def __init__(self): self.http_url http://your-asr-server:31211/funasr/file self.timeout 30 async def recognize_file(self, audio_path: str) - str: 识别音频文件 try: async with aiohttp.ClientSession() as session: with open(audio_path, rb) as f: data aiohttp.FormData() data.add_field(file, f, filenameos.path.basename(audio_path)) async with session.post( self.http_url, datadata, timeoutaiohttp.ClientTimeout(totalself.timeout) ) as response: if response.status ! 200: print(fASR HTTP错误: {response.status}) return result await response.json() if result.get(status) success: return result.get(text, ) else: print(fASR识别失败: {result}) return except Exception as e: print(fASR识别异常: {e}) return 踩坑记录接口路径是/funasr/file不是/api/funasr。先用curl测试连通性确认路径正确再写代码。3.2 LLM大语言模型模块用的是Qwen3 72B通过HTTP接口调用。import httpx import json class LLMService: 大语言模型服务 def __init__(self): self.api_url http://your-llm-server:1025/v1/chat/completions self.model qwen3-72b async def chat(self, message: str, history: list None) - str: 对话生成 if history is None: history [] messages [ {role: system, content: 你是一个 helpful 的AI助手回答简洁明了。} ] # 添加历史对话 for h in history: messages.append({role: user, content: h[user]}) messages.append({role: assistant, content: h[assistant]}) messages.append({role: user, content: message}) try: async with httpx.AsyncClient() as client: response await client.post( self.api_url, json{ model: self.model, messages: messages, temperature: 0.7, max_tokens: 512 }, timeout30.0 ) result response.json() return result[choices][0][message][content] except Exception as e: print(fLLM调用失败: {e}) return 抱歉我暂时无法回答。3.3 TTS语音合成模块pyttsx3是离线方案不需要联网响应速度快。import pyttsx3 import tempfile import os class TTSService: 语音合成服务 def __init__(self): self.engine pyttsx3.init() # 设置语速 self.engine.setProperty(rate, 150) # 设置音量 self.engine.setProperty(volume, 0.9) def text_to_speech(self, text: str, output_path: str None) - str: 文字转语音 if output_path is None: # 创建临时文件 temp_dir tempfile.mkdtemp() output_path os.path.join(temp_dir, ftts_{int(time.time())}.wav) try: self.engine.save_to_file(text, output_path) self.engine.runAndWait() return output_path except Exception as e: print(fTTS合成失败: {e}) return None3.4 数字人视频生成模块Wav2Lip根据语音生成口型动画需要安装VC运行库否则PyTorch会报DLL错误。import subprocess import os class Wav2LipService: Wav2Lip数字人服务 def __init__(self, wav2lip_path: str): self.wav2lip_path wav2lip_path self.checkpoint_path os.path.join(wav2lip_path, checkpoints/wav2lip_gan.pth) def generate_video(self, audio_path: str, face_image: str, output_path: str): 生成数字人视频 try: # 构建命令 cmd [ python, os.path.join(self.wav2lip_path, inference.py), --checkpoint_path, self.checkpoint_path, --face, face_image, --audio, audio_path, --outfile, output_path ] # 执行生成 result subprocess.run(cmd, capture_outputTrue, textTrue) if result.returncode 0: print(f视频生成成功: {output_path}) return output_path else: print(f视频生成失败: {result.stderr}) return None except Exception as e: print(fWav2Lip异常: {e}) return None踩坑记录Windows上运行Wav2Lip需要安装Visual C Redistributable否则PyTorch会报DLL加载失败错误。3.5 主程序整合import asyncio import os from datetime import datetime class DigitalHumanSystem: 数字人对话系统主类 def __init__(self): self.asr ASRService() self.llm LLMService() self.tts TTSService() self.wav2lip Wav2LipService(./Wav2Lip) self.history [] async def chat(self, audio_input: str, face_image: str) - str: 完整对话流程 print(f[{datetime.now()}] 开始对话...) # 1. ASR识别 print(1. 语音识别中...) text await self.asr.recognize_file(audio_input) if not text: return None print(f 识别结果: {text}) # 2. LLM生成回复 print(2. 生成回复中...) reply await self.llm.chat(text, self.history) print(f 回复内容: {reply}) # 3. TTS语音合成 print(3. 语音合成中...) audio_output self.tts.text_to_speech(reply) if not audio_output: return None print(f 音频文件: {audio_output}) # 4. Wav2Lip生成视频 print(4. 生成数字人视频中...) video_output foutput_{int(datetime.now().timestamp())}.mp4 video_path self.wav2lip.generate_video(audio_output, face_image, video_output) # 保存对话历史 self.history.append({user: text, assistant: reply}) print(f[{datetime.now()}] 对话完成!) return video_path # 运行示例 async def main(): system DigitalHumanSystem() # 输入用户语音 数字人底图 audio_file user_input.wav face_image avatar.png # 执行对话 result await system.chat(audio_file, face_image) if result: print(f生成视频: {result}) else: print(生成失败) if __name__ __main__: asyncio.run(main())四、项目部署4.1 环境依赖pip install aiohttp httpx pyttsx3Wav2Lip需要额外安装pip install torch torchvision opencv-python librosa4.2 目录结构digital-human/ ├── main.py # 主程序 ├── config.py # 配置文件 ├── asr_service.py # ASR模块 ├── llm_service.py # LLM模块 ├── tts_service.py # TTS模块 ├── wav2lip_service.py # 数字人模块 ├── Wav2Lip/ # Wav2Lip源码 │ ├── inference.py │ └── checkpoints/ ├── assets/ # 资源文件 │ └── avatar.png # 数字人底图 └── output/ # 输出目录4.3 配置文件# config.py # ASR配置 ASR_CONFIG { http_url: http://your-asr-server:31211/funasr/file, timeout: 30 } # LLM配置 LLM_CONFIG { api_url: http://your-llm-server:1025/v1/chat/completions, model: qwen3-72b } # Wav2Lip配置 WAV2LIP_CONFIG { path: ./Wav2Lip, checkpoint: ./Wav2Lip/checkpoints/wav2lip_gan.pth }五、踩坑总结坑1ASR接口404现象调用ASR接口返回404原因接口路径错误不是/funasr/file解决先用curl测试确认正确路径坑2Wav2Lip DLL错误现象PyTorch报DLL加载失败原因Windows缺少VC运行库解决安装Visual C Redistributable坑3C盘空间不足现象项目运行报错磁盘空间不足解决将项目迁移到D盘创建符号链接坑4WebSocket连接断开现象ASR WebSocket经常断开解决改用HTTP接口稳定性更好六、效果展示系统运行流程用户说话 → 2. ASR识别文字 → 3. LLM生成回复 → 4. TTS合成语音 → 5. Wav2Lip生成视频整个流程约3-5秒数字人能够实时对话。七、后续优化方向实时性优化改用流式ASR减少延迟多模态加入表情识别数字人更生动个性化支持自定义数字人形象部署优化Docker容器化部署八、源码获取完整代码已上传到GitHub包含完整Python代码Wav2Lip配置文件部署脚本

更多文章