Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions main/xiaozhi-server/config/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,11 @@ def formatter(record):
return record["message"]


def get_logger(tag):
"""获取预配置的日志记录器,避免循环导入"""
# 基本配置,不依赖config
return logger.bind(tag=tag)

def setup_logging():
check_config_file()
"""从配置文件中读取日志配置,并设置日志输出格式和级别"""
Expand Down
107 changes: 107 additions & 0 deletions main/xiaozhi-server/core/providers/tts/local_cosyvoice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import os
import sys
import uuid
import requests
from config.logger import get_logger
from datetime import datetime
from core.providers.tts.base import TTSProviderBase
import torch
import torchaudio

TAG = __name__
logger = get_logger(TAG)


class TTSProvider(TTSProviderBase):

def _initialize_model(self):
# 保存原始 sys.path
original_path = None

try:
original_path = sys.path.copy()
# 动态修改 sys.path
sys.path.insert(0, self.matcha_tts_path)
sys.path.insert(0, self.cosy_voice_path)

# 导入必要的模块
from cosyvoice.cli.cosyvoice import CosyVoice2
from cosyvoice.utils.file_utils import load_wav

# 初始化模型
self.model = CosyVoice2(self.cosy_voice_model_dir,
load_jit=False, load_trt=False, fp16=False)

# 保存导入的模块供之后使用
self.CosyVoice2 = CosyVoice2
self.prompt_speech_16k = load_wav(self.prompt_speech_16k, 16000)

return True
except ImportError as e:
logger.bind(tag=TAG).error(f"导入 CosyVoice 模块失败: {e}")
raise ImportError(f"导入 CosyVoice 模块失败: {e}")
finally:
# 恢复原始 sys.path
if original_path:
sys.path = original_path

def __init__(self, config, delete_audio_file):
super().__init__(config, delete_audio_file)
self.cosy_voice_path = config.get("cosyvoice_path")
self.cosy_voice_model_dir = config.get("cosyvoice_model_dir")
self.matcha_tts_path = config.get("matcha_tts_path") if config.get(
"matcha_tts_path") else f"{self.cosy_voice_path}/third_party/Matcha-TTS"

self.prompt_speech_16k = config.get("prompt_speech_16k") if config.get(
"prompt_speech_16k") else f"{self.cosy_voice_path}/asset/zero_shot_prompt.wav"
# 非必传参数,如果不传,则使用默认的16k采样率的提示音频
self.prompt_speech_16k_text = config.get("prompt_speech_16k_text") if config.get(
"prompt_speech_16k_text") else None

self._initialize_model()

def inference_to_single_file(self, inference_func, output_path, *args, **kwargs):
"""
执行推理并将结果保存为单个音频文件

参数:
inference_func: 推理函数(如cosyvoice.inference_zero_shot)
output_path: 输出文件路径
*args, **kwargs: 传递给推理函数的参数

返回:
合并后的语音张量
"""
speech_segments = []
for segment in inference_func(*args, **kwargs):
speech_segments.append(segment['tts_speech'])
if speech_segments:
combined_speech = torch.cat(speech_segments, dim=1)
torchaudio.save(output_path, combined_speech, self.model.sample_rate)
return combined_speech
return None

def generate_filename(self):
return os.path.join(self.output_file, f"tts-{datetime.now().date()}@{uuid.uuid4().hex}.{self.format}")

async def text_to_speak(self, text, output_file):
try:
if not self.prompt_speech_16k_text:
self.inference_to_single_file(
self.model.inference_cross_lingual,
output_file,
text,
self.prompt_speech_16k,
stream=False
)
else:
self.inference_to_single_file(
self.model.inference_zero_shot,
output_file,
text,
self.prompt_speech_16k_text,
self.prompt_speech_16k,
stream=False
)
except Exception as e:
logger.bind(tag=TAG).exception(f"CosyVoice TTS请求失败: {e}")
4 changes: 2 additions & 2 deletions main/xiaozhi-server/core/utils/tts.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import os
import re
import sys
from config.logger import setup_logging
from config.logger import get_logger
import importlib

logger = setup_logging()
logger = get_logger(__name__)


def create_instance(class_name, *args, **kwargs):
Expand Down
16 changes: 13 additions & 3 deletions main/xiaozhi-server/models/SenseVoiceSmall/demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,26 @@
hub="hf",
)

res0 = model.generate(
input=f"{model.model_path}/example/en.mp3",
cache={},
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
use_itn=True,
batch_size_s=60,
merge_vad=True, #
merge_length_s=15,
)

# en
res = model.generate(
input=f"{model.model_path}/example/en.mp3",
input=f"{model.model_path}/example/zh.mp3",
cache={},
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
use_itn=True,
batch_size_s=60,
merge_vad=True, #
merge_length_s=15,
)
text = rich_transcription_postprocess(res[0]["text"])
print(text)
print(rich_transcription_postprocess(res[0]["text"]))
print(rich_transcription_postprocess(res0[0]["text"]))

2 changes: 1 addition & 1 deletion main/xiaozhi-server/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
pyyml==0.0.2
torch==2.2.2
torchaudio==2.2.2
silero_vad==5.1.2
websockets==14.2
opuslib_next==1.1.2
numpy==1.26.4
pydub==0.25.1
funasr==1.2.3
torchaudio==2.2.2
openai==1.61.0
google-generativeai==0.8.4
edge_tts==7.0.0
Expand Down
14 changes: 14 additions & 0 deletions main/xiaozhi-server/test/unit/tts/cosyvoice_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
HyperPyYAML==1.2.2
openai-whisper==20231117
inflect==7.3.1
transformers==4.40.1
conformer==0.3.2
diffusers==0.29.0
lightning==2.2.4
rich==13.7.1
gdown==5.1.0
matplotlib==3.7.5
wget==3.2
pyarrow==19.0.1
pyworld==0.3.4
onnxruntime-gpu==1.21.0
81 changes: 81 additions & 0 deletions main/xiaozhi-server/test/unit/tts/local_cosyvoice_real.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import os
import unittest
import time
from datetime import datetime
import uuid
import wave
import torchaudio
from core.providers.tts.local_cosyvoice import TTSProvider


class TestRealTTSGeneration(unittest.TestCase):
"""真实语音生成集成测试,需要实际的 CosyVoice 环境"""

def setUp(self):
# 使用真实配置 - 请确保这些路径在您的环境中有效
self.config = {
"output_dir": "/tmp",
"cosyvoice_path": "/home/shangjun/xt_workspace/python_workspace/CosyVoice",
"cosyvoice_model_dir": "/home/shangjun/xt_workspace/python_workspace/CosyVoice/pretrained_models/CosyVoice2-0.5B",
"prompt_speech_16k": "/home/shangjun/xt_workspace/python_workspace/CosyVoice/asset/zero_shot_prompt.wav",
"prompt_speech_16k_text": "希望你以后能够做的比我还好呦。"
}
# 创建输出目录
os.makedirs(self.config["output_dir"], exist_ok=True)

def test_real_tts_generation(self):
"""使用真实模型生成语音文件"""
# 初始化提供者,不删除生成的文件
provider = TTSProvider(self.config, False)
provider.format = "wav"

# 生成测试文本
test_text = "王骀受了刖刑,被砍去了一只脚。孔子有个弟子叫常季,他见老师时提出了自己的疑问。他说:老师你看,王骀被砍去了一只脚,可是他的学识和品行好像都超过了先生您,至于跟平常人相比,好像水平就更高了。像他这样的人,运用心智是怎样的与众不同呢?孔子的学生觉得很是奇怪,这个人一只脚被砍掉了,但是他的名声却很大,很多人都喜欢跟他学习,这个学生感到很不理解,一见到老师就向老师提出自己心中的疑问。文中庄子又是借孔子之口,表达了自己这样的观点:说死和生都是人生中的大事,可是死和生都不能使王骀这样的人随之变化,你说王骀是个什么样的人呢?即使天翻过来地坠下去,他也不会因此而被毁灭,他通晓无所依凭的道理,当然也就不随物变迁,而是听任事物的变化而信守自己的宗本。孔子的这段话把常季给说晕了,他忍不住再问:老师您这些话是什么意思啊?孔子怎么回答的呢?这段话很重要,来看一下完整的译文:孔子说:“从事物千差万别的一面去看,邻近的肝胆虽处于一体之中,也像是楚国和越国那样相距甚远;如果从事物相同的一面来看,万事万物又都是同一的,没有差别的。像王骀这样的人,耳朵和眼睛最适宜何种声音和色彩这样的事,已经不在他考虑范围之内了。他让自己的心思自由自在地遨游在忘形、忘情的浑同境域之中,就把这些东西的差别都忘掉了。所以他看待自己丧失了一只脚这件事,就像是看待失落的土块一样。”学了前面《庄子》的几篇文章,这段话的观点我们已不陌生。另外,有没有觉得这段话的句式很熟悉?中学时我们就学过苏东坡的《前赤壁赋》,其中就有这样的句式:“自其变者而观之,则天地曾不能以一瞬;自其不变者而观之,则物与我皆无尽也。”可以说,东坡不仅化用了庄子的句式,而且思想也和庄子是一样的。"

# 执行文本到语音转换
start_time = time.time()
result_file = provider.to_tts(test_text)
end_time = time.time()

# 输出生成信息
print(f"语音生成耗时: {end_time - start_time:.2f}秒")
print(f"生成的文件路径: {result_file}")

# 验证文件是否存在
self.assertTrue(os.path.exists(result_file), "语音文件未成功生成")

# 验证文件格式
self.assertTrue(result_file.endswith(".wav"), "生成的不是WAV文件")

# 验证文件内容
try:
# 检查音频文件属性
audio_info = torchaudio.info(result_file)
print(f"采样率: {audio_info.sample_rate}Hz")
print(f"声道数: {audio_info.num_channels}")
print(f"音频长度: {audio_info.num_frames / audio_info.sample_rate:.2f}秒")

# 加载音频文件
waveform, sample_rate = torchaudio.load(result_file)

# 验证音频基本特性
self.assertEqual(sample_rate, 24000, "采样率应为24kHz")
self.assertTrue(waveform.size(0) > 0, "音频数据不应为空")
self.assertTrue(waveform.size(1) > 0, "音频长度不应为0")

print(f"音频形状: {waveform.shape}")
print(f"最大值: {waveform.max().item():.4f}, 最小值: {waveform.min().item():.4f}")

except Exception as e:
self.fail(f"验证音频文件失败: {e}")

# 如果需要,可以在这里播放音频进行人工验证
# import IPython.display as ipd
# ipd.Audio(result_file)

def tearDown(self):
# 清理临时文件(可选)
# 注意:如果想保留文件以便检查,可以注释掉下面的代码
# import shutil
# shutil.rmtree(self.config["output_dir"])
pass
Loading