feat: 功能优化

This commit is contained in:
2025-11-19 21:57:16 +08:00
parent f052b0af65
commit 75abf48bc1
11 changed files with 818 additions and 164 deletions

View File

@@ -3,9 +3,15 @@ package cn.iocoder.yudao.module.tik.voice.client;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.util.StrUtil;
import cn.iocoder.yudao.framework.common.exception.ServiceException;
import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceCloneRequest;
import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceCloneResult;
import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsRequest;
import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsResult;
import cn.iocoder.yudao.module.tik.voice.config.CosyVoiceProperties;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
import com.alibaba.dashscope.audio.ttsv2.enrollment.Voice;
import com.alibaba.dashscope.audio.ttsv2.enrollment.VoiceEnrollmentService;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import lombok.RequiredArgsConstructor;
@@ -17,6 +23,7 @@ import okhttp3.RequestBody;
import okhttp3.Response;
import org.springframework.stereotype.Component;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.Base64;
@@ -53,28 +60,130 @@ public class CosyVoiceClient {
if (request == null || StrUtil.isBlank(request.getText())) {
throw exception0(VOICE_TTS_FAILED.getCode(), "TTS 文本不能为空");
}
if (StrUtil.isBlank(request.getVoiceId())) {
throw exception0(VOICE_TTS_FAILED.getCode(), "必须提供 voiceId");
}
SpeechSynthesizer synthesizer = null;
try {
String payload = objectMapper.writeValueAsString(buildPayload(request));
Request httpRequest = new Request.Builder()
.url(properties.getTtsUrl())
.addHeader("Authorization", "Bearer " + properties.getApiKey())
.addHeader("Content-Type", "application/json")
.post(RequestBody.create(payload.getBytes(StandardCharsets.UTF_8), JSON))
log.info("[CosyVoice][开始TTS][voiceId={}, textLength={}, model={}]",
request.getVoiceId(),
request.getText().length(),
StrUtil.blankToDefault(request.getModel(), properties.getDefaultModel()));
// 使用 DashScope SDK 构建参数(严格按文档)
SpeechSynthesisParam param = SpeechSynthesisParam.builder()
.apiKey(properties.getApiKey())
.model(StrUtil.blankToDefault(request.getModel(), properties.getDefaultModel()))
.voice(request.getVoiceId())
.build();
try (Response response = getHttpClient().newCall(httpRequest).execute()) {
String body = response.body() != null ? response.body().string() : "";
if (!response.isSuccessful()) {
log.error("[CosyVoice][TTS失败][status={}, body={}]", response.code(), body);
throw buildException(body);
}
return parseTtsResult(body, request);
// 初始化合成器(同步调用传 null
synthesizer = new SpeechSynthesizer(param, null);
// 阻塞调用,获取完整音频
ByteBuffer audioData = synthesizer.call(request.getText());
if (audioData == null) {
throw exception0(VOICE_TTS_FAILED.getCode(), "CosyVoice 返回空音频数据");
}
// 转换为字节数组(严格按照文档:直接使用 array()
byte[] audioBytes = audioData.array();
log.info("[CosyVoice][TTS合成成功][Request ID: {}, audioSize={}, 首包延迟={}ms]",
synthesizer.getLastRequestId(),
audioBytes.length,
synthesizer.getFirstPackageDelay());
// 构建返回结果
CosyVoiceTtsResult result = new CosyVoiceTtsResult();
result.setAudio(audioBytes);
result.setFormat(request.getAudioFormat() != null ? request.getAudioFormat() : properties.getAudioFormat());
result.setSampleRate(request.getSampleRate() != null ? request.getSampleRate() : properties.getSampleRate());
result.setRequestId(synthesizer.getLastRequestId());
result.setVoiceId(request.getVoiceId());
return result;
} catch (ServiceException ex) {
throw ex;
} catch (Exception ex) {
log.error("[CosyVoice][TTS异常]", ex);
log.error("[CosyVoice][TTS异常][voiceId={}, text={}]", request.getVoiceId(), request.getText(), ex);
throw exception(VOICE_TTS_FAILED);
} finally {
// 关闭 WebSocket 连接
if (synthesizer != null) {
try {
synthesizer.getDuplexApi().close(1000, "任务结束");
} catch (Exception e) {
log.warn("[CosyVoice][关闭连接失败]", e);
}
}
}
}
/**
* 使用 HTTP API 进行 TTS 合成(备用方案)
*/
private CosyVoiceTtsResult synthesizeViaHttp(CosyVoiceTtsRequest request) throws Exception {
String payload = objectMapper.writeValueAsString(buildPayload(request));
Request httpRequest = new Request.Builder()
.url(properties.getTtsUrl())
.addHeader("Authorization", "Bearer " + properties.getApiKey())
.addHeader("Content-Type", "application/json")
.post(RequestBody.create(payload.getBytes(StandardCharsets.UTF_8), JSON))
.build();
try (Response response = getHttpClient().newCall(httpRequest).execute()) {
String body = response.body() != null ? response.body().string() : "";
if (!response.isSuccessful()) {
log.error("[CosyVoice][TTS失败][status={}, body={}]", response.code(), body);
throw buildException(body);
}
return parseTtsResult(body, request);
}
}
/**
* 调用 CosyVoice 语音复刻接口(声音注册)
*/
public CosyVoiceCloneResult cloneVoice(CosyVoiceCloneRequest request) {
if (!properties.isEnabled()) {
throw exception0(VOICE_TTS_FAILED.getCode(), "未配置 CosyVoice API Key");
}
if (request == null || StrUtil.isBlank(request.getUrl())) {
throw exception0(VOICE_TTS_FAILED.getCode(), "复刻音频URL不能为空");
}
if (request == null || StrUtil.isBlank(request.getTargetModel())) {
throw exception0(VOICE_TTS_FAILED.getCode(), "复刻模型不能为空");
}
if (request == null || StrUtil.isBlank(request.getPrefix())) {
throw exception0(VOICE_TTS_FAILED.getCode(), "音色前缀不能为空");
}
try {
log.info("[CosyVoice][开始语音复刻][targetModel={}, prefix={}, url={}]",
request.getTargetModel(), request.getPrefix(), request.getUrl());
// 使用 DashScope SDK 创建语音复刻
VoiceEnrollmentService service = new VoiceEnrollmentService(properties.getApiKey());
Voice voice = service.createVoice(request.getTargetModel(), request.getPrefix(), request.getUrl());
log.info("[CosyVoice][语音复刻成功][Request ID: {}, Voice ID: {}]",
service.getLastRequestId(), voice.getVoiceId());
// 构建返回结果
CosyVoiceCloneResult result = new CosyVoiceCloneResult();
result.setVoiceId(voice.getVoiceId());
result.setRequestId(service.getLastRequestId());
return result;
} catch (ServiceException ex) {
throw ex;
} catch (Exception ex) {
log.error("[CosyVoice][语音复刻异常][targetModel={}, prefix={}]",
request.getTargetModel(), request.getPrefix(), ex);
throw exception(VOICE_TTS_FAILED);
}
}

View File

@@ -0,0 +1,36 @@
package cn.iocoder.yudao.module.tik.voice.client.dto;
import lombok.Data;
/**
* CosyVoice 语音复刻请求
*/
@Data
public class CosyVoiceCloneRequest {
/**
* 复刻模型cosyvoice-v1 或 cosyvoice-v2
*/
private String targetModel;
/**
* 音色自定义前缀(仅允许数字和小写字母,长度<10字符
*/
private String prefix;
/**
* 音频文件公网URL
*/
private String url;
/**
* 采样率默认24000
*/
private Integer sampleRate;
/**
* 音频格式默认wav
*/
private String audioFormat;
}

View File

@@ -0,0 +1,21 @@
package cn.iocoder.yudao.module.tik.voice.client.dto;
import lombok.Data;
/**
* CosyVoice 语音复刻结果
*/
@Data
public class CosyVoiceCloneResult {
/**
* 生成的 voice_id
*/
private String voiceId;
/**
* 请求ID
*/
private String requestId;
}

View File

@@ -50,6 +50,11 @@ public class CosyVoiceProperties {
*/
private String ttsUrl = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/speech-synthesis";
/**
* 语音复刻接口地址(声音注册)
*/
private String voiceEnrollmentUrl = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/voice-enrollment";
/**
* 连接超时时间
*/

View File

@@ -54,6 +54,10 @@ public class TikUserVoiceDO extends TenantBaseDO {
* 备注信息
*/
private String note;
/**
* 复刻音色IDCosyVoice 语音复刻生成的 voice_id
*/
private String voiceId;
}

View File

@@ -20,6 +20,8 @@ import cn.iocoder.yudao.module.tik.file.service.TikUserFileService;
import cn.iocoder.yudao.module.tik.tikhup.service.TikHupService;
import cn.iocoder.yudao.framework.mybatis.core.query.LambdaQueryWrapperX;
import cn.iocoder.yudao.module.tik.voice.client.CosyVoiceClient;
import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceCloneRequest;
import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceCloneResult;
import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsRequest;
import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsResult;
import cn.iocoder.yudao.module.tik.voice.config.CosyVoiceProperties;
@@ -138,20 +140,30 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
.setTranscription(null); // 初始为空,表示未识别
voiceMapper.insert(voice);
// 4. 如果开启自动识别,异步执行识别(添加防重复检查)
if (Boolean.TRUE.equals(createReqVO.getAutoTranscribe())) {
// 再次检查是否已经有识别结果(防止并发重复创建)
TikUserVoiceDO checkVoice = voiceMapper.selectById(voice.getId());
if (StrUtil.isBlank(checkVoice.getTranscription())) {
String fileAccessUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
log.info("[createVoice][开启自动识别,配音编号({})文件ID({})预签名URL({})]",
voice.getId(), fileDO.getId(), fileAccessUrl);
asyncTranscribeVoice(voice.getId(), fileAccessUrl);
} else {
log.info("[createVoice][配音已经有识别结果,跳过自动识别,配音编号({})]", voice.getId());
}
// 4. 调用阿里云语音复刻服务,生成 voice_id
try {
log.info("[createVoice][开始语音复刻,配音编号({})文件ID({})]", voice.getId(), fileDO.getId());
String fileAccessUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
CosyVoiceCloneRequest cloneRequest = new CosyVoiceCloneRequest();
cloneRequest.setTargetModel("cosyvoice-v2"); // 使用v2模型效果更好
cloneRequest.setPrefix("voice" + voice.getId()); // 音色前缀,格式要求
cloneRequest.setUrl(fileAccessUrl);
CosyVoiceCloneResult cloneResult = cosyVoiceClient.cloneVoice(cloneRequest);
String voiceId = cloneResult.getVoiceId();
// 更新配音记录,保存 voice_id
voice.setVoiceId(voiceId);
voiceMapper.updateById(voice);
log.info("[createVoice][语音复刻成功,配音编号({})voice_id({})]", voice.getId(), voiceId);
} catch (Exception e) {
log.error("[createVoice][语音复刻失败,配音编号({}),错误信息: {}]", voice.getId(), e.getMessage(), e);
// 复刻失败不影响配音记录创建,只记录日志
}
log.info("[createVoice][用户({})创建配音成功,配音编号({})]", userId, voice.getId());
return voice.getId();
}
@@ -361,17 +373,25 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
throw exception(VOICE_NOT_EXISTS, "配音不属于当前用户");
}
// 获取文件信息用于获取文件URL
FileDO fileDO = fileMapper.selectById(voice.getFileId());
if (fileDO == null) {
throw exception(VOICE_FILE_NOT_EXISTS);
}
// 优先使用复刻的 voice_id如果不存在则使用文件URL兼容旧数据
if (StrUtil.isNotBlank(voice.getVoiceId())) {
log.info("[synthesizeVoice][使用复刻音色ID合成配音编号({})voice_id({})]", voiceConfigId, voice.getVoiceId());
voiceId = voice.getVoiceId();
transcriptionText = voice.getTranscription();
} else {
log.info("[synthesizeVoice][使用文件URL合成配音编号({})]", voiceConfigId);
// 获取文件信息用于获取文件URL
FileDO fileDO = fileMapper.selectById(voice.getFileId());
if (fileDO == null) {
throw exception(VOICE_FILE_NOT_EXISTS);
}
// 使用文件URL和识别文本进行合成
fileUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
transcriptionText = voice.getTranscription();
if (StrUtil.isBlank(transcriptionText)) {
throw exception(VOICE_NOT_EXISTS, "配音识别文本为空,请先进行语音识别");
// 使用文件URL和识别文本进行合成
fileUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
transcriptionText = voice.getTranscription();
if (StrUtil.isBlank(transcriptionText)) {
throw exception(VOICE_NOT_EXISTS, "配音识别文本为空,请先进行语音识别");
}
}
}
// 2. 如果没有配置ID使用voiceId或fileUrl系统音色或直接URL方式
@@ -512,21 +532,31 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
voiceConfigId, voice.getUserId(), userId);
throw exception(VOICE_NOT_EXISTS, "配音不属于当前用户");
}
// 获取文件信息用于获取文件URL
FileDO fileDO = fileMapper.selectById(voice.getFileId());
if (fileDO == null) {
throw exception(VOICE_FILE_NOT_EXISTS);
// 优先使用复刻的 voice_id如果不存在则使用文件URL兼容旧数据
if (StrUtil.isNotBlank(voice.getVoiceId())) {
log.info("[previewVoice][使用复刻音色ID试听配音编号({})voice_id({})]", voiceConfigId, voice.getVoiceId());
voiceId = voice.getVoiceId();
transcriptionText = voice.getTranscription();
inputText = StrUtil.blankToDefault(reqVO.getInputText(),
StrUtil.blankToDefault(transcriptionText, cosyVoiceProperties.getPreviewText()));
} else {
log.info("[previewVoice][使用文件URL试听配音编号({})]", voiceConfigId);
// 获取文件信息用于获取文件URL
FileDO fileDO = fileMapper.selectById(voice.getFileId());
if (fileDO == null) {
throw exception(VOICE_FILE_NOT_EXISTS);
}
// 使用文件URL和识别文本进行合成
fileUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
transcriptionText = voice.getTranscription();
if (StrUtil.isBlank(transcriptionText)) {
throw exception(VOICE_NOT_EXISTS, "配音识别文本为空,请先进行语音识别");
}
inputText = StrUtil.blankToDefault(reqVO.getInputText(),
StrUtil.blankToDefault(transcriptionText, cosyVoiceProperties.getPreviewText()));
}
// 使用文件URL和识别文本进行合成
fileUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
transcriptionText = voice.getTranscription();
if (StrUtil.isBlank(transcriptionText)) {
throw exception(VOICE_NOT_EXISTS, "配音识别文本为空,请先进行语音识别");
}
inputText = StrUtil.blankToDefault(reqVO.getInputText(),
StrUtil.blankToDefault(transcriptionText, cosyVoiceProperties.getPreviewText()));
}
// 3. 如果没有配置ID使用系统配音配置需要前端传voiceId
else {

View File

@@ -38,6 +38,9 @@ public class AppTikUserVoiceRespVO {
@Schema(description = "备注", example = "这是一个测试配音")
private String note;
@Schema(description = "复刻音色IDCosyVoice 语音复刻生成的 voice_id")
private String voiceId;
@Schema(description = "创建时间", requiredMode = Schema.RequiredMode.REQUIRED)
private LocalDateTime createTime;