语音优化

This commit is contained in:
2026-02-25 16:28:31 +08:00
parent 214c1f0f37
commit 0efca50be3
39 changed files with 237 additions and 1093 deletions

View File

@@ -1,355 +0,0 @@
package cn.iocoder.yudao.module.tik.voice.client;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.util.StrUtil;
import cn.iocoder.yudao.framework.common.exception.ServiceException;
import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceCloneRequest;
import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceCloneResult;
import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsRequest;
import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsResult;
import cn.iocoder.yudao.module.tik.voice.config.CosyVoiceProviderConfig;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
import com.alibaba.dashscope.audio.ttsv2.enrollment.Voice;
import com.alibaba.dashscope.audio.ttsv2.enrollment.VoiceEnrollmentService;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import okhttp3.MediaType;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.RequestBody;
import okhttp3.Response;
import org.springframework.stereotype.Component;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.Base64;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception;
import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception0;
import static cn.iocoder.yudao.module.tik.enums.ErrorCodeConstants.VOICE_TTS_FAILED;
/**
* CosyVoice 客户端
*/
@Slf4j
@Component
@RequiredArgsConstructor
public class CosyVoiceClient {
private static final MediaType JSON = MediaType.parse("application/json; charset=utf-8");
private final CosyVoiceProviderConfig config;
private final ObjectMapper objectMapper;
private volatile OkHttpClient httpClient;
/**
* 调用 CosyVoice TTS 接口
*/
public CosyVoiceTtsResult synthesize(CosyVoiceTtsRequest request) {
if (!config.isEnabled()) {
throw exception0(VOICE_TTS_FAILED.getCode(), "未配置 CosyVoice API Key");
}
// 添加详细的参数检查日志
String text = request != null ? request.getText() : null;
log.error("[CosyVoice][TTS参数检查][request={}, text={}, voiceId={}, model={}]",
request != null ? "存在" : "为null",
text != null ? "'" + text + "' (长度:" + text.length() + ")" : "为null",
request != null ? request.getVoiceId() : null,
request != null ? request.getModel() : null);
if (request == null || StrUtil.isBlank(request.getText())) {
throw exception0(VOICE_TTS_FAILED.getCode(), "TTS 文本不能为空");
}
if (StrUtil.isBlank(request.getVoiceId())) {
throw exception0(VOICE_TTS_FAILED.getCode(), "必须提供 voiceId");
}
SpeechSynthesizer synthesizer = null;
try {
log.info("[CosyVoice][开始TTS][voiceId={}, textLength={}, model={}, speechRate={}, instruction={}]",
request.getVoiceId(),
request.getText().length(),
StrUtil.blankToDefault(request.getModel(), config.getDefaultModel()),
request.getSpeechRate(),
request.getInstruction());
// 使用 DashScope SDK 构建参数(严格按文档)
// 注意speechRate 和 volume 需要转换为 int 类型
SpeechSynthesisParam param = SpeechSynthesisParam.builder()
.apiKey(config.getApiKey())
.model(StrUtil.blankToDefault(request.getModel(), config.getDefaultModel()))
.voice(request.getVoiceId())
.speechRate(request.getSpeechRate() != null ? request.getSpeechRate().intValue() : 1)
.volume(request.getVolume() != null ? request.getVolume().intValue() : 0)
.build();
if (StrUtil.isNotBlank(request.getInstruction())) {
param.setInstruction(request.getInstruction());
}
log.error("[CosyVoice][SDK参数][param={}, text='{}']", param, request.getText());
// 初始化合成器(同步调用传 null
synthesizer = new SpeechSynthesizer(param, null);
// 阻塞调用,获取完整音频
ByteBuffer audioData = synthesizer.call(request.getText());
if (audioData == null) {
throw exception0(VOICE_TTS_FAILED.getCode(), "CosyVoice 返回空音频数据");
}
// 转换为字节数组(严格按照文档:直接使用 array()
byte[] audioBytes = audioData.array();
log.info("[CosyVoice][TTS合成成功][Request ID: {}, audioSize={}, 首包延迟={}ms]",
synthesizer.getLastRequestId(),
audioBytes.length,
synthesizer.getFirstPackageDelay());
// 构建返回结果
CosyVoiceTtsResult result = new CosyVoiceTtsResult();
result.setAudio(audioBytes);
result.setFormat(request.getAudioFormat() != null ? request.getAudioFormat() : config.getAudioFormat());
result.setSampleRate(request.getSampleRate() != null ? request.getSampleRate() : config.getSampleRate());
result.setRequestId(synthesizer.getLastRequestId());
result.setVoiceId(request.getVoiceId());
return result;
} catch (ServiceException ex) {
throw ex;
} catch (Exception ex) {
log.error("[CosyVoice][TTS异常][voiceId={}, text={}]", request.getVoiceId(), request.getText(), ex);
throw exception(VOICE_TTS_FAILED);
} finally {
// 关闭 WebSocket 连接
if (synthesizer != null) {
try {
synthesizer.getDuplexApi().close(1000, "任务结束");
} catch (Exception e) {
log.warn("[CosyVoice][关闭连接失败]", e);
}
}
}
}
/**
* 使用 HTTP API 进行 TTS 合成(备用方案)
*/
private CosyVoiceTtsResult synthesizeViaHttp(CosyVoiceTtsRequest request) throws Exception {
String payload = objectMapper.writeValueAsString(buildPayload(request));
Request httpRequest = new Request.Builder()
.url(config.getTtsUrl())
.addHeader("Authorization", "Bearer " + config.getApiKey())
.addHeader("Content-Type", "application/json")
.post(RequestBody.create(payload.getBytes(StandardCharsets.UTF_8), JSON))
.build();
try (Response response = getHttpClient().newCall(httpRequest).execute()) {
String body = response.body() != null ? response.body().string() : "";
if (!response.isSuccessful()) {
log.error("[CosyVoice][TTS失败][status={}, body={}]", response.code(), body);
throw buildException(body);
}
return parseTtsResult(body, request);
}
}
/**
* 调用 CosyVoice 语音复刻接口(声音注册)
*/
public CosyVoiceCloneResult cloneVoice(CosyVoiceCloneRequest request) {
if (!config.isEnabled()) {
throw exception0(VOICE_TTS_FAILED.getCode(), "未配置 CosyVoice API Key");
}
if (request == null || StrUtil.isBlank(request.getUrl())) {
throw exception0(VOICE_TTS_FAILED.getCode(), "复刻音频URL不能为空");
}
if (request == null || StrUtil.isBlank(request.getTargetModel())) {
throw exception0(VOICE_TTS_FAILED.getCode(), "复刻模型不能为空");
}
if (request == null || StrUtil.isBlank(request.getPrefix())) {
throw exception0(VOICE_TTS_FAILED.getCode(), "音色前缀不能为空");
}
try {
log.info("[CosyVoice][开始语音复刻][targetModel={}, prefix={}, url={}]",
request.getTargetModel(), request.getPrefix(), request.getUrl());
// 使用 DashScope SDK 创建语音复刻
VoiceEnrollmentService service = new VoiceEnrollmentService(config.getApiKey());
Voice voice = service.createVoice(request.getTargetModel(), request.getPrefix(), request.getUrl());
log.info("[CosyVoice][语音复刻成功][Request ID: {}, Voice ID: {}]",
service.getLastRequestId(), voice.getVoiceId());
// 构建返回结果
CosyVoiceCloneResult result = new CosyVoiceCloneResult();
result.setVoiceId(voice.getVoiceId());
result.setRequestId(service.getLastRequestId());
return result;
} catch (ServiceException ex) {
throw ex;
} catch (Exception ex) {
log.error("[CosyVoice][语音复刻异常][targetModel={}, prefix={}]",
request.getTargetModel(), request.getPrefix(), ex);
throw exception(VOICE_TTS_FAILED);
}
}
private Map<String, Object> buildPayload(CosyVoiceTtsRequest request) {
Map<String, Object> payload = new HashMap<>();
String model = StrUtil.blankToDefault(request.getModel(), config.getDefaultModel());
payload.put("model", model);
Map<String, Object> input = new HashMap<>();
input.put("text", request.getText());
// 优先使用fileUrl语音克隆否则使用voiceId系统音色
if (StrUtil.isNotBlank(request.getFileUrl())) {
// 直接使用预签名URL带签名和时效阿里云API需要这个签名URL
input.put("audio_url", request.getFileUrl());
log.info("[CosyVoice][使用语音克隆][audio_url={}]", request.getFileUrl());
// 如果提供了参考文本,也一并传递(用于提高语音克隆质量)
if (StrUtil.isNotBlank(request.getReferenceText())) {
input.put("reference_text", request.getReferenceText());
log.info("[CosyVoice][添加参考文本][length={}]", request.getReferenceText().length());
}
} else {
// 使用系统音色
String voiceId = StrUtil.blankToDefault(request.getVoiceId(), config.getDefaultVoiceId());
if (StrUtil.isNotBlank(voiceId)) {
input.put("voice", voiceId);
log.info("[CosyVoice][使用系统音色][voice={}]", voiceId);
} else {
log.warn("[CosyVoice][未提供voiceId或fileUrl]");
}
}
payload.put("input", input);
Map<String, Object> parameters = new HashMap<>();
int sampleRate = request.getSampleRate() != null ? request.getSampleRate() : config.getSampleRate();
parameters.put("sample_rate", sampleRate);
// 根据官方文档,统一使用小写格式
String format = StrUtil.blankToDefault(request.getAudioFormat(), config.getAudioFormat()).toLowerCase();
parameters.put("format", format);
if (request.getSpeechRate() != null) {
parameters.put("speech_rate", request.getSpeechRate());
}
if (request.getVolume() != null) {
// 文档显示volume范围是0-100
parameters.put("volume", Math.round(request.getVolume()));
}
if (request.isPreview()) {
parameters.put("preview", true);
}
payload.put("parameters", parameters);
// 打印完整请求体(用于调试)
log.info("[CosyVoice][请求参数][model={}, sample_rate={}, format={}, text_length={}]",
model, sampleRate, format, request.getText().length());
return payload;
}
private CosyVoiceTtsResult parseTtsResult(String body, CosyVoiceTtsRequest request) throws Exception {
JsonNode root = objectMapper.readTree(body);
// 错误响应包含 code 字段
if (root.has("code")) {
String message = root.has("message") ? root.get("message").asText() : body;
log.error("[CosyVoice][TTS失败][code={}, message={}]", root.get("code").asText(), message);
throw exception0(VOICE_TTS_FAILED.getCode(), message);
}
JsonNode audioNode = root.path("output").path("audio");
if (!audioNode.isArray() || audioNode.isEmpty()) {
throw exception0(VOICE_TTS_FAILED.getCode(), "CosyVoice 返回的音频为空");
}
JsonNode firstAudio = audioNode.get(0);
String content = firstAudio.path("content").asText();
if (StrUtil.isBlank(content)) {
throw exception0(VOICE_TTS_FAILED.getCode(), "CosyVoice 返回空音频内容");
}
byte[] audioBytes = Base64.getDecoder().decode(content);
CosyVoiceTtsResult result = new CosyVoiceTtsResult();
result.setAudio(audioBytes);
result.setFormat(firstAudio.path("format").asText(StrUtil.blankToDefault(request.getAudioFormat(), config.getAudioFormat())));
result.setSampleRate(firstAudio.path("sample_rate").asInt(request.getSampleRate() != null ? request.getSampleRate() : config.getSampleRate()));
result.setRequestId(root.path("request_id").asText());
result.setVoiceId(firstAudio.path("voice").asText(request.getVoiceId()));
return result;
}
private OkHttpClient getHttpClient() {
if (httpClient == null) {
synchronized (this) {
if (httpClient == null) {
java.time.Duration connect = defaultDuration(config.getConnectTimeout(), 10);
java.time.Duration read = defaultDuration(config.getReadTimeout(), 60);
httpClient = new OkHttpClient.Builder()
.connectTimeout(connect.toMillis(), TimeUnit.MILLISECONDS)
.readTimeout(read.toMillis(), TimeUnit.MILLISECONDS)
.build();
}
}
}
return httpClient;
}
private Duration defaultDuration(Duration duration, long seconds) {
return duration == null ? Duration.ofSeconds(seconds) : duration;
}
private ServiceException buildException(String body) {
try {
JsonNode root = objectMapper.readTree(body);
String message = CollUtil.getFirst(
CollUtil.newArrayList(
root.path("message").asText(null),
root.path("output").path("message").asText(null)));
return exception0(VOICE_TTS_FAILED.getCode(), StrUtil.blankToDefault(message, "CosyVoice 调用失败"));
} catch (Exception ignored) {
return exception0(VOICE_TTS_FAILED.getCode(), body);
}
}
/**
* 从URL中提取原始URL去除查询参数和锚点
*
* @param url 可能包含查询参数的URL
* @return 原始URL去除查询参数和锚点
*/
private String extractRawUrl(String url) {
if (StrUtil.isBlank(url)) {
return url;
}
try {
java.net.URL urlObj = new java.net.URL(url);
// 只使用协议、主机、路径部分,忽略查询参数和锚点
return urlObj.getProtocol() + "://" + urlObj.getHost() + urlObj.getPath();
} catch (Exception e) {
// 如果URL解析失败使用简单方式去除查询参数
return url.split("\\?")[0].split("#")[0];
}
}
}

View File

@@ -1,124 +0,0 @@
package cn.iocoder.yudao.module.tik.voice.client;
import cn.iocoder.yudao.module.tik.voice.client.dto.VoiceCloneRequest;
import cn.iocoder.yudao.module.tik.voice.client.dto.VoiceCloneResult;
import cn.iocoder.yudao.module.tik.voice.client.dto.VoiceTtsRequest;
import cn.iocoder.yudao.module.tik.voice.client.dto.VoiceTtsResult;
import cn.iocoder.yudao.module.tik.voice.config.CosyVoiceProviderConfig;
import cn.iocoder.yudao.module.tik.voice.config.VoiceProviderProperties;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.stereotype.Component;
/**
* CosyVoice Provider 实现
*
* <p>阿里云 CosyVoice 语音服务的 Provider 实现。
* 内部委托给 {@link CosyVoiceClient} 进行实际的API调用。
*
* @author 芋道源码
*/
@Slf4j
@Component
@RequiredArgsConstructor
public class CosyVoiceProvider implements VoiceCloneProvider {
private final CosyVoiceClient cosyVoiceClient;
private final VoiceProviderProperties voiceProviderProperties;
/**
* 获取 CosyVoice 配置
*/
private CosyVoiceProviderConfig getConfig() {
var baseConfig = voiceProviderProperties.getProviderConfig("cosyvoice");
if (baseConfig instanceof CosyVoiceProviderConfig config) {
return config;
}
return new CosyVoiceProviderConfig();
}
@Override
public VoiceCloneResult cloneVoice(VoiceCloneRequest request) {
log.info("[CosyVoiceProvider][语音克隆][audioUrl={}, model={}]",
request.getAudioUrl(), request.getModel());
// 适配到 CosyVoiceCloneRequest
cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceCloneRequest cosyRequest =
new cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceCloneRequest();
cosyRequest.setUrl(request.getAudioUrl());
cosyRequest.setTargetModel(request.getModel());
cosyRequest.setPrefix(request.getPrefix());
if (request.getSampleRate() != null) {
cosyRequest.setSampleRate(request.getSampleRate());
}
if (request.getAudioFormat() != null) {
cosyRequest.setAudioFormat(request.getAudioFormat());
}
// 调用底层 Client
cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceCloneResult cosyResult =
cosyVoiceClient.cloneVoice(cosyRequest);
// 适配到统一 Result
VoiceCloneResult result = new VoiceCloneResult();
result.setVoiceId(cosyResult.getVoiceId());
result.setRequestId(cosyResult.getRequestId());
log.info("[CosyVoiceProvider][语音克隆成功][voiceId={}]", result.getVoiceId());
return result;
}
@Override
public VoiceTtsResult synthesize(VoiceTtsRequest request) {
log.info("[CosyVoiceProvider][语音合成][voiceId={}, textLength={}, model={}]",
request.getVoiceId(),
request.getText() != null ? request.getText().length() : 0,
request.getModel());
// 适配到 CosyVoiceTtsRequest
cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsRequest cosyRequest =
cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsRequest.builder()
.text(request.getText())
.voiceId(request.getVoiceId())
.fileUrl(request.getFileUrl())
.referenceText(request.getReferenceText())
.model(request.getModel())
.speechRate(request.getSpeechRate())
.volume(request.getVolume())
.instruction(request.getInstruction())
.sampleRate(request.getSampleRate())
.audioFormat(request.getAudioFormat())
.preview(request.isPreview())
.build();
log.error("[CosyVoiceProvider][构建的cosyRequest][text='{}', voiceId={}, fileUrl={}]",
cosyRequest.getText(), cosyRequest.getVoiceId(), cosyRequest.getFileUrl());
// 调用底层 Client
cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsResult cosyResult =
cosyVoiceClient.synthesize(cosyRequest);
// 适配到统一 Result
VoiceTtsResult result = new VoiceTtsResult();
result.setRequestId(cosyResult.getRequestId());
result.setFormat(cosyResult.getFormat());
result.setSampleRate(cosyResult.getSampleRate());
result.setAudio(cosyResult.getAudio());
result.setVoiceId(cosyResult.getVoiceId());
log.info("[CosyVoiceProvider][语音合成成功][format={}, audioSize={}]",
result.getFormat(), result.getAudio() != null ? result.getAudio().length : 0);
return result;
}
@Override
public boolean supports(String providerType) {
return "cosyvoice".equalsIgnoreCase(providerType);
}
@Override
public String getProviderType() {
return "cosyvoice";
}
}

View File

@@ -4,6 +4,7 @@ import cn.hutool.core.util.StrUtil;
import cn.hutool.http.HttpRequest;
import cn.hutool.http.HttpResponse;
import cn.hutool.json.JSONUtil;
import cn.iocoder.yudao.module.tik.voice.client.dto.SiliconFlowReference;
import cn.iocoder.yudao.module.tik.voice.client.dto.SiliconFlowTtsRequest;
import cn.iocoder.yudao.module.tik.voice.client.dto.SiliconFlowVoiceUploadRequest;
import cn.iocoder.yudao.module.tik.voice.client.dto.SiliconFlowVoiceUploadResponse;
@@ -21,6 +22,7 @@ import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.util.Base64;
import java.util.Collections;
/**
* 硅基流动 Provider 实现
@@ -108,19 +110,42 @@ public class SiliconFlowProvider implements VoiceCloneProvider {
throw new RuntimeException("硅基流动供应商未配置或已禁用");
}
log.info("[SiliconFlowProvider][语音合成][voiceId={}, textLength={}, model={}]",
// 判断使用哪种模式
boolean useReferenceMode = StrUtil.isBlank(request.getVoiceId())
&& StrUtil.isNotBlank(request.getFileUrl());
log.info("[SiliconFlowProvider][语音合成][voiceId={}, fileUrl={}, textLength={}, model={}, mode={}]",
request.getVoiceId(),
request.getFileUrl() != null ? "存在" : "",
request.getText() != null ? request.getText().length() : 0,
request.getModel());
request.getModel(),
useReferenceMode ? "动态音色" : "标准音色");
try {
SiliconFlowTtsRequest sfRequest = SiliconFlowTtsRequest.builder()
SiliconFlowTtsRequest.SiliconFlowTtsRequestBuilder requestBuilder = SiliconFlowTtsRequest.builder()
.model(getOrDefault(request.getModel(), getOrDefault(config.getDefaultModel(), "IndexTeam/IndexTTS-2")))
.input(request.getText())
.voice(request.getVoiceId())
.speed(request.getSpeechRate() != null ? request.getSpeechRate() : 1.0f)
.responseFormat(getOrDefault(request.getAudioFormat(), config.getAudioFormat()))
.build();
.gain(request.getVolume());
if (useReferenceMode) {
// 用户动态音色模式voice 传空,使用 references
log.info("[SiliconFlowProvider][使用动态音色模式][fileUrl={}]", request.getFileUrl());
requestBuilder.voice("");
SiliconFlowReference reference = SiliconFlowReference.builder()
.audio(request.getFileUrl())
.text(request.getReferenceText())
.build();
requestBuilder.references(Collections.singletonList(reference));
} else {
// 标准模式:使用 voiceId
log.info("[SiliconFlowProvider][使用标准音色模式][voiceId={}]", request.getVoiceId());
requestBuilder.voice(request.getVoiceId());
}
SiliconFlowTtsRequest sfRequest = requestBuilder.build();
String url = config.getBaseUrl() + config.getTtsUrl();
String requestBody = JSONUtil.toJsonStr(sfRequest);
@@ -141,15 +166,16 @@ public class SiliconFlowProvider implements VoiceCloneProvider {
}
byte[] audioBytes = response.bodyBytes();
String base64Audio = Base64.getEncoder().encodeToString(audioBytes);
VoiceTtsResult result = new VoiceTtsResult();
result.setAudio(Base64.getDecoder().decode(base64Audio));
result.setAudio(audioBytes);
result.setFormat(sfRequest.getResponseFormat());
result.setVoiceId(request.getVoiceId());
log.info("[SiliconFlowProvider][语音合成成功][format={}, audioSize={}]",
result.getFormat(), result.getAudio() != null ? result.getAudio().length : 0);
log.info("[SiliconFlowProvider][语音合成成功][format={}, audioSize={}, mode={}]",
result.getFormat(),
result.getAudio() != null ? result.getAudio().length : 0,
useReferenceMode ? "动态音色" : "标准音色");
return result;
} catch (Exception e) {

View File

@@ -41,7 +41,7 @@ public interface VoiceCloneProvider {
/**
* 检查是否支持指定的供应商类型
*
* @param providerType 供应商类型(如 "cosyvoice", "siliconflow"
* @param providerType 供应商类型(如 "siliconflow"
* @return true 如果支持false 否则
*/
boolean supports(String providerType);
@@ -49,7 +49,7 @@ public interface VoiceCloneProvider {
/**
* 获取供应商类型标识
*
* @return 供应商类型,如 "cosyvoice", "siliconflow"
* @return 供应商类型,如 "siliconflow"
*/
String getProviderType();
}

View File

@@ -75,7 +75,7 @@ public class VoiceCloneProviderFactory {
/**
* 根据类型获取 Provider
*
* @param providerType 供应商类型(如 "cosyvoice", "siliconflow"
* @param providerType 供应商类型(如 "siliconflow"
* @return 对应的 Provider 实例
* @throws ServiceException 当 Provider 不存在时抛出
*/

View File

@@ -1,36 +0,0 @@
package cn.iocoder.yudao.module.tik.voice.client.dto;
import lombok.Data;
/**
* CosyVoice 语音复刻请求
*/
@Data
public class CosyVoiceCloneRequest {
/**
* 复刻模型cosyvoice-v3-flash 等)
*/
private String targetModel;
/**
* 音色自定义前缀(仅允许数字和小写字母,长度<10字符
*/
private String prefix;
/**
* 音频文件公网URL
*/
private String url;
/**
* 采样率默认24000
*/
private Integer sampleRate;
/**
* 音频格式默认wav
*/
private String audioFormat;
}

View File

@@ -1,21 +0,0 @@
package cn.iocoder.yudao.module.tik.voice.client.dto;
import lombok.Data;
/**
* CosyVoice 语音复刻结果
*/
@Data
public class CosyVoiceCloneResult {
/**
* 生成的 voice_id
*/
private String voiceId;
/**
* 请求ID
*/
private String requestId;
}

View File

@@ -1,69 +0,0 @@
package cn.iocoder.yudao.module.tik.voice.client.dto;
import lombok.Builder;
import lombok.Data;
/**
* CosyVoice TTS 请求
*/
@Data
@Builder
public class CosyVoiceTtsRequest {
/**
* 待合成文本
*/
private String text;
/**
* 声音 ID可选默认使用配置
*/
private String voiceId;
/**
* 语音文件URL当使用语音URL合成时使用替代voiceId
*/
private String fileUrl;
/**
* 参考音频文本当使用fileUrl时用于提高克隆质量
*/
private String referenceText;
/**
* 模型(默认 cosyvoice-v3-flash
*/
private String model;
/**
* 语速
*/
private Float speechRate;
/**
* 音量,可选
*/
private Float volume;
/**
* 指令(用于控制音色风格),可选
*/
private String instruction;
/**
* 采样率
*/
private Integer sampleRate;
/**
* 音频格式
*/
private String audioFormat;
/**
* 是否仅用于试听,方便服务侧做限流
*/
private boolean preview;
}

View File

@@ -1,37 +0,0 @@
package cn.iocoder.yudao.module.tik.voice.client.dto;
import lombok.Data;
/**
* CosyVoice TTS 响应
*/
@Data
public class CosyVoiceTtsResult {
/**
* 请求ID
*/
private String requestId;
/**
* 返回的音频格式
*/
private String format;
/**
* 采样率
*/
private Integer sampleRate;
/**
* 音频二进制内容
*/
private byte[] audio;
/**
* 音频所使用的 voiceId
*/
private String voiceId;
}

View File

@@ -0,0 +1,27 @@
package cn.iocoder.yudao.module.tik.voice.client.dto;
import lombok.Builder;
import lombok.Data;
/**
* 硅基流动参考音频配置
*
* <p>用于用户动态音色模式,通过 references 传递参考音频实现实时语音克隆。
*
* @author 芋道源码
*/
@Data
@Builder
public class SiliconFlowReference {
/**
* 参考音频 URL也支持 base64 格式)
*/
private String audio;
/**
* 参考音频的文字内容
*/
private String text;
}

View File

@@ -4,6 +4,8 @@ import com.fasterxml.jackson.annotation.JsonProperty;
import lombok.Builder;
import lombok.Data;
import java.util.List;
/**
* 硅基流动文本转语音请求
*
@@ -42,4 +44,18 @@ public class SiliconFlowTtsRequest {
@JsonProperty("response_format")
private String responseFormat;
/**
* 音量增益(-10 到 10默认 0
*
* <p>正值增加音量,负值降低音量
*/
private Float gain;
/**
* 参考音频列表(用于用户动态音色模式)
*
* <p>当 voice 为空时,使用此字段传递参考音频实现实时语音克隆
*/
private List<SiliconFlowReference> references;
}

View File

@@ -16,7 +16,6 @@ public class VoiceCloneRequest {
/**
* 音频文件公网URL
*
* <p>CosyVoice: 对应 {@code url} 字段</p>
* <p>SiliconFlow: 对应 {@code audio} 字段需base64编码</p>
*/
private String audioUrl;
@@ -24,7 +23,6 @@ public class VoiceCloneRequest {
/**
* 模型名称
*
* <p>CosyVoice: 对应 {@code targetModel},如 {@code cosyvoice-v3-flash}</p>
* <p>SiliconFlow: 对应 {@code model},如 {@code indextts-2}</p>
*/
private String model;
@@ -32,7 +30,6 @@ public class VoiceCloneRequest {
/**
* 音色自定义前缀(可选)
*
* <p>CosyVoice: 必填,仅允许数字和小写字母,长度<10字符</p>
* <p>SiliconFlow: 不适用</p>
*/
private String prefix;
@@ -53,7 +50,6 @@ public class VoiceCloneRequest {
* 转录文本(可选)
*
* <p>SiliconFlow: 音频对应的文本内容</p>
* <p>CosyVoice: 不适用</p>
*/
private String transcriptionText;
}

View File

@@ -1,68 +0,0 @@
package cn.iocoder.yudao.module.tik.voice.config;
import lombok.Data;
import lombok.EqualsAndHashCode;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;
import java.time.Duration;
/**
* CosyVoice 供应商配置
*
* <p>继承通用配置,添加 CosyVoice 特有字段。
*
* @author 芋道源码
*/
@Data
@EqualsAndHashCode(callSuper = true)
@Component
@ConfigurationProperties(prefix = "yudao.voice.cosyvoice")
public class CosyVoiceProviderConfig extends VoiceProviderProperties.ProviderConfig {
/**
* 默认模型
*/
private String defaultModel = "cosyvoice-v3-flash";
/**
* 默认 voiceId可选
*/
private String defaultVoiceId;
/**
* 默认采样率
*/
private Integer sampleRate = 24000;
/**
* 默认音频格式
*/
private String audioFormat = "mp3";
/**
* 试听默认示例文本
*/
private String previewText = "您好,欢迎体验专属音色。";
/**
* TTS 接口地址
*/
private String ttsUrl = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/speech-synthesis";
/**
* 语音复刻接口地址(声音注册)
*/
private String voiceEnrollmentUrl = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/voice-enrollment";
/**
* 连接超时时间
*/
private Duration connectTimeout = Duration.ofSeconds(10);
/**
* 读取超时时间3分钟提升语音合成成功率
*/
private Duration readTimeout = Duration.ofSeconds(180);
}

View File

@@ -1,8 +1,5 @@
package cn.iocoder.yudao.module.tik.voice.config;
import lombok.extern.slf4j.Slf4j;
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
/**
@@ -11,25 +8,6 @@ import org.springframework.context.annotation.Configuration;
* @author 芋道源码
*/
@Configuration
@Slf4j
public class VoiceAutoConfiguration {
/**
* CosyVoice 供应商配置 Bean
*/
@Bean
@ConditionalOnProperty(prefix = "yudao.voice.providers.cosyvoice", name = "enabled", havingValue = "true", matchIfMissing = true)
public CosyVoiceProviderConfig cosyVoiceProviderConfig(VoiceProviderProperties properties) {
VoiceProviderProperties.ProviderConfig baseConfig = properties.getProviderConfig("cosyvoice");
if (baseConfig == null) {
baseConfig = new VoiceProviderProperties.ProviderConfig();
}
CosyVoiceProviderConfig config = new CosyVoiceProviderConfig();
config.setEnabled(baseConfig.isEnabled());
config.setApiKey(baseConfig.getApiKey());
config.setPriority(baseConfig.getPriority());
return config;
}
}

View File

@@ -22,14 +22,14 @@ public class VoiceProviderProperties {
/**
* 默认供应商类型
*
* <p>可选值: cosyvoice, siliconflow 等
* <p>可选值: siliconflow 等
*/
private String defaultProvider = "cosyvoice";
private String defaultProvider = "siliconflow";
/**
* 各供应商配置
*
* <p>key 为供应商类型(如 cosyvoice, siliconflow
* <p>key 为供应商类型(如 siliconflow
*/
private Map<String, ProviderConfig> providers = new HashMap<>();

View File

@@ -80,7 +80,7 @@ public class AppTikUserVoiceController {
}
@PostMapping("/tts")
@Operation(summary = "CosyVoice 文本转语音")
@Operation(summary = "文本转语音")
public CommonResult<AppTikVoiceTtsRespVO> synthesizeVoice(@Valid @RequestBody AppTikVoiceTtsReqVO reqVO) {
return success(voiceService.synthesizeVoice(reqVO));
}

View File

@@ -55,9 +55,13 @@ public class TikDigitalHumanTaskDO extends TenantBaseDO {
// ========== TTS参数 ==========
/**
* 音色IDCosyVoice voiceId
* 音色ID系统音色使用
*/
private String voiceId;
/**
* 用户配音IDtik_user_voice.id用户配音使用
*/
private Long voiceConfigId;
/**
* 输入文本(用于语音合成)
*/

View File

@@ -54,10 +54,6 @@ public class TikUserVoiceDO extends TenantBaseDO {
* 备注信息
*/
private String note;
/**
* 复刻音色IDCosyVoice 语音复刻生成的 voice_id
*/
private String voiceId;
}

View File

@@ -1,38 +0,0 @@
package cn.iocoder.yudao.module.tik.voice.enums;
import cn.hutool.core.util.StrUtil;
import lombok.AllArgsConstructor;
import lombok.Getter;
/**
* CosyVoice情感枚举
* 根据阿里云DashScope官方文档定义
* 参考https://help.aliyun.com/zh/dashscope/developer-reference/tts-api
*/
@Getter
@AllArgsConstructor
public enum CosyVoiceEmotionEnum {
NEUTRAL("neutral", "中性"),
HAPPY("happy", "高兴"),
SAD("sad", "悲伤"),
ANGRY("angry", "愤怒"),
SURPRISED("surprised", "惊讶"),
DISGUSTED("disgusted", "厌恶"),
SCARED("scared", "害怕");
private final String code;
private final String description;
public static CosyVoiceEmotionEnum getByCode(String code) {
if (StrUtil.isBlank(code)) {
return NEUTRAL;
}
for (CosyVoiceEmotionEnum emotion : values()) {
if (emotion.getCode().equalsIgnoreCase(code)) {
return emotion;
}
}
return NEUTRAL;
}
}

View File

@@ -390,8 +390,9 @@ public class DigitalHumanTaskServiceImpl implements DigitalHumanTaskService {
throw ServiceExceptionUtil.exception(ErrorCodeConstants.GENERAL_FORBIDDEN, "无权访问该音色");
}
if (StrUtil.isBlank(userVoice.getVoiceId())) {
throw new IllegalArgumentException("该音色配置无效缺少voiceId");
// 验证识别文本是否存在(用于动态音色模式)
if (StrUtil.isBlank(userVoice.getTranscription())) {
throw new IllegalArgumentException("该音色配置无效,请先进行语音识别");
}
}
@@ -399,14 +400,8 @@ public class DigitalHumanTaskServiceImpl implements DigitalHumanTaskService {
* 创建任务记录
*/
private TikDigitalHumanTaskDO createTaskRecord(AppTikDigitalHumanCreateReqVO reqVO, Long userId) {
// 如果是用户音色,需要从voiceConfigId获取voiceId
// 直接使用前端传递的 voiceId系统预置音色用户音色通过 voiceConfigId 在合成时处理
String voiceId = reqVO.getVoiceId();
if (voiceId == null && reqVO.getVoiceConfigId() != null) {
TikUserVoiceDO userVoice = userVoiceMapper.selectById(reqVO.getVoiceConfigId());
if (userVoice != null) {
voiceId = userVoice.getVoiceId();
}
}
// ✅ 预生成音频信息(无需存储时长,前端严格校验)
if (reqVO.getPreGeneratedAudio() != null) {
@@ -427,6 +422,7 @@ public class DigitalHumanTaskServiceImpl implements DigitalHumanTaskService {
.videoFileId(reqVO.getVideoFileId())
.videoUrl(reqVO.getVideoUrl())
.voiceId(voiceId)
.voiceConfigId(reqVO.getVoiceConfigId())
.inputText(reqVO.getInputText())
.speechRate(reqVO.getSpeechRate() != null ? reqVO.getSpeechRate() : 1.0f)
.volume(reqVO.getVolume() != null ? reqVO.getVolume() : 0f)
@@ -550,7 +546,7 @@ public class DigitalHumanTaskServiceImpl implements DigitalHumanTaskService {
}
/**
* 语音合成使用CosyVoice v3 Flash
* 语音合成
*/
private String synthesizeVoice(TikDigitalHumanTaskDO task) throws Exception {
// ✅ 优先使用预生成的音频(前端传递)
@@ -561,21 +557,25 @@ public class DigitalHumanTaskServiceImpl implements DigitalHumanTaskService {
}
// 如果没有预生成音频则走正常的TTS流程
// 参数验证
if (StrUtil.isBlank(task.getVoiceId())) {
throw new Exception("音色ID不能为空");
// 参数验证voiceId系统音色和 voiceConfigId用户配音二选一
boolean hasVoiceId = StrUtil.isNotBlank(task.getVoiceId());
boolean hasVoiceConfigId = task.getVoiceConfigId() != null;
if (!hasVoiceId && !hasVoiceConfigId) {
throw new Exception("音色ID不能为空需提供voiceId或voiceConfigId");
}
if (StrUtil.isBlank(task.getInputText())) {
throw new Exception("输入文本不能为空");
}
log.info("[synthesizeVoice][任务({})开始语音合成][voiceId={}, textLength={}]",
task.getId(), task.getVoiceId(), task.getInputText().length());
log.info("[synthesizeVoice][任务({})开始语音合成][voiceId={}, voiceConfigId={}, textLength={}]",
task.getId(), task.getVoiceId(), task.getVoiceConfigId(), task.getInputText().length());
// 构建TTS请求参数
AppTikVoiceTtsReqVO ttsReqVO = new AppTikVoiceTtsReqVO();
ttsReqVO.setInputText(task.getInputText());
ttsReqVO.setVoiceId(task.getVoiceId());
ttsReqVO.setVoiceId(task.getVoiceId()); // 系统音色
ttsReqVO.setVoiceConfigId(task.getVoiceConfigId()); // 用户配音
ttsReqVO.setSpeechRate(task.getSpeechRate() != null ? task.getSpeechRate() : 1.0f);
ttsReqVO.setVolume(task.getVolume() != null ? task.getVolume() : 0f);
ttsReqVO.setInstruction(task.getInstruction());

View File

@@ -63,7 +63,7 @@ public interface TikUserVoiceService {
void transcribeVoice(Long id);
/**
* CosyVoice 文本转语音
* 文本转语音
*/
AppTikVoiceTtsRespVO synthesizeVoice(AppTikVoiceTtsReqVO reqVO);

View File

@@ -22,8 +22,6 @@ import cn.iocoder.yudao.module.tik.tikhup.service.TikHupService;
import cn.iocoder.yudao.framework.mybatis.core.query.LambdaQueryWrapperX;
import cn.iocoder.yudao.module.tik.voice.client.VoiceCloneProvider;
import cn.iocoder.yudao.module.tik.voice.client.VoiceCloneProviderFactory;
import cn.iocoder.yudao.module.tik.voice.client.dto.VoiceCloneRequest;
import cn.iocoder.yudao.module.tik.voice.client.dto.VoiceCloneResult;
import cn.iocoder.yudao.module.tik.voice.client.dto.VoiceTtsRequest;
import cn.iocoder.yudao.module.tik.voice.client.dto.VoiceTtsResult;
import cn.iocoder.yudao.module.tik.muye.aimodelconfig.dal.AiModelConfigDO;
@@ -89,9 +87,6 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
@Resource
private VoiceCloneProviderFactory voiceProviderFactory;
@Resource
private cn.iocoder.yudao.module.tik.voice.config.CosyVoiceProviderConfig cosyVoiceProviderConfig;
@Resource
private StringRedisTemplate stringRedisTemplate;
@@ -102,22 +97,16 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
private static final long PREVIEW_CACHE_TTL_SECONDS = 3600;
private static final long SYNTH_CACHE_TTL_SECONDS = 24 * 3600;
/** 供应商类型常量 */
private static final String PROVIDER_COSYVOICE = "cosyvoice";
private static final String PROVIDER_SILICONFLOW = "siliconflow";
/** 模型常量 */
private static final String MODEL_COSYVOICE = "cosyvoice-v3-flash";
private static final String MODEL_SILICONFLOW = "IndexTeam/IndexTTS-2";
/** 积分平台和类型常量 */
private static final String PLATFORM_VOICE = "voice";
private static final String MODEL_CODE_TTS = "tts";
private static final String MODEL_CODE_CLONE = "clone";
@Resource
private PointsService pointsService;
/** SiliconFlow 参考音频最大大小5MB */
private static final int MAX_REFERENCE_AUDIO_SIZE = 5 * 1024 * 1024;
@Override
@Transactional(rollbackFor = Exception.class)
public Long createVoice(AppTikUserVoiceCreateReqVO createReqVO) {
@@ -128,7 +117,14 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
if (fileDO == null) {
throw exception(VOICE_FILE_NOT_EXISTS);
}
// 校验文件大小SiliconFlow API 限制参考音频不超过 5MB
if (fileDO.getSize() != null && fileDO.getSize() > MAX_REFERENCE_AUDIO_SIZE) {
double sizeMB = fileDO.getSize() / (1024.0 * 1024.0);
throw exception(VOICE_FILE_NOT_EXISTS,
String.format("音频文件过大(%.1fMB请上传小于5MB的音频文件", sizeMB));
}
// 验证文件分类是否为voice通过tik_user_file表查询
TikUserFileDO userFile = userFileMapper.selectOne(new LambdaQueryWrapperX<TikUserFileDO>()
.eq(TikUserFileDO::getFileId, createReqVO.getFileId())
@@ -158,51 +154,6 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
.setTranscription(createReqVO.getText()); // 使用前端传入的文本
voiceMapper.insert(voice);
// 4. 调用语音克隆服务,生成 voice_id
if (StrUtil.isNotBlank(createReqVO.getText())) {
try {
// 4.1 获取积分配置并预检
AiModelConfigDO config = pointsService.getConfig(PLATFORM_VOICE, MODEL_CODE_CLONE);
pointsService.checkPoints(userId.toString(), config.getConsumePoints());
log.info("[createVoice][开始语音复刻,配音编号({})文件ID({}),供应商({})]",
voice.getId(), fileDO.getId(), createReqVO.getProviderType());
String fileAccessUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
VoiceCloneProvider provider = voiceProviderFactory.getProvider(createReqVO.getProviderType());
String providerType = getProviderType(createReqVO.getProviderType(), provider);
String model = getModelByProvider(providerType);
VoiceCloneRequest cloneRequest = new VoiceCloneRequest();
cloneRequest.setAudioUrl(fileAccessUrl);
cloneRequest.setModel(model);
cloneRequest.setPrefix("voice" + voice.getId());
cloneRequest.setTranscriptionText(createReqVO.getText()); // 使用前端传入的文本
VoiceCloneResult cloneResult = provider.cloneVoice(cloneRequest);
String voiceId = cloneResult.getVoiceId();
voice.setVoiceId(voiceId);
voiceMapper.updateById(voice);
// 4.2 音色克隆成功,扣减积分
try {
pointsService.deductPoints(userId.toString(), config.getConsumePoints(), "voice_clone", voice.getId().toString());
log.info("[createVoice][用户 {} 扣减 {} 积分(音色克隆)]", userId, config.getConsumePoints());
} catch (Exception e) {
log.error("[createVoice][积分扣减失败: {}]", e.getMessage());
}
log.info("[createVoice][语音复刻成功,配音编号({})voice_id({})]", voice.getId(), voiceId);
} catch (Exception e) {
log.error("[createVoice][语音复刻失败,配音编号({}),错误信息: {}]", voice.getId(), e.getMessage(), e);
// 失败不扣费
}
} else {
log.info("[createVoice][未提供文本,跳过语音复刻,配音编号({})]", voice.getId());
}
log.info("[createVoice][用户({})创建配音成功,配音编号({})]", userId, voice.getId());
return voice.getId();
}
@@ -300,10 +251,7 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
.collect(Collectors.toList());
if (CollUtil.isNotEmpty(fileIds)) {
List<FileDO> files = fileMapper.selectBatchIds(fileIds);
Map<Long, FileDO> tempFileMap = files.stream()
.collect(Collectors.toMap(FileDO::getId, file -> file));
fileMap.putAll(tempFileMap);
fileMapper.selectBatchIds(fileIds).forEach(file -> fileMap.put(file.getId(), file));
}
}
@@ -412,26 +360,18 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
throw exception(VOICE_NOT_EXISTS, "配音不属于当前用户");
}
// 优先使用复刻的 voice_id如果不存在则使用文件URL兼容旧数据
if (StrUtil.isNotBlank(voice.getVoiceId())) {
log.info("[synthesizeVoice][使用复刻音色ID合成配音编号({})voice_id({})]", voiceConfigId, voice.getVoiceId());
voiceId = voice.getVoiceId();
// 注意:使用 voiceId 时,不依赖 transcriptionText直接使用前端传入的 inputText
transcriptionText = null; // 清除 transcriptionText让 determineSynthesisText 只使用 inputText
} else {
log.info("[synthesizeVoice][使用文件URL合成配音编号({})]", voiceConfigId);
// 获取文件信息用于获取文件URL
FileDO fileDO = fileMapper.selectById(voice.getFileId());
if (fileDO == null) {
throw exception(VOICE_FILE_NOT_EXISTS);
}
// 使用动态音色模式fileUrl + transcriptionText
log.info("[synthesizeVoice][使用动态音色模式,配音编号({})]", voiceConfigId);
FileDO fileDO = fileMapper.selectById(voice.getFileId());
if (fileDO == null) {
throw exception(VOICE_FILE_NOT_EXISTS);
}
// 使用文件URL和识别文本进行合成
fileUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
transcriptionText = voice.getTranscription();
if (StrUtil.isBlank(transcriptionText)) {
throw exception(VOICE_NOT_EXISTS, "配音识别文本为空,请先进行语音识别");
}
// 使用文件URL和识别文本进行合成
fileUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
transcriptionText = voice.getTranscription();
if (StrUtil.isBlank(transcriptionText)) {
throw exception(VOICE_NOT_EXISTS, "配音识别文本为空,请先进行语音识别");
}
}
// 2. 如果没有配置ID使用voiceId或fileUrl系统音色或直接URL方式
@@ -555,21 +495,17 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
throw exception(VOICE_NOT_EXISTS, "配音不存在");
}
voiceId = voice.getVoiceId();
if (StrUtil.isNotBlank(voiceId)) {
fileUrl = null;
referenceText = null;
} else {
FileDO fileDO = fileMapper.selectById(voice.getFileId());
if (fileDO == null) {
throw exception(VOICE_FILE_NOT_EXISTS);
}
fileUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
referenceText = voice.getTranscription();
if (StrUtil.isBlank(referenceText)) {
throw exception(VOICE_NOT_EXISTS, "配音识别文本为空,请先进行语音识别");
}
// 使用动态音色模式
FileDO fileDO = fileMapper.selectById(voice.getFileId());
if (fileDO == null) {
throw exception(VOICE_FILE_NOT_EXISTS);
}
fileUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
referenceText = voice.getTranscription();
if (StrUtil.isBlank(referenceText)) {
throw exception(VOICE_NOT_EXISTS, "配音识别文本为空,请先进行语音识别");
}
voiceId = null;
}
// 3. 系统配音
else {
@@ -623,21 +559,10 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
return buildPreviewResp(audioBase64, format, voiceId);
}
/**
* 获取 CosyVoice 配置
*/
private cn.iocoder.yudao.module.tik.voice.config.CosyVoiceProviderConfig getCosyVoiceConfig() {
return cosyVoiceProviderConfig;
}
/**
* 获取默认音频格式
*/
private String getDefaultFormat() {
var config = getCosyVoiceConfig();
if (config != null) {
return config.getAudioFormat();
}
return "mp3";
}
@@ -645,10 +570,6 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
* 获取默认采样率
*/
private Integer getDefaultSampleRate() {
var config = getCosyVoiceConfig();
if (config != null) {
return config.getSampleRate();
}
return 24000;
}
@@ -664,16 +585,14 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
}
private String resolveContentType(String format) {
if ("wav".equalsIgnoreCase(format)) {
return "audio/wav";
}
if ("mp3".equalsIgnoreCase(format)) {
if (format == null) {
return "audio/mpeg";
}
if ("flac".equalsIgnoreCase(format)) {
return "audio/flac";
}
return "audio/mpeg";
return switch (format.toLowerCase()) {
case "wav" -> "audio/wav";
case "flac" -> "audio/flac";
default -> "audio/mpeg";
};
}
private String determineSynthesisText(String transcriptionText, String inputText, boolean allowFallback) {
@@ -828,74 +747,25 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
return respVO;
}
@lombok.Data
@lombok.NoArgsConstructor
@lombok.AllArgsConstructor
private static class PreviewCacheEntry {
private String audioBase64;
private String format;
private Integer sampleRate;
private String requestId;
public PreviewCacheEntry() {}
public PreviewCacheEntry(String audioBase64, String format, Integer sampleRate, String requestId) {
this.audioBase64 = audioBase64;
this.format = format;
this.sampleRate = sampleRate;
this.requestId = requestId;
}
public String getAudioBase64() {
return audioBase64;
}
public String getFormat() {
return format;
}
public Integer getSampleRate() {
return sampleRate;
}
public String getRequestId() {
return requestId;
}
}
@lombok.Data
@lombok.NoArgsConstructor
@lombok.AllArgsConstructor
private static class SynthCacheEntry {
private String audioBase64;
private String format;
private Integer sampleRate;
private String requestId;
private String voiceId;
public SynthCacheEntry() {}
public SynthCacheEntry(String audioBase64, String format, Integer sampleRate, String requestId, String voiceId) {
this.audioBase64 = audioBase64;
this.format = format;
this.sampleRate = sampleRate;
this.requestId = requestId;
this.voiceId = voiceId;
}
public String getAudioBase64() {
return audioBase64;
}
public String getFormat() {
return format;
}
public Integer getSampleRate() {
return sampleRate;
}
public String getRequestId() {
return requestId;
}
public String getVoiceId() {
return voiceId;
}
}
/**
@@ -1116,10 +986,6 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
* 获取默认音色ID
*/
private String getDefaultVoiceId() {
var config = getCosyVoiceConfig();
if (config != null) {
return config.getDefaultVoiceId();
}
return null;
}
@@ -1127,32 +993,8 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
* 获取试听文本
*/
private String getPreviewText() {
var config = getCosyVoiceConfig();
if (config != null) {
return config.getPreviewText();
}
return "您好,欢迎体验专属音色。";
}
/**
* 获取供应商类型
*/
private String getProviderType(String requestProviderType, VoiceCloneProvider provider) {
if (StrUtil.isNotBlank(requestProviderType)) {
return requestProviderType;
}
return provider.getProviderType();
}
/**
* 根据供应商类型获取对应的模型
*/
private String getModelByProvider(String providerType) {
if (PROVIDER_SILICONFLOW.equalsIgnoreCase(providerType)) {
return MODEL_SILICONFLOW;
}
return MODEL_COSYVOICE; // 默认使用 CosyVoice 模型
}
}

View File

@@ -35,7 +35,7 @@ public class AppTikDigitalHumanCreateReqVO {
@Size(max = 1024, message = "视频URL不能超过1024个字符")
private String videoUrl;
@Schema(description = "音色IDCosyVoice voiceId系统音色使用)", example = "cosyvoice-v3-flash-sys-xxx")
@Schema(description = "音色ID系统音色使用", example = "alex")
private String voiceId;
@Schema(description = "用户音色配置IDtik_user_voice.id用户音色使用", example = "123")

View File

@@ -37,7 +37,7 @@ public class AppTikDigitalHumanRespVO {
@Schema(description = "配音配置ID", example = "789")
private Long voiceConfigId;
@Schema(description = "voice_id", example = "cosyvoice-v3-flash-xxx")
@Schema(description = "voice_id", example = "voice-xxx")
private String voiceId;
@Schema(description = "语速", example = "1.0")

View File

@@ -39,7 +39,7 @@ public class AppTikUserVoiceCreateReqVO {
@Size(max = 4000, message = "音频文本不能超过 4000 个字符")
private String text;
@Schema(description = "供应商类型:cosyvoice-阿里云,siliconflow-硅基流动(不传则使用默认)", example = "cosyvoice")
@Schema(description = "供应商类型siliconflow-硅基流动(不传则使用默认)", example = "siliconflow")
private String providerType;
}

View File

@@ -38,9 +38,6 @@ public class AppTikUserVoiceRespVO {
@Schema(description = "备注", example = "这是一个测试配音")
private String note;
@Schema(description = "复刻音色IDCosyVoice 语音复刻生成的 voice_id")
private String voiceId;
@Schema(description = "创建时间", requiredMode = Schema.RequiredMode.REQUIRED)
private LocalDateTime createTime;

View File

@@ -14,7 +14,7 @@ public class AppTikVoicePreviewReqVO {
@Schema(description = "配音编号tik_user_voice.id用户配音必传系统配音可不传")
private Long voiceConfigId;
@Schema(description = "CosyVoice音色ID系统配音必传用户配音可不传")
@Schema(description = "音色ID系统配音必传用户配音可不传")
private String voiceId;
@Schema(description = "语音文件URL当使用语音URL合成时必传替代voiceId")
@@ -43,7 +43,7 @@ public class AppTikVoicePreviewReqVO {
@Schema(description = "指令(用于控制音色风格)", example = "请用温柔专业的语调朗读")
private String instruction;
@Schema(description = "供应商类型:cosyvoice-阿里云,siliconflow-硅基流动(不传则使用默认)", example = "cosyvoice")
@Schema(description = "供应商类型siliconflow-硅基流动(不传则使用默认)", example = "siliconflow")
private String providerType;
}

View File

@@ -20,7 +20,7 @@ public class AppTikVoicePreviewRespVO {
@Schema(description = "采样率", example = "24000")
private Integer sampleRate;
@Schema(description = "CosyVoice 请求ID")
@Schema(description = "请求ID")
private String requestId;
@Schema(description = "使用的音色 ID")

View File

@@ -21,13 +21,13 @@ public class AppTikVoiceTtsReqVO {
@Size(max = 4000, message = "识别文本不能超过 4000 个字符")
private String transcriptionText;
@Schema(description = "音色 IDCosyVoice voiceId", example = "cosyvoice-v3-flash-myvoice-xxx")
@Schema(description = "音色 ID系统音色", example = "alex")
private String voiceId;
@Schema(description = "音色源音频 OSS 地址(当没有 voiceId 时必传)")
private String fileUrl;
@Schema(description = "模型名称,默认 cosyvoice-v3-flash", example = "cosyvoice-v3-flash")
@Schema(description = "模型名称", example = "IndexTeam/IndexTTS-2")
private String model;
@Schema(description = "语速,默认 1.0", example = "1.0")
@@ -45,7 +45,7 @@ public class AppTikVoiceTtsReqVO {
@Schema(description = "音频格式,默认 wav可选 mp3")
private String audioFormat;
@Schema(description = "供应商类型:cosyvoice-阿里云,siliconflow-硅基流动(不传则使用默认)", example = "cosyvoice")
@Schema(description = "供应商类型siliconflow-硅基流动(不传则使用默认)", example = "siliconflow")
private String providerType;
}

View File

@@ -4,7 +4,7 @@ import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Data;
@Data
@Schema(description = "CosyVoice 文本转语音响应")
@Schema(description = "文本转语音响应")
public class AppTikVoiceTtsRespVO {
@Schema(description = "用户文件编号", example = "1024")
@@ -23,7 +23,7 @@ public class AppTikVoiceTtsRespVO {
@Schema(description = "采样率", example = "24000")
private Integer sampleRate;
@Schema(description = "CosyVoice 请求ID")
@Schema(description = "请求ID")
private String requestId;
@Schema(description = "使用的音色 ID")