语音优化

2026-02-25 16:28:31 +08:00
parent 214c1f0f37
commit 0efca50be3
39 changed files with 237 additions and 1093 deletions
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/CosyVoiceClient.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/CosyVoiceClient.java
@@ -1,355 +0,0 @@
-package cn.iocoder.yudao.module.tik.voice.client;
-
-import cn.hutool.core.collection.CollUtil;
-import cn.hutool.core.util.StrUtil;
-import cn.iocoder.yudao.framework.common.exception.ServiceException;
-import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceCloneRequest;
-import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceCloneResult;
-import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsRequest;
-import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsResult;
-import cn.iocoder.yudao.module.tik.voice.config.CosyVoiceProviderConfig;
-import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
-import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
-import com.alibaba.dashscope.audio.ttsv2.enrollment.Voice;
-import com.alibaba.dashscope.audio.ttsv2.enrollment.VoiceEnrollmentService;
-import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import lombok.RequiredArgsConstructor;
-import lombok.extern.slf4j.Slf4j;
-import okhttp3.MediaType;
-import okhttp3.OkHttpClient;
-import okhttp3.Request;
-import okhttp3.RequestBody;
-import okhttp3.Response;
-import org.springframework.stereotype.Component;
-
-import java.nio.ByteBuffer;
-import java.nio.charset.StandardCharsets;
-import java.time.Duration;
-import java.util.Base64;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.concurrent.TimeUnit;
-
-import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception;
-import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception0;
-import static cn.iocoder.yudao.module.tik.enums.ErrorCodeConstants.VOICE_TTS_FAILED;
-
-/**
- * CosyVoice 客户端
- */
-@Slf4j
-@Component
-@RequiredArgsConstructor
-public class CosyVoiceClient {
-
-    private static final MediaType JSON = MediaType.parse("application/json; charset=utf-8");
-
-    private final CosyVoiceProviderConfig config;
-    private final ObjectMapper objectMapper;
-
-    private volatile OkHttpClient httpClient;
-
-    /**
-     * 调用 CosyVoice TTS 接口
-     */
-    public CosyVoiceTtsResult synthesize(CosyVoiceTtsRequest request) {
-        if (!config.isEnabled()) {
-            throw exception0(VOICE_TTS_FAILED.getCode(), "未配置 CosyVoice API Key");
-        }
-
-        // 添加详细的参数检查日志
-        String text = request != null ? request.getText() : null;
-        log.error("[CosyVoice][TTS参数检查][request={}, text={}, voiceId={}, model={}]",
-                request != null ? "存在" : "为null",
-                text != null ? "'" + text + "' (长度:" + text.length() + ")" : "为null",
-                request != null ? request.getVoiceId() : null,
-                request != null ? request.getModel() : null);
-
-        if (request == null || StrUtil.isBlank(request.getText())) {
-            throw exception0(VOICE_TTS_FAILED.getCode(), "TTS 文本不能为空");
-        }
-        if (StrUtil.isBlank(request.getVoiceId())) {
-            throw exception0(VOICE_TTS_FAILED.getCode(), "必须提供 voiceId");
-        }
-
-        SpeechSynthesizer synthesizer = null;
-        try {
-            log.info("[CosyVoice][开始TTS][voiceId={}, textLength={}, model={}, speechRate={}, instruction={}]",
-                    request.getVoiceId(),
-                    request.getText().length(),
-                    StrUtil.blankToDefault(request.getModel(), config.getDefaultModel()),
-                    request.getSpeechRate(),
-                    request.getInstruction());
-
-            // 使用 DashScope SDK 构建参数（严格按文档）
-            // 注意：speechRate 和 volume 需要转换为 int 类型
-            SpeechSynthesisParam param = SpeechSynthesisParam.builder()
-                    .apiKey(config.getApiKey())
-                    .model(StrUtil.blankToDefault(request.getModel(), config.getDefaultModel()))
-                    .voice(request.getVoiceId())
-                    .speechRate(request.getSpeechRate() != null ? request.getSpeechRate().intValue() : 1)
-                    .volume(request.getVolume() != null ? request.getVolume().intValue() : 0)
-                    .build();
-
-            if (StrUtil.isNotBlank(request.getInstruction())) {
-               param.setInstruction(request.getInstruction());
-            }
-
-            log.error("[CosyVoice][SDK参数][param={}, text='{}']", param, request.getText());
-
-            // 初始化合成器（同步调用传 null）
-            synthesizer = new SpeechSynthesizer(param, null);
-
-            // 阻塞调用，获取完整音频
-            ByteBuffer audioData = synthesizer.call(request.getText());
-
-            if (audioData == null) {
-                throw exception0(VOICE_TTS_FAILED.getCode(), "CosyVoice 返回空音频数据");
-            }
-
-            // 转换为字节数组（严格按照文档：直接使用 array()）
-            byte[] audioBytes = audioData.array();
-
-            log.info("[CosyVoice][TTS合成成功][Request ID: {}, audioSize={}, 首包延迟={}ms]",
-                    synthesizer.getLastRequestId(),
-                    audioBytes.length,
-                    synthesizer.getFirstPackageDelay());
-
-            // 构建返回结果
-            CosyVoiceTtsResult result = new CosyVoiceTtsResult();
-            result.setAudio(audioBytes);
-            result.setFormat(request.getAudioFormat() != null ? request.getAudioFormat() : config.getAudioFormat());
-            result.setSampleRate(request.getSampleRate() != null ? request.getSampleRate() : config.getSampleRate());
-            result.setRequestId(synthesizer.getLastRequestId());
-            result.setVoiceId(request.getVoiceId());
-
-            return result;
-
-        } catch (ServiceException ex) {
-            throw ex;
-        } catch (Exception ex) {
-            log.error("[CosyVoice][TTS异常][voiceId={}, text={}]", request.getVoiceId(), request.getText(), ex);
-            throw exception(VOICE_TTS_FAILED);
-        } finally {
-            // 关闭 WebSocket 连接
-            if (synthesizer != null) {
-                try {
-                    synthesizer.getDuplexApi().close(1000, "任务结束");
-                } catch (Exception e) {
-                    log.warn("[CosyVoice][关闭连接失败]", e);
-                }
-            }
-        }
-    }
-
-    /**
-     * 使用 HTTP API 进行 TTS 合成（备用方案）
-     */
-    private CosyVoiceTtsResult synthesizeViaHttp(CosyVoiceTtsRequest request) throws Exception {
-        String payload = objectMapper.writeValueAsString(buildPayload(request));
-        Request httpRequest = new Request.Builder()
-                .url(config.getTtsUrl())
-                .addHeader("Authorization", "Bearer " + config.getApiKey())
-                .addHeader("Content-Type", "application/json")
-                .post(RequestBody.create(payload.getBytes(StandardCharsets.UTF_8), JSON))
-                .build();
-
-        try (Response response = getHttpClient().newCall(httpRequest).execute()) {
-            String body = response.body() != null ? response.body().string() : "";
-            if (!response.isSuccessful()) {
-                log.error("[CosyVoice][TTS失败][status={}, body={}]", response.code(), body);
-                throw buildException(body);
-            }
-            return parseTtsResult(body, request);
-        }
-    }
-
-    /**
-     * 调用 CosyVoice 语音复刻接口（声音注册）
-     */
-    public CosyVoiceCloneResult cloneVoice(CosyVoiceCloneRequest request) {
-        if (!config.isEnabled()) {
-            throw exception0(VOICE_TTS_FAILED.getCode(), "未配置 CosyVoice API Key");
-        }
-        if (request == null || StrUtil.isBlank(request.getUrl())) {
-            throw exception0(VOICE_TTS_FAILED.getCode(), "复刻音频URL不能为空");
-        }
-        if (request == null || StrUtil.isBlank(request.getTargetModel())) {
-            throw exception0(VOICE_TTS_FAILED.getCode(), "复刻模型不能为空");
-        }
-        if (request == null || StrUtil.isBlank(request.getPrefix())) {
-            throw exception0(VOICE_TTS_FAILED.getCode(), "音色前缀不能为空");
-        }
-
-        try {
-            log.info("[CosyVoice][开始语音复刻][targetModel={}, prefix={}, url={}]",
-                    request.getTargetModel(), request.getPrefix(), request.getUrl());
-
-            // 使用 DashScope SDK 创建语音复刻
-            VoiceEnrollmentService service = new VoiceEnrollmentService(config.getApiKey());
-            Voice voice = service.createVoice(request.getTargetModel(), request.getPrefix(), request.getUrl());
-
-            log.info("[CosyVoice][语音复刻成功][Request ID: {}, Voice ID: {}]",
-                    service.getLastRequestId(), voice.getVoiceId());
-
-            // 构建返回结果
-            CosyVoiceCloneResult result = new CosyVoiceCloneResult();
-            result.setVoiceId(voice.getVoiceId());
-            result.setRequestId(service.getLastRequestId());
-
-            return result;
-        } catch (ServiceException ex) {
-            throw ex;
-        } catch (Exception ex) {
-            log.error("[CosyVoice][语音复刻异常][targetModel={}, prefix={}]",
-                    request.getTargetModel(), request.getPrefix(), ex);
-            throw exception(VOICE_TTS_FAILED);
-        }
-    }
-
-    private Map<String, Object> buildPayload(CosyVoiceTtsRequest request) {
-        Map<String, Object> payload = new HashMap<>();
-        String model = StrUtil.blankToDefault(request.getModel(), config.getDefaultModel());
-        payload.put("model", model);
-
-        Map<String, Object> input = new HashMap<>();
-        input.put("text", request.getText());
-
-        // 优先使用fileUrl（语音克隆），否则使用voiceId（系统音色）
-        if (StrUtil.isNotBlank(request.getFileUrl())) {
-            // 直接使用预签名URL（带签名和时效），阿里云API需要这个签名URL
-            input.put("audio_url", request.getFileUrl());
-            log.info("[CosyVoice][使用语音克隆][audio_url={}]", request.getFileUrl());
-
-            // 如果提供了参考文本，也一并传递（用于提高语音克隆质量）
-            if (StrUtil.isNotBlank(request.getReferenceText())) {
-                input.put("reference_text", request.getReferenceText());
-                log.info("[CosyVoice][添加参考文本][length={}]", request.getReferenceText().length());
-            }
-        } else {
-            // 使用系统音色
-            String voiceId = StrUtil.blankToDefault(request.getVoiceId(), config.getDefaultVoiceId());
-            if (StrUtil.isNotBlank(voiceId)) {
-                input.put("voice", voiceId);
-                log.info("[CosyVoice][使用系统音色][voice={}]", voiceId);
-            } else {
-                log.warn("[CosyVoice][未提供voiceId或fileUrl]");
-            }
-        }
-        payload.put("input", input);
-
-        Map<String, Object> parameters = new HashMap<>();
-        int sampleRate = request.getSampleRate() != null ? request.getSampleRate() : config.getSampleRate();
-        parameters.put("sample_rate", sampleRate);
-
-        // 根据官方文档，统一使用小写格式
-        String format = StrUtil.blankToDefault(request.getAudioFormat(), config.getAudioFormat()).toLowerCase();
-        parameters.put("format", format);
-
-        if (request.getSpeechRate() != null) {
-            parameters.put("speech_rate", request.getSpeechRate());
-        }
-        if (request.getVolume() != null) {
-            // 文档显示volume范围是0-100
-            parameters.put("volume", Math.round(request.getVolume()));
-        }
-        if (request.isPreview()) {
-            parameters.put("preview", true);
-        }
-
-        payload.put("parameters", parameters);
-
-        // 打印完整请求体（用于调试）
-        log.info("[CosyVoice][请求参数][model={}, sample_rate={}, format={}, text_length={}]",
-                model, sampleRate, format, request.getText().length());
-
-        return payload;
-    }
-
-    private CosyVoiceTtsResult parseTtsResult(String body, CosyVoiceTtsRequest request) throws Exception {
-        JsonNode root = objectMapper.readTree(body);
-
-        // 错误响应包含 code 字段
-        if (root.has("code")) {
-            String message = root.has("message") ? root.get("message").asText() : body;
-            log.error("[CosyVoice][TTS失败][code={}, message={}]", root.get("code").asText(), message);
-            throw exception0(VOICE_TTS_FAILED.getCode(), message);
-        }
-
-        JsonNode audioNode = root.path("output").path("audio");
-        if (!audioNode.isArray() || audioNode.isEmpty()) {
-            throw exception0(VOICE_TTS_FAILED.getCode(), "CosyVoice 返回的音频为空");
-        }
-
-        JsonNode firstAudio = audioNode.get(0);
-        String content = firstAudio.path("content").asText();
-        if (StrUtil.isBlank(content)) {
-            throw exception0(VOICE_TTS_FAILED.getCode(), "CosyVoice 返回空音频内容");
-        }
-
-        byte[] audioBytes = Base64.getDecoder().decode(content);
-        CosyVoiceTtsResult result = new CosyVoiceTtsResult();
-        result.setAudio(audioBytes);
-        result.setFormat(firstAudio.path("format").asText(StrUtil.blankToDefault(request.getAudioFormat(), config.getAudioFormat())));
-        result.setSampleRate(firstAudio.path("sample_rate").asInt(request.getSampleRate() != null ? request.getSampleRate() : config.getSampleRate()));
-        result.setRequestId(root.path("request_id").asText());
-        result.setVoiceId(firstAudio.path("voice").asText(request.getVoiceId()));
-        return result;
-    }
-
-    private OkHttpClient getHttpClient() {
-        if (httpClient == null) {
-            synchronized (this) {
-                if (httpClient == null) {
-                    java.time.Duration connect = defaultDuration(config.getConnectTimeout(), 10);
-                    java.time.Duration read = defaultDuration(config.getReadTimeout(), 60);
-                    httpClient = new OkHttpClient.Builder()
-                            .connectTimeout(connect.toMillis(), TimeUnit.MILLISECONDS)
-                            .readTimeout(read.toMillis(), TimeUnit.MILLISECONDS)
-                            .build();
-                }
-            }
-        }
-        return httpClient;
-    }
-
-    private Duration defaultDuration(Duration duration, long seconds) {
-        return duration == null ? Duration.ofSeconds(seconds) : duration;
-    }
-
-    private ServiceException buildException(String body) {
-        try {
-            JsonNode root = objectMapper.readTree(body);
-            String message = CollUtil.getFirst(
-                    CollUtil.newArrayList(
-                            root.path("message").asText(null),
-                            root.path("output").path("message").asText(null)));
-            return exception0(VOICE_TTS_FAILED.getCode(), StrUtil.blankToDefault(message, "CosyVoice 调用失败"));
-        } catch (Exception ignored) {
-            return exception0(VOICE_TTS_FAILED.getCode(), body);
-        }
-    }
-
-    /**
-     * 从URL中提取原始URL（去除查询参数和锚点）
-     *
-     * @param url 可能包含查询参数的URL
-     * @return 原始URL（去除查询参数和锚点）
-     */
-    private String extractRawUrl(String url) {
-        if (StrUtil.isBlank(url)) {
-            return url;
-        }
-        try {
-            java.net.URL urlObj = new java.net.URL(url);
-            // 只使用协议、主机、路径部分，忽略查询参数和锚点
-            return urlObj.getProtocol() + "://" + urlObj.getHost() + urlObj.getPath();
-        } catch (Exception e) {
-            // 如果URL解析失败，使用简单方式去除查询参数
-            return url.split("\\?")[0].split("#")[0];
-        }
-    }
-}
-
-
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/CosyVoiceProvider.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/CosyVoiceProvider.java
@@ -1,124 +0,0 @@
-package cn.iocoder.yudao.module.tik.voice.client;
-
-import cn.iocoder.yudao.module.tik.voice.client.dto.VoiceCloneRequest;
-import cn.iocoder.yudao.module.tik.voice.client.dto.VoiceCloneResult;
-import cn.iocoder.yudao.module.tik.voice.client.dto.VoiceTtsRequest;
-import cn.iocoder.yudao.module.tik.voice.client.dto.VoiceTtsResult;
-import cn.iocoder.yudao.module.tik.voice.config.CosyVoiceProviderConfig;
-import cn.iocoder.yudao.module.tik.voice.config.VoiceProviderProperties;
-import lombok.RequiredArgsConstructor;
-import lombok.extern.slf4j.Slf4j;
-import org.springframework.stereotype.Component;
-
-/**
- * CosyVoice Provider 实现
- *
- * <p>阿里云 CosyVoice 语音服务的 Provider 实现。
- * 内部委托给 {@link CosyVoiceClient} 进行实际的API调用。
- *
- * @author 芋道源码
- */
-@Slf4j
-@Component
-@RequiredArgsConstructor
-public class CosyVoiceProvider implements VoiceCloneProvider {
-
-    private final CosyVoiceClient cosyVoiceClient;
-    private final VoiceProviderProperties voiceProviderProperties;
-
-    /**
-     * 获取 CosyVoice 配置
-     */
-    private CosyVoiceProviderConfig getConfig() {
-        var baseConfig = voiceProviderProperties.getProviderConfig("cosyvoice");
-        if (baseConfig instanceof CosyVoiceProviderConfig config) {
-            return config;
-        }
-        return new CosyVoiceProviderConfig();
-    }
-
-    @Override
-    public VoiceCloneResult cloneVoice(VoiceCloneRequest request) {
-        log.info("[CosyVoiceProvider][语音克隆][audioUrl={}, model={}]",
-                request.getAudioUrl(), request.getModel());
-
-        // 适配到 CosyVoiceCloneRequest
-        cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceCloneRequest cosyRequest =
-            new cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceCloneRequest();
-
-        cosyRequest.setUrl(request.getAudioUrl());
-        cosyRequest.setTargetModel(request.getModel());
-        cosyRequest.setPrefix(request.getPrefix());
-        if (request.getSampleRate() != null) {
-            cosyRequest.setSampleRate(request.getSampleRate());
-        }
-        if (request.getAudioFormat() != null) {
-            cosyRequest.setAudioFormat(request.getAudioFormat());
-        }
-
-        // 调用底层 Client
-        cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceCloneResult cosyResult =
-            cosyVoiceClient.cloneVoice(cosyRequest);
-
-        // 适配到统一 Result
-        VoiceCloneResult result = new VoiceCloneResult();
-        result.setVoiceId(cosyResult.getVoiceId());
-        result.setRequestId(cosyResult.getRequestId());
-
-        log.info("[CosyVoiceProvider][语音克隆成功][voiceId={}]", result.getVoiceId());
-        return result;
-    }
-
-    @Override
-    public VoiceTtsResult synthesize(VoiceTtsRequest request) {
-        log.info("[CosyVoiceProvider][语音合成][voiceId={}, textLength={}, model={}]",
-                request.getVoiceId(),
-                request.getText() != null ? request.getText().length() : 0,
-                request.getModel());
-
-        // 适配到 CosyVoiceTtsRequest
-        cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsRequest cosyRequest =
-            cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsRequest.builder()
-                .text(request.getText())
-                .voiceId(request.getVoiceId())
-                .fileUrl(request.getFileUrl())
-                .referenceText(request.getReferenceText())
-                .model(request.getModel())
-                .speechRate(request.getSpeechRate())
-                .volume(request.getVolume())
-                .instruction(request.getInstruction())
-                .sampleRate(request.getSampleRate())
-                .audioFormat(request.getAudioFormat())
-                .preview(request.isPreview())
-                .build();
-
-        log.error("[CosyVoiceProvider][构建的cosyRequest][text='{}', voiceId={}, fileUrl={}]",
-                cosyRequest.getText(), cosyRequest.getVoiceId(), cosyRequest.getFileUrl());
-
-        // 调用底层 Client
-        cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsResult cosyResult =
-            cosyVoiceClient.synthesize(cosyRequest);
-
-        // 适配到统一 Result
-        VoiceTtsResult result = new VoiceTtsResult();
-        result.setRequestId(cosyResult.getRequestId());
-        result.setFormat(cosyResult.getFormat());
-        result.setSampleRate(cosyResult.getSampleRate());
-        result.setAudio(cosyResult.getAudio());
-        result.setVoiceId(cosyResult.getVoiceId());
-
-        log.info("[CosyVoiceProvider][语音合成成功][format={}, audioSize={}]",
-                result.getFormat(), result.getAudio() != null ? result.getAudio().length : 0);
-        return result;
-    }
-
-    @Override
-    public boolean supports(String providerType) {
-        return "cosyvoice".equalsIgnoreCase(providerType);
-    }
-
-    @Override
-    public String getProviderType() {
-        return "cosyvoice";
-    }
-}
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/SiliconFlowProvider.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/SiliconFlowProvider.java
@@ -4,6 +4,7 @@ import cn.hutool.core.util.StrUtil;
 import cn.hutool.http.HttpRequest;
 import cn.hutool.http.HttpResponse;
 import cn.hutool.json.JSONUtil;
+import cn.iocoder.yudao.module.tik.voice.client.dto.SiliconFlowReference;
 import cn.iocoder.yudao.module.tik.voice.client.dto.SiliconFlowTtsRequest;
 import cn.iocoder.yudao.module.tik.voice.client.dto.SiliconFlowVoiceUploadRequest;
 import cn.iocoder.yudao.module.tik.voice.client.dto.SiliconFlowVoiceUploadResponse;
@@ -21,6 +22,7 @@ import java.io.ByteArrayOutputStream;
 import java.io.InputStream;
 import java.net.URL;
 import java.util.Base64;
+import java.util.Collections;

 /**
 * 硅基流动 Provider 实现
@@ -108,19 +110,42 @@ public class SiliconFlowProvider implements VoiceCloneProvider {
            throw new RuntimeException("硅基流动供应商未配置或已禁用");
        }

-        log.info("[SiliconFlowProvider][语音合成][voiceId={}, textLength={}, model={}]",
+        // 判断使用哪种模式
+        boolean useReferenceMode = StrUtil.isBlank(request.getVoiceId())
+                && StrUtil.isNotBlank(request.getFileUrl());
+
+        log.info("[SiliconFlowProvider][语音合成][voiceId={}, fileUrl={}, textLength={}, model={}, mode={}]",
                request.getVoiceId(),
+                request.getFileUrl() != null ? "存在" : "无",
                request.getText() != null ? request.getText().length() : 0,
-                request.getModel());
+                request.getModel(),
+                useReferenceMode ? "动态音色" : "标准音色");

        try {
-            SiliconFlowTtsRequest sfRequest = SiliconFlowTtsRequest.builder()
+            SiliconFlowTtsRequest.SiliconFlowTtsRequestBuilder requestBuilder = SiliconFlowTtsRequest.builder()
                    .model(getOrDefault(request.getModel(), getOrDefault(config.getDefaultModel(), "IndexTeam/IndexTTS-2")))
                    .input(request.getText())
-                    .voice(request.getVoiceId())
                    .speed(request.getSpeechRate() != null ? request.getSpeechRate() : 1.0f)
                    .responseFormat(getOrDefault(request.getAudioFormat(), config.getAudioFormat()))
-                    .build();
+                    .gain(request.getVolume());
+
+            if (useReferenceMode) {
+                // 用户动态音色模式：voice 传空，使用 references
+                log.info("[SiliconFlowProvider][使用动态音色模式][fileUrl={}]", request.getFileUrl());
+                requestBuilder.voice("");
+
+                SiliconFlowReference reference = SiliconFlowReference.builder()
+                        .audio(request.getFileUrl())
+                        .text(request.getReferenceText())
+                        .build();
+                requestBuilder.references(Collections.singletonList(reference));
+            } else {
+                // 标准模式：使用 voiceId
+                log.info("[SiliconFlowProvider][使用标准音色模式][voiceId={}]", request.getVoiceId());
+                requestBuilder.voice(request.getVoiceId());
+            }
+
+            SiliconFlowTtsRequest sfRequest = requestBuilder.build();

            String url = config.getBaseUrl() + config.getTtsUrl();
            String requestBody = JSONUtil.toJsonStr(sfRequest);
@@ -141,15 +166,16 @@ public class SiliconFlowProvider implements VoiceCloneProvider {
            }

            byte[] audioBytes = response.bodyBytes();
-            String base64Audio = Base64.getEncoder().encodeToString(audioBytes);

            VoiceTtsResult result = new VoiceTtsResult();
-            result.setAudio(Base64.getDecoder().decode(base64Audio));
+            result.setAudio(audioBytes);
            result.setFormat(sfRequest.getResponseFormat());
            result.setVoiceId(request.getVoiceId());

-            log.info("[SiliconFlowProvider][语音合成成功][format={}, audioSize={}]",
-                    result.getFormat(), result.getAudio() != null ? result.getAudio().length : 0);
+            log.info("[SiliconFlowProvider][语音合成成功][format={}, audioSize={}, mode={}]",
+                    result.getFormat(),
+                    result.getAudio() != null ? result.getAudio().length : 0,
+                    useReferenceMode ? "动态音色" : "标准音色");
            return result;

        } catch (Exception e) {
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/VoiceCloneProvider.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/VoiceCloneProvider.java
@@ -41,7 +41,7 @@ public interface VoiceCloneProvider {
    /**
     * 检查是否支持指定的供应商类型
     *
-     * @param providerType 供应商类型（如 "cosyvoice", "siliconflow"）
+     * @param providerType 供应商类型（如 "siliconflow"）
     * @return true 如果支持，false 否则
     */
    boolean supports(String providerType);
@@ -49,7 +49,7 @@ public interface VoiceCloneProvider {
    /**
     * 获取供应商类型标识
     *
-     * @return 供应商类型，如 "cosyvoice", "siliconflow"
+     * @return 供应商类型，如 "siliconflow"
     */
    String getProviderType();
 }
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/VoiceCloneProviderFactory.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/VoiceCloneProviderFactory.java
@@ -75,7 +75,7 @@ public class VoiceCloneProviderFactory {
    /**
     * 根据类型获取 Provider
     *
-     * @param providerType 供应商类型（如 "cosyvoice", "siliconflow"）
+     * @param providerType 供应商类型（如 "siliconflow"）
     * @return 对应的 Provider 实例
     * @throws ServiceException 当 Provider 不存在时抛出
     */
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/CosyVoiceCloneRequest.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/CosyVoiceCloneRequest.java
@@ -1,36 +0,0 @@
-package cn.iocoder.yudao.module.tik.voice.client.dto;
-
-import lombok.Data;
-
-/**
- * CosyVoice 语音复刻请求
- */
-@Data
-public class CosyVoiceCloneRequest {
-
-    /**
-     * 复刻模型（cosyvoice-v3-flash 等）
-     */
-    private String targetModel;
-
-    /**
-     * 音色自定义前缀（仅允许数字和小写字母，长度<10字符）
-     */
-    private String prefix;
-
-    /**
-     * 音频文件公网URL
-     */
-    private String url;
-
-    /**
-     * 采样率，默认24000
-     */
-    private Integer sampleRate;
-
-    /**
-     * 音频格式，默认wav
-     */
-    private String audioFormat;
-
-}
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/CosyVoiceCloneResult.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/CosyVoiceCloneResult.java
@@ -1,21 +0,0 @@
-package cn.iocoder.yudao.module.tik.voice.client.dto;
-
-import lombok.Data;
-
-/**
- * CosyVoice 语音复刻结果
- */
-@Data
-public class CosyVoiceCloneResult {
-
-    /**
-     * 生成的 voice_id
-     */
-    private String voiceId;
-
-    /**
-     * 请求ID
-     */
-    private String requestId;
-
-}
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/CosyVoiceTtsRequest.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/CosyVoiceTtsRequest.java
@@ -1,69 +0,0 @@
-package cn.iocoder.yudao.module.tik.voice.client.dto;
-
-import lombok.Builder;
-import lombok.Data;
-
-/**
- * CosyVoice TTS 请求
- */
-@Data
-@Builder
-public class CosyVoiceTtsRequest {
-
-    /**
-     * 待合成文本
-     */
-    private String text;
-
-    /**
-     * 声音 ID（可选，默认使用配置）
-     */
-    private String voiceId;
-
-    /**
-     * 语音文件URL（当使用语音URL合成时使用，替代voiceId）
-     */
-    private String fileUrl;
-
-    /**
-     * 参考音频文本（当使用fileUrl时，用于提高克隆质量）
-     */
-    private String referenceText;
-
-    /**
-     * 模型（默认 cosyvoice-v3-flash）
-     */
-    private String model;
-
-    /**
-     * 语速
-     */
-    private Float speechRate;
-
-    /**
-     * 音量，可选
-     */
-    private Float volume;
-
-    /**
-     * 指令（用于控制音色风格），可选
-     */
-    private String instruction;
-
-    /**
-     * 采样率
-     */
-    private Integer sampleRate;
-
-    /**
-     * 音频格式
-     */
-    private String audioFormat;
-
-    /**
-     * 是否仅用于试听，方便服务侧做限流
-     */
-    private boolean preview;
-}
-
-
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/CosyVoiceTtsResult.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/CosyVoiceTtsResult.java
@@ -1,37 +0,0 @@
-package cn.iocoder.yudao.module.tik.voice.client.dto;
-
-import lombok.Data;
-
-/**
- * CosyVoice TTS 响应
- */
-@Data
-public class CosyVoiceTtsResult {
-
-    /**
-     * 请求ID
-     */
-    private String requestId;
-
-    /**
-     * 返回的音频格式
-     */
-    private String format;
-
-    /**
-     * 采样率
-     */
-    private Integer sampleRate;
-
-    /**
-     * 音频二进制内容
-     */
-    private byte[] audio;
-
-    /**
-     * 音频所使用的 voiceId
-     */
-    private String voiceId;
-}
-
-
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/SiliconFlowReference.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/SiliconFlowReference.java
@@ -0,0 +1,27 @@
+package cn.iocoder.yudao.module.tik.voice.client.dto;
+
+import lombok.Builder;
+import lombok.Data;
+
+/**
+ * 硅基流动参考音频配置
+ *
+ * <p>用于用户动态音色模式，通过 references 传递参考音频实现实时语音克隆。
+ *
+ * @author 芋道源码
+ */
+@Data
+@Builder
+public class SiliconFlowReference {
+
+    /**
+     * 参考音频 URL（也支持 base64 格式）
+     */
+    private String audio;
+
+    /**
+     * 参考音频的文字内容
+     */
+    private String text;
+
+}
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/SiliconFlowTtsRequest.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/SiliconFlowTtsRequest.java
@@ -4,6 +4,8 @@ import com.fasterxml.jackson.annotation.JsonProperty;
 import lombok.Builder;
 import lombok.Data;

+import java.util.List;
+
 /**
 * 硅基流动文本转语音请求
 *
@@ -42,4 +44,18 @@ public class SiliconFlowTtsRequest {
    @JsonProperty("response_format")
    private String responseFormat;

+    /**
+     * 音量增益（-10 到 10，默认 0）
+     *
+     * <p>正值增加音量，负值降低音量
+     */
+    private Float gain;
+
+    /**
+     * 参考音频列表（用于用户动态音色模式）
+     *
+     * <p>当 voice 为空时，使用此字段传递参考音频实现实时语音克隆
+     */
+    private List<SiliconFlowReference> references;
+
 }
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/VoiceCloneRequest.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/VoiceCloneRequest.java
@@ -16,7 +16,6 @@ public class VoiceCloneRequest {
    /**
     * 音频文件公网URL
     *
-     * <p>CosyVoice: 对应 {@code url} 字段</p>
     * <p>SiliconFlow: 对应 {@code audio} 字段（需base64编码）</p>
     */
    private String audioUrl;
@@ -24,7 +23,6 @@ public class VoiceCloneRequest {
    /**
     * 模型名称
     *
-     * <p>CosyVoice: 对应 {@code targetModel}，如 {@code cosyvoice-v3-flash}</p>
     * <p>SiliconFlow: 对应 {@code model}，如 {@code indextts-2}</p>
     */
    private String model;
@@ -32,7 +30,6 @@ public class VoiceCloneRequest {
    /**
     * 音色自定义前缀（可选）
     *
-     * <p>CosyVoice: 必填，仅允许数字和小写字母，长度<10字符</p>
     * <p>SiliconFlow: 不适用</p>
     */
    private String prefix;
@@ -53,7 +50,6 @@ public class VoiceCloneRequest {
     * 转录文本（可选）
     *
     * <p>SiliconFlow: 音频对应的文本内容</p>
-     * <p>CosyVoice: 不适用</p>
     */
    private String transcriptionText;
 }
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/config/CosyVoiceProviderConfig.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/config/CosyVoiceProviderConfig.java
@@ -1,68 +0,0 @@
-package cn.iocoder.yudao.module.tik.voice.config;
-
-import lombok.Data;
-import lombok.EqualsAndHashCode;
-import org.springframework.boot.context.properties.ConfigurationProperties;
-import org.springframework.stereotype.Component;
-
-import java.time.Duration;
-
-/**
- * CosyVoice 供应商配置
- *
- * <p>继承通用配置，添加 CosyVoice 特有字段。
- *
- * @author 芋道源码
- */
-@Data
-@EqualsAndHashCode(callSuper = true)
-@Component
-@ConfigurationProperties(prefix = "yudao.voice.cosyvoice")
-public class CosyVoiceProviderConfig extends VoiceProviderProperties.ProviderConfig {
-
-    /**
-     * 默认模型
-     */
-    private String defaultModel = "cosyvoice-v3-flash";
-
-    /**
-     * 默认 voiceId（可选）
-     */
-    private String defaultVoiceId;
-
-    /**
-     * 默认采样率
-     */
-    private Integer sampleRate = 24000;
-
-    /**
-     * 默认音频格式
-     */
-    private String audioFormat = "mp3";
-
-    /**
-     * 试听默认示例文本
-     */
-    private String previewText = "您好，欢迎体验专属音色。";
-
-    /**
-     * TTS 接口地址
-     */
-    private String ttsUrl = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/speech-synthesis";
-
-    /**
-     * 语音复刻接口地址（声音注册）
-     */
-    private String voiceEnrollmentUrl = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/voice-enrollment";
-
-    /**
-     * 连接超时时间
-     */
-    private Duration connectTimeout = Duration.ofSeconds(10);
-
-    /**
-     * 读取超时时间（3分钟，提升语音合成成功率）
-     */
-    private Duration readTimeout = Duration.ofSeconds(180);
-
-}
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/config/VoiceAutoConfiguration.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/config/VoiceAutoConfiguration.java
@@ -1,8 +1,5 @@
 package cn.iocoder.yudao.module.tik.voice.config;

-import lombok.extern.slf4j.Slf4j;
-import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty;
-import org.springframework.context.annotation.Bean;
 import org.springframework.context.annotation.Configuration;

 /**
@@ -11,25 +8,6 @@ import org.springframework.context.annotation.Configuration;
 * @author 芋道源码
 */
@Configuration
-@Slf4j
 public class VoiceAutoConfiguration {

-    /**
-     * CosyVoice 供应商配置 Bean
-     */
-    @Bean
-    @ConditionalOnProperty(prefix = "yudao.voice.providers.cosyvoice", name = "enabled", havingValue = "true", matchIfMissing = true)
-    public CosyVoiceProviderConfig cosyVoiceProviderConfig(VoiceProviderProperties properties) {
-        VoiceProviderProperties.ProviderConfig baseConfig = properties.getProviderConfig("cosyvoice");
-        if (baseConfig == null) {
-            baseConfig = new VoiceProviderProperties.ProviderConfig();
-        }
-
-        CosyVoiceProviderConfig config = new CosyVoiceProviderConfig();
-        config.setEnabled(baseConfig.isEnabled());
-        config.setApiKey(baseConfig.getApiKey());
-        config.setPriority(baseConfig.getPriority());
-        return config;
-    }
-
 }
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/config/VoiceProviderProperties.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/config/VoiceProviderProperties.java
@@ -22,14 +22,14 @@ public class VoiceProviderProperties {
    /**
     * 默认供应商类型
     *
-     * <p>可选值: cosyvoice, siliconflow 等
+     * <p>可选值: siliconflow 等
     */
-    private String defaultProvider = "cosyvoice";
+    private String defaultProvider = "siliconflow";

    /**
     * 各供应商配置
     *
-     * <p>key 为供应商类型（如 cosyvoice, siliconflow）
+     * <p>key 为供应商类型（如 siliconflow）
     */
    private Map<String, ProviderConfig> providers = new HashMap<>();

--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/controller/AppTikUserVoiceController.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/controller/AppTikUserVoiceController.java
@@ -80,7 +80,7 @@ public class AppTikUserVoiceController {
    }

    @PostMapping("/tts")
-    @Operation(summary = "CosyVoice 文本转语音")
+    @Operation(summary = "文本转语音")
    public CommonResult<AppTikVoiceTtsRespVO> synthesizeVoice(@Valid @RequestBody AppTikVoiceTtsReqVO reqVO) {
        return success(voiceService.synthesizeVoice(reqVO));
    }
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/dal/dataobject/TikDigitalHumanTaskDO.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/dal/dataobject/TikDigitalHumanTaskDO.java
@@ -55,9 +55,13 @@ public class TikDigitalHumanTaskDO extends TenantBaseDO {

    // ========== TTS参数 ==========
    /**
-     * 音色ID（CosyVoice voiceId）
+     * 音色ID（系统音色使用）
     */
    private String voiceId;
+    /**
+     * 用户配音ID（tik_user_voice.id，用户配音使用）
+     */
+    private Long voiceConfigId;
    /**
     * 输入文本（用于语音合成）
     */
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/dal/dataobject/TikUserVoiceDO.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/dal/dataobject/TikUserVoiceDO.java
@@ -54,10 +54,6 @@ public class TikUserVoiceDO extends TenantBaseDO {
     * 备注信息
     */
    private String note;
-    /**
-     * 复刻音色ID（CosyVoice 语音复刻生成的 voice_id）
-     */
-    private String voiceId;

 }

--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/enums/CosyVoiceEmotionEnum.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/enums/CosyVoiceEmotionEnum.java
@@ -1,38 +0,0 @@
-package cn.iocoder.yudao.module.tik.voice.enums;
-
-import cn.hutool.core.util.StrUtil;
-import lombok.AllArgsConstructor;
-import lombok.Getter;
-
-/**
- * CosyVoice情感枚举
- * 根据阿里云DashScope官方文档定义
- * 参考：https://help.aliyun.com/zh/dashscope/developer-reference/tts-api
- */
-@Getter
-@AllArgsConstructor
-public enum CosyVoiceEmotionEnum {
-
-    NEUTRAL("neutral", "中性"),
-    HAPPY("happy", "高兴"),
-    SAD("sad", "悲伤"),
-    ANGRY("angry", "愤怒"),
-    SURPRISED("surprised", "惊讶"),
-    DISGUSTED("disgusted", "厌恶"),
-    SCARED("scared", "害怕");
-
-    private final String code;
-    private final String description;
-
-    public static CosyVoiceEmotionEnum getByCode(String code) {
-        if (StrUtil.isBlank(code)) {
-            return NEUTRAL;
-        }
-        for (CosyVoiceEmotionEnum emotion : values()) {
-            if (emotion.getCode().equalsIgnoreCase(code)) {
-                return emotion;
-            }
-        }
-        return NEUTRAL;
-    }
-}
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/service/DigitalHumanTaskServiceImpl.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/service/DigitalHumanTaskServiceImpl.java
@@ -390,8 +390,9 @@ public class DigitalHumanTaskServiceImpl implements DigitalHumanTaskService {
            throw ServiceExceptionUtil.exception(ErrorCodeConstants.GENERAL_FORBIDDEN, "无权访问该音色");
        }

-        if (StrUtil.isBlank(userVoice.getVoiceId())) {
-            throw new IllegalArgumentException("该音色配置无效，缺少voiceId");
+        // 验证识别文本是否存在（用于动态音色模式）
+        if (StrUtil.isBlank(userVoice.getTranscription())) {
+            throw new IllegalArgumentException("该音色配置无效，请先进行语音识别");
        }
    }

@@ -399,14 +400,8 @@ public class DigitalHumanTaskServiceImpl implements DigitalHumanTaskService {
     * 创建任务记录
     */
    private TikDigitalHumanTaskDO createTaskRecord(AppTikDigitalHumanCreateReqVO reqVO, Long userId) {
-        // 如果是用户音色，需要从voiceConfigId获取voiceId
+        // 直接使用前端传递的 voiceId（系统预置音色），用户音色通过 voiceConfigId 在合成时处理
        String voiceId = reqVO.getVoiceId();
-        if (voiceId == null && reqVO.getVoiceConfigId() != null) {
-            TikUserVoiceDO userVoice = userVoiceMapper.selectById(reqVO.getVoiceConfigId());
-            if (userVoice != null) {
-                voiceId = userVoice.getVoiceId();
-            }
-        }

        // ✅ 预生成音频信息（无需存储时长，前端严格校验）
        if (reqVO.getPreGeneratedAudio() != null) {
@@ -427,6 +422,7 @@ public class DigitalHumanTaskServiceImpl implements DigitalHumanTaskService {
                .videoFileId(reqVO.getVideoFileId())
                .videoUrl(reqVO.getVideoUrl())
                .voiceId(voiceId)
+                .voiceConfigId(reqVO.getVoiceConfigId())
                .inputText(reqVO.getInputText())
                .speechRate(reqVO.getSpeechRate() != null ? reqVO.getSpeechRate() : 1.0f)
                .volume(reqVO.getVolume() != null ? reqVO.getVolume() : 0f)
@@ -550,7 +546,7 @@ public class DigitalHumanTaskServiceImpl implements DigitalHumanTaskService {
    }

    /**
-     * 语音合成（使用CosyVoice v3 Flash）
+     * 语音合成
     */
    private String synthesizeVoice(TikDigitalHumanTaskDO task) throws Exception {
        // ✅ 优先使用预生成的音频（前端传递）
@@ -561,21 +557,25 @@ public class DigitalHumanTaskServiceImpl implements DigitalHumanTaskService {
        }

        // 如果没有预生成音频，则走正常的TTS流程
-        // 参数验证
-        if (StrUtil.isBlank(task.getVoiceId())) {
-            throw new Exception("音色ID不能为空");
+        // 参数验证：voiceId（系统音色）和 voiceConfigId（用户配音）二选一
+        boolean hasVoiceId = StrUtil.isNotBlank(task.getVoiceId());
+        boolean hasVoiceConfigId = task.getVoiceConfigId() != null;
+
+        if (!hasVoiceId && !hasVoiceConfigId) {
+            throw new Exception("音色ID不能为空（需提供voiceId或voiceConfigId）");
        }
        if (StrUtil.isBlank(task.getInputText())) {
            throw new Exception("输入文本不能为空");
        }

-        log.info("[synthesizeVoice][任务({})开始语音合成][voiceId={}, textLength={}]",
-                task.getId(), task.getVoiceId(), task.getInputText().length());
+        log.info("[synthesizeVoice][任务({})开始语音合成][voiceId={}, voiceConfigId={}, textLength={}]",
+                task.getId(), task.getVoiceId(), task.getVoiceConfigId(), task.getInputText().length());

        // 构建TTS请求参数
        AppTikVoiceTtsReqVO ttsReqVO = new AppTikVoiceTtsReqVO();
        ttsReqVO.setInputText(task.getInputText());
-        ttsReqVO.setVoiceId(task.getVoiceId());
+        ttsReqVO.setVoiceId(task.getVoiceId());           // 系统音色
+        ttsReqVO.setVoiceConfigId(task.getVoiceConfigId()); // 用户配音
        ttsReqVO.setSpeechRate(task.getSpeechRate() != null ? task.getSpeechRate() : 1.0f);
        ttsReqVO.setVolume(task.getVolume() != null ? task.getVolume() : 0f);
        ttsReqVO.setInstruction(task.getInstruction());
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/service/TikUserVoiceService.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/service/TikUserVoiceService.java
@@ -63,7 +63,7 @@ public interface TikUserVoiceService {
    void transcribeVoice(Long id);

    /**
-     * CosyVoice 文本转语音
+     * 文本转语音
     */
    AppTikVoiceTtsRespVO synthesizeVoice(AppTikVoiceTtsReqVO reqVO);

--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/service/TikUserVoiceServiceImpl.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/service/TikUserVoiceServiceImpl.java
@@ -22,8 +22,6 @@ import cn.iocoder.yudao.module.tik.tikhup.service.TikHupService;
 import cn.iocoder.yudao.framework.mybatis.core.query.LambdaQueryWrapperX;
 import cn.iocoder.yudao.module.tik.voice.client.VoiceCloneProvider;
 import cn.iocoder.yudao.module.tik.voice.client.VoiceCloneProviderFactory;
-import cn.iocoder.yudao.module.tik.voice.client.dto.VoiceCloneRequest;
-import cn.iocoder.yudao.module.tik.voice.client.dto.VoiceCloneResult;
 import cn.iocoder.yudao.module.tik.voice.client.dto.VoiceTtsRequest;
 import cn.iocoder.yudao.module.tik.voice.client.dto.VoiceTtsResult;
 import cn.iocoder.yudao.module.tik.muye.aimodelconfig.dal.AiModelConfigDO;
@@ -89,9 +87,6 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
    @Resource
    private VoiceCloneProviderFactory voiceProviderFactory;

-    @Resource
-    private cn.iocoder.yudao.module.tik.voice.config.CosyVoiceProviderConfig cosyVoiceProviderConfig;
-
    @Resource
    private StringRedisTemplate stringRedisTemplate;

@@ -102,22 +97,16 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
    private static final long PREVIEW_CACHE_TTL_SECONDS = 3600;
    private static final long SYNTH_CACHE_TTL_SECONDS = 24 * 3600;

-    /** 供应商类型常量 */
-    private static final String PROVIDER_COSYVOICE = "cosyvoice";
-    private static final String PROVIDER_SILICONFLOW = "siliconflow";
-
-    /** 模型常量 */
-    private static final String MODEL_COSYVOICE = "cosyvoice-v3-flash";
-    private static final String MODEL_SILICONFLOW = "IndexTeam/IndexTTS-2";
-
    /** 积分平台和类型常量 */
    private static final String PLATFORM_VOICE = "voice";
    private static final String MODEL_CODE_TTS = "tts";
-    private static final String MODEL_CODE_CLONE = "clone";

    @Resource
    private PointsService pointsService;

+    /** SiliconFlow 参考音频最大大小：5MB */
+    private static final int MAX_REFERENCE_AUDIO_SIZE = 5 * 1024 * 1024;
+
    @Override
    @Transactional(rollbackFor = Exception.class)
    public Long createVoice(AppTikUserVoiceCreateReqVO createReqVO) {
@@ -128,7 +117,14 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
        if (fileDO == null) {
            throw exception(VOICE_FILE_NOT_EXISTS);
        }
-        
+
+        // 校验文件大小（SiliconFlow API 限制参考音频不超过 5MB）
+        if (fileDO.getSize() != null && fileDO.getSize() > MAX_REFERENCE_AUDIO_SIZE) {
+            double sizeMB = fileDO.getSize() / (1024.0 * 1024.0);
+            throw exception(VOICE_FILE_NOT_EXISTS,
+                    String.format("音频文件过大（%.1fMB），请上传小于5MB的音频文件", sizeMB));
+        }
+
        // 验证文件分类是否为voice（通过tik_user_file表查询）
        TikUserFileDO userFile = userFileMapper.selectOne(new LambdaQueryWrapperX<TikUserFileDO>()
                .eq(TikUserFileDO::getFileId, createReqVO.getFileId())
@@ -158,51 +154,6 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
                .setTranscription(createReqVO.getText()); // 使用前端传入的文本
        voiceMapper.insert(voice);

-        // 4. 调用语音克隆服务，生成 voice_id
-        if (StrUtil.isNotBlank(createReqVO.getText())) {
-            try {
-                // 4.1 获取积分配置并预检
-                AiModelConfigDO config = pointsService.getConfig(PLATFORM_VOICE, MODEL_CODE_CLONE);
-                pointsService.checkPoints(userId.toString(), config.getConsumePoints());
-
-                log.info("[createVoice][开始语音复刻，配音编号({})，文件ID({})，供应商({})]",
-                        voice.getId(), fileDO.getId(), createReqVO.getProviderType());
-                String fileAccessUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
-
-                VoiceCloneProvider provider = voiceProviderFactory.getProvider(createReqVO.getProviderType());
-                String providerType = getProviderType(createReqVO.getProviderType(), provider);
-                String model = getModelByProvider(providerType);
-
-                VoiceCloneRequest cloneRequest = new VoiceCloneRequest();
-                cloneRequest.setAudioUrl(fileAccessUrl);
-                cloneRequest.setModel(model);
-                cloneRequest.setPrefix("voice" + voice.getId());
-                cloneRequest.setTranscriptionText(createReqVO.getText()); // 使用前端传入的文本
-
-                VoiceCloneResult cloneResult = provider.cloneVoice(cloneRequest);
-                String voiceId = cloneResult.getVoiceId();
-
-                voice.setVoiceId(voiceId);
-                voiceMapper.updateById(voice);
-
-                // 4.2 音色克隆成功，扣减积分
-                try {
-                    pointsService.deductPoints(userId.toString(), config.getConsumePoints(), "voice_clone", voice.getId().toString());
-                    log.info("[createVoice][用户 {} 扣减 {} 积分（音色克隆）]", userId, config.getConsumePoints());
-                } catch (Exception e) {
-                    log.error("[createVoice][积分扣减失败: {}]", e.getMessage());
-                }
-
-                log.info("[createVoice][语音复刻成功，配音编号({})，voice_id({})]", voice.getId(), voiceId);
-            } catch (Exception e) {
-                log.error("[createVoice][语音复刻失败，配音编号({})，错误信息: {}]", voice.getId(), e.getMessage(), e);
-                // 失败不扣费
-            }
-        } else {
-            log.info("[createVoice][未提供文本，跳过语音复刻，配音编号({})]", voice.getId());
-        }
-
-
        log.info("[createVoice][用户({})创建配音成功，配音编号({})]", userId, voice.getId());
        return voice.getId();
    }
@@ -300,10 +251,7 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
                    .collect(Collectors.toList());

            if (CollUtil.isNotEmpty(fileIds)) {
-                List<FileDO> files = fileMapper.selectBatchIds(fileIds);
-                Map<Long, FileDO> tempFileMap = files.stream()
-                        .collect(Collectors.toMap(FileDO::getId, file -> file));
-                fileMap.putAll(tempFileMap);
+                fileMapper.selectBatchIds(fileIds).forEach(file -> fileMap.put(file.getId(), file));
            }
        }

@@ -412,26 +360,18 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
                throw exception(VOICE_NOT_EXISTS, "配音不属于当前用户");
            }

-            // 优先使用复刻的 voice_id，如果不存在则使用文件URL（兼容旧数据）
-            if (StrUtil.isNotBlank(voice.getVoiceId())) {
-                log.info("[synthesizeVoice][使用复刻音色ID合成，配音编号({})，voice_id({})]", voiceConfigId, voice.getVoiceId());
-                voiceId = voice.getVoiceId();
-                // 注意：使用 voiceId 时，不依赖 transcriptionText，直接使用前端传入的 inputText
-                transcriptionText = null;  // 清除 transcriptionText，让 determineSynthesisText 只使用 inputText
-            } else {
-                log.info("[synthesizeVoice][使用文件URL合成，配音编号({})]", voiceConfigId);
-                // 获取文件信息，用于获取文件URL
-                FileDO fileDO = fileMapper.selectById(voice.getFileId());
-                if (fileDO == null) {
-                    throw exception(VOICE_FILE_NOT_EXISTS);
-                }
+            // 使用动态音色模式（fileUrl + transcriptionText）
+            log.info("[synthesizeVoice][使用动态音色模式，配音编号({})]", voiceConfigId);
+            FileDO fileDO = fileMapper.selectById(voice.getFileId());
+            if (fileDO == null) {
+                throw exception(VOICE_FILE_NOT_EXISTS);
+            }

-                // 使用文件URL和识别文本进行合成
-                fileUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
-                transcriptionText = voice.getTranscription();
-                if (StrUtil.isBlank(transcriptionText)) {
-                    throw exception(VOICE_NOT_EXISTS, "配音识别文本为空，请先进行语音识别");
-                }
+            // 使用文件URL和识别文本进行合成
+            fileUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
+            transcriptionText = voice.getTranscription();
+            if (StrUtil.isBlank(transcriptionText)) {
+                throw exception(VOICE_NOT_EXISTS, "配音识别文本为空，请先进行语音识别");
            }
        }
        // 2. 如果没有配置ID，使用voiceId或fileUrl（系统音色或直接URL方式）
@@ -555,21 +495,17 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
                throw exception(VOICE_NOT_EXISTS, "配音不存在");
            }

-            voiceId = voice.getVoiceId();
-            if (StrUtil.isNotBlank(voiceId)) {
-                fileUrl = null;
-                referenceText = null;
-            } else {
-                FileDO fileDO = fileMapper.selectById(voice.getFileId());
-                if (fileDO == null) {
-                    throw exception(VOICE_FILE_NOT_EXISTS);
-                }
-                fileUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
-                referenceText = voice.getTranscription();
-                if (StrUtil.isBlank(referenceText)) {
-                    throw exception(VOICE_NOT_EXISTS, "配音识别文本为空，请先进行语音识别");
-                }
+            // 使用动态音色模式
+            FileDO fileDO = fileMapper.selectById(voice.getFileId());
+            if (fileDO == null) {
+                throw exception(VOICE_FILE_NOT_EXISTS);
            }
+            fileUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
+            referenceText = voice.getTranscription();
+            if (StrUtil.isBlank(referenceText)) {
+                throw exception(VOICE_NOT_EXISTS, "配音识别文本为空，请先进行语音识别");
+            }
+            voiceId = null;
        }
        // 3. 系统配音
        else {
@@ -623,21 +559,10 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
        return buildPreviewResp(audioBase64, format, voiceId);
    }

-    /**
-     * 获取 CosyVoice 配置
-     */
-    private cn.iocoder.yudao.module.tik.voice.config.CosyVoiceProviderConfig getCosyVoiceConfig() {
-        return cosyVoiceProviderConfig;
-    }
-
    /**
     * 获取默认音频格式
     */
    private String getDefaultFormat() {
-        var config = getCosyVoiceConfig();
-        if (config != null) {
-            return config.getAudioFormat();
-        }
        return "mp3";
    }

@@ -645,10 +570,6 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
     * 获取默认采样率
     */
    private Integer getDefaultSampleRate() {
-        var config = getCosyVoiceConfig();
-        if (config != null) {
-            return config.getSampleRate();
-        }
        return 24000;
    }

@@ -664,16 +585,14 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
    }

    private String resolveContentType(String format) {
-        if ("wav".equalsIgnoreCase(format)) {
-            return "audio/wav";
-        }
-        if ("mp3".equalsIgnoreCase(format)) {
+        if (format == null) {
            return "audio/mpeg";
        }
-        if ("flac".equalsIgnoreCase(format)) {
-            return "audio/flac";
-        }
-        return "audio/mpeg";
+        return switch (format.toLowerCase()) {
+            case "wav" -> "audio/wav";
+            case "flac" -> "audio/flac";
+            default -> "audio/mpeg";
+        };
    }

    private String determineSynthesisText(String transcriptionText, String inputText, boolean allowFallback) {
@@ -828,74 +747,25 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
        return respVO;
    }

+    @lombok.Data
+    @lombok.NoArgsConstructor
+    @lombok.AllArgsConstructor
    private static class PreviewCacheEntry {
        private String audioBase64;
        private String format;
        private Integer sampleRate;
        private String requestId;
-
-        public PreviewCacheEntry() {}
-
-        public PreviewCacheEntry(String audioBase64, String format, Integer sampleRate, String requestId) {
-            this.audioBase64 = audioBase64;
-            this.format = format;
-            this.sampleRate = sampleRate;
-            this.requestId = requestId;
-        }
-
-        public String getAudioBase64() {
-            return audioBase64;
-        }
-
-        public String getFormat() {
-            return format;
-        }
-
-        public Integer getSampleRate() {
-            return sampleRate;
-        }
-
-        public String getRequestId() {
-            return requestId;
-        }
    }

+    @lombok.Data
+    @lombok.NoArgsConstructor
+    @lombok.AllArgsConstructor
    private static class SynthCacheEntry {
        private String audioBase64;
        private String format;
        private Integer sampleRate;
        private String requestId;
        private String voiceId;
-
-        public SynthCacheEntry() {}
-
-        public SynthCacheEntry(String audioBase64, String format, Integer sampleRate, String requestId, String voiceId) {
-            this.audioBase64 = audioBase64;
-            this.format = format;
-            this.sampleRate = sampleRate;
-            this.requestId = requestId;
-            this.voiceId = voiceId;
-        }
-
-        public String getAudioBase64() {
-            return audioBase64;
-        }
-
-        public String getFormat() {
-            return format;
-        }
-
-        public Integer getSampleRate() {
-            return sampleRate;
-        }
-
-        public String getRequestId() {
-            return requestId;
-        }
-
-        public String getVoiceId() {
-            return voiceId;
-        }
    }

    /**
@@ -1116,10 +986,6 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
     * 获取默认音色ID
     */
    private String getDefaultVoiceId() {
-        var config = getCosyVoiceConfig();
-        if (config != null) {
-            return config.getDefaultVoiceId();
-        }
        return null;
    }

@@ -1127,32 +993,8 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
     * 获取试听文本
     */
    private String getPreviewText() {
-        var config = getCosyVoiceConfig();
-        if (config != null) {
-            return config.getPreviewText();
-        }
        return "您好，欢迎体验专属音色。";
    }

-    /**
-     * 获取供应商类型
-     */
-    private String getProviderType(String requestProviderType, VoiceCloneProvider provider) {
-        if (StrUtil.isNotBlank(requestProviderType)) {
-            return requestProviderType;
-        }
-        return provider.getProviderType();
-    }
-
-    /**
-     * 根据供应商类型获取对应的模型
-     */
-    private String getModelByProvider(String providerType) {
-        if (PROVIDER_SILICONFLOW.equalsIgnoreCase(providerType)) {
-            return MODEL_SILICONFLOW;
-        }
-        return MODEL_COSYVOICE; // 默认使用 CosyVoice 模型
-    }
-
 }

--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikDigitalHumanCreateReqVO.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikDigitalHumanCreateReqVO.java
@@ -35,7 +35,7 @@ public class AppTikDigitalHumanCreateReqVO {
    @Size(max = 1024, message = "视频URL不能超过1024个字符")
    private String videoUrl;

-    @Schema(description = "音色ID（CosyVoice voiceId，系统音色使用）", example = "cosyvoice-v3-flash-sys-xxx")
+    @Schema(description = "音色ID（系统音色使用）", example = "alex")
    private String voiceId;

    @Schema(description = "用户音色配置ID（tik_user_voice.id，用户音色使用）", example = "123")
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikDigitalHumanRespVO.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikDigitalHumanRespVO.java
@@ -37,7 +37,7 @@ public class AppTikDigitalHumanRespVO {
    @Schema(description = "配音配置ID", example = "789")
    private Long voiceConfigId;

-    @Schema(description = "voice_id", example = "cosyvoice-v3-flash-xxx")
+    @Schema(description = "voice_id", example = "voice-xxx")
    private String voiceId;

    @Schema(description = "语速", example = "1.0")
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikUserVoiceCreateReqVO.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikUserVoiceCreateReqVO.java
@@ -39,7 +39,7 @@ public class AppTikUserVoiceCreateReqVO {
    @Size(max = 4000, message = "音频文本不能超过 4000 个字符")
    private String text;

-    @Schema(description = "供应商类型：cosyvoice-阿里云，siliconflow-硅基流动（不传则使用默认）", example = "cosyvoice")
+    @Schema(description = "供应商类型：siliconflow-硅基流动（不传则使用默认）", example = "siliconflow")
    private String providerType;

 }
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikUserVoiceRespVO.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikUserVoiceRespVO.java
@@ -38,9 +38,6 @@ public class AppTikUserVoiceRespVO {
    @Schema(description = "备注", example = "这是一个测试配音")
    private String note;

-    @Schema(description = "复刻音色ID（CosyVoice 语音复刻生成的 voice_id）")
-    private String voiceId;
-
    @Schema(description = "创建时间", requiredMode = Schema.RequiredMode.REQUIRED)
    private LocalDateTime createTime;

--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikVoicePreviewReqVO.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikVoicePreviewReqVO.java
@@ -14,7 +14,7 @@ public class AppTikVoicePreviewReqVO {
    @Schema(description = "配音编号（tik_user_voice.id），用户配音必传，系统配音可不传")
    private Long voiceConfigId;

-    @Schema(description = "CosyVoice音色ID（系统配音必传，用户配音可不传）")
+    @Schema(description = "音色ID（系统配音必传，用户配音可不传）")
    private String voiceId;

    @Schema(description = "语音文件URL（当使用语音URL合成时必传，替代voiceId）")
@@ -43,7 +43,7 @@ public class AppTikVoicePreviewReqVO {
    @Schema(description = "指令（用于控制音色风格）", example = "请用温柔专业的语调朗读")
    private String instruction;

-    @Schema(description = "供应商类型：cosyvoice-阿里云，siliconflow-硅基流动（不传则使用默认）", example = "cosyvoice")
+    @Schema(description = "供应商类型：siliconflow-硅基流动（不传则使用默认）", example = "siliconflow")
    private String providerType;

 }
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikVoicePreviewRespVO.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikVoicePreviewRespVO.java
@@ -20,7 +20,7 @@ public class AppTikVoicePreviewRespVO {
    @Schema(description = "采样率", example = "24000")
    private Integer sampleRate;

-    @Schema(description = "CosyVoice 请求ID")
+    @Schema(description = "请求ID")
    private String requestId;

    @Schema(description = "使用的音色 ID")
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikVoiceTtsReqVO.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikVoiceTtsReqVO.java
@@ -21,13 +21,13 @@ public class AppTikVoiceTtsReqVO {
    @Size(max = 4000, message = "识别文本不能超过 4000 个字符")
    private String transcriptionText;

-    @Schema(description = "音色 ID（CosyVoice voiceId）", example = "cosyvoice-v3-flash-myvoice-xxx")
+    @Schema(description = "音色 ID（系统音色）", example = "alex")
    private String voiceId;

    @Schema(description = "音色源音频 OSS 地址（当没有 voiceId 时必传）")
    private String fileUrl;

-    @Schema(description = "模型名称，默认 cosyvoice-v3-flash", example = "cosyvoice-v3-flash")
+    @Schema(description = "模型名称", example = "IndexTeam/IndexTTS-2")
    private String model;

    @Schema(description = "语速，默认 1.0", example = "1.0")
@@ -45,7 +45,7 @@ public class AppTikVoiceTtsReqVO {
    @Schema(description = "音频格式，默认 wav，可选 mp3")
    private String audioFormat;

-    @Schema(description = "供应商类型：cosyvoice-阿里云，siliconflow-硅基流动（不传则使用默认）", example = "cosyvoice")
+    @Schema(description = "供应商类型：siliconflow-硅基流动（不传则使用默认）", example = "siliconflow")
    private String providerType;

 }
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikVoiceTtsRespVO.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikVoiceTtsRespVO.java
@@ -4,7 +4,7 @@ import io.swagger.v3.oas.annotations.media.Schema;
 import lombok.Data;

@Data
-@Schema(description = "CosyVoice 文本转语音响应")
+@Schema(description = "文本转语音响应")
 public class AppTikVoiceTtsRespVO {

    @Schema(description = "用户文件编号", example = "1024")
@@ -23,7 +23,7 @@ public class AppTikVoiceTtsRespVO {
    @Schema(description = "采样率", example = "24000")
    private Integer sampleRate;

-    @Schema(description = "CosyVoice 请求ID")
+    @Schema(description = "请求ID")
    private String requestId;

    @Schema(description = "使用的音色 ID")