From 75abf48bc1c18f89d6bc02c29e03deaa7686d266 Mon Sep 17 00:00:00 2001
From: sion123 <450702724@qq.com>
Date: Wed, 19 Nov 2025 21:57:16 +0800
Subject: [PATCH] =?UTF-8?q?feat:=20=E5=8A=9F=E8=83=BD=E4=BC=98=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/cosyvoice-copy.md                        | 129 +++++++++
 docs/cosyvoice-generate.md                    | 271 ++++++++++++++++++
 docs/cosyvoice-ram-config.md                  | 156 ++++++++++
 .../app/web-gold/src/views/dh/VoiceCopy.vue   | 118 +-------
 .../tik/voice/client/CosyVoiceClient.java     | 137 ++++++++-
 .../client/dto/CosyVoiceCloneRequest.java     |  36 +++
 .../client/dto/CosyVoiceCloneResult.java      |  21 ++
 .../tik/voice/config/CosyVoiceProperties.java |   5 +
 .../voice/dal/dataobject/TikUserVoiceDO.java  |   4 +
 .../service/TikUserVoiceServiceImpl.java      | 102 ++++---
 .../tik/voice/vo/AppTikUserVoiceRespVO.java   |   3 +
 11 files changed, 818 insertions(+), 164 deletions(-)
 create mode 100644 docs/cosyvoice-copy.md
 create mode 100644 docs/cosyvoice-generate.md
 create mode 100644 docs/cosyvoice-ram-config.md
 create mode 100644 yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/CosyVoiceCloneRequest.java
 create mode 100644 yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/CosyVoiceCloneResult.java

diff --git a/docs/cosyvoice-copy.md b/docs/cosyvoice-copy.md
new file mode 100644
index 0000000000..898d996d39
--- /dev/null
+++ b/docs/cosyvoice-copy.md
@@ -0,0 +1,129 @@
+# 阿里云CosyVoice声音复刻API说明文档
+## 一、接口概述
+CosyVoice声音复刻API依托大模型技术，仅需10~20秒清晰音频，即可快速生成高度拟真的定制音色（`voice_id`），支持`cosyvoice-v1`和`cosyvoice-v2`模型（v2效果更优）。复刻服务免费，使用复刻音色进行语音合成时按字符计费。
+
+核心功能：音色的创建、查询、更新、删除，生成的`voice_id`可直接用于CosyVoice语音合成接口。
+
+## 二、前提条件
+1. 开通CosyVoice服务，获取API Key（推荐配置到环境变量，避免硬编码）。
+2. 安装最新版DashScope SDK（Java/Python），其他语言需调用RESTful API。
+3. 准备公网可访问的音频URL（推荐上传至阿里云OSS，支持WAV/MP3/M4A格式）。
+
+## 三、核心接口详情（支持Java/Python SDK + RESTful API）
+### 1. 创建音色（生成voice_id）
+#### 功能描述
+上传10~20秒音频，生成专属`voice_id`，用于后续语音合成。
+#### 请求参数
+| 参数名       | 类型   | 是否必填 | 说明                                                                 |
+|--------------|--------|----------|----------------------------------------------------------------------|
+| target_model | string | 是       | 复刻模型，支持`cosyvoice-v1`/`cosyvoice-v2`                          |
+| prefix       | string | 是       | 音色自定义前缀，仅允许数字和小写字母，长度<10字符                     |
+| url          | string | 是       | 音频文件公网URL，需满足格式要求（采样率≥16kHz、文件≤10MB、含≥5秒连续语音） |
+#### 响应参数
+| 参数名   | 类型   | 说明                     |
+|----------|--------|--------------------------|
+| voice_id | string | 定制音色ID，用于语音合成 |
+| request_id| string | 任务唯一标识，用于排查问题 |
+#### 示例代码（Python SDK）
+```python
+import os
+import dashscope
+from dashscope.audio.tts_v2 import VoiceEnrollmentService
+
+dashscope.api_key = os.getenv('DASHSCOPE_API_KEY')
+service = VoiceEnrollmentService()
+# 调用创建接口
+voice_id = service.create_voice(target_model="cosyvoice-v2", prefix="test", url="音频公网URL")
+print(f"生成的voice_id: {voice_id}")
+```
+
+### 2. 查询所有音色
+#### 功能描述
+查询账号下已创建的所有音色，支持按前缀筛选和分页。
+#### 请求参数
+| 参数名     | 类型   | 是否必填 | 说明                          |
+|------------|--------|----------|-------------------------------|
+| prefix     | string | 否       | 音色前缀，为空则返回所有音色  |
+| page_index | int    | 否       | 页码索引，默认0               |
+| page_size  | int    | 否       | 每页条数，默认10              |
+#### 响应参数
+| 参数名       | 类型   | 说明                                                                 |
+|--------------|--------|----------------------------------------------------------------------|
+| voice_list   | array  | 音色列表，含每个音色的`voice_id`、创建时间（gmt_create）、状态（status） |
+| status       | string | 音色状态：DEPLOYING（审核中）/OK（可用）/UNDEPLOYED（审核失败）        |
+| request_id   | string | 任务唯一标识                                                         |
+
+### 3. 查询指定音色
+#### 功能描述
+查询单个`voice_id`的详细信息（状态、原始音频URL等）。
+#### 请求参数
+| 参数名   | 类型   | 是否必填 | 说明               |
+|----------|--------|----------|--------------------|
+| voice_id | string | 是       | 需查询的音色ID     |
+#### 响应参数
+| 参数名         | 类型   | 说明                                                                 |
+|----------------|--------|----------------------------------------------------------------------|
+| voice_id       | string | 音色ID                                                               |
+| resource_link  | string | 复刻所用音频的公网URL                                                |
+| target_model   | string | 复刻时使用的模型                                                     |
+| status         | string | 音色状态（DEPLOYING/OK/UNDEPLOYED）                                  |
+| gmt_create     | string | 音色创建时间                                                         |
+
+### 4. 更新音色
+#### 功能描述
+使用新的音频URL更新已有`voice_id`的音色。
+#### 请求参数
+| 参数名   | 类型   | 是否必填 | 说明                                                                 |
+|----------|--------|----------|----------------------------------------------------------------------|
+| voice_id | string | 是       | 需更新的音色ID                                                       |
+| url      | string | 是       | 新的音频公网URL（需满足格式要求）                                     |
+#### 响应参数
+| 参数名     | 类型   | 说明               |
+|------------|--------|--------------------|
+| request_id | string | 任务唯一标识       |
+
+### 5. 删除音色
+#### 功能描述
+删除无需使用的`voice_id`，释放配额（账号最多保留1000个音色）。
+#### 请求参数
+| 参数名   | 类型   | 是否必填 | 说明               |
+|----------|--------|----------|--------------------|
+| voice_id | string | 是       | 需删除的音色ID     |
+#### 响应参数
+| 参数名     | 类型   | 说明               |
+|------------|--------|--------------------|
+| request_id | string | 任务唯一标识       |
+
+## 四、音频文件要求
+1. 格式：支持WAV（16bit）、MP3、M4A。
+2. 采样率：≥16000Hz。
+3. 时长：10~20秒（建议不超过60秒），含至少一段≥5秒的连续语音。
+4. 大小：≤10MB。
+5. 质量：语音清晰、无杂音，朗读连贯。
+
+## 五、使用流程（复刻→合成）
+1. 调用「创建音色」接口，传入音频URL，获取`voice_id`。
+2. 调用CosyVoice语音合成接口，将`voice_id`作为`voice`参数传入，即可使用定制音色合成语音。
+3. （可选）通过「查询指定音色」接口确认`status`为`OK`后再使用。
+
+## 六、关键限制
+1. 配额限制：每个主账号最多保留1000个复刻音色，删除后释放配额。
+2. 并发限制：复刻接口总并发≤10 RPS（v1+v2合计），语音合成接口并发≤3 RPS。
+3. 模型匹配：v1版本`voice_id`仅用于v1合成，v2版本`voice_id`仅用于v2合成，不可混用。
+4. 有效期：超过1年未使用的音色将自动下线。
+
+## 七、常见错误码及解决方案
+| 错误码                  | 说明                                  | 解决方案                                                         |
+|-------------------------|---------------------------------------|------------------------------------------------------------------|
+| Throttling.AllocationQuota | 音色数量达限额                        | 删除无用音色或提交工单申请扩容                                   |
+| Audio.AudioShortError   | 音频有效时长过短                      | 重新录制10~20秒连续语音                                          |
+| InvalidApiKey           | API Key无效                           | 检查API Key是否正确，无多余空格或缺失字符                         |
+| Model.AccessDenied      | 模型访问权限不足                      | 使用“默认业务空间”下的API Key调用                                 |
+| BadRequest.UnsupportedFileFormat | 音频格式不支持                  | 转换为WAV/MP3/M4A格式，确认文件实际编码与后缀一致                 |
+| Audio.FileSizeExceed    | 音频文件超过10MB                      | 压缩文件大小或截取有效片段                                       |
+
+## 八、注意事项
+1. 版权要求：需对复刻音频的所有权及合法使用权负责，遵守服务协议。
+2. 音频URL：确保公网可访问，推荐使用阿里云OSS生成临时访问链接（避免长期公开泄露）。
+3. 升级建议：v1音色可使用原始音频重新复刻为v2版本，获得更优效果。
+4. 合成调节：使用`voice_id`合成语音时，可通过`volume`（音量）、`speechRate`（语速）等参数调节输出效果。
diff --git a/docs/cosyvoice-generate.md b/docs/cosyvoice-generate.md
new file mode 100644
index 0000000000..18406d9473
--- /dev/null
+++ b/docs/cosyvoice-generate.md
@@ -0,0 +1,271 @@
+# 阿里云CosyVoice Java SDK 调用模板（参数+示例）
+## 一、前提条件
+1. 开通CosyVoice服务，获取API Key（建议配置到环境变量，避免硬编码）。
+2. 安装最新版DashScope SDK（支持2.20.3+版本，SSML功能需此版本及以上）。
+3. 模型与音色需匹配（如v2模型对应v2音色，v3模型对应v3音色）。
+
+## 二、核心参数汇总
+| 参数名       | 类型       | 是否必填 | 默认值                  | 取值范围/说明                                                                 |
+|--------------|------------|----------|-------------------------|------------------------------------------------------------------------------|
+| model        | String     | 是       | -                       | cosyvoice-v1/v2/v3/v3-plus（v3系列需申请邀测）                               |
+| voice        | String     | 是       | -                       | 对应模型的音色（如v2：longxiaochun_v2；v3：longhuohuo_v3，详见文档音色列表） |
+| format       | enum       | 否       | 因音色而异（默认MP3 22050Hz） | 支持WAV/MP3/PCM/OGG_OPUS，如PCM_22050HZ_MONO_16BIT、MP3_24000HZ_MONO_256KBPS |
+| volume       | int        | 否       | 50                      | 0~100（音量大小）                                                            |
+| speechRate   | float      | 否       | 1.0                     | 0.5~2.0（语速，1.0为默认，约4字/秒）                                          |
+| pitchRate    | float      | 否       | 1.0                     | 0.5~2.0（语调）                                                              |
+| bit_rate     | int        | 否       | 32                      | 6~510kbps（仅opus格式支持，v1模型不支持）                                    |
+| seed         | int        | 否       | 0                       | 0~65535（随机数种子，仅v3/v3-plus支持）                                       |
+| style        | int        | 否       | 0                       | ≥0整数（风格调整，仅v3/v3-plus支持）                                          |
+| languageHints| List<String> | 否     | -                       | 仅v3/v3-plus支持，单次配置1个语种（"zh"/"en"）                                |
+| instruction  | String     | 否       | -                       | 仅v3/v3-plus支持，格式："你说话的情感是<情感值>"（如"Happy"/"Angry"）          |
+
+## 三、四种核心调用方式模板
+### 1. 同步调用（阻塞式，适合短文本）
+```java
+import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
+import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.nio.ByteBuffer;
+
+public class SyncCallTemplate {
+    // 配置参数（根据需求修改）
+    private static final String MODEL = "cosyvoice-v3"; // 模型
+    private static final String VOICE = "longhuohuo_v3"; // 音色
+    private static final String TEXT = "今天天气真好，适合出门散步！"; // ≤2000字符
+    private static final String OUTPUT_FILE = "output.mp3"; // 输出文件
+
+    public static void main(String[] args) {
+        // 1. 构建请求参数
+        SpeechSynthesisParam param = SpeechSynthesisParam.builder()
+                // .apiKey("your-api-key") // 未配置环境变量时打开
+                .model(MODEL)
+                .voice(VOICE)
+                .volume(60) // 可选：调整音量
+                .speechRate(1.1f) // 可选：调整语速
+                .build();
+
+        // 2. 初始化合成器（同步调用传null）
+        SpeechSynthesizer synthesizer = new SpeechSynthesizer(param, null);
+        ByteBuffer audioData = null;
+
+        try {
+            // 3. 阻塞调用，获取完整音频
+            audioData = synthesizer.call(TEXT);
+            // 4. 保存音频到本地
+            if (audioData != null) {
+                try (FileOutputStream fos = new FileOutputStream(new File(OUTPUT_FILE))) {
+                    fos.write(audioData.array());
+                }
+                System.out.println("合成成功！输出文件：" + OUTPUT_FILE);
+                System.out.println("RequestId：" + synthesizer.getLastRequestId());
+                System.out.println("首包延迟：" + synthesizer.getFirstPackageDelay() + "ms");
+            }
+        } catch (Exception e) {
+            System.err.println("合成失败：" + e.getMessage());
+        } finally {
+            // 5. 关闭WebSocket连接
+            synthesizer.getDuplexApi().close(1000, "任务结束");
+        }
+    }
+}
+```
+
+### 2. 异步调用（非阻塞，短文本实时接收）
+```java
+import com.alibaba.dashscope.audio.tts.SpeechSynthesisResult;
+import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
+import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
+import com.alibaba.dashscope.common.ResultCallback;
+import java.util.concurrent.CountDownLatch;
+
+public class AsyncCallTemplate {
+    private static final String MODEL = "cosyvoice-v2";
+    private static final String VOICE = "longxiaochun_v2";
+    private static final String TEXT = "欢迎使用阿里云CosyVoice语音合成服务！"; // ≤2000字符
+
+    public static void main(String[] args) throws InterruptedException {
+        CountDownLatch latch = new CountDownLatch(1);
+
+        // 1. 配置回调（实时接收音频）
+        ResultCallback<SpeechSynthesisResult> callback = new ResultCallback<SpeechSynthesisResult>() {
+            @Override
+            public void onEvent(SpeechSynthesisResult result) {
+                // 接收音频分片（可实时播放或写入文件）
+                if (result.getAudioFrame() != null) {
+                    System.out.println("收到音频分片，大小：" + result.getAudioFrame().capacity() + "字节");
+                    // 此处可添加音频处理逻辑（如流式播放、追加写入文件）
+                }
+                // 查看计费字符数（最终以最后一次为准）
+                if (result.getUsage() != null) {
+                    System.out.println("当前计费字符数：" + result.getUsage().getCharacters());
+                }
+            }
+
+            @Override
+            public void onComplete() {
+                System.out.println("合成完成！");
+                latch.countDown();
+            }
+
+            @Override
+            public void onError(Exception e) {
+                System.err.println("合成失败：" + e.getMessage());
+                latch.countDown();
+            }
+        };
+
+        // 2. 构建参数并初始化合成器
+        SpeechSynthesisParam param = SpeechSynthesisParam.builder()
+                // .apiKey("your-api-key")
+                .model(MODEL)
+                .voice(VOICE)
+                .format(SpeechSynthesisAudioFormat.MP3_16000HZ_MONO_128KBPS) // 可选配置格式
+                .build();
+        SpeechSynthesizer synthesizer = new SpeechSynthesizer(param, callback);
+
+        try {
+            // 3. 非阻塞调用
+            synthesizer.call(TEXT);
+            latch.await(); // 等待合成完成
+            System.out.println("RequestId：" + synthesizer.getLastRequestId());
+        } catch (Exception e) {
+            System.err.println("调用异常：" + e.getMessage());
+        } finally {
+            synthesizer.getDuplexApi().close(1000, "任务结束");
+        }
+    }
+}
+```
+
+### 3. 流式调用（分段传文本，适合长文本）
+```java
+import com.alibaba.dashscope.audio.tts.SpeechSynthesisResult;
+import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisAudioFormat;
+import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
+import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
+import com.alibaba.dashscope.common.ResultCallback;
+
+public class StreamingCallTemplate {
+    // 分段文本（每段≤2000字符，累计≤20万字符）
+    private static final String[] TEXT_SEGMENTS = {
+        "流式语音合成适合长文本场景，",
+        "可以分段发送文本，",
+        "服务端实时返回音频，",
+        "减少等待时间。"
+    };
+    private static final String MODEL = "cosyvoice-v3";
+    private static final String VOICE = "longchuanshu_v3";
+
+    public static void main(String[] args) {
+        // 1. 配置回调
+        ResultCallback<SpeechSynthesisResult> callback = new ResultCallback<SpeechSynthesisResult>() {
+            @Override
+            public void onEvent(SpeechSynthesisResult result) {
+                if (result.getAudioFrame() != null) {
+                    System.out.println("收到流式音频分片");
+                    // 处理音频（如实时播放、写入缓冲文件）
+                }
+            }
+
+            @Override
+            public void onComplete() {
+                System.out.println("流式合成全部完成！");
+            }
+
+            @Override
+            public void onError(Exception e) {
+                System.err.println("流式合成失败：" + e.getMessage());
+            }
+        };
+
+        // 2. 构建参数
+        SpeechSynthesisParam param = SpeechSynthesisParam.builder()
+                // .apiKey("your-api-key")
+                .model(MODEL)
+                .voice(VOICE)
+                .format(SpeechSynthesisAudioFormat.PCM_22050HZ_MONO_16BIT)
+                .speechRate(0.9f)
+                .build();
+        SpeechSynthesizer synthesizer = new SpeechSynthesizer(param, callback);
+
+        try {
+            // 3. 分段发送文本（间隔≤23秒）
+            for (String segment : TEXT_SEGMENTS) {
+                synthesizer.streamingCall(segment);
+                Thread.sleep(500); // 模拟文本输入间隔
+            }
+            // 4. 必须调用：结束流式合成（触发剩余文本合成）
+            synthesizer.streamingComplete();
+            System.out.println("RequestId：" + synthesizer.getLastRequestId());
+        } catch (Exception e) {
+            System.err.println("调用异常：" + e.getMessage());
+        } finally {
+            synthesizer.getDuplexApi().close(1000, "任务结束");
+        }
+    }
+}
+```
+
+### 4. Flowable调用（响应式编程，支持流式输入输出）
+```java
+import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
+import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
+import com.alibaba.dashscope.exception.NoApiKeyException;
+import io.reactivex.Flowable;
+import io.reactivex.BackpressureStrategy;
+
+public class FlowableCallTemplate {
+    private static final String MODEL = "cosyvoice-v2";
+    private static final String VOICE = "longyingtian_v2";
+    private static final String[] TEXT_ARRAY = {"响应式编程模式，", "支持流式输入输出，", "适合高并发场景。"};
+
+    public static void main(String[] args) throws NoApiKeyException {
+        // 1. 模拟流式文本输入
+        Flowable<String> textStream = Flowable.create(emitter -> {
+            new Thread(() -> {
+                for (String text : TEXT_ARRAY) {
+                    emitter.onNext(text);
+                    try {
+                        Thread.sleep(800);
+                    } catch (InterruptedException e) {
+                        emitter.onError(e);
+                    }
+                }
+                emitter.onComplete();
+            }).start();
+        }, BackpressureStrategy.BUFFER);
+
+        // 2. 构建参数
+        SpeechSynthesisParam param = SpeechSynthesisParam.builder()
+                // .apiKey("your-api-key")
+                .model(MODEL)
+                .voice(VOICE)
+                .volume(70)
+                .build();
+        SpeechSynthesizer synthesizer = new SpeechSynthesizer(param, null);
+
+        try {
+            // 3. 流式调用并处理结果
+            synthesizer.streamingCallAsFlowable(textStream)
+                    .blockingForEach(result -> {
+                        if (result.getAudioFrame() != null) {
+                            System.out.println("Flowable收到音频，大小：" + result.getAudioFrame().capacity() + "字节");
+                            // 处理音频逻辑
+                        }
+                    });
+            System.out.println("Flowable合成完成！RequestId：" + synthesizer.getLastRequestId());
+        } finally {
+            synthesizer.getDuplexApi().close(1000, "任务结束");
+        }
+    }
+}
+```
+
+## 四、核心注意事项
+1. 文本长度限制：非流式单次≤2000字符，流式累计≤20万字符（含SSML标签）。
+2. 字符计算规则：汉字=2字符，英文/数字/标点/空格=1字符。
+3. 流式调用必须调用`streamingComplete()`，否则结尾文本无法合成。
+4. 每次调用`call()`前需重新初始化`SpeechSynthesizer`实例。
+5. 音频格式需与播放器兼容（如MP3/OPUS支持流式播放，推荐使用ffmpeg、AudioFormat等工具）。
+
diff --git a/docs/cosyvoice-ram-config.md b/docs/cosyvoice-ram-config.md
new file mode 100644
index 0000000000..6a520b2865
--- /dev/null
+++ b/docs/cosyvoice-ram-config.md
@@ -0,0 +1,156 @@
+# CosyVoice访问OSS配置指南
+
+## 问题说明
+CosyVoice复刻服务需要访问OSS存储的音频文件，但默认情况下CosyVoice没有访问用户OSS的权限。
+
+## 解决方案：配置RAM权限
+
+### 1. 创建RAM角色
+在阿里云RAM控制台创建角色，允许CosyVoice服务访问OSS：
+
+```json
+{
+  "Version": "1",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Action": [
+        "oss:GetObject",
+        "oss:HeadObject"
+      ],
+      "Resource": [
+        "acs:oss:*:*:{bucket-name}/*"
+      ]
+    }
+  ]
+}
+```
+
+**参数说明：**
+- `{bucket-name}`：替换为你的OSS存储桶名称
+
+### 2. 配置信任策略
+为RAM角色添加信任策略，允许DashScope服务扮演该角色：
+
+```json
+{
+  "Version": "1",
+  "Statement": [
+    {
+      "Effect": "Allow",
+      "Principal": {
+        "Service": [
+          "dashscope.aliyuncs.com"
+        ]
+      },
+      "Action": "sts:AssumeRole"
+    }
+  ]
+}
+```
+
+### 3. 授权步骤（控制台操作）
+
+**步骤1：创建自定义权限策略**
+1. 登录阿里云控制台 → 访问控制(RAM) → 权限管理 → 权限策略
+2. 点击"创建策略"
+3. 选择"脚本配置"
+4. 粘贴上述JSON权限策略
+5. 策略名称：`CosyVoice-OSS-Access`
+6. 点击"确定"
+
+**步骤2：创建RAM角色**
+1. RAM → 身份管理 → 角色 → 创建角色
+2. 选择"阿里云服务" → "DashScope"
+3. 输入角色名称：`CosyVoice-OSS-Role`
+4. 完成创建
+
+**步骤3：授权角色访问OSS**
+1. 在角色详情页面，点击"添加权限"
+2. 搜索并添加：
+   - `AliyunOSSReadOnlyAccess`（阿里云OSS只读权限）
+   - `CosyVoice-OSS-Access`（自定义权限）
+3. 点击"确定"
+
+**步骤4：获取ARN**
+在角色详情页面，复制"ARN"：
+```
+acs:ram::{你的AccountID}:role/CosyVoice-OSS-Role
+```
+
+### 4. 配置application.yaml
+
+在 `yudao-server/src/main/resources/application.yaml` 中添加：
+
+```yaml
+yudao:
+  cosyvoice:
+    # ... 其他配置
+    # RAM角色ARN（用于授权CosyVoice访问OSS）
+    ram-role-arn: "acs:ram::{AccountID}:role/CosyVoice-OSS-Role"
+```
+
+### 5. 修改OSS访问方式
+
+在 `CosyVoiceClient.buildClonePayload()` 中，添加授权信息：
+
+```java
+// 在请求头中添加Authorization
+Request httpRequest = new Request.Builder()
+        .url(properties.getVoiceEnrollmentUrl())
+        .addHeader("Authorization", "Bearer " + properties.getApiKey())
+        .addHeader("Content-Type", "application/json")
+        // 添加STS临时凭证（需要先调用AssumeRole获取临时凭证）
+        .addHeader("x-acs-security-token", "{security-token}")
+        .post(RequestBody.create(payload.getBytes(StandardCharsets.UTF_8), JSON))
+        .build();
+```
+
+### 6. 获取STS临时凭证（Java代码）
+
+```java
+import com.aliyun.sts20150401.Client;
+import com.aliyun.sts20150401.models.AssumeRoleRequest;
+import com.aliyun.sts20150401.models.AssumeRoleResponse;
+
+public String getSecurityToken() {
+    // 需要配置RAM角色ARN和访问密钥
+    AssumeRoleRequest request = new AssumeRoleRequest()
+            .setRoleArn(properties.getRamRoleArn())
+            .setRoleSessionName("cosyvoice-session");
+
+    AssumeRoleResponse response = client.assumeRole(request);
+    return response.body.credentials.securityToken;
+}
+```
+
+### 7. 完整实现思路
+
+1. **本地开发**：使用STS临时凭证
+2. **生产环境**：
+   - 方案A：配置RAM角色，让CosyVoice直接访问OSS
+   - 方案B：将音频文件上传到CosyVoice可访问的公共OSS存储桶
+
+## 替代方案：使用公共OSS存储桶
+
+如果RAM权限配置复杂，可以：
+
+1. 创建公共可读的OSS存储桶
+2. 将音频文件上传到该存储桶
+3. 使用公共URL进行复刻
+
+**注意**：公共存储桶存在安全风险，仅用于测试！
+
+## 验证配置
+
+配置完成后，重新测试语音复刻功能：
+
+1. 查看日志中的请求URL是否可公网访问
+2. 查看是否还有"url error"错误
+3. 查看复刻是否成功返回voice_id
+
+## 参考资料
+
+- [阿里云RAM权限管理](https://help.aliyun.com/zh/ram/instance/role/parameter-overview)
+- [STS临时凭证](https://help.aliyun.com/zh/acs/STS/usage-scenarios/usage-scenarios)
+- [CosyVoice错误代码](https://help.aliyun.com/zh/model-studio/error-code#error-url)
diff --git a/frontend/app/web-gold/src/views/dh/VoiceCopy.vue b/frontend/app/web-gold/src/views/dh/VoiceCopy.vue
index 0d0281d4a2..c03c70c275 100644
--- a/frontend/app/web-gold/src/views/dh/VoiceCopy.vue
+++ b/frontend/app/web-gold/src/views/dh/VoiceCopy.vue
@@ -39,9 +39,6 @@
           <div v-if="column.key === 'name'" class="voice-name">
             {{ record.name || '未命名' }}
           </div>
-          <div v-else-if="column.key === 'transcription'" class="transcription-text">
-            {{ formatTranscription(record.transcription) }}
-          </div>
           <span v-else-if="column.key === 'createTime'">
             {{ formatDateTime(record.createTime) }}
           </span>
@@ -51,15 +48,6 @@
           </a-button>
           <a-space v-else-if="column.key === 'actions'">
             <a-button type="link" size="small" @click="handleEdit(record)">编辑</a-button>
-            <a-button
-              type="link"
-              size="small"
-              :loading="transcribingId === record.id"
-              :disabled="!!record.transcription"
-              @click="handleTranscribe(record)"
-            >
-              {{ record.transcription ? '已识别' : '识别' }}
-            </a-button>
             <a-button type="link" size="small" danger @click="handleDelete(record)">删除</a-button>
           </a-space>
         </template>
@@ -109,14 +97,6 @@
         <a-form-item label="备注" name="note">
           <a-textarea v-model="formData.note" :rows="3" placeholder="请输入备注信息" />
         </a-form-item>
-
-        <a-form-item v-if="!isCreateMode" label="识别内容" name="transcription">
-          <a-textarea
-            v-model="formData.transcription"
-            :rows="4"
-            placeholder="识别内容，支持手动修改"
-          />
-        </a-form-item>
       </a-form>
     </a-modal>
 
@@ -125,7 +105,7 @@
 </template>
 
 <script setup>
-import { ref, reactive, computed, onMounted, onUnmounted, nextTick } from 'vue'
+import { ref, reactive, computed, onMounted, nextTick } from 'vue'
 import { message, Modal } from 'ant-design-vue'
 import { PlusOutlined, SearchOutlined, UploadOutlined, PlayCircleOutlined } from '@ant-design/icons-vue'
 import { VoiceService } from '@/api/voice'
@@ -133,12 +113,6 @@ import { MaterialService } from '@/api/material'
 import dayjs from 'dayjs'
 
 // ========== 常量 ==========
-const POLLING_CONFIG = {
-  interval: 10000,
-  maxCount: 30,
-  transcriptionMaxLength: 50
-}
-
 const DEFAULT_FORM_DATA = {
   id: null,
   name: '',
@@ -146,8 +120,7 @@ const DEFAULT_FORM_DATA = {
   autoTranscribe: true,
   language: 'zh-CN',
   gender: 'female',
-  note: '',
-  transcription: ''
+  note: ''
 }
 
 // ========== 响应式数据 ==========
@@ -155,13 +128,11 @@ const loading = ref(false)
 const submitting = ref(false)
 const uploading = ref(false)
 const voiceList = ref([])
-const transcribingId = ref(null)
 const modalVisible = ref(false)
 const formMode = ref('create')
 const formRef = ref(null)
 const audioPlayer = ref(null)
 const fileList = ref([])
-let pollingTimer = null
 
 const searchParams = reactive({
   name: '',
@@ -185,7 +156,6 @@ const isCreateMode = computed(() => formMode.value === 'create')
 // ========== 表格配置 ==========
 const columns = [
   { title: '配音名称', key: 'name', dataIndex: 'name', width: 200 },
-  { title: '识别内容', key: 'transcription', dataIndex: 'transcription', width: 300 },
   { title: '创建时间', key: 'createTime', dataIndex: 'createTime', width: 180 },
   { title: '操作', key: 'actions', width: 200, fixed: 'right' }
 ]
@@ -197,12 +167,6 @@ const formRules = {
 }
 
 // ========== 工具函数 ==========
-const formatTranscription = (transcription) => {
-  if (!transcription) return '未识别'
-  if (transcription.length <= POLLING_CONFIG.transcriptionMaxLength) return transcription
-  return transcription.substring(0, POLLING_CONFIG.transcriptionMaxLength) + '...'
-}
-
 const formatDateTime = (value) => {
   if (!value) return '-'
   return dayjs(value).format('YYYY-MM-DD HH:mm:ss')
@@ -215,8 +179,7 @@ const fillFormData = (data) => {
     fileId: data.fileId || null,
     language: data.language || 'zh-CN',
     gender: data.gender || 'female',
-    note: data.note || '',
-    transcription: data.transcription || ''
+    note: data.note || ''
   })
 }
 
@@ -300,62 +263,6 @@ const handleDelete = (record) => {
   })
 }
 
-// ========== 语音识别 ==========
-const handleTranscribe = async (record) => {
-  transcribingId.value = record.id
-  try {
-    const res = await VoiceService.transcribe(record.id)
-    if (res.code !== 0) {
-      message.error(res.msg || '识别失败')
-      transcribingId.value = null
-      return
-    }
-
-    message.success('识别任务已提交，正在识别中...')
-    startPollingTranscription(record.id)
-  } catch (error) {
-    console.error('识别失败:', error)
-    message.error('识别失败，请稍后重试')
-    transcribingId.value = null
-  }
-}
-
-const stopPolling = () => {
-  if (pollingTimer) {
-    clearInterval(pollingTimer)
-    pollingTimer = null
-  }
-  transcribingId.value = null
-}
-
-const startPollingTranscription = (voiceId) => {
-  stopPolling()
-
-  let pollCount = 0
-  pollingTimer = setInterval(async () => {
-    pollCount++
-
-    try {
-      const res = await VoiceService.get(voiceId)
-      if (res.code === 0 && res.data?.transcription) {
-        stopPolling()
-        message.success('识别完成')
-        loadVoiceList()
-        return
-      }
-
-      if (pollCount >= POLLING_CONFIG.maxCount) {
-        stopPolling()
-        message.warning('识别超时，请稍后手动刷新查看结果')
-        loadVoiceList()
-      }
-    } catch (error) {
-      console.error('轮询识别结果失败:', error)
-      if (pollCount >= POLLING_CONFIG.maxCount) stopPolling()
-    }
-  }, POLLING_CONFIG.interval)
-}
-
 // ========== 音频播放 ==========
 const handlePlayAudio = (record) => {
   if (record.fileUrl && audioPlayer.value) {
@@ -460,8 +367,7 @@ const handleSubmit = async () => {
         name: formData.name,
         language: formData.language,
         gender: formData.gender,
-        note: formData.note,
-        transcription: formData.transcription
+        note: formData.note
       }
 
   try {
@@ -476,12 +382,6 @@ const handleSubmit = async () => {
 
     message.success(isCreateMode.value ? '创建成功' : '更新成功')
     modalVisible.value = false
-
-    if (isCreateMode.value && formData.autoTranscribe && res.data) {
-      message.info('自动识别已启动，正在识别中...')
-      startPollingTranscription(res.data)
-    }
-
     loadVoiceList()
   } catch (error) {
     console.error('提交失败:', error)
@@ -506,10 +406,6 @@ const resetForm = () => {
 onMounted(() => {
   loadVoiceList()
 })
-
-onUnmounted(() => {
-  stopPolling()
-})
 </script>
 
 <style scoped>
@@ -550,12 +446,6 @@ onUnmounted(() => {
   color: var(--color-text);
 }
 
-.transcription-text {
-  color: var(--color-text-secondary);
-  font-size: 13px;
-  line-height: 1.5;
-}
-
 .upload-hint {
   font-size: 12px;
   color: var(--color-text-secondary);
diff --git a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/CosyVoiceClient.java b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/CosyVoiceClient.java
index fbe2f32a60..73cd624866 100644
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/CosyVoiceClient.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/CosyVoiceClient.java
@@ -3,9 +3,15 @@ package cn.iocoder.yudao.module.tik.voice.client;
 import cn.hutool.core.collection.CollUtil;
 import cn.hutool.core.util.StrUtil;
 import cn.iocoder.yudao.framework.common.exception.ServiceException;
+import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceCloneRequest;
+import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceCloneResult;
 import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsRequest;
 import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsResult;
 import cn.iocoder.yudao.module.tik.voice.config.CosyVoiceProperties;
+import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
+import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
+import com.alibaba.dashscope.audio.ttsv2.enrollment.Voice;
+import com.alibaba.dashscope.audio.ttsv2.enrollment.VoiceEnrollmentService;
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import lombok.RequiredArgsConstructor;
@@ -17,6 +23,7 @@ import okhttp3.RequestBody;
 import okhttp3.Response;
 import org.springframework.stereotype.Component;
 
+import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
 import java.time.Duration;
 import java.util.Base64;
@@ -53,28 +60,130 @@ public class CosyVoiceClient {
         if (request == null || StrUtil.isBlank(request.getText())) {
             throw exception0(VOICE_TTS_FAILED.getCode(), "TTS 文本不能为空");
         }
+        if (StrUtil.isBlank(request.getVoiceId())) {
+            throw exception0(VOICE_TTS_FAILED.getCode(), "必须提供 voiceId");
+        }
 
+        SpeechSynthesizer synthesizer = null;
         try {
-            String payload = objectMapper.writeValueAsString(buildPayload(request));
-            Request httpRequest = new Request.Builder()
-                    .url(properties.getTtsUrl())
-                    .addHeader("Authorization", "Bearer " + properties.getApiKey())
-                    .addHeader("Content-Type", "application/json")
-                    .post(RequestBody.create(payload.getBytes(StandardCharsets.UTF_8), JSON))
+            log.info("[CosyVoice][开始TTS][voiceId={}, textLength={}, model={}]",
+                    request.getVoiceId(),
+                    request.getText().length(),
+                    StrUtil.blankToDefault(request.getModel(), properties.getDefaultModel()));
+
+            // 使用 DashScope SDK 构建参数（严格按文档）
+            SpeechSynthesisParam param = SpeechSynthesisParam.builder()
+                    .apiKey(properties.getApiKey())
+                    .model(StrUtil.blankToDefault(request.getModel(), properties.getDefaultModel()))
+                    .voice(request.getVoiceId())
                     .build();
 
-            try (Response response = getHttpClient().newCall(httpRequest).execute()) {
-                String body = response.body() != null ? response.body().string() : "";
-                if (!response.isSuccessful()) {
-                    log.error("[CosyVoice][TTS失败][status={}, body={}]", response.code(), body);
-                    throw buildException(body);
-                }
-                return parseTtsResult(body, request);
+            // 初始化合成器（同步调用传 null）
+            synthesizer = new SpeechSynthesizer(param, null);
+
+            // 阻塞调用，获取完整音频
+            ByteBuffer audioData = synthesizer.call(request.getText());
+
+            if (audioData == null) {
+                throw exception0(VOICE_TTS_FAILED.getCode(), "CosyVoice 返回空音频数据");
             }
+
+            // 转换为字节数组（严格按照文档：直接使用 array()）
+            byte[] audioBytes = audioData.array();
+
+            log.info("[CosyVoice][TTS合成成功][Request ID: {}, audioSize={}, 首包延迟={}ms]",
+                    synthesizer.getLastRequestId(),
+                    audioBytes.length,
+                    synthesizer.getFirstPackageDelay());
+
+            // 构建返回结果
+            CosyVoiceTtsResult result = new CosyVoiceTtsResult();
+            result.setAudio(audioBytes);
+            result.setFormat(request.getAudioFormat() != null ? request.getAudioFormat() : properties.getAudioFormat());
+            result.setSampleRate(request.getSampleRate() != null ? request.getSampleRate() : properties.getSampleRate());
+            result.setRequestId(synthesizer.getLastRequestId());
+            result.setVoiceId(request.getVoiceId());
+
+            return result;
+
         } catch (ServiceException ex) {
             throw ex;
         } catch (Exception ex) {
-            log.error("[CosyVoice][TTS异常]", ex);
+            log.error("[CosyVoice][TTS异常][voiceId={}, text={}]", request.getVoiceId(), request.getText(), ex);
+            throw exception(VOICE_TTS_FAILED);
+        } finally {
+            // 关闭 WebSocket 连接
+            if (synthesizer != null) {
+                try {
+                    synthesizer.getDuplexApi().close(1000, "任务结束");
+                } catch (Exception e) {
+                    log.warn("[CosyVoice][关闭连接失败]", e);
+                }
+            }
+        }
+    }
+
+    /**
+     * 使用 HTTP API 进行 TTS 合成（备用方案）
+     */
+    private CosyVoiceTtsResult synthesizeViaHttp(CosyVoiceTtsRequest request) throws Exception {
+        String payload = objectMapper.writeValueAsString(buildPayload(request));
+        Request httpRequest = new Request.Builder()
+                .url(properties.getTtsUrl())
+                .addHeader("Authorization", "Bearer " + properties.getApiKey())
+                .addHeader("Content-Type", "application/json")
+                .post(RequestBody.create(payload.getBytes(StandardCharsets.UTF_8), JSON))
+                .build();
+
+        try (Response response = getHttpClient().newCall(httpRequest).execute()) {
+            String body = response.body() != null ? response.body().string() : "";
+            if (!response.isSuccessful()) {
+                log.error("[CosyVoice][TTS失败][status={}, body={}]", response.code(), body);
+                throw buildException(body);
+            }
+            return parseTtsResult(body, request);
+        }
+    }
+
+    /**
+     * 调用 CosyVoice 语音复刻接口（声音注册）
+     */
+    public CosyVoiceCloneResult cloneVoice(CosyVoiceCloneRequest request) {
+        if (!properties.isEnabled()) {
+            throw exception0(VOICE_TTS_FAILED.getCode(), "未配置 CosyVoice API Key");
+        }
+        if (request == null || StrUtil.isBlank(request.getUrl())) {
+            throw exception0(VOICE_TTS_FAILED.getCode(), "复刻音频URL不能为空");
+        }
+        if (request == null || StrUtil.isBlank(request.getTargetModel())) {
+            throw exception0(VOICE_TTS_FAILED.getCode(), "复刻模型不能为空");
+        }
+        if (request == null || StrUtil.isBlank(request.getPrefix())) {
+            throw exception0(VOICE_TTS_FAILED.getCode(), "音色前缀不能为空");
+        }
+
+        try {
+            log.info("[CosyVoice][开始语音复刻][targetModel={}, prefix={}, url={}]",
+                    request.getTargetModel(), request.getPrefix(), request.getUrl());
+
+            // 使用 DashScope SDK 创建语音复刻
+            VoiceEnrollmentService service = new VoiceEnrollmentService(properties.getApiKey());
+            Voice voice = service.createVoice(request.getTargetModel(), request.getPrefix(), request.getUrl());
+
+            log.info("[CosyVoice][语音复刻成功][Request ID: {}, Voice ID: {}]",
+                    service.getLastRequestId(), voice.getVoiceId());
+
+            // 构建返回结果
+            CosyVoiceCloneResult result = new CosyVoiceCloneResult();
+            result.setVoiceId(voice.getVoiceId());
+            result.setRequestId(service.getLastRequestId());
+
+            return result;
+        } catch (ServiceException ex) {
+            throw ex;
+        } catch (Exception ex) {
+            log.error("[CosyVoice][语音复刻异常][targetModel={}, prefix={}]",
+                    request.getTargetModel(), request.getPrefix(), ex);
             throw exception(VOICE_TTS_FAILED);
         }
     }
diff --git a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/CosyVoiceCloneRequest.java b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/CosyVoiceCloneRequest.java
new file mode 100644
index 0000000000..9e9156e0a9
--- /dev/null
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/CosyVoiceCloneRequest.java
@@ -0,0 +1,36 @@
+package cn.iocoder.yudao.module.tik.voice.client.dto;
+
+import lombok.Data;
+
+/**
+ * CosyVoice 语音复刻请求
+ */
+@Data
+public class CosyVoiceCloneRequest {
+
+    /**
+     * 复刻模型（cosyvoice-v1 或 cosyvoice-v2）
+     */
+    private String targetModel;
+
+    /**
+     * 音色自定义前缀（仅允许数字和小写字母，长度<10字符）
+     */
+    private String prefix;
+
+    /**
+     * 音频文件公网URL
+     */
+    private String url;
+
+    /**
+     * 采样率，默认24000
+     */
+    private Integer sampleRate;
+
+    /**
+     * 音频格式，默认wav
+     */
+    private String audioFormat;
+
+}
diff --git a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/CosyVoiceCloneResult.java b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/CosyVoiceCloneResult.java
new file mode 100644
index 0000000000..d01421a269
--- /dev/null
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/CosyVoiceCloneResult.java
@@ -0,0 +1,21 @@
+package cn.iocoder.yudao.module.tik.voice.client.dto;
+
+import lombok.Data;
+
+/**
+ * CosyVoice 语音复刻结果
+ */
+@Data
+public class CosyVoiceCloneResult {
+
+    /**
+     * 生成的 voice_id
+     */
+    private String voiceId;
+
+    /**
+     * 请求ID
+     */
+    private String requestId;
+
+}
diff --git a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/config/CosyVoiceProperties.java b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/config/CosyVoiceProperties.java
index 60b39c4abb..19cc522a35 100644
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/config/CosyVoiceProperties.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/config/CosyVoiceProperties.java
@@ -50,6 +50,11 @@ public class CosyVoiceProperties {
      */
     private String ttsUrl = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/speech-synthesis";
 
+    /**
+     * 语音复刻接口地址（声音注册）
+     */
+    private String voiceEnrollmentUrl = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/voice-enrollment";
+
     /**
      * 连接超时时间
      */
diff --git a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/dal/dataobject/TikUserVoiceDO.java b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/dal/dataobject/TikUserVoiceDO.java
index 4ab397b217..6e2a666610 100644
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/dal/dataobject/TikUserVoiceDO.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/dal/dataobject/TikUserVoiceDO.java
@@ -54,6 +54,10 @@ public class TikUserVoiceDO extends TenantBaseDO {
      * 备注信息
      */
     private String note;
+    /**
+     * 复刻音色ID（CosyVoice 语音复刻生成的 voice_id）
+     */
+    private String voiceId;
 
 }
 
diff --git a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/service/TikUserVoiceServiceImpl.java b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/service/TikUserVoiceServiceImpl.java
index 3db54fb1cd..83d7c9eb17 100644
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/service/TikUserVoiceServiceImpl.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/service/TikUserVoiceServiceImpl.java
@@ -20,6 +20,8 @@ import cn.iocoder.yudao.module.tik.file.service.TikUserFileService;
 import cn.iocoder.yudao.module.tik.tikhup.service.TikHupService;
 import cn.iocoder.yudao.framework.mybatis.core.query.LambdaQueryWrapperX;
 import cn.iocoder.yudao.module.tik.voice.client.CosyVoiceClient;
+import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceCloneRequest;
+import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceCloneResult;
 import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsRequest;
 import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsResult;
 import cn.iocoder.yudao.module.tik.voice.config.CosyVoiceProperties;
@@ -138,20 +140,30 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
                 .setTranscription(null); // 初始为空，表示未识别
         voiceMapper.insert(voice);
 
-        // 4. 如果开启自动识别，异步执行识别（添加防重复检查）
-        if (Boolean.TRUE.equals(createReqVO.getAutoTranscribe())) {
-            // 再次检查是否已经有识别结果（防止并发重复创建）
-            TikUserVoiceDO checkVoice = voiceMapper.selectById(voice.getId());
-            if (StrUtil.isBlank(checkVoice.getTranscription())) {
-                String fileAccessUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
-                log.info("[createVoice][开启自动识别，配音编号({})，文件ID({})，预签名URL({})]",
-                        voice.getId(), fileDO.getId(), fileAccessUrl);
-                asyncTranscribeVoice(voice.getId(), fileAccessUrl);
-            } else {
-                log.info("[createVoice][配音已经有识别结果，跳过自动识别，配音编号({})]", voice.getId());
-            }
+        // 4. 调用阿里云语音复刻服务，生成 voice_id
+        try {
+            log.info("[createVoice][开始语音复刻，配音编号({})，文件ID({})]", voice.getId(), fileDO.getId());
+            String fileAccessUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
+
+            CosyVoiceCloneRequest cloneRequest = new CosyVoiceCloneRequest();
+            cloneRequest.setTargetModel("cosyvoice-v2"); // 使用v2模型，效果更好
+            cloneRequest.setPrefix("voice" + voice.getId()); // 音色前缀，格式要求
+            cloneRequest.setUrl(fileAccessUrl);
+
+            CosyVoiceCloneResult cloneResult = cosyVoiceClient.cloneVoice(cloneRequest);
+            String voiceId = cloneResult.getVoiceId();
+
+            // 更新配音记录，保存 voice_id
+            voice.setVoiceId(voiceId);
+            voiceMapper.updateById(voice);
+
+            log.info("[createVoice][语音复刻成功，配音编号({})，voice_id({})]", voice.getId(), voiceId);
+        } catch (Exception e) {
+            log.error("[createVoice][语音复刻失败，配音编号({})，错误信息: {}]", voice.getId(), e.getMessage(), e);
+            // 复刻失败不影响配音记录创建，只记录日志
         }
 
+
         log.info("[createVoice][用户({})创建配音成功，配音编号({})]", userId, voice.getId());
         return voice.getId();
     }
@@ -361,17 +373,25 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
                 throw exception(VOICE_NOT_EXISTS, "配音不属于当前用户");
             }
 
-            // 获取文件信息，用于获取文件URL
-            FileDO fileDO = fileMapper.selectById(voice.getFileId());
-            if (fileDO == null) {
-                throw exception(VOICE_FILE_NOT_EXISTS);
-            }
+            // 优先使用复刻的 voice_id，如果不存在则使用文件URL（兼容旧数据）
+            if (StrUtil.isNotBlank(voice.getVoiceId())) {
+                log.info("[synthesizeVoice][使用复刻音色ID合成，配音编号({})，voice_id({})]", voiceConfigId, voice.getVoiceId());
+                voiceId = voice.getVoiceId();
+                transcriptionText = voice.getTranscription();
+            } else {
+                log.info("[synthesizeVoice][使用文件URL合成，配音编号({})]", voiceConfigId);
+                // 获取文件信息，用于获取文件URL
+                FileDO fileDO = fileMapper.selectById(voice.getFileId());
+                if (fileDO == null) {
+                    throw exception(VOICE_FILE_NOT_EXISTS);
+                }
 
-            // 使用文件URL和识别文本进行合成
-            fileUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
-            transcriptionText = voice.getTranscription();
-            if (StrUtil.isBlank(transcriptionText)) {
-                throw exception(VOICE_NOT_EXISTS, "配音识别文本为空，请先进行语音识别");
+                // 使用文件URL和识别文本进行合成
+                fileUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
+                transcriptionText = voice.getTranscription();
+                if (StrUtil.isBlank(transcriptionText)) {
+                    throw exception(VOICE_NOT_EXISTS, "配音识别文本为空，请先进行语音识别");
+                }
             }
         }
         // 2. 如果没有配置ID，使用voiceId或fileUrl（系统音色或直接URL方式）
@@ -512,21 +532,31 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
                         voiceConfigId, voice.getUserId(), userId);
                 throw exception(VOICE_NOT_EXISTS, "配音不属于当前用户");
             }
-            
-            // 获取文件信息，用于获取文件URL
-            FileDO fileDO = fileMapper.selectById(voice.getFileId());
-            if (fileDO == null) {
-                throw exception(VOICE_FILE_NOT_EXISTS);
+
+            // 优先使用复刻的 voice_id，如果不存在则使用文件URL（兼容旧数据）
+            if (StrUtil.isNotBlank(voice.getVoiceId())) {
+                log.info("[previewVoice][使用复刻音色ID试听，配音编号({})，voice_id({})]", voiceConfigId, voice.getVoiceId());
+                voiceId = voice.getVoiceId();
+                transcriptionText = voice.getTranscription();
+                inputText = StrUtil.blankToDefault(reqVO.getInputText(),
+                        StrUtil.blankToDefault(transcriptionText, cosyVoiceProperties.getPreviewText()));
+            } else {
+                log.info("[previewVoice][使用文件URL试听，配音编号({})]", voiceConfigId);
+                // 获取文件信息，用于获取文件URL
+                FileDO fileDO = fileMapper.selectById(voice.getFileId());
+                if (fileDO == null) {
+                    throw exception(VOICE_FILE_NOT_EXISTS);
+                }
+
+                // 使用文件URL和识别文本进行合成
+                fileUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
+                transcriptionText = voice.getTranscription();
+                if (StrUtil.isBlank(transcriptionText)) {
+                    throw exception(VOICE_NOT_EXISTS, "配音识别文本为空，请先进行语音识别");
+                }
+                inputText = StrUtil.blankToDefault(reqVO.getInputText(),
+                        StrUtil.blankToDefault(transcriptionText, cosyVoiceProperties.getPreviewText()));
             }
-            
-            // 使用文件URL和识别文本进行合成
-            fileUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
-            transcriptionText = voice.getTranscription();
-            if (StrUtil.isBlank(transcriptionText)) {
-                throw exception(VOICE_NOT_EXISTS, "配音识别文本为空，请先进行语音识别");
-            }
-            inputText = StrUtil.blankToDefault(reqVO.getInputText(), 
-                    StrUtil.blankToDefault(transcriptionText, cosyVoiceProperties.getPreviewText()));
         }
         // 3. 如果没有配置ID，使用系统配音配置（需要前端传voiceId）
         else {
diff --git a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikUserVoiceRespVO.java b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikUserVoiceRespVO.java
index ff5a1e4993..7f4f2cd6e7 100644
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikUserVoiceRespVO.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikUserVoiceRespVO.java
@@ -38,6 +38,9 @@ public class AppTikUserVoiceRespVO {
     @Schema(description = "备注", example = "这是一个测试配音")
     private String note;
 
+    @Schema(description = "复刻音色ID（CosyVoice 语音复刻生成的 voice_id）")
+    private String voiceId;
+
     @Schema(description = "创建时间", requiredMode = Schema.RequiredMode.REQUIRED)
     private LocalDateTime createTime;