Files
sionrui/docs/cosyvoice-generate.md
2025-11-19 21:57:16 +08:00

272 lines
13 KiB
Markdown
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# 阿里云CosyVoice Java SDK 调用模板(参数+示例)
## 一、前提条件
1. 开通CosyVoice服务获取API Key建议配置到环境变量避免硬编码
2. 安装最新版DashScope SDK支持2.20.3+版本SSML功能需此版本及以上
3. 模型与音色需匹配如v2模型对应v2音色v3模型对应v3音色
## 二、核心参数汇总
| 参数名 | 类型 | 是否必填 | 默认值 | 取值范围/说明 |
|--------------|------------|----------|-------------------------|------------------------------------------------------------------------------|
| model | String | 是 | - | cosyvoice-v1/v2/v3/v3-plusv3系列需申请邀测 |
| voice | String | 是 | - | 对应模型的音色如v2longxiaochun_v2v3longhuohuo_v3详见文档音色列表 |
| format | enum | 否 | 因音色而异默认MP3 22050Hz | 支持WAV/MP3/PCM/OGG_OPUS如PCM_22050HZ_MONO_16BIT、MP3_24000HZ_MONO_256KBPS |
| volume | int | 否 | 50 | 0~100音量大小 |
| speechRate | float | 否 | 1.0 | 0.5~2.0语速1.0为默认约4字/秒) |
| pitchRate | float | 否 | 1.0 | 0.5~2.0(语调) |
| bit_rate | int | 否 | 32 | 6~510kbps仅opus格式支持v1模型不支持 |
| seed | int | 否 | 0 | 0~65535随机数种子仅v3/v3-plus支持 |
| style | int | 否 | 0 | ≥0整数风格调整仅v3/v3-plus支持 |
| languageHints| List<String> | 否 | - | 仅v3/v3-plus支持单次配置1个语种"zh"/"en" |
| instruction | String | 否 | - | 仅v3/v3-plus支持格式"你说话的情感是<情感值>"(如"Happy"/"Angry" |
## 三、四种核心调用方式模板
### 1. 同步调用(阻塞式,适合短文本)
```java
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
import java.io.File;
import java.io.FileOutputStream;
import java.nio.ByteBuffer;
public class SyncCallTemplate {
// 配置参数(根据需求修改)
private static final String MODEL = "cosyvoice-v3"; // 模型
private static final String VOICE = "longhuohuo_v3"; // 音色
private static final String TEXT = "今天天气真好,适合出门散步!"; // ≤2000字符
private static final String OUTPUT_FILE = "output.mp3"; // 输出文件
public static void main(String[] args) {
// 1. 构建请求参数
SpeechSynthesisParam param = SpeechSynthesisParam.builder()
// .apiKey("your-api-key") // 未配置环境变量时打开
.model(MODEL)
.voice(VOICE)
.volume(60) // 可选:调整音量
.speechRate(1.1f) // 可选:调整语速
.build();
// 2. 初始化合成器同步调用传null
SpeechSynthesizer synthesizer = new SpeechSynthesizer(param, null);
ByteBuffer audioData = null;
try {
// 3. 阻塞调用,获取完整音频
audioData = synthesizer.call(TEXT);
// 4. 保存音频到本地
if (audioData != null) {
try (FileOutputStream fos = new FileOutputStream(new File(OUTPUT_FILE))) {
fos.write(audioData.array());
}
System.out.println("合成成功!输出文件:" + OUTPUT_FILE);
System.out.println("RequestId" + synthesizer.getLastRequestId());
System.out.println("首包延迟:" + synthesizer.getFirstPackageDelay() + "ms");
}
} catch (Exception e) {
System.err.println("合成失败:" + e.getMessage());
} finally {
// 5. 关闭WebSocket连接
synthesizer.getDuplexApi().close(1000, "任务结束");
}
}
}
```
### 2. 异步调用(非阻塞,短文本实时接收)
```java
import com.alibaba.dashscope.audio.tts.SpeechSynthesisResult;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
import com.alibaba.dashscope.common.ResultCallback;
import java.util.concurrent.CountDownLatch;
public class AsyncCallTemplate {
private static final String MODEL = "cosyvoice-v2";
private static final String VOICE = "longxiaochun_v2";
private static final String TEXT = "欢迎使用阿里云CosyVoice语音合成服务"; // ≤2000字符
public static void main(String[] args) throws InterruptedException {
CountDownLatch latch = new CountDownLatch(1);
// 1. 配置回调(实时接收音频)
ResultCallback<SpeechSynthesisResult> callback = new ResultCallback<SpeechSynthesisResult>() {
@Override
public void onEvent(SpeechSynthesisResult result) {
// 接收音频分片(可实时播放或写入文件)
if (result.getAudioFrame() != null) {
System.out.println("收到音频分片,大小:" + result.getAudioFrame().capacity() + "字节");
// 此处可添加音频处理逻辑(如流式播放、追加写入文件)
}
// 查看计费字符数(最终以最后一次为准)
if (result.getUsage() != null) {
System.out.println("当前计费字符数:" + result.getUsage().getCharacters());
}
}
@Override
public void onComplete() {
System.out.println("合成完成!");
latch.countDown();
}
@Override
public void onError(Exception e) {
System.err.println("合成失败:" + e.getMessage());
latch.countDown();
}
};
// 2. 构建参数并初始化合成器
SpeechSynthesisParam param = SpeechSynthesisParam.builder()
// .apiKey("your-api-key")
.model(MODEL)
.voice(VOICE)
.format(SpeechSynthesisAudioFormat.MP3_16000HZ_MONO_128KBPS) // 可选配置格式
.build();
SpeechSynthesizer synthesizer = new SpeechSynthesizer(param, callback);
try {
// 3. 非阻塞调用
synthesizer.call(TEXT);
latch.await(); // 等待合成完成
System.out.println("RequestId" + synthesizer.getLastRequestId());
} catch (Exception e) {
System.err.println("调用异常:" + e.getMessage());
} finally {
synthesizer.getDuplexApi().close(1000, "任务结束");
}
}
}
```
### 3. 流式调用(分段传文本,适合长文本)
```java
import com.alibaba.dashscope.audio.tts.SpeechSynthesisResult;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisAudioFormat;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
import com.alibaba.dashscope.common.ResultCallback;
public class StreamingCallTemplate {
// 分段文本每段≤2000字符累计≤20万字符
private static final String[] TEXT_SEGMENTS = {
"流式语音合成适合长文本场景,",
"可以分段发送文本,",
"服务端实时返回音频,",
"减少等待时间。"
};
private static final String MODEL = "cosyvoice-v3";
private static final String VOICE = "longchuanshu_v3";
public static void main(String[] args) {
// 1. 配置回调
ResultCallback<SpeechSynthesisResult> callback = new ResultCallback<SpeechSynthesisResult>() {
@Override
public void onEvent(SpeechSynthesisResult result) {
if (result.getAudioFrame() != null) {
System.out.println("收到流式音频分片");
// 处理音频(如实时播放、写入缓冲文件)
}
}
@Override
public void onComplete() {
System.out.println("流式合成全部完成!");
}
@Override
public void onError(Exception e) {
System.err.println("流式合成失败:" + e.getMessage());
}
};
// 2. 构建参数
SpeechSynthesisParam param = SpeechSynthesisParam.builder()
// .apiKey("your-api-key")
.model(MODEL)
.voice(VOICE)
.format(SpeechSynthesisAudioFormat.PCM_22050HZ_MONO_16BIT)
.speechRate(0.9f)
.build();
SpeechSynthesizer synthesizer = new SpeechSynthesizer(param, callback);
try {
// 3. 分段发送文本间隔≤23秒
for (String segment : TEXT_SEGMENTS) {
synthesizer.streamingCall(segment);
Thread.sleep(500); // 模拟文本输入间隔
}
// 4. 必须调用:结束流式合成(触发剩余文本合成)
synthesizer.streamingComplete();
System.out.println("RequestId" + synthesizer.getLastRequestId());
} catch (Exception e) {
System.err.println("调用异常:" + e.getMessage());
} finally {
synthesizer.getDuplexApi().close(1000, "任务结束");
}
}
}
```
### 4. Flowable调用响应式编程支持流式输入输出
```java
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
import com.alibaba.dashscope.exception.NoApiKeyException;
import io.reactivex.Flowable;
import io.reactivex.BackpressureStrategy;
public class FlowableCallTemplate {
private static final String MODEL = "cosyvoice-v2";
private static final String VOICE = "longyingtian_v2";
private static final String[] TEXT_ARRAY = {"响应式编程模式,", "支持流式输入输出,", "适合高并发场景。"};
public static void main(String[] args) throws NoApiKeyException {
// 1. 模拟流式文本输入
Flowable<String> textStream = Flowable.create(emitter -> {
new Thread(() -> {
for (String text : TEXT_ARRAY) {
emitter.onNext(text);
try {
Thread.sleep(800);
} catch (InterruptedException e) {
emitter.onError(e);
}
}
emitter.onComplete();
}).start();
}, BackpressureStrategy.BUFFER);
// 2. 构建参数
SpeechSynthesisParam param = SpeechSynthesisParam.builder()
// .apiKey("your-api-key")
.model(MODEL)
.voice(VOICE)
.volume(70)
.build();
SpeechSynthesizer synthesizer = new SpeechSynthesizer(param, null);
try {
// 3. 流式调用并处理结果
synthesizer.streamingCallAsFlowable(textStream)
.blockingForEach(result -> {
if (result.getAudioFrame() != null) {
System.out.println("Flowable收到音频大小" + result.getAudioFrame().capacity() + "字节");
// 处理音频逻辑
}
});
System.out.println("Flowable合成完成RequestId" + synthesizer.getLastRequestId());
} finally {
synthesizer.getDuplexApi().close(1000, "任务结束");
}
}
}
```
## 四、核心注意事项
1. 文本长度限制非流式单次≤2000字符流式累计≤20万字符含SSML标签
2. 字符计算规则:汉字=2字符英文/数字/标点/空格=1字符。
3. 流式调用必须调用`streamingComplete()`,否则结尾文本无法合成。
4. 每次调用`call()`前需重新初始化`SpeechSynthesizer`实例。
5. 音频格式需与播放器兼容如MP3/OPUS支持流式播放推荐使用ffmpeg、AudioFormat等工具