阿里云CosyVoice Java SDK 调用模板（参数+示例）

一、前提条件

开通CosyVoice服务，获取API Key（建议配置到环境变量，避免硬编码）。
安装最新版DashScope SDK（支持2.20.3+版本，SSML功能需此版本及以上）。
模型与音色需匹配（如v2模型对应v2音色，v3模型对应v3音色）。

二、核心参数汇总

参数名	类型	是否必填	默认值	取值范围/说明
model	String	是	-	cosyvoice-v1/v2/v3/v3-plus（v3系列需申请邀测）
voice	String	是	-	对应模型的音色（如v2：longxiaochun_v2；v3：longhuohuo_v3，详见文档音色列表）
format	enum	否	因音色而异（默认MP3 22050Hz）	支持WAV/MP3/PCM/OGG_OPUS，如PCM_22050HZ_MONO_16BIT、MP3_24000HZ_MONO_256KBPS
volume	int	否	50	0~100（音量大小）
speechRate	float	否	1.0	0.5~2.0（语速，1.0为默认，约4字/秒）
pitchRate	float	否	1.0	0.5~2.0（语调）
bit_rate	int	否	32	6~510kbps（仅opus格式支持，v1模型不支持）
seed	int	否	0	0~65535（随机数种子，仅v3/v3-plus支持）
style	int	否	0	≥0整数（风格调整，仅v3/v3-plus支持）
languageHints	List	否	-	仅v3/v3-plus支持，单次配置1个语种（"zh"/"en"）
instruction	String	否	-	仅v3/v3-plus支持，格式："你说话的情感是<情感值>"（如"Happy"/"Angry"）

三、四种核心调用方式模板

1. 同步调用（阻塞式，适合短文本）

import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
import java.io.File;
import java.io.FileOutputStream;
import java.nio.ByteBuffer;

public class SyncCallTemplate {
    // 配置参数（根据需求修改）
    private static final String MODEL = "cosyvoice-v3"; // 模型
    private static final String VOICE = "longhuohuo_v3"; // 音色
    private static final String TEXT = "今天天气真好，适合出门散步！"; // ≤2000字符
    private static final String OUTPUT_FILE = "output.mp3"; // 输出文件

    public static void main(String[] args) {
        // 1. 构建请求参数
        SpeechSynthesisParam param = SpeechSynthesisParam.builder()
                // .apiKey("your-api-key") // 未配置环境变量时打开
                .model(MODEL)
                .voice(VOICE)
                .volume(60) // 可选：调整音量
                .speechRate(1.1f) // 可选：调整语速
                .build();

        // 2. 初始化合成器（同步调用传null）
        SpeechSynthesizer synthesizer = new SpeechSynthesizer(param, null);
        ByteBuffer audioData = null;

        try {
            // 3. 阻塞调用，获取完整音频
            audioData = synthesizer.call(TEXT);
            // 4. 保存音频到本地
            if (audioData != null) {
                try (FileOutputStream fos = new FileOutputStream(new File(OUTPUT_FILE))) {
                    fos.write(audioData.array());
                }
                System.out.println("合成成功！输出文件：" + OUTPUT_FILE);
                System.out.println("RequestId：" + synthesizer.getLastRequestId());
                System.out.println("首包延迟：" + synthesizer.getFirstPackageDelay() + "ms");
            }
        } catch (Exception e) {
            System.err.println("合成失败：" + e.getMessage());
        } finally {
            // 5. 关闭WebSocket连接
            synthesizer.getDuplexApi().close(1000, "任务结束");
        }
    }
}

2. 异步调用（非阻塞，短文本实时接收）

import com.alibaba.dashscope.audio.tts.SpeechSynthesisResult;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
import com.alibaba.dashscope.common.ResultCallback;
import java.util.concurrent.CountDownLatch;

public class AsyncCallTemplate {
    private static final String MODEL = "cosyvoice-v2";
    private static final String VOICE = "longxiaochun_v2";
    private static final String TEXT = "欢迎使用阿里云CosyVoice语音合成服务！"; // ≤2000字符

    public static void main(String[] args) throws InterruptedException {
        CountDownLatch latch = new CountDownLatch(1);

        // 1. 配置回调（实时接收音频）
        ResultCallback<SpeechSynthesisResult> callback = new ResultCallback<SpeechSynthesisResult>() {
            @Override
            public void onEvent(SpeechSynthesisResult result) {
                // 接收音频分片（可实时播放或写入文件）
                if (result.getAudioFrame() != null) {
                    System.out.println("收到音频分片，大小：" + result.getAudioFrame().capacity() + "字节");
                    // 此处可添加音频处理逻辑（如流式播放、追加写入文件）
                }
                // 查看计费字符数（最终以最后一次为准）
                if (result.getUsage() != null) {
                    System.out.println("当前计费字符数：" + result.getUsage().getCharacters());
                }
            }

            @Override
            public void onComplete() {
                System.out.println("合成完成！");
                latch.countDown();
            }

            @Override
            public void onError(Exception e) {
                System.err.println("合成失败：" + e.getMessage());
                latch.countDown();
            }
        };

        // 2. 构建参数并初始化合成器
        SpeechSynthesisParam param = SpeechSynthesisParam.builder()
                // .apiKey("your-api-key")
                .model(MODEL)
                .voice(VOICE)
                .format(SpeechSynthesisAudioFormat.MP3_16000HZ_MONO_128KBPS) // 可选配置格式
                .build();
        SpeechSynthesizer synthesizer = new SpeechSynthesizer(param, callback);

        try {
            // 3. 非阻塞调用
            synthesizer.call(TEXT);
            latch.await(); // 等待合成完成
            System.out.println("RequestId：" + synthesizer.getLastRequestId());
        } catch (Exception e) {
            System.err.println("调用异常：" + e.getMessage());
        } finally {
            synthesizer.getDuplexApi().close(1000, "任务结束");
        }
    }
}

3. 流式调用（分段传文本，适合长文本）

import com.alibaba.dashscope.audio.tts.SpeechSynthesisResult;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisAudioFormat;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
import com.alibaba.dashscope.common.ResultCallback;

public class StreamingCallTemplate {
    // 分段文本（每段≤2000字符，累计≤20万字符）
    private static final String[] TEXT_SEGMENTS = {
        "流式语音合成适合长文本场景，",
        "可以分段发送文本，",
        "服务端实时返回音频，",
        "减少等待时间。"
    };
    private static final String MODEL = "cosyvoice-v3";
    private static final String VOICE = "longchuanshu_v3";

    public static void main(String[] args) {
        // 1. 配置回调
        ResultCallback<SpeechSynthesisResult> callback = new ResultCallback<SpeechSynthesisResult>() {
            @Override
            public void onEvent(SpeechSynthesisResult result) {
                if (result.getAudioFrame() != null) {
                    System.out.println("收到流式音频分片");
                    // 处理音频（如实时播放、写入缓冲文件）
                }
            }

            @Override
            public void onComplete() {
                System.out.println("流式合成全部完成！");
            }

            @Override
            public void onError(Exception e) {
                System.err.println("流式合成失败：" + e.getMessage());
            }
        };

        // 2. 构建参数
        SpeechSynthesisParam param = SpeechSynthesisParam.builder()
                // .apiKey("your-api-key")
                .model(MODEL)
                .voice(VOICE)
                .format(SpeechSynthesisAudioFormat.PCM_22050HZ_MONO_16BIT)
                .speechRate(0.9f)
                .build();
        SpeechSynthesizer synthesizer = new SpeechSynthesizer(param, callback);

        try {
            // 3. 分段发送文本（间隔≤23秒）
            for (String segment : TEXT_SEGMENTS) {
                synthesizer.streamingCall(segment);
                Thread.sleep(500); // 模拟文本输入间隔
            }
            // 4. 必须调用：结束流式合成（触发剩余文本合成）
            synthesizer.streamingComplete();
            System.out.println("RequestId：" + synthesizer.getLastRequestId());
        } catch (Exception e) {
            System.err.println("调用异常：" + e.getMessage());
        } finally {
            synthesizer.getDuplexApi().close(1000, "任务结束");
        }
    }
}

4. Flowable调用（响应式编程，支持流式输入输出）

import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesisParam;
import com.alibaba.dashscope.audio.ttsv2.SpeechSynthesizer;
import com.alibaba.dashscope.exception.NoApiKeyException;
import io.reactivex.Flowable;
import io.reactivex.BackpressureStrategy;

public class FlowableCallTemplate {
    private static final String MODEL = "cosyvoice-v2";
    private static final String VOICE = "longyingtian_v2";
    private static final String[] TEXT_ARRAY = {"响应式编程模式，", "支持流式输入输出，", "适合高并发场景。"};

    public static void main(String[] args) throws NoApiKeyException {
        // 1. 模拟流式文本输入
        Flowable<String> textStream = Flowable.create(emitter -> {
            new Thread(() -> {
                for (String text : TEXT_ARRAY) {
                    emitter.onNext(text);
                    try {
                        Thread.sleep(800);
                    } catch (InterruptedException e) {
                        emitter.onError(e);
                    }
                }
                emitter.onComplete();
            }).start();
        }, BackpressureStrategy.BUFFER);

        // 2. 构建参数
        SpeechSynthesisParam param = SpeechSynthesisParam.builder()
                // .apiKey("your-api-key")
                .model(MODEL)
                .voice(VOICE)
                .volume(70)
                .build();
        SpeechSynthesizer synthesizer = new SpeechSynthesizer(param, null);

        try {
            // 3. 流式调用并处理结果
            synthesizer.streamingCallAsFlowable(textStream)
                    .blockingForEach(result -> {
                        if (result.getAudioFrame() != null) {
                            System.out.println("Flowable收到音频，大小：" + result.getAudioFrame().capacity() + "字节");
                            // 处理音频逻辑
                        }
                    });
            System.out.println("Flowable合成完成！RequestId：" + synthesizer.getLastRequestId());
        } finally {
            synthesizer.getDuplexApi().close(1000, "任务结束");
        }
    }
}

四、核心注意事项

文本长度限制：非流式单次≤2000字符，流式累计≤20万字符（含SSML标签）。
字符计算规则：汉字=2字符，英文/数字/标点/空格=1字符。
流式调用必须调用streamingComplete()，否则结尾文本无法合成。
每次调用call()前需重新初始化SpeechSynthesizer实例。
音频格式需与播放器兼容（如MP3/OPUS支持流式播放，推荐使用ffmpeg、AudioFormat等工具）。

13 KiB Raw Blame History Unescape Escape

阿里云CosyVoice Java SDK 调用模板（参数+示例）

一、前提条件

二、核心参数汇总

三、四种核心调用方式模板

1. 同步调用（阻塞式，适合短文本）

2. 异步调用（非阻塞，短文本实时接收）

3. 流式调用（分段传文本，适合长文本）

4. Flowable调用（响应式编程，支持流式输入输出）

四、核心注意事项

13 KiB

Raw Blame History