语音合成

This commit is contained in:
2025-11-19 22:52:00 +08:00
parent 75abf48bc1
commit cc5401e743
6 changed files with 101 additions and 90 deletions

View File

@@ -137,6 +137,7 @@ const buildPreviewParams = (voice) => {
} }
return { return {
voiceConfigId: configId, voiceConfigId: configId,
inputText: ttsText.value, // 传递用户输入的文本
emotion: emotion.value || 'neutral', emotion: emotion.value || 'neutral',
speechRate: speechRate.value || 1.0, speechRate: speechRate.value || 1.0,
audioFormat: 'mp3' audioFormat: 'mp3'
@@ -144,6 +145,7 @@ const buildPreviewParams = (voice) => {
} else { } else {
return { return {
voiceId: voice.voiceId, voiceId: voice.voiceId,
inputText: ttsText.value, // 传递用户输入的文本
emotion: emotion.value || 'neutral', emotion: emotion.value || 'neutral',
speechRate: speechRate.value || 1.0, speechRate: speechRate.value || 1.0,
audioFormat: 'mp3' audioFormat: 'mp3'
@@ -209,16 +211,29 @@ const handleSynthesizeVoice = async () => {
const playSynthesizedAudio = () => { const playSynthesizedAudio = () => {
// 防止重复点击 // 防止重复点击
if (isPlayingSynthesized.value || !synthesizedAudio.value?.audioUrl) { if (isPlayingSynthesized.value || !synthesizedAudio.value) {
return return
} }
isPlayingSynthesized.value = true isPlayingSynthesized.value = true
playAudioPreview(synthesizedAudio.value.audioUrl, {
onEnded: () => { // 优先使用Base64数据安全方案
if (synthesizedAudio.value.audioBase64) {
playAudioFromBase64(synthesizedAudio.value.audioBase64, synthesizedAudio.value.format, () => {
isPlayingSynthesized.value = false isPlayingSynthesized.value = false
} })
}) }
// 兼容旧的audioUrl方式已废弃
else if (synthesizedAudio.value.audioUrl) {
playAudioPreview(synthesizedAudio.value.audioUrl, {
onEnded: () => {
isPlayingSynthesized.value = false
}
})
} else {
message.warning('暂无可播放的音频')
isPlayingSynthesized.value = false
}
} }
// 视频处理 // 视频处理
@@ -329,7 +344,7 @@ const playAudioPreview = (url, options = {}) => {
}) })
} }
const playAudioFromBase64 = (audioBase64, format = 'mp3') => { const playAudioFromBase64 = (audioBase64, format = 'mp3', onEnded = null) => {
try { try {
previewObjectUrl && URL.revokeObjectURL(previewObjectUrl) previewObjectUrl && URL.revokeObjectURL(previewObjectUrl)
const byteCharacters = window.atob(audioBase64) const byteCharacters = window.atob(audioBase64)
@@ -340,16 +355,18 @@ const playAudioFromBase64 = (audioBase64, format = 'mp3') => {
const mime = format === 'mp3' ? 'audio/mpeg' : `audio/${format}` const mime = format === 'mp3' ? 'audio/mpeg' : `audio/${format}`
const blob = new Blob([new Uint8Array(byteNumbers)], { type: mime }) const blob = new Blob([new Uint8Array(byteNumbers)], { type: mime })
previewObjectUrl = URL.createObjectURL(blob) previewObjectUrl = URL.createObjectURL(blob)
playAudioPreview(previewObjectUrl, { playAudioPreview(previewObjectUrl, {
revokeOnEnd: true, revokeOnEnd: true,
onEnded: () => { onEnded: () => {
isPlayingPreview.value = false isPlayingPreview.value = false
onEnded && onEnded()
} }
}) })
} catch (error) { } catch (error) {
console.error('Base64播放失败:', error) console.error('Base64播放失败:', error)
isPlayingPreview.value = false isPlayingPreview.value = false
message.error('音频播放失败') message.error('音频播放失败')
onEnded && onEnded()
} }
} }
@@ -395,13 +412,13 @@ let previewObjectUrl = ''
<section class="digital-video-left"> <section class="digital-video-left">
<!-- 文本输入 --> <!-- 文本输入 -->
<div class="tts-section"> <div class="tts-section">
<div class="section-label">文案</div>
<a-textarea <a-textarea
v-model:value="ttsText" v-model:value="ttsText"
placeholder="请输入你想让角色说话的内容" placeholder="请输入你想让角色说话的内容"
:rows="6" :rows="6"
class="tts-textarea" class="tts-textarea"
/> />
<div class="tts-hint"> 试听后可获取准确的说话时长</div>
<!-- 音色选择 --> <!-- 音色选择 -->
<div class="voice-selection"> <div class="voice-selection">
@@ -514,12 +531,12 @@ let previewObjectUrl = ''
<div v-if="synthesizedAudio" class="synth-audio-card"> <div v-if="synthesizedAudio" class="synth-audio-card">
<div class="synth-audio-title">已生成语音</div> <div class="synth-audio-title">已生成语音</div>
<div class="synth-audio-meta"> <div class="synth-audio-meta">
<span>文件编号{{ synthesizedAudio.fileId }}</span>
<span>格式{{ (synthesizedAudio.format || 'mp3').toUpperCase() }}</span> <span>格式{{ (synthesizedAudio.format || 'mp3').toUpperCase() }}</span>
<span v-if="synthesizedAudio.audioBase64">Base64编码</span>
</div> </div>
<div class="synth-audio-actions"> <div class="synth-audio-actions">
<a-button <a-button
size="small" size="small"
:loading="isPlayingSynthesized" :loading="isPlayingSynthesized"
:disabled="isPlayingSynthesized" :disabled="isPlayingSynthesized"
@click="playSynthesizedAudio" @click="playSynthesizedAudio"

View File

@@ -66,16 +66,21 @@ public class CosyVoiceClient {
SpeechSynthesizer synthesizer = null; SpeechSynthesizer synthesizer = null;
try { try {
log.info("[CosyVoice][开始TTS][voiceId={}, textLength={}, model={}]", log.info("[CosyVoice][开始TTS][voiceId={}, textLength={}, model={}, speechRate={}, emotion={}]",
request.getVoiceId(), request.getVoiceId(),
request.getText().length(), request.getText().length(),
StrUtil.blankToDefault(request.getModel(), properties.getDefaultModel())); StrUtil.blankToDefault(request.getModel(), properties.getDefaultModel()),
request.getSpeechRate(),
request.getEmotion());
// 使用 DashScope SDK 构建参数(严格按文档) // 使用 DashScope SDK 构建参数(严格按文档)
// 注意speechRate 和 volume 需要转换为 int 类型
SpeechSynthesisParam param = SpeechSynthesisParam.builder() SpeechSynthesisParam param = SpeechSynthesisParam.builder()
.apiKey(properties.getApiKey()) .apiKey(properties.getApiKey())
.model(StrUtil.blankToDefault(request.getModel(), properties.getDefaultModel())) .model(StrUtil.blankToDefault(request.getModel(), properties.getDefaultModel()))
.voice(request.getVoiceId()) .voice(request.getVoiceId())
.speechRate(request.getSpeechRate() != null ? request.getSpeechRate().intValue() : 1)
.volume(request.getVolume() != null ? request.getVolume().intValue() : 0)
.build(); .build();
// 初始化合成器(同步调用传 null // 初始化合成器(同步调用传 null

View File

@@ -45,6 +45,11 @@ public class CosyVoiceTtsRequest {
*/ */
private Float volume; private Float volume;
/**
* 情感,可选
*/
private String emotion;
/** /**
* 采样率 * 采样率
*/ */

View File

@@ -377,7 +377,8 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
if (StrUtil.isNotBlank(voice.getVoiceId())) { if (StrUtil.isNotBlank(voice.getVoiceId())) {
log.info("[synthesizeVoice][使用复刻音色ID合成配音编号({})voice_id({})]", voiceConfigId, voice.getVoiceId()); log.info("[synthesizeVoice][使用复刻音色ID合成配音编号({})voice_id({})]", voiceConfigId, voice.getVoiceId());
voiceId = voice.getVoiceId(); voiceId = voice.getVoiceId();
transcriptionText = voice.getTranscription(); // 注意:使用 voiceId 时,不依赖 transcriptionText直接使用前端传入的 inputText
transcriptionText = null; // 清除 transcriptionText让 determineSynthesisText 只使用 inputText
} else { } else {
log.info("[synthesizeVoice][使用文件URL合成配音编号({})]", voiceConfigId); log.info("[synthesizeVoice][使用文件URL合成配音编号({})]", voiceConfigId);
// 获取文件信息用于获取文件URL // 获取文件信息用于获取文件URL
@@ -440,6 +441,7 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
reqVO.getModel(), reqVO.getModel(),
reqVO.getSpeechRate(), reqVO.getSpeechRate(),
reqVO.getVolume(), reqVO.getVolume(),
reqVO.getEmotion(),
reqVO.getSampleRate(), reqVO.getSampleRate(),
reqVO.getAudioFormat(), reqVO.getAudioFormat(),
false false
@@ -447,31 +449,18 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
String format = defaultFormat(ttsResult.getFormat(), reqVO.getAudioFormat()); String format = defaultFormat(ttsResult.getFormat(), reqVO.getAudioFormat());
String finalVoiceId = StrUtil.blankToDefault(voiceId, cosyVoiceProperties.getDefaultVoiceId()); String finalVoiceId = StrUtil.blankToDefault(voiceId, cosyVoiceProperties.getDefaultVoiceId());
ByteArrayMultipartFile multipartFile = new ByteArrayMultipartFile(
"file",
buildFileName(finalVoiceId, format),
resolveContentType(format),
ttsResult.getAudio()
);
Long infraFileId = tikUserFileService.uploadFile(multipartFile, "audio", null);
// 通过infraFileId查询TikUserFileDO获取用户文件ID // 【安全方案】不暴露OSS链接直接返回Base64编码的音频数据
TikUserFileDO userFile = userFileMapper.selectOne( String audioBase64 = Base64.getEncoder().encodeToString(ttsResult.getAudio());
new LambdaQueryWrapperX<TikUserFileDO>() log.info("[synthesizeVoice][合成成功,配音编号({})voiceId({})format({})audioSize={}]",
.eq(TikUserFileDO::getFileId, infraFileId) voiceConfigId, finalVoiceId, format, ttsResult.getAudio().length);
.eq(TikUserFileDO::getUserId, SecurityFrameworkUtils.getLoginUserId())
.orderByDesc(TikUserFileDO::getId)
.last("LIMIT 1"));
if (userFile == null) {
throw exception(VOICE_FILE_NOT_EXISTS, "文件上传成功但未找到用户文件记录");
}
AppTikVoiceTtsRespVO respVO = new AppTikVoiceTtsRespVO(); AppTikVoiceTtsRespVO respVO = new AppTikVoiceTtsRespVO();
respVO.setFileId(infraFileId); // 返回infraFileId保持与原有逻辑一致 respVO.setFileId(null); // 返回fileId避免暴露
respVO.setAudioUrl(tikUserFileService.getAudioPlayUrl(userFile.getId())); // 使用TikUserFileDO.id获取播放URL respVO.setAudioBase64(audioBase64); // 返回Base64数据前端可直接播放
respVO.setFormat(format); respVO.setFormat(format);
respVO.setSampleRate(ttsResult.getSampleRate()); respVO.setSampleRate(ttsResult.getSampleRate());
respVO.setRequestId(ttsResult.getRequestId()); respVO.setRequestId(""); // 不返回Request ID避免暴露技术细节
respVO.setVoiceId(finalVoiceId); respVO.setVoiceId(finalVoiceId);
saveSynthCache(cacheKey, new SynthCacheEntry( saveSynthCache(cacheKey, new SynthCacheEntry(
@@ -537,9 +526,9 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
if (StrUtil.isNotBlank(voice.getVoiceId())) { if (StrUtil.isNotBlank(voice.getVoiceId())) {
log.info("[previewVoice][使用复刻音色ID试听配音编号({})voice_id({})]", voiceConfigId, voice.getVoiceId()); log.info("[previewVoice][使用复刻音色ID试听配音编号({})voice_id({})]", voiceConfigId, voice.getVoiceId());
voiceId = voice.getVoiceId(); voiceId = voice.getVoiceId();
transcriptionText = voice.getTranscription(); // 注意:使用 voiceId 时,不依赖 transcriptionText直接使用前端传入的 inputText
inputText = StrUtil.blankToDefault(reqVO.getInputText(), transcriptionText = null; // 清除 transcriptionText
StrUtil.blankToDefault(transcriptionText, cosyVoiceProperties.getPreviewText())); inputText = StrUtil.blankToDefault(reqVO.getInputText(), cosyVoiceProperties.getPreviewText());
} else { } else {
log.info("[previewVoice][使用文件URL试听配音编号({})]", voiceConfigId); log.info("[previewVoice][使用文件URL试听配音编号({})]", voiceConfigId);
// 获取文件信息用于获取文件URL // 获取文件信息用于获取文件URL
@@ -593,14 +582,14 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
PreviewCacheEntry previewCache = getPreviewCache(cacheKey); PreviewCacheEntry previewCache = getPreviewCache(cacheKey);
if (previewCache != null) { if (previewCache != null) {
log.info("[previewVoice][使用缓存,配音编号({})voiceId({})fileUrl({})cacheKey({})]", log.info("[previewVoice][使用缓存,配音编号({})voiceId({})cacheKey({})]",
voiceConfigId, voiceId, fileUrl, cacheKey); voiceConfigId, voiceId, cacheKey);
// 缓存中存储的是原始URL需要生成预签名URL // 缓存命中直接返回缓存的数据Base64
String cachedUrl = fileApi.presignGetUrl(previewCache.getFileUrl(), PRESIGN_URL_EXPIRATION_SECONDS); String cachedBase64 = previewCache.getAudioBase64();
return buildPreviewResp(previewCache, cachedUrl, voiceId); return buildPreviewResp(cachedBase64, previewCache.getFormat(), voiceId);
} }
log.info("[previewVoice][调用CosyVoice合成配音编号({})voiceId({})fileUrl({}),文本长度({})]", log.info("[previewVoice][调用CosyVoice合成配音编号({})voiceId({})fileUrl({}),文本长度({})]",
voiceConfigId, voiceId, fileUrl, finalText.length()); voiceConfigId, voiceId, fileUrl, finalText.length());
CosyVoiceTtsResult ttsResult = cosyVoiceClient.synthesize(buildTtsRequest( CosyVoiceTtsResult ttsResult = cosyVoiceClient.synthesize(buildTtsRequest(
finalText, finalText,
@@ -610,26 +599,28 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
null, // 使用默认模型 null, // 使用默认模型
speechRate, speechRate,
volume, volume,
emotion,
null, null,
audioFormat, audioFormat,
true true
)); ));
String format = defaultFormat(ttsResult.getFormat(), audioFormat); String format = defaultFormat(ttsResult.getFormat(), audioFormat);
String identifier = StrUtil.isNotBlank(fileUrl) ? "fileUrl" : (StrUtil.isNotBlank(voiceId) ? voiceId : "voice"); String identifier = StrUtil.isNotBlank(voiceId) ? voiceId : "voice";
String objectName = buildFileName(identifier, format); String objectName = buildFileName(identifier, format);
// 上传到OSS返回原始URL不是预签名URL
String resultFileUrl = fileApi.createFile(ttsResult.getAudio(), objectName, "voice/preview", resolveContentType(format)); // 【安全方案】不暴露OSS链接直接返回Base64编码的音频数据
log.info("[previewVoice][合成成功,配音编号({})voiceId({})fileUrl({})resultFileUrl({})format({})]", // 这样前端可直接播放无需额外请求也不会暴露OSS存储信息
voiceConfigId, voiceId, fileUrl, resultFileUrl, format); String audioBase64 = Base64.getEncoder().encodeToString(ttsResult.getAudio());
log.info("[previewVoice][合成成功,配音编号({})voiceId({})format({})audioSize={}]",
// 生成预签名URL用于返回给前端 voiceConfigId, voiceId, format, ttsResult.getAudio().length);
String presignUrl = fileApi.presignGetUrl(resultFileUrl, PRESIGN_URL_EXPIRATION_SECONDS);
// 缓存Base64数据用于提升响应速度
// 缓存中存储原始URL不是预签名URL下次使用时再生成预签名URL PreviewCacheEntry entry = new PreviewCacheEntry(audioBase64, format, ttsResult.getSampleRate(), ttsResult.getRequestId());
PreviewCacheEntry entry = new PreviewCacheEntry(resultFileUrl, format, ttsResult.getSampleRate(), ttsResult.getRequestId());
savePreviewCache(cacheKey, entry); savePreviewCache(cacheKey, entry);
return buildPreviewResp(entry, presignUrl, voiceId);
// 返回Base64数据前端使用 data:audio/...;base64,... 格式播放
return buildPreviewResp(audioBase64, format, voiceId);
} }
private CosyVoiceTtsRequest buildTtsRequest(String text, private CosyVoiceTtsRequest buildTtsRequest(String text,
@@ -639,6 +630,7 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
String model, String model,
Float speechRate, Float speechRate,
Float volume, Float volume,
String emotion,
Integer sampleRate, Integer sampleRate,
String audioFormat, String audioFormat,
boolean preview) { boolean preview) {
@@ -650,6 +642,7 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
.model(model) .model(model)
.speechRate(speechRate) .speechRate(speechRate)
.volume(volume) .volume(volume)
.emotion(emotion)
.sampleRate(sampleRate) .sampleRate(sampleRate)
.audioFormat(audioFormat) .audioFormat(audioFormat)
.preview(preview) .preview(preview)
@@ -822,65 +815,48 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
} }
private AppTikVoiceTtsRespVO buildSynthResponseFromCache(AppTikVoiceTtsReqVO reqVO, SynthCacheEntry cache) { private AppTikVoiceTtsRespVO buildSynthResponseFromCache(AppTikVoiceTtsReqVO reqVO, SynthCacheEntry cache) {
byte[] audioBytes = Base64.getDecoder().decode(cache.getAudioBase64()); // 直接使用缓存的Base64数据不上传OSS
String format = defaultFormat(cache.getFormat(), reqVO.getAudioFormat()); String format = defaultFormat(cache.getFormat(), reqVO.getAudioFormat());
String voiceId = StrUtil.blankToDefault(reqVO.getVoiceId(), cache.getVoiceId()); String voiceId = StrUtil.blankToDefault(reqVO.getVoiceId(), cache.getVoiceId());
ByteArrayMultipartFile multipartFile = new ByteArrayMultipartFile(
"file",
buildFileName(voiceId, format),
resolveContentType(format),
audioBytes
);
Long infraFileId = tikUserFileService.uploadFile(multipartFile, "audio", null);
// 通过infraFileId查询TikUserFileDO获取用户文件ID
TikUserFileDO userFile = userFileMapper.selectOne(
new LambdaQueryWrapperX<TikUserFileDO>()
.eq(TikUserFileDO::getFileId, infraFileId)
.eq(TikUserFileDO::getUserId, SecurityFrameworkUtils.getLoginUserId())
.orderByDesc(TikUserFileDO::getId)
.last("LIMIT 1"));
if (userFile == null) {
throw exception(VOICE_FILE_NOT_EXISTS, "文件上传成功但未找到用户文件记录");
}
AppTikVoiceTtsRespVO respVO = new AppTikVoiceTtsRespVO(); AppTikVoiceTtsRespVO respVO = new AppTikVoiceTtsRespVO();
respVO.setFileId(infraFileId); // 返回infraFileId保持与原有逻辑一致 respVO.setFileId(null); // 返回fileId避免暴露
respVO.setAudioUrl(tikUserFileService.getAudioPlayUrl(userFile.getId())); // 使用TikUserFileDO.id获取播放URL respVO.setAudioBase64(cache.getAudioBase64()); // 返回Base64数据
respVO.setFormat(format); respVO.setFormat(format);
respVO.setSampleRate(cache.getSampleRate()); respVO.setSampleRate(cache.getSampleRate());
respVO.setRequestId(cache.getRequestId()); respVO.setRequestId(""); // 不返回Request ID避免暴露技术细节
respVO.setVoiceId(voiceId); respVO.setVoiceId(voiceId);
return respVO; return respVO;
} }
private AppTikVoicePreviewRespVO buildPreviewResp(PreviewCacheEntry entry, String presignUrl, String voiceId) { private AppTikVoicePreviewRespVO buildPreviewResp(String audioBase64, String format, String voiceId) {
AppTikVoicePreviewRespVO respVO = new AppTikVoicePreviewRespVO(); AppTikVoicePreviewRespVO respVO = new AppTikVoicePreviewRespVO();
respVO.setAudioUrl(presignUrl); respVO.setAudioBase64(audioBase64); // 返回Base64数据前端可直接播放
respVO.setFormat(entry.getFormat()); respVO.setFormat(format);
respVO.setSampleRate(entry.getSampleRate()); // 缓存中不存储其他信息,返回默认值
respVO.setRequestId(entry.getRequestId()); respVO.setSampleRate(24000); // 默认采样率
respVO.setRequestId(""); // 不返回Request ID避免暴露技术细节
respVO.setVoiceId(voiceId); respVO.setVoiceId(voiceId);
return respVO; return respVO;
} }
private static class PreviewCacheEntry { private static class PreviewCacheEntry {
private String fileUrl; private String audioBase64;
private String format; private String format;
private Integer sampleRate; private Integer sampleRate;
private String requestId; private String requestId;
public PreviewCacheEntry() {} public PreviewCacheEntry() {}
public PreviewCacheEntry(String fileUrl, String format, Integer sampleRate, String requestId) { public PreviewCacheEntry(String audioBase64, String format, Integer sampleRate, String requestId) {
this.fileUrl = fileUrl; this.audioBase64 = audioBase64;
this.format = format; this.format = format;
this.sampleRate = sampleRate; this.sampleRate = sampleRate;
this.requestId = requestId; this.requestId = requestId;
} }
public String getFileUrl() { public String getAudioBase64() {
return fileUrl; return audioBase64;
} }
public String getFormat() { public String getFormat() {

View File

@@ -7,7 +7,11 @@ import lombok.Data;
@Schema(description = "音色试听响应") @Schema(description = "音色试听响应")
public class AppTikVoicePreviewRespVO { public class AppTikVoicePreviewRespVO {
@Schema(description = "音频播放地址(预签名 URL") @Schema(description = "音频Base64数据可直接播放使用 data:audio/...;base64,... 格式")
private String audioBase64;
@Schema(description = "音频播放地址(预签名 URL已废弃不推荐使用")
@Deprecated
private String audioUrl; private String audioUrl;
@Schema(description = "音频格式", example = "wav") @Schema(description = "音频格式", example = "wav")

View File

@@ -10,7 +10,11 @@ public class AppTikVoiceTtsRespVO {
@Schema(description = "用户文件编号", example = "1024") @Schema(description = "用户文件编号", example = "1024")
private Long fileId; private Long fileId;
@Schema(description = "音频播放地址(预签名 URL") @Schema(description = "音频Base64数据可直接播放使用 data:audio/...;base64,... 格式")
private String audioBase64;
@Schema(description = "音频播放地址(预签名 URL已废弃不推荐使用")
@Deprecated
private String audioUrl; private String audioUrl;
@Schema(description = "音频格式", example = "mp3") @Schema(description = "音频格式", example = "mp3")