语音合成

2025-11-19 22:52:00 +08:00
parent 75abf48bc1
commit cc5401e743
6 changed files with 101 additions and 90 deletions
--- a/frontend/app/web-gold/src/views/dh/Video.vue
+++ b/frontend/app/web-gold/src/views/dh/Video.vue
@@ -137,6 +137,7 @@ const buildPreviewParams = (voice) => {
    }
    return {
      voiceConfigId: configId,
+      inputText: ttsText.value,  // 传递用户输入的文本
      emotion: emotion.value || 'neutral',
      speechRate: speechRate.value || 1.0,
      audioFormat: 'mp3'
@@ -144,6 +145,7 @@ const buildPreviewParams = (voice) => {
  } else {
    return {
      voiceId: voice.voiceId,
+      inputText: ttsText.value,  // 传递用户输入的文本
      emotion: emotion.value || 'neutral',
      speechRate: speechRate.value || 1.0,
      audioFormat: 'mp3'
@@ -209,16 +211,29 @@ const handleSynthesizeVoice = async () => {

 const playSynthesizedAudio = () => {
  // 防止重复点击
-  if (isPlayingSynthesized.value || !synthesizedAudio.value?.audioUrl) {
+  if (isPlayingSynthesized.value || !synthesizedAudio.value) {
    return
  }
-  
+
  isPlayingSynthesized.value = true
-  playAudioPreview(synthesizedAudio.value.audioUrl, {
-    onEnded: () => {
+
+  // 优先使用Base64数据（安全方案）
+  if (synthesizedAudio.value.audioBase64) {
+    playAudioFromBase64(synthesizedAudio.value.audioBase64, synthesizedAudio.value.format, () => {
      isPlayingSynthesized.value = false
-    }
-  })
+    })
+  }
+  // 兼容旧的audioUrl方式（已废弃）
+  else if (synthesizedAudio.value.audioUrl) {
+    playAudioPreview(synthesizedAudio.value.audioUrl, {
+      onEnded: () => {
+        isPlayingSynthesized.value = false
+      }
+    })
+  } else {
+    message.warning('暂无可播放的音频')
+    isPlayingSynthesized.value = false
+  }
 }

 // 视频处理
@@ -329,7 +344,7 @@ const playAudioPreview = (url, options = {}) => {
  })
 }

-const playAudioFromBase64 = (audioBase64, format = 'mp3') => {
+const playAudioFromBase64 = (audioBase64, format = 'mp3', onEnded = null) => {
  try {
    previewObjectUrl && URL.revokeObjectURL(previewObjectUrl)
    const byteCharacters = window.atob(audioBase64)
@@ -340,16 +355,18 @@ const playAudioFromBase64 = (audioBase64, format = 'mp3') => {
    const mime = format === 'mp3' ? 'audio/mpeg' : `audio/${format}`
    const blob = new Blob([new Uint8Array(byteNumbers)], { type: mime })
    previewObjectUrl = URL.createObjectURL(blob)
-    playAudioPreview(previewObjectUrl, { 
+    playAudioPreview(previewObjectUrl, {
      revokeOnEnd: true,
      onEnded: () => {
        isPlayingPreview.value = false
+        onEnded && onEnded()
      }
    })
  } catch (error) {
    console.error('Base64播放失败:', error)
    isPlayingPreview.value = false
    message.error('音频播放失败')
+    onEnded && onEnded()
  }
 }

@@ -395,13 +412,13 @@ let previewObjectUrl = ''
      <section class="digital-video-left">
        <!-- 文本输入 -->
        <div class="tts-section">
+          <div class="section-label">文案</div>
          <a-textarea
            v-model:value="ttsText"
            placeholder="请输入你想让角色说话的内容"
            :rows="6"
            class="tts-textarea"
          />
-          <div class="tts-hint">▶ 试听后可获取准确的说话时长</div>

          <!-- 音色选择 -->
          <div class="voice-selection">
@@ -514,12 +531,12 @@ let previewObjectUrl = ''
          <div v-if="synthesizedAudio" class="synth-audio-card">
            <div class="synth-audio-title">已生成语音</div>
            <div class="synth-audio-meta">
-              <span>文件编号：{{ synthesizedAudio.fileId }}</span>
              <span>格式：{{ (synthesizedAudio.format || 'mp3').toUpperCase() }}</span>
+              <span v-if="synthesizedAudio.audioBase64">Base64编码</span>
            </div>
            <div class="synth-audio-actions">
-              <a-button 
-                size="small" 
+              <a-button
+                size="small"
                :loading="isPlayingSynthesized"
                :disabled="isPlayingSynthesized"
                @click="playSynthesizedAudio"
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/CosyVoiceClient.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/CosyVoiceClient.java
@@ -66,16 +66,21 @@ public class CosyVoiceClient {

        SpeechSynthesizer synthesizer = null;
        try {
-            log.info("[CosyVoice][开始TTS][voiceId={}, textLength={}, model={}]",
+            log.info("[CosyVoice][开始TTS][voiceId={}, textLength={}, model={}, speechRate={}, emotion={}]",
                    request.getVoiceId(),
                    request.getText().length(),
-                    StrUtil.blankToDefault(request.getModel(), properties.getDefaultModel()));
+                    StrUtil.blankToDefault(request.getModel(), properties.getDefaultModel()),
+                    request.getSpeechRate(),
+                    request.getEmotion());

            // 使用 DashScope SDK 构建参数（严格按文档）
+            // 注意：speechRate 和 volume 需要转换为 int 类型
            SpeechSynthesisParam param = SpeechSynthesisParam.builder()
                    .apiKey(properties.getApiKey())
                    .model(StrUtil.blankToDefault(request.getModel(), properties.getDefaultModel()))
                    .voice(request.getVoiceId())
+                    .speechRate(request.getSpeechRate() != null ? request.getSpeechRate().intValue() : 1)
+                    .volume(request.getVolume() != null ? request.getVolume().intValue() : 0)
                    .build();

            // 初始化合成器（同步调用传 null）
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/CosyVoiceTtsRequest.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/CosyVoiceTtsRequest.java
@@ -45,6 +45,11 @@ public class CosyVoiceTtsRequest {
     */
    private Float volume;

+    /**
+     * 情感，可选
+     */
+    private String emotion;
+
    /**
     * 采样率
     */
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/service/TikUserVoiceServiceImpl.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/service/TikUserVoiceServiceImpl.java
@@ -377,7 +377,8 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
            if (StrUtil.isNotBlank(voice.getVoiceId())) {
                log.info("[synthesizeVoice][使用复刻音色ID合成，配音编号({})，voice_id({})]", voiceConfigId, voice.getVoiceId());
                voiceId = voice.getVoiceId();
-                transcriptionText = voice.getTranscription();
+                // 注意：使用 voiceId 时，不依赖 transcriptionText，直接使用前端传入的 inputText
+                transcriptionText = null;  // 清除 transcriptionText，让 determineSynthesisText 只使用 inputText
            } else {
                log.info("[synthesizeVoice][使用文件URL合成，配音编号({})]", voiceConfigId);
                // 获取文件信息，用于获取文件URL
@@ -440,6 +441,7 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
                reqVO.getModel(),
                reqVO.getSpeechRate(),
                reqVO.getVolume(),
+                reqVO.getEmotion(),
                reqVO.getSampleRate(),
                reqVO.getAudioFormat(),
                false
@@ -447,31 +449,18 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {

        String format = defaultFormat(ttsResult.getFormat(), reqVO.getAudioFormat());
        String finalVoiceId = StrUtil.blankToDefault(voiceId, cosyVoiceProperties.getDefaultVoiceId());
-        ByteArrayMultipartFile multipartFile = new ByteArrayMultipartFile(
-                "file",
-                buildFileName(finalVoiceId, format),
-                resolveContentType(format),
-                ttsResult.getAudio()
-        );
-        Long infraFileId = tikUserFileService.uploadFile(multipartFile, "audio", null);

-        // 通过infraFileId查询TikUserFileDO，获取用户文件ID
-        TikUserFileDO userFile = userFileMapper.selectOne(
-                new LambdaQueryWrapperX<TikUserFileDO>()
-                        .eq(TikUserFileDO::getFileId, infraFileId)
-                        .eq(TikUserFileDO::getUserId, SecurityFrameworkUtils.getLoginUserId())
-                        .orderByDesc(TikUserFileDO::getId)
-                        .last("LIMIT 1"));
-        if (userFile == null) {
-            throw exception(VOICE_FILE_NOT_EXISTS, "文件上传成功但未找到用户文件记录");
-        }
+        // 【安全方案】不暴露OSS链接，直接返回Base64编码的音频数据
+        String audioBase64 = Base64.getEncoder().encodeToString(ttsResult.getAudio());
+        log.info("[synthesizeVoice][合成成功，配音编号({})，voiceId({})，format({})，audioSize={}]",
+                voiceConfigId, finalVoiceId, format, ttsResult.getAudio().length);

        AppTikVoiceTtsRespVO respVO = new AppTikVoiceTtsRespVO();
-        respVO.setFileId(infraFileId); // 返回infraFileId，保持与原有逻辑一致
-        respVO.setAudioUrl(tikUserFileService.getAudioPlayUrl(userFile.getId())); // 使用TikUserFileDO.id获取播放URL
+        respVO.setFileId(null);  // 不返回fileId（避免暴露）
+        respVO.setAudioBase64(audioBase64);  // 返回Base64数据，前端可直接播放
        respVO.setFormat(format);
        respVO.setSampleRate(ttsResult.getSampleRate());
-        respVO.setRequestId(ttsResult.getRequestId());
+        respVO.setRequestId("");  // 不返回Request ID（避免暴露技术细节）
        respVO.setVoiceId(finalVoiceId);

        saveSynthCache(cacheKey, new SynthCacheEntry(
@@ -537,9 +526,9 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
            if (StrUtil.isNotBlank(voice.getVoiceId())) {
                log.info("[previewVoice][使用复刻音色ID试听，配音编号({})，voice_id({})]", voiceConfigId, voice.getVoiceId());
                voiceId = voice.getVoiceId();
-                transcriptionText = voice.getTranscription();
-                inputText = StrUtil.blankToDefault(reqVO.getInputText(),
-                        StrUtil.blankToDefault(transcriptionText, cosyVoiceProperties.getPreviewText()));
+                // 注意：使用 voiceId 时，不依赖 transcriptionText，直接使用前端传入的 inputText
+                transcriptionText = null;  // 清除 transcriptionText
+                inputText = StrUtil.blankToDefault(reqVO.getInputText(), cosyVoiceProperties.getPreviewText());
            } else {
                log.info("[previewVoice][使用文件URL试听，配音编号({})]", voiceConfigId);
                // 获取文件信息，用于获取文件URL
@@ -593,14 +582,14 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
        PreviewCacheEntry previewCache = getPreviewCache(cacheKey);

        if (previewCache != null) {
-            log.info("[previewVoice][使用缓存，配音编号({})，voiceId({})，fileUrl({})，cacheKey({})]", 
-                    voiceConfigId, voiceId, fileUrl, cacheKey);
-            // 缓存中存储的是原始URL，需要生成预签名URL
-            String cachedUrl = fileApi.presignGetUrl(previewCache.getFileUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
-            return buildPreviewResp(previewCache, cachedUrl, voiceId);
+            log.info("[previewVoice][使用缓存，配音编号({})，voiceId({})，cacheKey({})]",
+                    voiceConfigId, voiceId, cacheKey);
+            // 缓存命中，直接返回缓存的数据（Base64）
+            String cachedBase64 = previewCache.getAudioBase64();
+            return buildPreviewResp(cachedBase64, previewCache.getFormat(), voiceId);
        }

-        log.info("[previewVoice][调用CosyVoice合成，配音编号({})，voiceId({})，fileUrl({})，文本长度({})]", 
+        log.info("[previewVoice][调用CosyVoice合成，配音编号({})，voiceId({})，fileUrl({})，文本长度({})]",
                voiceConfigId, voiceId, fileUrl, finalText.length());
        CosyVoiceTtsResult ttsResult = cosyVoiceClient.synthesize(buildTtsRequest(
                finalText,
@@ -610,26 +599,28 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
                null, // 使用默认模型
                speechRate,
                volume,
+                emotion,
                null,
                audioFormat,
                true
        ));

        String format = defaultFormat(ttsResult.getFormat(), audioFormat);
-        String identifier = StrUtil.isNotBlank(fileUrl) ? "fileUrl" : (StrUtil.isNotBlank(voiceId) ? voiceId : "voice");
+        String identifier = StrUtil.isNotBlank(voiceId) ? voiceId : "voice";
        String objectName = buildFileName(identifier, format);
-        // 上传到OSS，返回原始URL（不是预签名URL）
-        String resultFileUrl = fileApi.createFile(ttsResult.getAudio(), objectName, "voice/preview", resolveContentType(format));
-        log.info("[previewVoice][合成成功，配音编号({})，voiceId({})，fileUrl({})，resultFileUrl({})，format({})]", 
-                voiceConfigId, voiceId, fileUrl, resultFileUrl, format);
-        
-        // 生成预签名URL用于返回给前端
-        String presignUrl = fileApi.presignGetUrl(resultFileUrl, PRESIGN_URL_EXPIRATION_SECONDS);
-        
-        // 缓存中存储原始URL（不是预签名URL），下次使用时再生成预签名URL
-        PreviewCacheEntry entry = new PreviewCacheEntry(resultFileUrl, format, ttsResult.getSampleRate(), ttsResult.getRequestId());
+
+        // 【安全方案】不暴露OSS链接，直接返回Base64编码的音频数据
+        // 这样前端可直接播放，无需额外请求，也不会暴露OSS存储信息
+        String audioBase64 = Base64.getEncoder().encodeToString(ttsResult.getAudio());
+        log.info("[previewVoice][合成成功，配音编号({})，voiceId({})，format({})，audioSize={}]",
+                voiceConfigId, voiceId, format, ttsResult.getAudio().length);
+
+        // 缓存Base64数据（用于提升响应速度）
+        PreviewCacheEntry entry = new PreviewCacheEntry(audioBase64, format, ttsResult.getSampleRate(), ttsResult.getRequestId());
        savePreviewCache(cacheKey, entry);
-        return buildPreviewResp(entry, presignUrl, voiceId);
+
+        // 返回Base64数据，前端使用 data:audio/...;base64,... 格式播放
+        return buildPreviewResp(audioBase64, format, voiceId);
    }

    private CosyVoiceTtsRequest buildTtsRequest(String text,
@@ -639,6 +630,7 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
                                                String model,
                                                Float speechRate,
                                                Float volume,
+                                                String emotion,
                                                Integer sampleRate,
                                                String audioFormat,
                                                boolean preview) {
@@ -650,6 +642,7 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
                .model(model)
                .speechRate(speechRate)
                .volume(volume)
+                .emotion(emotion)
                .sampleRate(sampleRate)
                .audioFormat(audioFormat)
                .preview(preview)
@@ -822,65 +815,48 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService {
    }

    private AppTikVoiceTtsRespVO buildSynthResponseFromCache(AppTikVoiceTtsReqVO reqVO, SynthCacheEntry cache) {
-        byte[] audioBytes = Base64.getDecoder().decode(cache.getAudioBase64());
+        // 直接使用缓存的Base64数据，不上传OSS
        String format = defaultFormat(cache.getFormat(), reqVO.getAudioFormat());
        String voiceId = StrUtil.blankToDefault(reqVO.getVoiceId(), cache.getVoiceId());
-        ByteArrayMultipartFile multipartFile = new ByteArrayMultipartFile(
-                "file",
-                buildFileName(voiceId, format),
-                resolveContentType(format),
-                audioBytes
-        );
-        Long infraFileId = tikUserFileService.uploadFile(multipartFile, "audio", null);
-        
-        // 通过infraFileId查询TikUserFileDO，获取用户文件ID
-        TikUserFileDO userFile = userFileMapper.selectOne(
-                new LambdaQueryWrapperX<TikUserFileDO>()
-                        .eq(TikUserFileDO::getFileId, infraFileId)
-                        .eq(TikUserFileDO::getUserId, SecurityFrameworkUtils.getLoginUserId())
-                        .orderByDesc(TikUserFileDO::getId)
-                        .last("LIMIT 1"));
-        if (userFile == null) {
-            throw exception(VOICE_FILE_NOT_EXISTS, "文件上传成功但未找到用户文件记录");
-        }

        AppTikVoiceTtsRespVO respVO = new AppTikVoiceTtsRespVO();
-        respVO.setFileId(infraFileId); // 返回infraFileId，保持与原有逻辑一致
-        respVO.setAudioUrl(tikUserFileService.getAudioPlayUrl(userFile.getId())); // 使用TikUserFileDO.id获取播放URL
+        respVO.setFileId(null);  // 不返回fileId（避免暴露）
+        respVO.setAudioBase64(cache.getAudioBase64());  // 返回Base64数据
        respVO.setFormat(format);
        respVO.setSampleRate(cache.getSampleRate());
-        respVO.setRequestId(cache.getRequestId());
+        respVO.setRequestId("");  // 不返回Request ID（避免暴露技术细节）
        respVO.setVoiceId(voiceId);
        return respVO;
    }

-    private AppTikVoicePreviewRespVO buildPreviewResp(PreviewCacheEntry entry, String presignUrl, String voiceId) {
+    private AppTikVoicePreviewRespVO buildPreviewResp(String audioBase64, String format, String voiceId) {
        AppTikVoicePreviewRespVO respVO = new AppTikVoicePreviewRespVO();
-        respVO.setAudioUrl(presignUrl);
-        respVO.setFormat(entry.getFormat());
-        respVO.setSampleRate(entry.getSampleRate());
-        respVO.setRequestId(entry.getRequestId());
+        respVO.setAudioBase64(audioBase64);  // 返回Base64数据，前端可直接播放
+        respVO.setFormat(format);
+        // 缓存中不存储其他信息，返回默认值
+        respVO.setSampleRate(24000);  // 默认采样率
+        respVO.setRequestId("");  // 不返回Request ID（避免暴露技术细节）
        respVO.setVoiceId(voiceId);
        return respVO;
    }

    private static class PreviewCacheEntry {
-        private String fileUrl;
+        private String audioBase64;
        private String format;
        private Integer sampleRate;
        private String requestId;

        public PreviewCacheEntry() {}

-        public PreviewCacheEntry(String fileUrl, String format, Integer sampleRate, String requestId) {
-            this.fileUrl = fileUrl;
+        public PreviewCacheEntry(String audioBase64, String format, Integer sampleRate, String requestId) {
+            this.audioBase64 = audioBase64;
            this.format = format;
            this.sampleRate = sampleRate;
            this.requestId = requestId;
        }

-        public String getFileUrl() {
-            return fileUrl;
+        public String getAudioBase64() {
+            return audioBase64;
        }

        public String getFormat() {
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikVoicePreviewRespVO.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikVoicePreviewRespVO.java
@@ -7,7 +7,11 @@ import lombok.Data;
@Schema(description = "音色试听响应")
 public class AppTikVoicePreviewRespVO {

-    @Schema(description = "音频播放地址（预签名 URL）")
+    @Schema(description = "音频Base64数据（可直接播放，使用 data:audio/...;base64,... 格式）")
+    private String audioBase64;
+
+    @Schema(description = "音频播放地址（预签名 URL，已废弃，不推荐使用）")
+    @Deprecated
    private String audioUrl;

    @Schema(description = "音频格式", example = "wav")
--- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikVoiceTtsRespVO.java
+++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikVoiceTtsRespVO.java
@@ -10,7 +10,11 @@ public class AppTikVoiceTtsRespVO {
    @Schema(description = "用户文件编号", example = "1024")
    private Long fileId;

-    @Schema(description = "音频播放地址（预签名 URL）")
+    @Schema(description = "音频Base64数据（可直接播放，使用 data:audio/...;base64,... 格式）")
+    private String audioBase64;
+
+    @Schema(description = "音频播放地址（预签名 URL，已废弃，不推荐使用）")
+    @Deprecated
    private String audioUrl;

    @Schema(description = "音频格式", example = "mp3")