diff --git a/frontend/app/web-gold/src/components/VoiceSelector.vue b/frontend/app/web-gold/src/components/VoiceSelector.vue index 468bac0880..2eb8e21b43 100644 --- a/frontend/app/web-gold/src/components/VoiceSelector.vue +++ b/frontend/app/web-gold/src/components/VoiceSelector.vue @@ -50,7 +50,7 @@ const { setSpeechRate, resetPreviewState } = useTTS({ - provider: TTS_PROVIDERS.QWEN + provider: TTS_PROVIDERS.SILICONFLOW }) // 当前选中的音色ID diff --git a/frontend/app/web-gold/src/composables/useTTS.js b/frontend/app/web-gold/src/composables/useTTS.js index 48ef4f9d2f..34a9b02d1c 100644 --- a/frontend/app/web-gold/src/composables/useTTS.js +++ b/frontend/app/web-gold/src/composables/useTTS.js @@ -10,28 +10,15 @@ import { normalizeProviderType, VOICE_PROVIDER_TYPES } from '@/config/voiceConfi // 兼容旧代码的导出 const TTS_PROVIDERS = VOICE_PROVIDER_TYPES -// 供应商默认配置(使用标准化后的键名) -const DEFAULT_CONFIG = { - cosyvoice: { +const DEFAULT_CONFIG = { apiEndpoint: '/api/tik/voice/tts', audioFormat: 'mp3', supportedFormats: ['mp3', 'wav'] - }, - azure: { - apiEndpoint: '/api/tik/voice/azure/tts', - audioFormat: 'mp3', - supportedFormats: ['mp3', 'wav', 'ogg'] - }, - aws: { - apiEndpoint: '/api/tik/voice/aws/tts', - audioFormat: 'mp3', - supportedFormats: ['mp3', 'wav', 'ogg'] - } } export function useTTS(options = {}) { const { - provider = VOICE_PROVIDER_TYPES.COSYVOICE, + provider = VOICE_PROVIDER_TYPES.SILICONFLOW, customConfig = {} } = options @@ -49,9 +36,7 @@ export function useTTS(options = {}) { // 获取当前供应商配置 const getProviderConfig = () => { - const normalizedProvider = normalizeProviderType(provider) - const config = DEFAULT_CONFIG[normalizedProvider] || DEFAULT_CONFIG.cosyvoice - return { ...config, ...customConfig } + return DEFAULT_CONFIG } /** diff --git a/frontend/app/web-gold/src/config/voiceConfig.js b/frontend/app/web-gold/src/config/voiceConfig.js index 1e100a3795..a89aa7e0ba 100644 --- a/frontend/app/web-gold/src/config/voiceConfig.js +++ b/frontend/app/web-gold/src/config/voiceConfig.js @@ -20,17 +20,14 @@ export const VOICE_PROVIDER_OPTIONS = [ { label: '硅基流动 SiliconFlow', value: VOICE_PROVIDER_TYPES.SILICONFLOW } ] -// 供应商别名映射(兼容旧名称) -export const PROVIDER_ALIAS_MAP = { - [VOICE_PROVIDER_TYPES.QWEN]: VOICE_PROVIDER_TYPES.COSYVOICE -} + /** - * 标准化供应商类型(处理别名映射) + * 标准化供应商类型 */ export function normalizeProviderType(providerType) { if (!providerType) return DEFAULT_VOICE_PROVIDER - return PROVIDER_ALIAS_MAP[providerType] || providerType + return VOICE_PROVIDER_TYPES[providerType] || providerType } /** @@ -41,21 +38,13 @@ export function getProviderLabel(providerType) { return option?.label || providerType } -/** - * 检查供应商是否支持 - */ -export function isProviderSupported(providerType) { - const normalized = normalizeProviderType(providerType) - return Object.values(VOICE_PROVIDER_TYPES).includes(normalized) -} + // 默认导出配置对象 export default { VOICE_PROVIDER_TYPES, DEFAULT_VOICE_PROVIDER, VOICE_PROVIDER_OPTIONS, - PROVIDER_ALIAS_MAP, normalizeProviderType, getProviderLabel, - isProviderSupported } diff --git a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/CosyVoiceClient.java b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/CosyVoiceClient.java index b8af69d005..0534ef43f7 100644 --- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/CosyVoiceClient.java +++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/CosyVoiceClient.java @@ -57,6 +57,15 @@ public class CosyVoiceClient { if (!config.isEnabled()) { throw exception0(VOICE_TTS_FAILED.getCode(), "未配置 CosyVoice API Key"); } + + // 添加详细的参数检查日志 + String text = request != null ? request.getText() : null; + log.error("[CosyVoice][TTS参数检查][request={}, text={}, voiceId={}, model={}]", + request != null ? "存在" : "为null", + text != null ? "'" + text + "' (长度:" + text.length() + ")" : "为null", + request != null ? request.getVoiceId() : null, + request != null ? request.getModel() : null); + if (request == null || StrUtil.isBlank(request.getText())) { throw exception0(VOICE_TTS_FAILED.getCode(), "TTS 文本不能为空"); } @@ -86,7 +95,9 @@ public class CosyVoiceClient { if (StrUtil.isNotBlank(request.getInstruction())) { param.setInstruction(request.getInstruction()); } - + + log.error("[CosyVoice][SDK参数][param={}, text='{}']", param, request.getText()); + // 初始化合成器(同步调用传 null) synthesizer = new SpeechSynthesizer(param, null); diff --git a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/CosyVoiceProvider.java b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/CosyVoiceProvider.java index 4797cba7ff..91473c5df0 100644 --- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/CosyVoiceProvider.java +++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/CosyVoiceProvider.java @@ -92,6 +92,9 @@ public class CosyVoiceProvider implements VoiceCloneProvider { .preview(request.isPreview()) .build(); + log.error("[CosyVoiceProvider][构建的cosyRequest][text='{}', voiceId={}, fileUrl={}]", + cosyRequest.getText(), cosyRequest.getVoiceId(), cosyRequest.getFileUrl()); + // 调用底层 Client cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsResult cosyResult = cosyVoiceClient.synthesize(cosyRequest); diff --git a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/SiliconFlowProvider.java b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/SiliconFlowProvider.java index 1d4535298e..66fb1637ab 100644 --- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/SiliconFlowProvider.java +++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/SiliconFlowProvider.java @@ -120,7 +120,6 @@ public class SiliconFlowProvider implements VoiceCloneProvider { .input(request.getText()) .voice(request.getVoiceId()) .speed(request.getSpeechRate() != null ? request.getSpeechRate() : 1.0f) - .sampleRate(request.getSampleRate() != null ? request.getSampleRate() : config.getSampleRate()) .responseFormat(getOrDefault(request.getAudioFormat(), config.getAudioFormat())) .build(); @@ -150,7 +149,6 @@ public class SiliconFlowProvider implements VoiceCloneProvider { VoiceTtsResult result = new VoiceTtsResult(); result.setAudio(Base64.getDecoder().decode(base64Audio)); result.setFormat(sfRequest.getResponseFormat()); - result.setSampleRate(sfRequest.getSampleRate()); result.setVoiceId(request.getVoiceId()); log.info("[SiliconFlowProvider][语音合成成功][format={}, audioSize={}]", diff --git a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/SiliconFlowTtsRequest.java b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/SiliconFlowTtsRequest.java index e60e1eaf09..d9add1609e 100644 --- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/SiliconFlowTtsRequest.java +++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/client/dto/SiliconFlowTtsRequest.java @@ -1,5 +1,6 @@ package cn.iocoder.yudao.module.tik.voice.client.dto; +import com.fasterxml.jackson.annotation.JsonProperty; import lombok.Builder; import lombok.Data; @@ -20,8 +21,9 @@ public class SiliconFlowTtsRequest { private String model; /** - * 待合成文本 + * 待合成文本(API 参数名:input) */ + @JsonProperty("input") private String input; /** @@ -34,14 +36,12 @@ public class SiliconFlowTtsRequest { */ private Float speed; - /** - * 采样率(如 24000) - */ - private Integer sampleRate; + /** - * 响应格式(mp3, wav, pcm) + * 响应格式(mp3, opus, wav, pcm)(API 参数名:response_format) */ + @JsonProperty("response_format") private String responseFormat; } diff --git a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/service/TikUserVoiceServiceImpl.java b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/service/TikUserVoiceServiceImpl.java index 09e7a0cf8e..2b7db6aaad 100644 --- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/service/TikUserVoiceServiceImpl.java +++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/service/TikUserVoiceServiceImpl.java @@ -427,7 +427,6 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService { transcriptionText, reqVO.getInputText(), false); - // 移除appendEmotion调用,情感通过instruction参数传递 String cacheKey = buildCacheKey(SYNTH_CACHE_PREFIX, voiceId, @@ -493,128 +492,75 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService { Long userId = SecurityFrameworkUtils.getLoginUserId(); Long voiceConfigId = reqVO.getVoiceConfigId(); - // 增加请求参数日志 - log.info("[previewVoice][开始试听,请求参数:voiceConfigId={}, voiceId={}, fileUrl={}, userId={}]", - voiceConfigId, reqVO.getVoiceId(), reqVO.getFileUrl(), userId); + log.info("[previewVoice][试听,voiceConfigId={}, voiceId={}, userId={}]", + voiceConfigId, reqVO.getVoiceId(), userId); String voiceId = null; String fileUrl = null; - String transcriptionText = null; - String inputText; + String referenceText = null; - // 1. 如果传入了fileUrl和transcriptionText,直接使用(通过语音URL合成) + // 1. 通过语音URL合成 if (StrUtil.isNotBlank(reqVO.getFileUrl()) && StrUtil.isNotBlank(reqVO.getTranscriptionText())) { - log.info("[previewVoice][使用语音URL合成,用户({})]", userId); - // 如果传入的是预签名URL,提取原始URL(去除查询参数),避免二次签名 String rawFileUrl = extractRawUrl(reqVO.getFileUrl()); - // 如果提取后的URL与原始URL不同,说明是预签名URL,需要重新生成预签名URL - // 否则直接使用(可能是原始URL或公开URL) - if (!rawFileUrl.equals(reqVO.getFileUrl())) { - // 重新生成预签名URL,确保有效期足够长 - fileUrl = fileApi.presignGetUrl(rawFileUrl, PRESIGN_URL_EXPIRATION_SECONDS); - log.info("[previewVoice][检测到预签名URL,已提取原始URL并重新生成预签名URL]"); - } else { - fileUrl = reqVO.getFileUrl(); - } - transcriptionText = reqVO.getTranscriptionText(); - inputText = StrUtil.blankToDefault(reqVO.getInputText(), transcriptionText); + fileUrl = rawFileUrl.equals(reqVO.getFileUrl()) + ? reqVO.getFileUrl() + : fileApi.presignGetUrl(rawFileUrl, PRESIGN_URL_EXPIRATION_SECONDS); + referenceText = reqVO.getTranscriptionText(); } - // 2. 如果有配置ID,根据配置ID查询配音信息(用户配音) + // 2. 用户配音 else if (voiceConfigId != null) { - log.info("[previewVoice][开始试听,配音编号({}),用户({})]", voiceConfigId, userId); - TikUserVoiceDO voice = voiceMapper.selectById(voiceConfigId); - log.info("[previewVoice][查询配音结果:voice={},配音编号={},用户ID={}]", - voice != null ? "存在" : "不存在", voiceConfigId, userId); - - if (voice == null) { - log.warn("[previewVoice][配音不存在,配音编号({}),用户({})]", voiceConfigId, userId); - throw exception(VOICE_NOT_EXISTS, "配音不存在,编号:" + voiceConfigId); - } - if (!voice.getUserId().equals(userId)) { - log.warn("[previewVoice][配音不属于当前用户,配音编号({}),配音用户({}),当前用户({})]", - voiceConfigId, voice.getUserId(), userId); - throw exception(VOICE_NOT_EXISTS, "配音不属于当前用户"); + if (voice == null || !voice.getUserId().equals(userId)) { + throw exception(VOICE_NOT_EXISTS, "配音不存在"); } - // 优先使用复刻的 voice_id,如果不存在则使用文件URL(兼容旧数据) if (StrUtil.isNotBlank(voice.getVoiceId())) { - log.info("[previewVoice][使用复刻音色ID试听,配音编号({}),voice_id({})]", voiceConfigId, voice.getVoiceId()); voiceId = voice.getVoiceId(); - // 注意:使用 voiceId 时,不依赖 transcriptionText,直接使用前端传入的 inputText - transcriptionText = null; // 清除 transcriptionText - inputText = StrUtil.blankToDefault(reqVO.getInputText(), getPreviewText()); } else { - log.info("[previewVoice][使用文件URL试听,配音编号({})]", voiceConfigId); - // 获取文件信息,用于获取文件URL FileDO fileDO = fileMapper.selectById(voice.getFileId()); if (fileDO == null) { throw exception(VOICE_FILE_NOT_EXISTS); } - - // 使用文件URL和识别文本进行合成 fileUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS); - transcriptionText = voice.getTranscription(); - if (StrUtil.isBlank(transcriptionText)) { + referenceText = voice.getTranscription(); + if (StrUtil.isBlank(referenceText)) { throw exception(VOICE_NOT_EXISTS, "配音识别文本为空,请先进行语音识别"); } - inputText = StrUtil.blankToDefault(reqVO.getInputText(), - StrUtil.blankToDefault(transcriptionText, getPreviewText())); } } - // 3. 如果没有配置ID,使用系统配音配置(需要前端传voiceId) + // 3. 系统配音 else { - log.info("[previewVoice][开始试听,使用系统配音配置,用户({})]", userId); voiceId = StrUtil.blankToDefault(reqVO.getVoiceId(), getDefaultVoiceId()); if (StrUtil.isBlank(voiceId)) { throw exception(VOICE_NOT_EXISTS, "系统配音音色ID不能为空"); } - inputText = StrUtil.blankToDefault(reqVO.getInputText(), getPreviewText()); } - - String finalText = determineSynthesisText( - transcriptionText, - inputText, - true); - - // 使用请求参数或默认值 + + // 统一处理:使用前端传入的 inputText,否则使用默认试听文本 + String finalText = StrUtil.blankToDefault(reqVO.getInputText(), getPreviewText()); + String instruction = reqVO.getInstruction(); - // 注意:instruction参数现在直接传递给CosyVoice,不再添加到文本中 Float speechRate = reqVO.getSpeechRate() != null ? reqVO.getSpeechRate() : 1.0f; Float volume = reqVO.getVolume() != null ? reqVO.getVolume() : 0f; String audioFormat = StrUtil.blankToDefault(reqVO.getAudioFormat(), "mp3"); - // 构建缓存key(使用fileUrl或voiceId) - String cacheKey = buildCacheKey(PREVIEW_CACHE_PREFIX, - voiceId, - fileUrl, - finalText, - speechRate, - volume, - instruction, - audioFormat, - null); + // 缓存 + String cacheKey = buildCacheKey(PREVIEW_CACHE_PREFIX, voiceId, fileUrl, finalText, + speechRate, volume, instruction, audioFormat, null); PreviewCacheEntry previewCache = getPreviewCache(cacheKey); - if (previewCache != null) { - log.info("[previewVoice][使用缓存,配音编号({}),voiceId({}),cacheKey({})]", - voiceConfigId, voiceId, cacheKey); - // 缓存命中,直接返回缓存的数据(Base64) - String cachedBase64 = previewCache.getAudioBase64(); - return buildPreviewResp(cachedBase64, previewCache.getFormat(), voiceId); + return buildPreviewResp(previewCache.getAudioBase64(), previewCache.getFormat(), voiceId); } - log.info("[previewVoice][调用语音合成服务,配音编号({}),voiceId({}),fileUrl({}),文本长度({}),供应商({})]", - voiceConfigId, voiceId, fileUrl, finalText.length(), reqVO.getProviderType()); - - // 使用 Provider 接口进行 TTS 合成(支持前端选择供应商,不传则使用默认) + // TTS 合成 + log.info("[previewVoice][TTS,voiceId={}, textLen={}]", voiceId, finalText.length()); VoiceCloneProvider provider = voiceProviderFactory.getProvider(reqVO.getProviderType()); VoiceTtsRequest ttsRequest = VoiceTtsRequest.builder() .text(finalText) .voiceId(voiceId) .fileUrl(fileUrl) - .referenceText(transcriptionText) - .model(null) // 使用默认模型 + .referenceText(referenceText) + .model(null) .speechRate(speechRate) .volume(volume) .instruction(instruction) @@ -624,22 +570,13 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService { .build(); VoiceTtsResult ttsResult = provider.synthesize(ttsRequest); - String format = defaultFormat(ttsResult.getFormat(), audioFormat); - String identifier = StrUtil.isNotBlank(voiceId) ? voiceId : "voice"; - String objectName = buildFileName(identifier, format); - - // 【安全方案】不暴露OSS链接,直接返回Base64编码的音频数据 - // 这样前端可直接播放,无需额外请求,也不会暴露OSS存储信息 String audioBase64 = Base64.getEncoder().encodeToString(ttsResult.getAudio()); - log.info("[previewVoice][合成成功,配音编号({}),voiceId({}),format({}),audioSize={}]", - voiceConfigId, voiceId, format, ttsResult.getAudio().length); - // 缓存Base64数据(用于提升响应速度) - PreviewCacheEntry entry = new PreviewCacheEntry(audioBase64, format, ttsResult.getSampleRate(), ttsResult.getRequestId()); - savePreviewCache(cacheKey, entry); + savePreviewCache(cacheKey, new PreviewCacheEntry(audioBase64, format, + ttsResult.getSampleRate(), ttsResult.getRequestId())); - // 返回Base64数据,前端使用 data:audio/...;base64,... 格式播放 + log.info("[previewVoice][成功,voiceId={}, format={}, size={}]", voiceId, format, ttsResult.getAudio().length); return buildPreviewResp(audioBase64, format, voiceId); } @@ -716,25 +653,6 @@ public class TikUserVoiceServiceImpl implements TikUserVoiceService { throw exception(VOICE_TTS_FAILED, "请提供需要合成的文本内容"); } - private String appendEmotion(String text, String emotion) { - if (StrUtil.isBlank(text)) { - return text; - } - if (StrUtil.isBlank(emotion) || "neutral".equalsIgnoreCase(emotion)) { - return text; - } - String emotionLabel = switch (emotion.toLowerCase()) { - case "happy" -> "高兴"; - case "angry" -> "愤怒"; - case "sad" -> "悲伤"; - case "scared" -> "害怕"; - case "disgusted" -> "厌恶"; - case "surprised" -> "惊讶"; - default -> emotion; - }; - return "【情感:" + emotionLabel + "】" + text; - } - /** * 从URL中提取原始URL(去除查询参数和锚点) * diff --git a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikVoicePreviewReqVO.java b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikVoicePreviewReqVO.java index a10bd5955f..3e6796f5ad 100644 --- a/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikVoicePreviewReqVO.java +++ b/yudao-module-tik/src/main/java/cn/iocoder/yudao/module/tik/voice/vo/AppTikVoicePreviewReqVO.java @@ -24,7 +24,7 @@ public class AppTikVoicePreviewReqVO { @Size(max = 4000, message = "语音文本不能超过 4000 个字符") private String transcriptionText; - @Schema(description = "输入文本(可选,如果不传则使用配音的识别文本或默认文本)") + @Schema(description = "输入文本(可选,不传则使用默认试听文本)") @Size(max = 4000, message = "输入文本不能超过 4000 个字符") private String inputText; diff --git a/yudao-server/src/main/resources/application-local.yaml b/yudao-server/src/main/resources/application-local.yaml index 6275812b75..2b163e6661 100644 --- a/yudao-server/src/main/resources/application-local.yaml +++ b/yudao-server/src/main/resources/application-local.yaml @@ -231,7 +231,7 @@ yudao: default-model: cosyvoice-v3-flash siliconflow: enabled: true - api-key: sk-epsakfenqnyzoxhmbucsxlhkdqlcbnimslqoivkshalvdozz + api-key: sk-kcvifijrafkzxsmnxbgxspnxdvjiaawcbyoiqhmfobykynpx base-url: https://api.siliconflow.cn default-model: IndexTeam/IndexTTS-2 ice: diff --git a/yudao-server/src/main/resources/application.yaml b/yudao-server/src/main/resources/application.yaml index a134437bd5..876fcd2c12 100644 --- a/yudao-server/src/main/resources/application.yaml +++ b/yudao-server/src/main/resources/application.yaml @@ -214,7 +214,7 @@ spring: yudao: voice: - default-provider: cosyvoice + default-provider: siliconflow cosyvoice: enabled: true api-key: sk-10c746f8cb8640738f8d6b71af699003 @@ -225,8 +225,8 @@ yudao: tts-url: https://dashscope.aliyuncs.com/api/v1/services/audio/tts/speech-synthesis voice-enrollment-url: https://dashscope.aliyuncs.com/api/v1/services/audio/tts/voice-enrollment siliconflow: - enabled: false - api-key: ${SILICONFLOW_API_KEY:} + enabled: true + api-key: sk-kcvifijrafkzxsmnxbgxspnxdvjiaawcbyoiqhmfobykynpx base-url: https://api.siliconflow.cn default-model: IndexTeam/IndexTTS-2 sample-rate: 24000