send-stream

This commit is contained in:
wing
2025-11-19 00:15:18 +08:00
parent 33abc33b58
commit eee3206e90
31 changed files with 3000 additions and 0 deletions

View File

@@ -0,0 +1,178 @@
package cn.iocoder.yudao.module.tik.voice.client;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.util.StrUtil;
import cn.iocoder.yudao.framework.common.exception.ServiceException;
import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsRequest;
import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsResult;
import cn.iocoder.yudao.module.tik.voice.config.CosyVoiceProperties;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import okhttp3.MediaType;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.RequestBody;
import okhttp3.Response;
import org.springframework.stereotype.Component;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.Base64;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception;
import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception0;
import static cn.iocoder.yudao.module.tik.enmus.ErrorCodeConstants.VOICE_TTS_FAILED;
/**
* CosyVoice 客户端
*/
@Slf4j
@Component
@RequiredArgsConstructor
public class CosyVoiceClient {
private static final MediaType JSON = MediaType.parse("application/json; charset=utf-8");
private final CosyVoiceProperties properties;
private final ObjectMapper objectMapper;
private volatile OkHttpClient httpClient;
/**
* 调用 CosyVoice TTS 接口
*/
public CosyVoiceTtsResult synthesize(CosyVoiceTtsRequest request) {
if (!properties.isEnabled()) {
throw exception0(VOICE_TTS_FAILED.getCode(), "未配置 CosyVoice API Key");
}
if (request == null || StrUtil.isBlank(request.getText())) {
throw exception0(VOICE_TTS_FAILED.getCode(), "TTS 文本不能为空");
}
try {
String payload = objectMapper.writeValueAsString(buildPayload(request));
Request httpRequest = new Request.Builder()
.url(properties.getTtsUrl())
.addHeader("Authorization", "Bearer " + properties.getApiKey())
.addHeader("Content-Type", "application/json")
.post(RequestBody.create(payload.getBytes(StandardCharsets.UTF_8), JSON))
.build();
try (Response response = getHttpClient().newCall(httpRequest).execute()) {
String body = response.body() != null ? response.body().string() : "";
if (!response.isSuccessful()) {
log.error("[CosyVoice][TTS失败][status={}, body={}]", response.code(), body);
throw buildException(body);
}
return parseTtsResult(body, request);
}
} catch (ServiceException ex) {
throw ex;
} catch (Exception ex) {
log.error("[CosyVoice][TTS异常]", ex);
throw exception(VOICE_TTS_FAILED);
}
}
private Map<String, Object> buildPayload(CosyVoiceTtsRequest request) {
Map<String, Object> payload = new HashMap<>();
String model = StrUtil.blankToDefault(request.getModel(), properties.getDefaultModel());
payload.put("model", model);
Map<String, Object> input = new HashMap<>();
input.put("text", request.getText());
String voiceId = StrUtil.blankToDefault(request.getVoiceId(), properties.getDefaultVoiceId());
if (StrUtil.isNotBlank(voiceId)) {
input.put("voice", voiceId);
}
payload.put("input", input);
Map<String, Object> parameters = new HashMap<>();
int sampleRate = request.getSampleRate() != null ? request.getSampleRate() : properties.getSampleRate();
parameters.put("sample_rate", sampleRate);
String format = StrUtil.blankToDefault(request.getAudioFormat(), properties.getAudioFormat());
parameters.put("format", format);
if (request.getSpeechRate() != null) {
parameters.put("speech_rate", request.getSpeechRate());
}
if (request.getVolume() != null) {
parameters.put("volume", request.getVolume());
}
if (request.isPreview()) {
parameters.put("preview", true);
}
payload.put("parameters", parameters);
return payload;
}
private CosyVoiceTtsResult parseTtsResult(String body, CosyVoiceTtsRequest request) throws Exception {
JsonNode root = objectMapper.readTree(body);
// 错误响应包含 code 字段
if (root.has("code")) {
String message = root.has("message") ? root.get("message").asText() : body;
log.error("[CosyVoice][TTS失败][code={}, message={}]", root.get("code").asText(), message);
throw exception0(VOICE_TTS_FAILED.getCode(), message);
}
JsonNode audioNode = root.path("output").path("audio");
if (!audioNode.isArray() || audioNode.isEmpty()) {
throw exception0(VOICE_TTS_FAILED.getCode(), "CosyVoice 返回的音频为空");
}
JsonNode firstAudio = audioNode.get(0);
String content = firstAudio.path("content").asText();
if (StrUtil.isBlank(content)) {
throw exception0(VOICE_TTS_FAILED.getCode(), "CosyVoice 返回空音频内容");
}
byte[] audioBytes = Base64.getDecoder().decode(content);
CosyVoiceTtsResult result = new CosyVoiceTtsResult();
result.setAudio(audioBytes);
result.setFormat(firstAudio.path("format").asText(StrUtil.blankToDefault(request.getAudioFormat(), properties.getAudioFormat())));
result.setSampleRate(firstAudio.path("sample_rate").asInt(request.getSampleRate() != null ? request.getSampleRate() : properties.getSampleRate()));
result.setRequestId(root.path("request_id").asText());
result.setVoiceId(firstAudio.path("voice").asText(request.getVoiceId()));
return result;
}
private OkHttpClient getHttpClient() {
if (httpClient == null) {
synchronized (this) {
if (httpClient == null) {
java.time.Duration connect = defaultDuration(properties.getConnectTimeout(), 10);
java.time.Duration read = defaultDuration(properties.getReadTimeout(), 60);
httpClient = new OkHttpClient.Builder()
.connectTimeout(connect.toMillis(), TimeUnit.MILLISECONDS)
.readTimeout(read.toMillis(), TimeUnit.MILLISECONDS)
.build();
}
}
}
return httpClient;
}
private Duration defaultDuration(Duration duration, long seconds) {
return duration == null ? Duration.ofSeconds(seconds) : duration;
}
private ServiceException buildException(String body) {
try {
JsonNode root = objectMapper.readTree(body);
String message = CollUtil.getFirst(
CollUtil.newArrayList(
root.path("message").asText(null),
root.path("output").path("message").asText(null)));
return exception0(VOICE_TTS_FAILED.getCode(), StrUtil.blankToDefault(message, "CosyVoice 调用失败"));
} catch (Exception ignored) {
return exception0(VOICE_TTS_FAILED.getCode(), body);
}
}
}

View File

@@ -0,0 +1,141 @@
package cn.iocoder.yudao.module.tik.voice.client;
import cn.hutool.core.util.StrUtil;
import cn.iocoder.yudao.framework.common.exception.ServiceException;
import cn.iocoder.yudao.module.tik.voice.client.dto.LatentsyncSubmitRequest;
import cn.iocoder.yudao.module.tik.voice.client.dto.LatentsyncSubmitResponse;
import cn.iocoder.yudao.module.tik.voice.config.LatentsyncProperties;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import okhttp3.MediaType;
import okhttp3.OkHttpClient;
import okhttp3.Request;
import okhttp3.RequestBody;
import okhttp3.Response;
import org.springframework.stereotype.Component;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception;
import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception0;
import static cn.iocoder.yudao.module.tik.enmus.ErrorCodeConstants.LATENTSYNC_SUBMIT_FAILED;
/**
* 302AI Latentsync 客户端
*/
@Slf4j
@Component
@RequiredArgsConstructor
public class LatentsyncClient {
private static final MediaType JSON = MediaType.parse("application/json; charset=utf-8");
private final LatentsyncProperties properties;
private final ObjectMapper objectMapper;
private volatile OkHttpClient httpClient;
public LatentsyncSubmitResponse submitTask(LatentsyncSubmitRequest request) {
if (!properties.isEnabled()) {
throw exception0(LATENTSYNC_SUBMIT_FAILED.getCode(), "未配置 Latentsync API Key");
}
validateRequest(request);
Map<String, Object> payload = buildPayload(request);
try {
String body = objectMapper.writeValueAsString(payload);
Request httpRequest = new Request.Builder()
.url(properties.getSubmitUrl())
.addHeader("Authorization", "Bearer " + properties.getApiKey())
.addHeader("Content-Type", "application/json")
.post(RequestBody.create(body.getBytes(StandardCharsets.UTF_8), JSON))
.build();
try (Response response = getHttpClient().newCall(httpRequest).execute()) {
String responseBody = response.body() != null ? response.body().string() : "";
if (!response.isSuccessful()) {
log.error("[Latentsync][submit failed][status={}, body={}]", response.code(), responseBody);
throw buildException(responseBody);
}
LatentsyncSubmitResponse submitResponse =
objectMapper.readValue(responseBody, LatentsyncSubmitResponse.class);
if (StrUtil.isBlank(submitResponse.getRequestId())) {
log.error("[Latentsync][submit failed][response={}]", responseBody);
throw exception0(LATENTSYNC_SUBMIT_FAILED.getCode(), "Latentsync 返回 requestId 为空");
}
return submitResponse;
}
} catch (ServiceException ex) {
throw ex;
} catch (Exception ex) {
log.error("[Latentsync][submit exception]", ex);
throw exception(LATENTSYNC_SUBMIT_FAILED);
}
}
private void validateRequest(LatentsyncSubmitRequest request) {
if (request == null) {
throw exception0(LATENTSYNC_SUBMIT_FAILED.getCode(), "请求体不能为空");
}
if (StrUtil.isBlank(request.getAudioUrl())) {
throw exception0(LATENTSYNC_SUBMIT_FAILED.getCode(), "音频地址不能为空");
}
if (StrUtil.isBlank(request.getVideoUrl())) {
throw exception0(LATENTSYNC_SUBMIT_FAILED.getCode(), "视频地址不能为空");
}
Integer scale = request.getGuidanceScale();
if (scale != null && (scale < 1 || scale > 2)) {
throw exception0(LATENTSYNC_SUBMIT_FAILED.getCode(), "guidanceScale 取值范围 1-2");
}
}
private Map<String, Object> buildPayload(LatentsyncSubmitRequest request) {
Map<String, Object> payload = new HashMap<>();
payload.put("audio_url", request.getAudioUrl());
payload.put("video_url", request.getVideoUrl());
Integer scale = request.getGuidanceScale() != null
? request.getGuidanceScale() : properties.getDefaultGuidanceScale();
payload.put("guidance_scale", scale);
Integer seed = request.getSeed() != null ? request.getSeed() : properties.getDefaultSeed();
payload.put("seed", seed);
return payload;
}
private OkHttpClient getHttpClient() {
if (httpClient == null) {
synchronized (this) {
if (httpClient == null) {
Duration connect = defaultDuration(properties.getConnectTimeout(), 10);
Duration read = defaultDuration(properties.getReadTimeout(), 60);
httpClient = new OkHttpClient.Builder()
.connectTimeout(connect.toMillis(), TimeUnit.MILLISECONDS)
.readTimeout(read.toMillis(), TimeUnit.MILLISECONDS)
.build();
}
}
}
return httpClient;
}
private Duration defaultDuration(Duration duration, long seconds) {
return duration == null ? Duration.ofSeconds(seconds) : duration;
}
private ServiceException buildException(String body) {
try {
JsonNode root = objectMapper.readTree(body);
String message = root.path("message").asText(body);
return exception0(LATENTSYNC_SUBMIT_FAILED.getCode(), message);
} catch (Exception ignored) {
return exception0(LATENTSYNC_SUBMIT_FAILED.getCode(), body);
}
}
}

View File

@@ -0,0 +1,54 @@
package cn.iocoder.yudao.module.tik.voice.client.dto;
import lombok.Builder;
import lombok.Data;
/**
* CosyVoice TTS 请求
*/
@Data
@Builder
public class CosyVoiceTtsRequest {
/**
* 待合成文本
*/
private String text;
/**
* 声音 ID可选默认使用配置
*/
private String voiceId;
/**
* 模型(默认 cosyvoice-v2
*/
private String model;
/**
* 语速
*/
private Float speechRate;
/**
* 音量,可选
*/
private Float volume;
/**
* 采样率
*/
private Integer sampleRate;
/**
* 音频格式
*/
private String audioFormat;
/**
* 是否仅用于试听,方便服务侧做限流
*/
private boolean preview;
}

View File

@@ -0,0 +1,37 @@
package cn.iocoder.yudao.module.tik.voice.client.dto;
import lombok.Data;
/**
* CosyVoice TTS 响应
*/
@Data
public class CosyVoiceTtsResult {
/**
* 请求ID
*/
private String requestId;
/**
* 返回的音频格式
*/
private String format;
/**
* 采样率
*/
private Integer sampleRate;
/**
* 音频二进制内容
*/
private byte[] audio;
/**
* 音频所使用的 voiceId
*/
private String voiceId;
}

View File

@@ -0,0 +1,34 @@
package cn.iocoder.yudao.module.tik.voice.client.dto;
import lombok.Builder;
import lombok.Data;
/**
* Latentsync 任务提交请求
*/
@Data
@Builder
public class LatentsyncSubmitRequest {
/**
* 音频地址(必填)
*/
private String audioUrl;
/**
* 视频地址(必填)
*/
private String videoUrl;
/**
* 口型约束力度1-2
*/
private Integer guidanceScale;
/**
* 随机种子
*/
private Integer seed;
}

View File

@@ -0,0 +1,39 @@
package cn.iocoder.yudao.module.tik.voice.client.dto;
import lombok.Data;
import java.util.Map;
/**
* Latentsync 任务提交响应
*/
@Data
public class LatentsyncSubmitResponse {
/**
* 日志内容(官方暂未返回,预留)
*/
private Object logs;
/**
* 指标信息
*/
private Map<String, Object> metrics;
/**
* 队列位置
*/
private Integer queuePosition;
/**
* 任务 ID
*/
private String requestId;
/**
* 当前状态
*/
private String status;
}

View File

@@ -0,0 +1,74 @@
package cn.iocoder.yudao.module.tik.voice.config;
import cn.hutool.core.util.StrUtil;
import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;
import java.time.Duration;
/**
* CosyVoice 配置
*/
@Data
@Component
@ConfigurationProperties(prefix = "yudao.cosyvoice")
public class CosyVoiceProperties {
/**
* DashScope API Key
*/
private String apiKey;
/**
* 默认模型
*/
private String defaultModel = "cosyvoice-v2";
/**
* 默认 voiceId可选
*/
private String defaultVoiceId;
/**
* 默认采样率
*/
private Integer sampleRate = 24000;
/**
* 默认音频格式
*/
private String audioFormat = "wav";
/**
* 试听默认示例文本
*/
private String previewText = "您好,欢迎体验专属音色。";
/**
* TTS 接口地址
*/
private String ttsUrl = "https://dashscope.aliyuncs.com/api/v1/services/audio/tts/speech-synthesis";
/**
* 连接超时时间
*/
private Duration connectTimeout = Duration.ofSeconds(10);
/**
* 读取超时时间
*/
private Duration readTimeout = Duration.ofSeconds(60);
/**
* 是否启用
*/
private boolean enabled = true;
public boolean isEnabled() {
return enabled && StrUtil.isNotBlank(apiKey);
}
}

View File

@@ -0,0 +1,78 @@
package cn.iocoder.yudao.module.tik.voice.config;
import cn.hutool.core.util.StrUtil;
import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;
import java.time.Duration;
/**
* Latentsync 接口配置
*/
@Data
@Component
@ConfigurationProperties(prefix = "tik.latentsync")
public class LatentsyncProperties {
/**
* 302AI API Key可通过配置覆盖
*/
private String apiKey = "ab900d8c94094a90aed3e88cdba785c1";
/**
* 默认海外网关
*/
private String baseUrl = "https://api.302.ai";
/**
* 默认国内中转网关
*/
private String domesticBaseUrl = "https://api.302ai.cn";
/**
* 是否优先使用国内网关
*/
private boolean preferDomestic = false;
/**
* 提交任务路径
*/
private String submitPath = "/302/submit/latentsync";
/**
* guidance_scale 默认值1-2
*/
private Integer defaultGuidanceScale = 1;
/**
* 随机种子默认值
*/
private Integer defaultSeed = 8888;
/**
* 连接超时时间
*/
private Duration connectTimeout = Duration.ofSeconds(10);
/**
* 读取超时时间
*/
private Duration readTimeout = Duration.ofSeconds(60);
/**
* 是否打开调用
*/
private boolean enabled = true;
public String getSubmitUrl() {
String base = preferDomestic ? domesticBaseUrl : baseUrl;
return StrUtil.blankToDefault(base, baseUrl) + submitPath;
}
public boolean isEnabled() {
return enabled && StrUtil.isNotBlank(apiKey);
}
}

View File

@@ -0,0 +1,38 @@
package cn.iocoder.yudao.module.tik.voice.controller;
import cn.iocoder.yudao.framework.common.pojo.CommonResult;
import cn.iocoder.yudao.module.tik.voice.service.LatentsyncService;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikLatentsyncSubmitReqVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikLatentsyncSubmitRespVO;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import jakarta.annotation.Resource;
import jakarta.validation.Valid;
import org.springframework.validation.annotation.Validated;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import static cn.iocoder.yudao.framework.common.pojo.CommonResult.success;
/**
* 用户 App - Latentsync 口型同步
*/
@Tag(name = "用户 App - Latentsync 口型同步")
@RestController
@RequestMapping("/api/tik/latentsync")
@Validated
public class AppTikLatentsyncController {
@Resource
private LatentsyncService latentsyncService;
@PostMapping("/submit")
@Operation(summary = "提交 302AI Latentsync 口型任务")
public CommonResult<AppTikLatentsyncSubmitRespVO> submitTask(@Valid @RequestBody AppTikLatentsyncSubmitReqVO reqVO) {
return success(latentsyncService.submitTask(reqVO));
}
}

View File

@@ -0,0 +1,95 @@
package cn.iocoder.yudao.module.tik.voice.controller;
import cn.iocoder.yudao.framework.common.pojo.CommonResult;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.module.tik.voice.service.TikUserVoiceService;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikUserVoiceCreateReqVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikUserVoicePageReqVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikUserVoiceRespVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikUserVoiceUpdateReqVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikVoicePreviewReqVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikVoicePreviewRespVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikVoiceTtsReqVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikVoiceTtsRespVO;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter;
import io.swagger.v3.oas.annotations.tags.Tag;
import jakarta.annotation.Resource;
import jakarta.validation.Valid;
import lombok.extern.slf4j.Slf4j;
import org.springframework.validation.annotation.Validated;
import org.springframework.web.bind.annotation.*;
import static cn.iocoder.yudao.framework.common.pojo.CommonResult.success;
/**
* 用户 App - 配音管理 Controller
*
* @author 芋道源码
*/
@Tag(name = "用户 App - 配音管理")
@RestController
@RequestMapping("/api/tik/voice")
@Validated
@Slf4j
public class AppTikUserVoiceController {
@Resource
private TikUserVoiceService voiceService;
@PostMapping("/create")
@Operation(summary = "创建配音")
public CommonResult<Long> createVoice(@Valid @RequestBody AppTikUserVoiceCreateReqVO createReqVO) {
return success(voiceService.createVoice(createReqVO));
}
@PutMapping("/update")
@Operation(summary = "更新配音")
public CommonResult<Boolean> updateVoice(@Valid @RequestBody AppTikUserVoiceUpdateReqVO updateReqVO) {
voiceService.updateVoice(updateReqVO);
return success(true);
}
@DeleteMapping("/delete")
@Operation(summary = "删除配音")
@Parameter(name = "id", description = "配音编号", required = true, example = "1")
public CommonResult<Boolean> deleteVoice(@RequestParam("id") Long id) {
voiceService.deleteVoice(id);
return success(true);
}
@GetMapping("/page")
@Operation(summary = "分页查询配音列表")
public CommonResult<PageResult<AppTikUserVoiceRespVO>> getVoicePage(@Valid AppTikUserVoicePageReqVO pageReqVO) {
return success(voiceService.getVoicePage(pageReqVO));
}
@GetMapping("/get")
@Operation(summary = "获取单个配音")
@Parameter(name = "id", description = "配音编号", required = true, example = "1")
public CommonResult<AppTikUserVoiceRespVO> getVoice(@RequestParam("id") Long id) {
return success(voiceService.getVoice(id));
}
@PostMapping("/transcribe")
@Operation(summary = "手动触发语音识别")
@Parameter(name = "id", description = "配音编号", required = true, example = "1")
public CommonResult<Boolean> transcribeVoice(@RequestParam("id") Long id) {
voiceService.transcribeVoice(id);
return success(true);
}
@PostMapping("/tts")
@Operation(summary = "CosyVoice 文本转语音")
public CommonResult<AppTikVoiceTtsRespVO> synthesizeVoice(@Valid @RequestBody AppTikVoiceTtsReqVO reqVO) {
return success(voiceService.synthesizeVoice(reqVO));
}
@PostMapping("/preview")
@Operation(summary = "我的音色试听")
public CommonResult<AppTikVoicePreviewRespVO> previewVoice(@Valid @RequestBody AppTikVoicePreviewReqVO reqVO) {
return success(voiceService.previewVoice(reqVO));
}
}

View File

@@ -0,0 +1,59 @@
package cn.iocoder.yudao.module.tik.voice.dal.dataobject;
import cn.iocoder.yudao.framework.tenant.core.db.TenantBaseDO;
import com.baomidou.mybatisplus.annotation.KeySequence;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.*;
/**
* 用户配音 DO
*
* @author 芋道源码
*/
@TableName("tik_user_voice")
@KeySequence("tik_user_voice_seq") // 用于 Oracle、PostgreSQL、Kingbase、DB2、H2 数据库的主键自增。如果是 MySQL 等数据库,可不写。
@Data
@EqualsAndHashCode(callSuper = true)
@ToString(callSuper = true)
@Builder
@NoArgsConstructor
@AllArgsConstructor
public class TikUserVoiceDO extends TenantBaseDO {
/**
* 配音编号
*/
@TableId
private Long id;
/**
* 用户编号
*/
private Long userId;
/**
* 配音名称
*/
private String name;
/**
* 音频文件编号(关联 infra_file.id
*/
private Long fileId;
/**
* 语音识别内容,为空表示未识别,有值表示已识别
*/
private String transcription;
/**
* 语言zh-CN-简体中文zh-TW-繁體中文en-US-English
*/
private String language;
/**
* 音色类型female-女声male-男声
*/
private String gender;
/**
* 备注信息
*/
private String note;
}

View File

@@ -0,0 +1,26 @@
package cn.iocoder.yudao.module.tik.voice.dal.mysql;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.mybatis.core.mapper.BaseMapperX;
import cn.iocoder.yudao.framework.mybatis.core.query.LambdaQueryWrapperX;
import cn.iocoder.yudao.module.tik.voice.dal.dataobject.TikUserVoiceDO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikUserVoicePageReqVO;
import org.apache.ibatis.annotations.Mapper;
/**
* 用户配音 Mapper
*
* @author 芋道源码
*/
@Mapper
public interface TikUserVoiceMapper extends BaseMapperX<TikUserVoiceDO> {
default PageResult<TikUserVoiceDO> selectPage(AppTikUserVoicePageReqVO reqVO) {
return selectPage(reqVO, new LambdaQueryWrapperX<TikUserVoiceDO>()
.eqIfPresent(TikUserVoiceDO::getUserId, reqVO.getUserId())
.likeIfPresent(TikUserVoiceDO::getName, reqVO.getName())
.orderByDesc(TikUserVoiceDO::getId));
}
}

View File

@@ -0,0 +1,20 @@
package cn.iocoder.yudao.module.tik.voice.service;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikLatentsyncSubmitReqVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikLatentsyncSubmitRespVO;
/**
* Latentsync 口型同步 Service
*/
public interface LatentsyncService {
/**
* 提交 302AI Latentsync 任务
*
* @param reqVO 请求 VO
* @return 任务响应
*/
AppTikLatentsyncSubmitRespVO submitTask(AppTikLatentsyncSubmitReqVO reqVO);
}

View File

@@ -0,0 +1,42 @@
package cn.iocoder.yudao.module.tik.voice.service;
import cn.hutool.core.util.StrUtil;
import cn.iocoder.yudao.module.tik.voice.client.LatentsyncClient;
import cn.iocoder.yudao.module.tik.voice.client.dto.LatentsyncSubmitRequest;
import cn.iocoder.yudao.module.tik.voice.client.dto.LatentsyncSubmitResponse;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikLatentsyncSubmitReqVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikLatentsyncSubmitRespVO;
import jakarta.validation.Valid;
import lombok.RequiredArgsConstructor;
import org.springframework.stereotype.Service;
import org.springframework.validation.annotation.Validated;
/**
* Latentsync Service 实现
*/
@Service
@Validated
@RequiredArgsConstructor
public class LatentsyncServiceImpl implements LatentsyncService {
private final LatentsyncClient latentsyncClient;
@Override
public AppTikLatentsyncSubmitRespVO submitTask(@Valid AppTikLatentsyncSubmitReqVO reqVO) {
LatentsyncSubmitRequest request = LatentsyncSubmitRequest.builder()
.audioUrl(StrUtil.trim(reqVO.getAudioUrl()))
.videoUrl(StrUtil.trim(reqVO.getVideoUrl()))
.guidanceScale(reqVO.getGuidanceScale())
.seed(reqVO.getSeed())
.build();
LatentsyncSubmitResponse response = latentsyncClient.submitTask(request);
AppTikLatentsyncSubmitRespVO respVO = new AppTikLatentsyncSubmitRespVO();
respVO.setRequestId(response.getRequestId());
respVO.setStatus(response.getStatus());
respVO.setQueuePosition(response.getQueuePosition());
return respVO;
}
}

View File

@@ -0,0 +1,75 @@
package cn.iocoder.yudao.module.tik.voice.service;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikUserVoiceCreateReqVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikUserVoicePageReqVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikUserVoiceRespVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikUserVoiceUpdateReqVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikVoicePreviewReqVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikVoicePreviewRespVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikVoiceTtsReqVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikVoiceTtsRespVO;
/**
* 用户配音 Service 接口
*
* @author 芋道源码
*/
public interface TikUserVoiceService {
/**
* 创建配音(上传文件 + 可选自动识别)
*
* @param createReqVO 创建请求 VO
* @return 配音编号
*/
Long createVoice(AppTikUserVoiceCreateReqVO createReqVO);
/**
* 更新配音信息
*
* @param updateReqVO 更新请求 VO
*/
void updateVoice(AppTikUserVoiceUpdateReqVO updateReqVO);
/**
* 删除配音
*
* @param id 配音编号
*/
void deleteVoice(Long id);
/**
* 分页查询
*
* @param pageReqVO 分页查询条件
* @return 配音列表
*/
PageResult<AppTikUserVoiceRespVO> getVoicePage(AppTikUserVoicePageReqVO pageReqVO);
/**
* 获取单个配音
*
* @param id 配音编号
* @return 配音信息
*/
AppTikUserVoiceRespVO getVoice(Long id);
/**
* 手动触发语音识别
*
* @param id 配音编号
*/
void transcribeVoice(Long id);
/**
* CosyVoice 文本转语音
*/
AppTikVoiceTtsRespVO synthesizeVoice(AppTikVoiceTtsReqVO reqVO);
/**
* 我的音色试听
*/
AppTikVoicePreviewRespVO previewVoice(AppTikVoicePreviewReqVO reqVO);
}

View File

@@ -0,0 +1,864 @@
package cn.iocoder.yudao.module.tik.voice.service;
import cn.hutool.core.collection.CollUtil;
import cn.hutool.core.util.StrUtil;
import cn.hutool.http.HttpUtil;
import cn.hutool.json.JSONArray;
import cn.hutool.json.JSONObject;
import cn.hutool.json.JSONUtil;
import cn.iocoder.yudao.framework.common.pojo.CommonResult;
import cn.iocoder.yudao.framework.common.pojo.PageResult;
import cn.iocoder.yudao.framework.common.util.collection.CollectionUtils;
import cn.iocoder.yudao.framework.common.util.object.BeanUtils;
import cn.iocoder.yudao.framework.security.core.util.SecurityFrameworkUtils;
import cn.iocoder.yudao.module.infra.api.file.FileApi;
import cn.iocoder.yudao.module.infra.dal.dataobject.file.FileDO;
import cn.iocoder.yudao.module.infra.dal.mysql.file.FileMapper;
import cn.iocoder.yudao.module.tik.file.dal.dataobject.TikUserFileDO;
import cn.iocoder.yudao.module.tik.file.dal.mysql.TikUserFileMapper;
import cn.iocoder.yudao.module.tik.file.service.TikUserFileService;
import cn.iocoder.yudao.module.tik.tikhup.service.TikHupService;
import cn.iocoder.yudao.framework.mybatis.core.query.LambdaQueryWrapperX;
import cn.iocoder.yudao.module.tik.voice.client.CosyVoiceClient;
import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsRequest;
import cn.iocoder.yudao.module.tik.voice.client.dto.CosyVoiceTtsResult;
import cn.iocoder.yudao.module.tik.voice.config.CosyVoiceProperties;
import cn.iocoder.yudao.module.tik.voice.dal.dataobject.TikUserVoiceDO;
import cn.iocoder.yudao.module.tik.voice.dal.mysql.TikUserVoiceMapper;
import cn.iocoder.yudao.module.tik.voice.util.ByteArrayMultipartFile;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikUserVoiceCreateReqVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikUserVoicePageReqVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikUserVoiceRespVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikUserVoiceUpdateReqVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikVoicePreviewReqVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikVoicePreviewRespVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikVoiceTtsReqVO;
import cn.iocoder.yudao.module.tik.voice.vo.AppTikVoiceTtsRespVO;
import lombok.extern.slf4j.Slf4j;
import org.springframework.data.redis.core.StringRedisTemplate;
import org.springframework.scheduling.annotation.Async;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;
import org.springframework.validation.annotation.Validated;
import jakarta.annotation.Resource;
import java.util.Arrays;
import java.util.Base64;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import static cn.iocoder.yudao.framework.common.exception.util.ServiceExceptionUtil.exception;
import static cn.iocoder.yudao.module.tik.enmus.ErrorCodeConstants.*;
/**
* 用户配音 Service 实现类
*
* @author 芋道源码
*/
@Service
@Validated
@Slf4j
public class TikUserVoiceServiceImpl implements TikUserVoiceService {
@Resource
private TikUserVoiceMapper voiceMapper;
@Resource
private FileMapper fileMapper;
@Resource
private TikUserFileMapper userFileMapper;
@Resource
private TikUserFileService tikUserFileService;
@Resource
private FileApi fileApi;
@Resource
private TikHupService tikHupService;
@Resource
private CosyVoiceClient cosyVoiceClient;
@Resource
private CosyVoiceProperties cosyVoiceProperties;
@Resource
private StringRedisTemplate stringRedisTemplate;
/** 预签名URL过期时间1小时单位 */
private static final int PRESIGN_URL_EXPIRATION_SECONDS = 3600;
private static final String PREVIEW_CACHE_PREFIX = "tik:voice:preview:";
private static final String SYNTH_CACHE_PREFIX = "tik:voice:tts:";
private static final long PREVIEW_CACHE_TTL_SECONDS = 3600;
private static final long SYNTH_CACHE_TTL_SECONDS = 24 * 3600;
@Override
@Transactional(rollbackFor = Exception.class)
public Long createVoice(AppTikUserVoiceCreateReqVO createReqVO) {
Long userId = SecurityFrameworkUtils.getLoginUserId();
// 1. 校验文件是否存在且属于voice分类
FileDO fileDO = fileMapper.selectById(createReqVO.getFileId());
if (fileDO == null) {
throw exception(VOICE_FILE_NOT_EXISTS);
}
// 验证文件分类是否为voice通过tik_user_file表查询
TikUserFileDO userFile = userFileMapper.selectOne(new LambdaQueryWrapperX<TikUserFileDO>()
.eq(TikUserFileDO::getFileId, createReqVO.getFileId())
.eq(TikUserFileDO::getFileCategory, "voice")
.eq(TikUserFileDO::getUserId, userId));
if (userFile == null) {
throw exception(VOICE_FILE_NOT_EXISTS, "文件不存在或不属于voice分类");
}
// 2. 校验名称是否重复
TikUserVoiceDO existingVoice = voiceMapper.selectOne(new LambdaQueryWrapperX<TikUserVoiceDO>()
.eq(TikUserVoiceDO::getUserId, userId)
.eq(TikUserVoiceDO::getName, createReqVO.getName())
.eq(TikUserVoiceDO::getDeleted, false));
if (existingVoice != null) {
throw exception(VOICE_NAME_DUPLICATE);
}
// 3. 创建配音记录
TikUserVoiceDO voice = new TikUserVoiceDO()
.setUserId(userId)
.setName(createReqVO.getName())
.setFileId(createReqVO.getFileId())
.setLanguage(StrUtil.blankToDefault(createReqVO.getLanguage(), "zh-CN"))
.setGender(StrUtil.blankToDefault(createReqVO.getGender(), "female"))
.setNote(createReqVO.getNote())
.setTranscription(null); // 初始为空,表示未识别
voiceMapper.insert(voice);
// 4. 如果开启自动识别,异步执行识别
if (Boolean.TRUE.equals(createReqVO.getAutoTranscribe())) {
String fileAccessUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
log.info("[createVoice][开启自动识别,配音编号({})文件ID({})预签名URL({})]",
voice.getId(), fileDO.getId(), fileAccessUrl);
asyncTranscribeVoice(voice.getId(), fileAccessUrl);
}
log.info("[createVoice][用户({})创建配音成功,配音编号({})]", userId, voice.getId());
return voice.getId();
}
@Override
@Transactional(rollbackFor = Exception.class)
public void updateVoice(AppTikUserVoiceUpdateReqVO updateReqVO) {
Long userId = SecurityFrameworkUtils.getLoginUserId();
// 1. 校验配音是否存在且属于当前用户
TikUserVoiceDO voice = voiceMapper.selectById(updateReqVO.getId());
if (voice == null || !voice.getUserId().equals(userId)) {
throw exception(VOICE_NOT_EXISTS);
}
// 2. 如果更新名称,校验名称是否重复
if (StrUtil.isNotBlank(updateReqVO.getName()) && !updateReqVO.getName().equals(voice.getName())) {
TikUserVoiceDO existingVoice = voiceMapper.selectOne(new LambdaQueryWrapperX<TikUserVoiceDO>()
.eq(TikUserVoiceDO::getUserId, userId)
.eq(TikUserVoiceDO::getName, updateReqVO.getName())
.eq(TikUserVoiceDO::getDeleted, false)
.ne(TikUserVoiceDO::getId, updateReqVO.getId()));
if (existingVoice != null) {
throw exception(VOICE_NAME_DUPLICATE);
}
}
// 3. 更新配音信息
TikUserVoiceDO updateObj = new TikUserVoiceDO()
.setId(updateReqVO.getId());
if (StrUtil.isNotBlank(updateReqVO.getName())) {
updateObj.setName(updateReqVO.getName());
}
if (StrUtil.isNotBlank(updateReqVO.getLanguage())) {
updateObj.setLanguage(updateReqVO.getLanguage());
}
if (StrUtil.isNotBlank(updateReqVO.getGender())) {
updateObj.setGender(updateReqVO.getGender());
}
if (updateReqVO.getNote() != null) {
updateObj.setNote(updateReqVO.getNote());
}
if (updateReqVO.getTranscription() != null) {
updateObj.setTranscription(updateReqVO.getTranscription());
}
voiceMapper.updateById(updateObj);
log.info("[updateVoice][用户({})更新配音成功,配音编号({})]", userId, updateReqVO.getId());
}
@Override
@Transactional(rollbackFor = Exception.class)
public void deleteVoice(Long id) {
Long userId = SecurityFrameworkUtils.getLoginUserId();
// 1. 校验配音是否存在且属于当前用户
TikUserVoiceDO voice = voiceMapper.selectById(id);
if (voice == null || !voice.getUserId().equals(userId)) {
throw exception(VOICE_NOT_EXISTS);
}
// 2. 删除音频文件含OSS
TikUserFileDO userFile = userFileMapper.selectOne(new LambdaQueryWrapperX<TikUserFileDO>()
.eq(TikUserFileDO::getFileId, voice.getFileId())
.eq(TikUserFileDO::getUserId, userId));
if (userFile != null) {
tikUserFileService.deleteFiles(Collections.singletonList(userFile.getId()));
}
// 3. 逻辑删除配音记录
voiceMapper.deleteById(id);
log.info("[deleteVoice][用户({})删除配音成功,配音编号({})]", userId, id);
}
@Override
public PageResult<AppTikUserVoiceRespVO> getVoicePage(AppTikUserVoicePageReqVO pageReqVO) {
// 自动填充当前登录用户ID
Long userId = SecurityFrameworkUtils.getLoginUserId();
pageReqVO.setUserId(userId);
// 查询配音列表
PageResult<TikUserVoiceDO> pageResult = voiceMapper.selectPage(pageReqVO);
// 批量查询文件信息,避免 N+1 查询
Map<Long, FileDO> fileMap = new HashMap<>();
if (CollUtil.isNotEmpty(pageResult.getList())) {
List<Long> fileIds = pageResult.getList().stream()
.map(TikUserVoiceDO::getFileId)
.distinct()
.collect(Collectors.toList());
if (CollUtil.isNotEmpty(fileIds)) {
List<FileDO> files = fileMapper.selectBatchIds(fileIds);
Map<Long, FileDO> tempFileMap = files.stream()
.collect(Collectors.toMap(FileDO::getId, file -> file));
fileMap.putAll(tempFileMap);
}
}
// 转换为VO并关联查询文件信息
return CollectionUtils.convertPage(pageResult, voice -> {
AppTikUserVoiceRespVO vo = BeanUtils.toBean(voice, AppTikUserVoiceRespVO.class);
// 通过 file_id 关联查询文件URL并生成预签名URL
FileDO fileDO = fileMap.get(voice.getFileId());
if (fileDO != null) {
// 生成预签名URL1小时有效期
String presignedUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
vo.setFileUrl(presignedUrl);
}
return vo;
});
}
@Override
public AppTikUserVoiceRespVO getVoice(Long id) {
Long userId = SecurityFrameworkUtils.getLoginUserId();
// 1. 查询配音
TikUserVoiceDO voice = voiceMapper.selectById(id);
if (voice == null || !voice.getUserId().equals(userId)) {
throw exception(VOICE_NOT_EXISTS);
}
// 2. 转换为VO并关联查询文件信息
AppTikUserVoiceRespVO vo = BeanUtils.toBean(voice, AppTikUserVoiceRespVO.class);
// 通过 file_id 关联查询文件URL并生成预签名URL
FileDO fileDO = fileMapper.selectById(voice.getFileId());
if (fileDO != null) {
// 生成预签名URL1小时有效期
String presignedUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
vo.setFileUrl(presignedUrl);
}
return vo;
}
@Override
@Transactional(rollbackFor = Exception.class)
public void transcribeVoice(Long id) {
Long userId = SecurityFrameworkUtils.getLoginUserId();
// 1. 校验配音是否存在且属于当前用户
TikUserVoiceDO voice = voiceMapper.selectById(id);
if (voice == null || !voice.getUserId().equals(userId)) {
throw exception(VOICE_NOT_EXISTS);
}
// 2. 获取文件URL
FileDO fileDO = fileMapper.selectById(voice.getFileId());
if (fileDO == null) {
throw exception(VOICE_FILE_NOT_EXISTS);
}
// 3. 异步执行识别
String fileAccessUrl = fileApi.presignGetUrl(fileDO.getUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
asyncTranscribeVoice(id, fileAccessUrl);
}
@Override
public AppTikVoiceTtsRespVO synthesizeVoice(AppTikVoiceTtsReqVO reqVO) {
String finalText = determineSynthesisText(
reqVO.getTranscriptionText(),
reqVO.getInputText(),
false);
finalText = appendEmotion(finalText, reqVO.getEmotion());
String cacheKey = buildCacheKey(SYNTH_CACHE_PREFIX,
reqVO.getVoiceId(),
reqVO.getFileUrl(),
finalText,
reqVO.getSpeechRate(),
reqVO.getVolume(),
reqVO.getEmotion(),
reqVO.getAudioFormat(),
reqVO.getSampleRate());
SynthCacheEntry synthCache = getSynthCache(cacheKey);
if (synthCache != null) {
return buildSynthResponseFromCache(reqVO, synthCache);
}
CosyVoiceTtsResult ttsResult = cosyVoiceClient.synthesize(buildTtsRequest(
finalText,
reqVO.getVoiceId(),
reqVO.getModel(),
reqVO.getSpeechRate(),
reqVO.getVolume(),
reqVO.getSampleRate(),
reqVO.getAudioFormat(),
false
));
String format = defaultFormat(ttsResult.getFormat(), reqVO.getAudioFormat());
String voiceId = StrUtil.blankToDefault(reqVO.getVoiceId(), cosyVoiceProperties.getDefaultVoiceId());
ByteArrayMultipartFile multipartFile = new ByteArrayMultipartFile(
"file",
buildFileName(voiceId, format),
resolveContentType(format),
ttsResult.getAudio()
);
Long fileId = tikUserFileService.uploadFile(multipartFile, "audio", null);
AppTikVoiceTtsRespVO respVO = new AppTikVoiceTtsRespVO();
respVO.setFileId(fileId);
respVO.setAudioUrl(tikUserFileService.getAudioPlayUrl(fileId));
respVO.setFormat(format);
respVO.setSampleRate(ttsResult.getSampleRate());
respVO.setRequestId(ttsResult.getRequestId());
respVO.setVoiceId(voiceId);
saveSynthCache(cacheKey, new SynthCacheEntry(
Base64.getEncoder().encodeToString(ttsResult.getAudio()),
format,
ttsResult.getSampleRate(),
ttsResult.getRequestId(),
voiceId
));
return respVO;
}
@Override
public AppTikVoicePreviewRespVO previewVoice(AppTikVoicePreviewReqVO reqVO) {
String finalText = determineSynthesisText(
reqVO.getTranscriptionText(),
reqVO.getInputText(),
true);
finalText = appendEmotion(finalText, reqVO.getEmotion());
String cacheKey = buildCacheKey(PREVIEW_CACHE_PREFIX,
reqVO.getVoiceId(),
reqVO.getFileUrl(),
finalText,
reqVO.getSpeechRate(),
reqVO.getVolume(),
reqVO.getEmotion(),
reqVO.getAudioFormat(),
null);
PreviewCacheEntry previewCache = getPreviewCache(cacheKey);
String voiceId = StrUtil.blankToDefault(reqVO.getVoiceId(), cosyVoiceProperties.getDefaultVoiceId());
if (previewCache != null) {
String cachedUrl = fileApi.presignGetUrl(previewCache.getFileUrl(), PRESIGN_URL_EXPIRATION_SECONDS);
return buildPreviewResp(previewCache, cachedUrl, voiceId);
}
CosyVoiceTtsResult ttsResult = cosyVoiceClient.synthesize(buildTtsRequest(
finalText,
reqVO.getVoiceId(),
reqVO.getModel(),
reqVO.getSpeechRate(),
reqVO.getVolume(),
null,
reqVO.getAudioFormat(),
true
));
String format = defaultFormat(ttsResult.getFormat(), reqVO.getAudioFormat());
voiceId = StrUtil.blankToDefault(reqVO.getVoiceId(), cosyVoiceProperties.getDefaultVoiceId());
String objectName = buildFileName(voiceId, format);
String fileUrl = fileApi.createFile(ttsResult.getAudio(), objectName, "voice/preview", resolveContentType(format));
String presignUrl = fileApi.presignGetUrl(fileUrl, PRESIGN_URL_EXPIRATION_SECONDS);
PreviewCacheEntry entry = new PreviewCacheEntry(fileUrl, format, ttsResult.getSampleRate(), ttsResult.getRequestId());
savePreviewCache(cacheKey, entry);
return buildPreviewResp(entry, presignUrl, voiceId);
}
private CosyVoiceTtsRequest buildTtsRequest(String text,
String voiceId,
String model,
Float speechRate,
Float volume,
Integer sampleRate,
String audioFormat,
boolean preview) {
return CosyVoiceTtsRequest.builder()
.text(text)
.voiceId(voiceId)
.model(model)
.speechRate(speechRate)
.volume(volume)
.sampleRate(sampleRate)
.audioFormat(audioFormat)
.preview(preview)
.build();
}
private String defaultFormat(String responseFormat, String requestFormat) {
return StrUtil.blankToDefault(responseFormat,
StrUtil.blankToDefault(requestFormat, cosyVoiceProperties.getAudioFormat()));
}
private String buildFileName(String voiceId, String format) {
String safeVoice = StrUtil.blankToDefault(voiceId, "voice")
.replaceAll("[^a-zA-Z0-9_-]", "");
return safeVoice + "-" + System.currentTimeMillis() + "." + format;
}
private String resolveContentType(String format) {
if ("wav".equalsIgnoreCase(format)) {
return "audio/wav";
}
if ("mp3".equalsIgnoreCase(format)) {
return "audio/mpeg";
}
if ("flac".equalsIgnoreCase(format)) {
return "audio/flac";
}
return "audio/mpeg";
}
private String determineSynthesisText(String transcriptionText, String inputText, boolean allowFallback) {
StringBuilder builder = new StringBuilder();
if (StrUtil.isNotBlank(transcriptionText)) {
builder.append(transcriptionText.trim());
}
if (StrUtil.isNotBlank(inputText)) {
if (builder.length() > 0) {
builder.append("\n");
}
builder.append(inputText.trim());
}
if (builder.length() > 0) {
return builder.toString();
}
if (allowFallback) {
return cosyVoiceProperties.getPreviewText();
}
throw exception(VOICE_TTS_FAILED, "请提供需要合成的文本内容");
}
private String appendEmotion(String text, String emotion) {
if (StrUtil.isBlank(text)) {
return text;
}
if (StrUtil.isBlank(emotion) || "neutral".equalsIgnoreCase(emotion)) {
return text;
}
String emotionLabel = switch (emotion.toLowerCase()) {
case "happy" -> "高兴";
case "angry" -> "愤怒";
case "sad" -> "悲伤";
case "scared" -> "害怕";
case "disgusted" -> "厌恶";
case "surprised" -> "惊讶";
default -> emotion;
};
return "【情感:" + emotionLabel + "" + text;
}
private String buildCacheKey(String prefix,
String voiceId,
String fileUrl,
String text,
Float speechRate,
Float volume,
String emotion,
String audioFormat,
Integer sampleRate) {
String identifier = StrUtil.isNotBlank(voiceId)
? voiceId
: StrUtil.blankToDefault(fileUrl, "no-voice");
String payload = StrUtil.join("|",
identifier,
text,
speechRate != null ? speechRate : "1.0",
volume != null ? volume : "0",
StrUtil.blankToDefault(emotion, "neutral"),
StrUtil.blankToDefault(audioFormat, cosyVoiceProperties.getAudioFormat()),
sampleRate != null ? sampleRate : cosyVoiceProperties.getSampleRate());
String hash = cn.hutool.crypto.SecureUtil.sha256(payload);
return prefix + hash;
}
private PreviewCacheEntry getPreviewCache(String key) {
try {
String json = stringRedisTemplate.opsForValue().get(key);
if (StrUtil.isBlank(json)) {
return null;
}
return JSONUtil.toBean(json, PreviewCacheEntry.class);
} catch (Exception ex) {
log.warn("[previewVoice][cache read failed][key={}]", key, ex);
return null;
}
}
private void savePreviewCache(String key, PreviewCacheEntry entry) {
try {
stringRedisTemplate.opsForValue().set(
key,
JSONUtil.toJsonStr(entry),
PREVIEW_CACHE_TTL_SECONDS,
TimeUnit.SECONDS);
} catch (Exception ex) {
log.warn("[previewVoice][cache write failed][key={}]", key, ex);
}
}
private SynthCacheEntry getSynthCache(String key) {
try {
String json = stringRedisTemplate.opsForValue().get(key);
if (StrUtil.isBlank(json)) {
return null;
}
return JSONUtil.toBean(json, SynthCacheEntry.class);
} catch (Exception ex) {
log.warn("[synthesizeVoice][cache read failed][key={}]", key, ex);
return null;
}
}
private void saveSynthCache(String key, SynthCacheEntry entry) {
try {
stringRedisTemplate.opsForValue().set(
key,
JSONUtil.toJsonStr(entry),
SYNTH_CACHE_TTL_SECONDS,
TimeUnit.SECONDS);
} catch (Exception ex) {
log.warn("[synthesizeVoice][cache write failed][key={}]", key, ex);
}
}
private AppTikVoiceTtsRespVO buildSynthResponseFromCache(AppTikVoiceTtsReqVO reqVO, SynthCacheEntry cache) {
byte[] audioBytes = Base64.getDecoder().decode(cache.getAudioBase64());
String format = defaultFormat(cache.getFormat(), reqVO.getAudioFormat());
String voiceId = StrUtil.blankToDefault(reqVO.getVoiceId(), cache.getVoiceId());
ByteArrayMultipartFile multipartFile = new ByteArrayMultipartFile(
"file",
buildFileName(voiceId, format),
resolveContentType(format),
audioBytes
);
Long fileId = tikUserFileService.uploadFile(multipartFile, "audio", null);
AppTikVoiceTtsRespVO respVO = new AppTikVoiceTtsRespVO();
respVO.setFileId(fileId);
respVO.setAudioUrl(tikUserFileService.getAudioPlayUrl(fileId));
respVO.setFormat(format);
respVO.setSampleRate(cache.getSampleRate());
respVO.setRequestId(cache.getRequestId());
respVO.setVoiceId(voiceId);
return respVO;
}
private AppTikVoicePreviewRespVO buildPreviewResp(PreviewCacheEntry entry, String presignUrl, String voiceId) {
AppTikVoicePreviewRespVO respVO = new AppTikVoicePreviewRespVO();
respVO.setAudioUrl(presignUrl);
respVO.setFormat(entry.getFormat());
respVO.setSampleRate(entry.getSampleRate());
respVO.setRequestId(entry.getRequestId());
respVO.setVoiceId(voiceId);
return respVO;
}
private static class PreviewCacheEntry {
private String fileUrl;
private String format;
private Integer sampleRate;
private String requestId;
public PreviewCacheEntry() {}
public PreviewCacheEntry(String fileUrl, String format, Integer sampleRate, String requestId) {
this.fileUrl = fileUrl;
this.format = format;
this.sampleRate = sampleRate;
this.requestId = requestId;
}
public String getFileUrl() {
return fileUrl;
}
public String getFormat() {
return format;
}
public Integer getSampleRate() {
return sampleRate;
}
public String getRequestId() {
return requestId;
}
}
private static class SynthCacheEntry {
private String audioBase64;
private String format;
private Integer sampleRate;
private String requestId;
private String voiceId;
public SynthCacheEntry() {}
public SynthCacheEntry(String audioBase64, String format, Integer sampleRate, String requestId, String voiceId) {
this.audioBase64 = audioBase64;
this.format = format;
this.sampleRate = sampleRate;
this.requestId = requestId;
this.voiceId = voiceId;
}
public String getAudioBase64() {
return audioBase64;
}
public String getFormat() {
return format;
}
public Integer getSampleRate() {
return sampleRate;
}
public String getRequestId() {
return requestId;
}
public String getVoiceId() {
return voiceId;
}
}
/**
* 异步执行语音识别
*
* @param voiceId 配音编号
* @param fileUrl 文件URL
*/
@Async
public void asyncTranscribeVoice(Long voiceId, String fileUrl) {
try {
log.info("[asyncTranscribeVoice][开始识别,配音编号({})文件URL({})]", voiceId, fileUrl);
Object result = tikHupService.videoToCharacters2(Collections.singletonList(fileUrl));
// 解析识别结果
String transcription = extractTranscription(result);
if (StrUtil.isNotBlank(transcription)) {
// 更新识别结果
TikUserVoiceDO updateObj = new TikUserVoiceDO()
.setId(voiceId)
.setTranscription(transcription);
voiceMapper.updateById(updateObj);
log.info("[asyncTranscribeVoice][识别成功,配音编号({}),文本长度({})]", voiceId, transcription.length());
} else {
log.warn("[asyncTranscribeVoice][识别结果为空,配音编号({}),返回码({})]",
voiceId, result instanceof CommonResult ? ((CommonResult<?>) result).getCode() : "未知");
}
} catch (Exception e) {
log.error("[asyncTranscribeVoice][识别失败,配音编号({})文件URL({})]", voiceId, fileUrl, e);
}
}
/**
* 从识别结果中提取文字内容
* 根据 TikHupService.videoToCharacters* 的实际返回格式进行解析
*
* @param result 识别结果
* @return 文字内容
*/
private String extractTranscription(Object result) {
if (result == null) {
return null;
}
try {
if (result instanceof CommonResult<?> commonResult) {
if (!commonResult.isSuccess()) {
log.warn("[extractTranscription][识别失败code({})msg({})]",
commonResult.getCode(), commonResult.getMsg());
return null;
}
Object data = commonResult.getData();
if (data == null) {
return null;
}
String parsed = parseTranscriptionText(data);
if (StrUtil.isNotBlank(parsed)) {
return parsed;
}
return data.toString();
}
String parsed = parseTranscriptionText(result);
if (StrUtil.isNotBlank(parsed)) {
return parsed;
}
return result.toString();
} catch (Exception e) {
log.warn("[extractTranscription][解析识别结果失败]", e);
return null;
}
}
private static final List<String> TRANSCRIPTION_TEXT_KEYS =
Arrays.asList("text", "sentence", "result", "content", "transcript", "output_text", "display_text");
private String parseTranscriptionText(Object rawData) {
if (rawData == null) {
return null;
}
String rawString = rawData instanceof String ? (String) rawData : JSONUtil.toJsonStr(rawData);
if (StrUtil.isBlank(rawString)) {
return null;
}
if (!JSONUtil.isTypeJSON(rawString)) {
return rawString;
}
try {
Object json = JSONUtil.parse(rawString);
String localText = extractTextFromJson(json);
if (StrUtil.isNotBlank(localText)) {
return localText;
}
if (json instanceof JSONObject jsonObject) {
JSONArray results = jsonObject.getJSONArray("results");
if (CollUtil.isEmpty(results)) {
return null;
}
Object lastObj = results.get(results.size() - 1);
if (!(lastObj instanceof JSONObject lastResult)) {
return null;
}
String transcriptionUrl = lastResult.getStr("transcription_url");
if (StrUtil.isBlank(transcriptionUrl)) {
return null;
}
StringBuilder builder = new StringBuilder();
appendRemoteTranscription(builder, transcriptionUrl);
return builder.length() > 0 ? builder.toString().trim() : null;
}
} catch (Exception e) {
log.warn("[parseTranscriptionText][解析Paraformer结果失败]", e);
}
return rawString;
}
private void appendRemoteTranscription(StringBuilder builder, String transcriptionUrl) {
if (StrUtil.isBlank(transcriptionUrl)) {
return;
}
String remoteContent = fetchRemoteTranscription(transcriptionUrl);
if (StrUtil.isBlank(remoteContent)) {
return;
}
String remoteText = extractTextFromJson(JSONUtil.parse(remoteContent));
if (StrUtil.isNotBlank(remoteText)) {
appendLine(builder, remoteText);
}
}
private String extractTextFromJson(Object json) {
if (json == null) {
return null;
}
StringBuilder builder = new StringBuilder();
collectTranscriptionText(json, builder);
return builder.length() > 0 ? builder.toString().trim() : null;
}
private String fetchRemoteTranscription(String url) {
try {
String body = HttpUtil.get(url);
if (StrUtil.isNotBlank(body)) {
return body;
}
} catch (Exception e) {
log.warn("[fetchRemoteTranscription][下载转写文本失败url({})]", url, e);
}
return null;
}
private void collectTranscriptionText(Object node, StringBuilder builder) {
if (node == null) {
return;
}
if (node instanceof JSONObject jsonObject) {
for (String key : jsonObject.keySet()) {
Object value = jsonObject.get(key);
if (value == null) {
continue;
}
if (value instanceof CharSequence && TRANSCRIPTION_TEXT_KEYS.contains(key)) {
appendLine(builder, value.toString());
} else if (value instanceof JSONObject || value instanceof JSONArray) {
collectTranscriptionText(value, builder);
}
}
} else if (node instanceof JSONArray jsonArray) {
for (Object item : jsonArray) {
collectTranscriptionText(item, builder);
}
}
}
private void appendLine(StringBuilder builder, String line) {
String normalized = StrUtil.trim(line);
if (StrUtil.isBlank(normalized)) {
return;
}
if (builder.length() > 0) {
builder.append('\n');
}
builder.append(normalized);
}
}

View File

@@ -0,0 +1,69 @@
package cn.iocoder.yudao.module.tik.voice.util;
import org.springframework.util.FileCopyUtils;
import org.springframework.web.multipart.MultipartFile;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
/**
* 仅用于在服务内部上传的内存文件
*/
public class ByteArrayMultipartFile implements MultipartFile {
private final String name;
private final String originalFilename;
private final String contentType;
private final byte[] content;
public ByteArrayMultipartFile(String name, String originalFilename, String contentType, byte[] content) {
this.name = name;
this.originalFilename = originalFilename;
this.contentType = contentType;
this.content = content != null ? content : new byte[0];
}
@Override
public String getName() {
return name;
}
@Override
public String getOriginalFilename() {
return originalFilename;
}
@Override
public String getContentType() {
return contentType;
}
@Override
public boolean isEmpty() {
return content.length == 0;
}
@Override
public long getSize() {
return content.length;
}
@Override
public byte[] getBytes() {
return content;
}
@Override
public InputStream getInputStream() {
return new ByteArrayInputStream(content);
}
@Override
public void transferTo(File dest) throws IOException {
FileCopyUtils.copy(content, dest);
}
}

View File

@@ -0,0 +1,37 @@
package cn.iocoder.yudao.module.tik.voice.vo;
import io.swagger.v3.oas.annotations.media.Schema;
import jakarta.validation.constraints.Max;
import jakarta.validation.constraints.Min;
import jakarta.validation.constraints.NotBlank;
import jakarta.validation.constraints.Size;
import lombok.Data;
/**
* Latentsync 提交请求 VO
*/
@Data
public class AppTikLatentsyncSubmitReqVO {
@Schema(description = "音频 URL需公网可访问", requiredMode = Schema.RequiredMode.REQUIRED,
example = "https://example.com/audio.wav")
@NotBlank(message = "音频地址不能为空")
@Size(max = 1024, message = "音频地址长度不能超过 1024 字符")
private String audioUrl;
@Schema(description = "视频 URL需公网可访问", requiredMode = Schema.RequiredMode.REQUIRED,
example = "https://example.com/video.mp4")
@NotBlank(message = "视频地址不能为空")
@Size(max = 1024, message = "视频地址长度不能超过 1024 字符")
private String videoUrl;
@Schema(description = "guidance_scale范围 1-2默认 1", example = "1")
@Min(value = 1, message = "guidanceScale 不能小于 1")
@Max(value = 2, message = "guidanceScale 不能大于 2")
private Integer guidanceScale;
@Schema(description = "随机种子(默认 8888", example = "8888")
private Integer seed;
}

View File

@@ -0,0 +1,22 @@
package cn.iocoder.yudao.module.tik.voice.vo;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Data;
/**
* Latentsync 提交响应 VO
*/
@Data
public class AppTikLatentsyncSubmitRespVO {
@Schema(description = "Latentsync 任务 ID", example = "8eed0b9b-6103-4357-a57b-9f135a8c3276")
private String requestId;
@Schema(description = "官方状态,如 IN_QUEUE、PROCESSING、SUCCEEDED", example = "IN_QUEUE")
private String status;
@Schema(description = "当前排队位置", example = "0")
private Integer queuePosition;
}

View File

@@ -0,0 +1,38 @@
package cn.iocoder.yudao.module.tik.voice.vo;
import io.swagger.v3.oas.annotations.media.Schema;
import jakarta.validation.constraints.NotBlank;
import jakarta.validation.constraints.NotNull;
import lombok.Data;
/**
* 用户 App - 创建配音 Request VO
*
* @author 芋道源码
*/
@Schema(description = "用户 App - 创建配音 Request VO")
@Data
public class AppTikUserVoiceCreateReqVO {
@Schema(description = "配音名称", requiredMode = Schema.RequiredMode.REQUIRED, example = "我的配音")
@NotBlank(message = "配音名称不能为空")
private String name;
@Schema(description = "音频文件编号(关联 infra_file.id", requiredMode = Schema.RequiredMode.REQUIRED, example = "1")
@NotNull(message = "音频文件编号不能为空")
private Long fileId;
@Schema(description = "是否自动识别", example = "false")
private Boolean autoTranscribe;
@Schema(description = "语言zh-CN-简体中文zh-TW-繁體中文en-US-English", example = "zh-CN")
private String language;
@Schema(description = "音色类型female-女声male-男声", example = "female")
private String gender;
@Schema(description = "备注", example = "这是一个测试配音")
private String note;
}

View File

@@ -0,0 +1,23 @@
package cn.iocoder.yudao.module.tik.voice.vo;
import cn.iocoder.yudao.framework.common.pojo.PageParam;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Data;
/**
* 用户 App - 用户配音分页 Request VO
*
* @author 芋道源码
*/
@Schema(description = "用户 App - 用户配音分页 Request VO")
@Data
public class AppTikUserVoicePageReqVO extends PageParam {
@Schema(description = "用户编号(自动填充,无需传递)")
private Long userId;
@Schema(description = "配音名称(模糊查询)", example = "我的配音")
private String name;
}

View File

@@ -0,0 +1,48 @@
package cn.iocoder.yudao.module.tik.voice.vo;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Data;
import java.time.LocalDateTime;
/**
* 用户 App - 用户配音 Response VO
*
* @author 芋道源码
*/
@Schema(description = "用户 App - 用户配音 Response VO")
@Data
public class AppTikUserVoiceRespVO {
@Schema(description = "配音编号", requiredMode = Schema.RequiredMode.REQUIRED, example = "1")
private Long id;
@Schema(description = "配音名称", requiredMode = Schema.RequiredMode.REQUIRED, example = "我的配音")
private String name;
@Schema(description = "音频文件编号(关联 infra_file.id", requiredMode = Schema.RequiredMode.REQUIRED, example = "1")
private Long fileId;
@Schema(description = "文件访问URL通过 file_id 关联查询获取)")
private String fileUrl;
@Schema(description = "语音识别内容", example = "这是识别出的文字内容")
private String transcription;
@Schema(description = "语言zh-CN-简体中文zh-TW-繁體中文en-US-English", example = "zh-CN")
private String language;
@Schema(description = "音色类型female-女声male-男声", example = "female")
private String gender;
@Schema(description = "备注", example = "这是一个测试配音")
private String note;
@Schema(description = "创建时间", requiredMode = Schema.RequiredMode.REQUIRED)
private LocalDateTime createTime;
@Schema(description = "更新时间", requiredMode = Schema.RequiredMode.REQUIRED)
private LocalDateTime updateTime;
}

View File

@@ -0,0 +1,36 @@
package cn.iocoder.yudao.module.tik.voice.vo;
import io.swagger.v3.oas.annotations.media.Schema;
import jakarta.validation.constraints.NotNull;
import lombok.Data;
/**
* 用户 App - 更新配音 Request VO
*
* @author 芋道源码
*/
@Schema(description = "用户 App - 更新配音 Request VO")
@Data
public class AppTikUserVoiceUpdateReqVO {
@Schema(description = "配音编号", requiredMode = Schema.RequiredMode.REQUIRED, example = "1")
@NotNull(message = "配音编号不能为空")
private Long id;
@Schema(description = "配音名称", example = "我的配音")
private String name;
@Schema(description = "语言zh-CN-简体中文zh-TW-繁體中文en-US-English", example = "zh-CN")
private String language;
@Schema(description = "音色类型female-女声male-男声", example = "female")
private String gender;
@Schema(description = "备注", example = "这是一个测试配音")
private String note;
@Schema(description = "识别内容", example = "识别文字,可手动编辑")
private String transcription;
}

View File

@@ -0,0 +1,43 @@
package cn.iocoder.yudao.module.tik.voice.vo;
import io.swagger.v3.oas.annotations.media.Schema;
import jakarta.validation.constraints.Size;
import lombok.Data;
/**
* 我的音色试听请求
*/
@Data
public class AppTikVoicePreviewReqVO {
@Schema(description = "输入文本")
@Size(max = 4000, message = "输入文本不能超过 4000 个字符")
private String inputText;
@Schema(description = "识别文本,用于拼接")
@Size(max = 4000, message = "识别文本不能超过 4000 个字符")
private String transcriptionText;
@Schema(description = "音色 IDCosyVoice voiceId")
private String voiceId;
@Schema(description = "音色源音频 OSS 地址(当没有 voiceId 时必传)")
private String fileUrl;
@Schema(description = "模型名称,默认 cosyvoice-v2")
private String model;
@Schema(description = "语速", example = "1.0")
private Float speechRate;
@Schema(description = "音量", example = "0")
private Float volume;
@Schema(description = "情感", example = "neutral")
private String emotion;
@Schema(description = "音频格式,默认 wav")
private String audioFormat;
}

View File

@@ -0,0 +1,26 @@
package cn.iocoder.yudao.module.tik.voice.vo;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Data;
@Data
@Schema(description = "音色试听响应")
public class AppTikVoicePreviewRespVO {
@Schema(description = "音频播放地址(预签名 URL")
private String audioUrl;
@Schema(description = "音频格式", example = "wav")
private String format;
@Schema(description = "采样率", example = "24000")
private Integer sampleRate;
@Schema(description = "CosyVoice 请求ID")
private String requestId;
@Schema(description = "使用的音色 ID")
private String voiceId;
}

View File

@@ -0,0 +1,46 @@
package cn.iocoder.yudao.module.tik.voice.vo;
import io.swagger.v3.oas.annotations.media.Schema;
import jakarta.validation.constraints.Size;
import lombok.Data;
/**
* 文本转语音请求 VO
*/
@Data
public class AppTikVoiceTtsReqVO {
@Schema(description = "输入文本")
@Size(max = 4000, message = "输入文本不能超过 4000 个字符")
private String inputText;
@Schema(description = "识别文本,用于拼接")
@Size(max = 4000, message = "识别文本不能超过 4000 个字符")
private String transcriptionText;
@Schema(description = "音色 IDCosyVoice voiceId", example = "cosyvoice-v2-myvoice-xxx")
private String voiceId;
@Schema(description = "音色源音频 OSS 地址(当没有 voiceId 时必传)")
private String fileUrl;
@Schema(description = "模型名称,默认 cosyvoice-v2", example = "cosyvoice-v3")
private String model;
@Schema(description = "语速,默认 1.0", example = "1.0")
private Float speechRate;
@Schema(description = "情感", example = "happy")
private String emotion;
@Schema(description = "音量调节范围 [-10,10]", example = "0")
private Float volume;
@Schema(description = "目标采样率,默认 24000")
private Integer sampleRate;
@Schema(description = "音频格式,默认 wav可选 mp3")
private String audioFormat;
}

View File

@@ -0,0 +1,29 @@
package cn.iocoder.yudao.module.tik.voice.vo;
import io.swagger.v3.oas.annotations.media.Schema;
import lombok.Data;
@Data
@Schema(description = "CosyVoice 文本转语音响应")
public class AppTikVoiceTtsRespVO {
@Schema(description = "用户文件编号", example = "1024")
private Long fileId;
@Schema(description = "音频播放地址(预签名 URL")
private String audioUrl;
@Schema(description = "音频格式", example = "mp3")
private String format;
@Schema(description = "采样率", example = "24000")
private Integer sampleRate;
@Schema(description = "CosyVoice 请求ID")
private String requestId;
@Schema(description = "使用的音色 ID")
private String voiceId;
}