feat(video-pipeline): 实现 TTS 逐句分句生成与字幕精确对齐
TTS 阶段将长文本按标点切分为短句,逐句生成音频并记录每句时长到 `item.segments[]`。assemble 阶段优先使用 segments 的精确时长分配字幕时间线,无 segments 时回退到字数权重估算。同时优化音频上传流程,支持分段音频独立上传 OSS 并在配音时按段映射时间线。
This commit is contained in:
@@ -1,11 +1,12 @@
|
||||
/**
|
||||
* Phase: tts — 语音合成
|
||||
* Phase: tts — 语音合成(逐句分句生成)
|
||||
*
|
||||
* 使用通义千问 TTS 生成旁白音频
|
||||
* 将每个 item 的 script 按标点切分为短句,每句单独生成 TTS 音频。
|
||||
* 结果写入 item.segments[],实现字幕与语音精确对齐。
|
||||
*/
|
||||
|
||||
const path = require('path')
|
||||
const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils')
|
||||
const { saveManifest, ensureDir, log, getManifestDir, splitTextIntoSentences } = require('./pipeline-utils')
|
||||
|
||||
async function phaseTts(manifest, manifestPath, options = {}) {
|
||||
const dir = getManifestDir(manifestPath)
|
||||
@@ -24,17 +25,51 @@ async function phaseTts(manifest, manifestPath, options = {}) {
|
||||
for (let i = 0; i < items.length; i++) {
|
||||
const item = items[i]
|
||||
const idx = i + 1
|
||||
const fullText = item.script || item.text
|
||||
|
||||
try {
|
||||
const { filePath, duration } = await synthesize(item.script || item.text, {
|
||||
outputDir: audioDir,
|
||||
id: item.id || idx,
|
||||
voice: manifest.ttsVoice || undefined,
|
||||
instruction: manifest.ttsInstruction || undefined,
|
||||
rate: manifest.ttsRate || undefined,
|
||||
})
|
||||
item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
|
||||
item.audioDuration = Math.round(duration * 1000) / 1000
|
||||
log('tts', `[${idx}/${items.length}] ${duration.toFixed(1)}s: ${(item.script || item.text).substring(0, 30)}...`)
|
||||
const sentences = splitTextIntoSentences(fullText)
|
||||
|
||||
if (sentences.length <= 1) {
|
||||
// 单句:不需要 segments,走原逻辑
|
||||
const { filePath, duration } = await synthesize(fullText, {
|
||||
outputDir: audioDir,
|
||||
id: item.id || idx,
|
||||
voice: manifest.ttsVoice || undefined,
|
||||
instruction: manifest.ttsInstruction || undefined,
|
||||
rate: manifest.ttsRate || undefined,
|
||||
})
|
||||
item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
|
||||
item.audioDuration = Math.round(duration * 1000) / 1000
|
||||
log('tts', `[${idx}/${items.length}] ${duration.toFixed(1)}s: ${fullText.substring(0, 30)}...`)
|
||||
} else {
|
||||
// 多句:逐句生成,写入 segments
|
||||
const segments = []
|
||||
let totalDuration = 0
|
||||
|
||||
for (let j = 0; j < sentences.length; j++) {
|
||||
const sentence = sentences[j]
|
||||
const segId = `${item.id || idx}_${j + 1}`
|
||||
const { filePath, duration } = await synthesize(sentence, {
|
||||
outputDir: audioDir,
|
||||
id: segId,
|
||||
voice: manifest.ttsVoice || undefined,
|
||||
instruction: manifest.ttsInstruction || undefined,
|
||||
rate: manifest.ttsRate || undefined,
|
||||
})
|
||||
segments.push({
|
||||
text: sentence,
|
||||
audio: path.relative(dir, filePath).replace(/\\/g, '/'),
|
||||
duration: Math.round(duration * 1000) / 1000,
|
||||
})
|
||||
totalDuration += duration
|
||||
}
|
||||
|
||||
item.segments = segments
|
||||
item.audio = segments[0].audio
|
||||
item.audioDuration = Math.round(totalDuration * 1000) / 1000
|
||||
log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s (${segments.length}句): ${fullText.substring(0, 30)}...`)
|
||||
}
|
||||
} catch (err) {
|
||||
item.status = 'failed'
|
||||
item.error = `TTS失败: ${err.message}`
|
||||
|
||||
Reference in New Issue
Block a user