Files
video-create/.claude/skills/video-from-script/scripts/lib/phase-tts.js
sion123 9d19437a29 feat(video-pipeline): 实现 TTS 逐句分句生成与字幕精确对齐
TTS 阶段将长文本按标点切分为短句,逐句生成音频并记录每句时长到 `item.segments[]`。assemble 阶段优先使用 segments 的精确时长分配字幕时间线,无 segments 时回退到字数权重估算。同时优化音频上传流程,支持分段音频独立上传 OSS 并在配音时按段映射时间线。
2026-05-01 14:41:28 +08:00

83 lines
2.9 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Phase: tts — 语音合成(逐句分句生成)
*
* 将每个 item 的 script 按标点切分为短句,每句单独生成 TTS 音频。
* 结果写入 item.segments[],实现字幕与语音精确对齐。
*/
const path = require('path')
const { saveManifest, ensureDir, log, getManifestDir, splitTextIntoSentences } = require('./pipeline-utils')
async function phaseTts(manifest, manifestPath, options = {}) {
const dir = getManifestDir(manifestPath)
const audioDir = path.join(dir, 'audio')
ensureDir(audioDir)
const { synthesize } = require('../qwen-tts')
const items = manifest.items.filter(it =>
it.status === 'done' && (it.script || it.text) && !it.audio
)
if (items.length === 0) { log('tts', '无待处理 item跳过'); return }
log('tts', `${items.length}`)
for (let i = 0; i < items.length; i++) {
const item = items[i]
const idx = i + 1
const fullText = item.script || item.text
try {
const sentences = splitTextIntoSentences(fullText)
if (sentences.length <= 1) {
// 单句:不需要 segments走原逻辑
const { filePath, duration } = await synthesize(fullText, {
outputDir: audioDir,
id: item.id || idx,
voice: manifest.ttsVoice || undefined,
instruction: manifest.ttsInstruction || undefined,
rate: manifest.ttsRate || undefined,
})
item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
item.audioDuration = Math.round(duration * 1000) / 1000
log('tts', `[${idx}/${items.length}] ${duration.toFixed(1)}s: ${fullText.substring(0, 30)}...`)
} else {
// 多句:逐句生成,写入 segments
const segments = []
let totalDuration = 0
for (let j = 0; j < sentences.length; j++) {
const sentence = sentences[j]
const segId = `${item.id || idx}_${j + 1}`
const { filePath, duration } = await synthesize(sentence, {
outputDir: audioDir,
id: segId,
voice: manifest.ttsVoice || undefined,
instruction: manifest.ttsInstruction || undefined,
rate: manifest.ttsRate || undefined,
})
segments.push({
text: sentence,
audio: path.relative(dir, filePath).replace(/\\/g, '/'),
duration: Math.round(duration * 1000) / 1000,
})
totalDuration += duration
}
item.segments = segments
item.audio = segments[0].audio
item.audioDuration = Math.round(totalDuration * 1000) / 1000
log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s (${segments.length}句): ${fullText.substring(0, 30)}...`)
}
} catch (err) {
item.status = 'failed'
item.error = `TTS失败: ${err.message}`
log('tts', `[${idx}/${items.length}] 失败: ${err.message}`)
}
saveManifest(manifestPath, manifest)
}
}
module.exports = { phaseTts }