feat(video-pipeline): 重构视频流水线,优化成片时间线规则和状态管理

- 引入 manifest.json 作为唯一状态源,所有子 Agent 操作回写 manifest
- 重构 timebuilder 逻辑,支持四种视频适配策略(加速/裁剪/放缓/画面停顿)
- 统一 TTS 阶段输出结构,单句和多句均写入 segments[]
- 重写字幕和配音生成,基于 segments 精确时长实现音画同步
- 新增 confirm 命令支持按 id 范围确认,上传阶段分离图片和视频
- 添加中间产物写入 output/ 目录的约束,清理废弃配置参数
This commit is contained in:
2026-05-02 00:14:40 +08:00
parent b4b92854db
commit 0998fd6ae1
14 changed files with 457 additions and 205 deletions

View File

@@ -2,7 +2,8 @@
* Phase: tts — 语音合成(逐句分句生成)
*
* 将每个 item 的 script 按标点切分为短句,每句单独生成 TTS 音频。
* 结果写入 item.segments[]实现字幕与语音精确对齐
* 统一写入 item.segments[]单句时数组仅 1 个元素
* item.audio 指向第一段item.audioDuration 为累计时长。
*/
const path = require('path')
@@ -29,47 +30,32 @@ async function phaseTts(manifest, manifestPath, options = {}) {
try {
const sentences = splitTextIntoSentences(fullText)
const segments = []
let totalDuration = 0
if (sentences.length <= 1) {
// 单句:不需要 segments走原逻辑
const { filePath, duration } = await synthesize(fullText, {
for (let j = 0; j < sentences.length; j++) {
const sentence = sentences[j]
const segId = `${item.id || idx}_${j + 1}`
const { filePath, duration } = await synthesize(sentence, {
outputDir: audioDir,
id: item.id || idx,
id: segId,
voice: manifest.ttsVoice || undefined,
instruction: manifest.ttsInstruction || undefined,
rate: manifest.ttsRate || undefined,
})
item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
item.audioDuration = Math.round(duration * 1000) / 1000
log('tts', `[${idx}/${items.length}] ${duration.toFixed(1)}s: ${fullText.substring(0, 30)}...`)
} else {
// 多句:逐句生成,写入 segments
const segments = []
let totalDuration = 0
for (let j = 0; j < sentences.length; j++) {
const sentence = sentences[j]
const segId = `${item.id || idx}_${j + 1}`
const { filePath, duration } = await synthesize(sentence, {
outputDir: audioDir,
id: segId,
voice: manifest.ttsVoice || undefined,
instruction: manifest.ttsInstruction || undefined,
rate: manifest.ttsRate || undefined,
})
segments.push({
text: sentence,
audio: path.relative(dir, filePath).replace(/\\/g, '/'),
duration: Math.round(duration * 1000) / 1000,
})
totalDuration += duration
}
item.segments = segments
item.audio = segments[0].audio
item.audioDuration = Math.round(totalDuration * 1000) / 1000
log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s (${segments.length}句): ${fullText.substring(0, 30)}...`)
segments.push({
text: sentence,
audio: path.relative(dir, filePath).replace(/\\/g, '/'),
duration: Math.round(duration * 1000) / 1000,
})
totalDuration += duration
}
// 统一使用 segments 数组(单句 = 1 元素,多句 = N 元素)
item.segments = segments
item.audio = segments[0].audio
item.audioDuration = Math.round(totalDuration * 1000) / 1000
log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s (${segments.length}句): ${fullText.substring(0, 30)}...`)
} catch (err) {
item.status = 'failed'
item.error = `TTS失败: ${err.message}`