feat(video-pipeline): 重构视频流水线，优化成片时间线规则和状态管理

- 引入 manifest.json 作为唯一状态源，所有子 Agent 操作回写 manifest - 重构 timebuilder 逻辑，支持四种视频适配策略（加速/裁剪/放缓/画面停顿） - 统一 TTS 阶段输出结构，单句和多句均写入 segments[] - 重写字幕和配音生成，基于 segments 精确时长实现音画同步 - 新增 confirm 命令支持按 id 范围确认，上传阶段分离图片和视频 - 添加中间产物写入 output/ 目录的约束，清理废弃配置参数
2026-05-02 00:14:40 +08:00
parent b4b92854db
commit 0998fd6ae1
14 changed files with 457 additions and 205 deletions
--- a/.claude/skills/video-from-script/scripts/lib/phase-tts.js
+++ b/.claude/skills/video-from-script/scripts/lib/phase-tts.js
@@ -2,7 +2,8 @@
 * Phase: tts — 语音合成（逐句分句生成）
 *
 * 将每个 item 的 script 按标点切分为短句，每句单独生成 TTS 音频。
- * 结果写入 item.segments[]，实现字幕与语音精确对齐。
+ * 统一写入 item.segments[]，单句时数组仅 1 个元素。
+ * item.audio 指向第一段，item.audioDuration 为累计时长。
 */

 const path = require('path')
@@ -29,47 +30,32 @@ async function phaseTts(manifest, manifestPath, options = {}) {

    try {
      const sentences = splitTextIntoSentences(fullText)
+      const segments = []
+      let totalDuration = 0

-      if (sentences.length <= 1) {
-        // 单句：不需要 segments，走原逻辑
-        const { filePath, duration } = await synthesize(fullText, {
+      for (let j = 0; j < sentences.length; j++) {
+        const sentence = sentences[j]
+        const segId = `${item.id || idx}_${j + 1}`
+        const { filePath, duration } = await synthesize(sentence, {
          outputDir: audioDir,
-          id: item.id || idx,
+          id: segId,
          voice: manifest.ttsVoice || undefined,
          instruction: manifest.ttsInstruction || undefined,
          rate: manifest.ttsRate || undefined,
        })
-        item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
-        item.audioDuration = Math.round(duration * 1000) / 1000
-        log('tts', `[${idx}/${items.length}] ${duration.toFixed(1)}s: ${fullText.substring(0, 30)}...`)
-      } else {
-        // 多句：逐句生成，写入 segments
-        const segments = []
-        let totalDuration = 0
-
-        for (let j = 0; j < sentences.length; j++) {
-          const sentence = sentences[j]
-          const segId = `${item.id || idx}_${j + 1}`
-          const { filePath, duration } = await synthesize(sentence, {
-            outputDir: audioDir,
-            id: segId,
-            voice: manifest.ttsVoice || undefined,
-            instruction: manifest.ttsInstruction || undefined,
-            rate: manifest.ttsRate || undefined,
-          })
-          segments.push({
-            text: sentence,
-            audio: path.relative(dir, filePath).replace(/\\/g, '/'),
-            duration: Math.round(duration * 1000) / 1000,
-          })
-          totalDuration += duration
-        }
-
-        item.segments = segments
-        item.audio = segments[0].audio
-        item.audioDuration = Math.round(totalDuration * 1000) / 1000
-        log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s (${segments.length}句): ${fullText.substring(0, 30)}...`)
+        segments.push({
+          text: sentence,
+          audio: path.relative(dir, filePath).replace(/\\/g, '/'),
+          duration: Math.round(duration * 1000) / 1000,
+        })
+        totalDuration += duration
      }
+
+      // 统一使用 segments 数组（单句 = 1 元素，多句 = N 元素）
+      item.segments = segments
+      item.audio = segments[0].audio
+      item.audioDuration = Math.round(totalDuration * 1000) / 1000
+      log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s (${segments.length}句): ${fullText.substring(0, 30)}...`)
    } catch (err) {
      item.status = 'failed'
      item.error = `TTS失败: ${err.message}`