feat(video-pipeline): 实现 TTS 逐句分句生成与字幕精确对齐

TTS 阶段将长文本按标点切分为短句，逐句生成音频并记录每句时长到 `item.segments[]`。assemble 阶段优先使用 segments 的精确时长分配字幕时间线，无 segments 时回退到字数权重估算。同时优化音频上传流程，支持分段音频独立上传 OSS 并在配音时按段映射时间线。
2026-05-01 14:41:28 +08:00
parent f5d47ec5db
commit 9d19437a29
4 changed files with 236 additions and 122 deletions
--- a/.claude/skills/video-from-script/scripts/lib/phase-tts.js
+++ b/.claude/skills/video-from-script/scripts/lib/phase-tts.js
@@ -1,11 +1,12 @@
 /**
- * Phase: tts — 语音合成
+ * Phase: tts — 语音合成（逐句分句生成）
 *
- * 使用通义千问 TTS 生成旁白音频
+ * 将每个 item 的 script 按标点切分为短句，每句单独生成 TTS 音频。
+ * 结果写入 item.segments[]，实现字幕与语音精确对齐。
 */

 const path = require('path')
-const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils')
+const { saveManifest, ensureDir, log, getManifestDir, splitTextIntoSentences } = require('./pipeline-utils')

 async function phaseTts(manifest, manifestPath, options = {}) {
  const dir = getManifestDir(manifestPath)
@@ -24,17 +25,51 @@ async function phaseTts(manifest, manifestPath, options = {}) {
  for (let i = 0; i < items.length; i++) {
    const item = items[i]
    const idx = i + 1
+    const fullText = item.script || item.text
+
    try {
-      const { filePath, duration } = await synthesize(item.script || item.text, {
-        outputDir: audioDir,
-        id: item.id || idx,
-        voice: manifest.ttsVoice || undefined,
-        instruction: manifest.ttsInstruction || undefined,
-        rate: manifest.ttsRate || undefined,
-      })
-      item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
-      item.audioDuration = Math.round(duration * 1000) / 1000
-      log('tts', `[${idx}/${items.length}] ${duration.toFixed(1)}s: ${(item.script || item.text).substring(0, 30)}...`)
+      const sentences = splitTextIntoSentences(fullText)
+
+      if (sentences.length <= 1) {
+        // 单句：不需要 segments，走原逻辑
+        const { filePath, duration } = await synthesize(fullText, {
+          outputDir: audioDir,
+          id: item.id || idx,
+          voice: manifest.ttsVoice || undefined,
+          instruction: manifest.ttsInstruction || undefined,
+          rate: manifest.ttsRate || undefined,
+        })
+        item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
+        item.audioDuration = Math.round(duration * 1000) / 1000
+        log('tts', `[${idx}/${items.length}] ${duration.toFixed(1)}s: ${fullText.substring(0, 30)}...`)
+      } else {
+        // 多句：逐句生成，写入 segments
+        const segments = []
+        let totalDuration = 0
+
+        for (let j = 0; j < sentences.length; j++) {
+          const sentence = sentences[j]
+          const segId = `${item.id || idx}_${j + 1}`
+          const { filePath, duration } = await synthesize(sentence, {
+            outputDir: audioDir,
+            id: segId,
+            voice: manifest.ttsVoice || undefined,
+            instruction: manifest.ttsInstruction || undefined,
+            rate: manifest.ttsRate || undefined,
+          })
+          segments.push({
+            text: sentence,
+            audio: path.relative(dir, filePath).replace(/\\/g, '/'),
+            duration: Math.round(duration * 1000) / 1000,
+          })
+          totalDuration += duration
+        }
+
+        item.segments = segments
+        item.audio = segments[0].audio
+        item.audioDuration = Math.round(totalDuration * 1000) / 1000
+        log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s (${segments.length}句): ${fullText.substring(0, 30)}...`)
+      }
    } catch (err) {
      item.status = 'failed'
      item.error = `TTS失败: ${err.message}`