feat(video-pipeline): 实现 TTS 逐句分句生成与字幕精确对齐

TTS 阶段将长文本按标点切分为短句，逐句生成音频并记录每句时长到 `item.segments[]`。assemble 阶段优先使用 segments 的精确时长分配字幕时间线，无 segments 时回退到字数权重估算。同时优化音频上传流程，支持分段音频独立上传 OSS 并在配音时按段映射时间线。
2026-05-01 14:41:28 +08:00
parent f5d47ec5db
commit 9d19437a29
4 changed files with 236 additions and 122 deletions
--- a/.claude/skills/video-from-script/scripts/capcut_assemble.js
+++ b/.claude/skills/video-from-script/scripts/capcut_assemble.js
@@ -17,6 +17,7 @@ const path = require('path')
 const fs = require('fs')
 const { execFile } = require('child_process')
 const { syncDraft, registerDraft, triggerDirectoryScan } = require('./sync-to-jianying')
+const { splitTextIntoSentences } = require('./lib/pipeline-utils')

 // ============================================================================
 // 配置
@@ -248,8 +249,8 @@ async function assemble(args) {
  // 用 ffprobe 测量实际音频/视频时长，替代 manifest 中的估计值
  let audioMeasured = 0, videoMeasured = 0
  for (const item of items) {
-    // 测量 TTS 音频实际时长
-    if (item.audio && !item.audio.startsWith('http')) {
+    // 测量 TTS 音频实际时长（有 segments 时跳过，audioDuration 已是精确累计值）
+    if (item.audio && !item.audio.startsWith('http') && !item.segments) {
      const audioPath = path.isAbsolute(item.audio)
        ? item.audio
        : path.resolve(inputDir, item.audio)
@@ -277,6 +278,9 @@ async function assemble(args) {
  const totalDurationUs = timeline.length > 0 ? timeline[timeline.length - 1].end : 0
  const hasTTS = items.some(item => item.audio && item.audioDuration != null)

+  // -- 读取转场策略（在 addImages/addVideos 之前） --
+  const transitionConfig = loadTransitions(manifest)
+
  console.log(`\nCapCut 成片组装`)
  console.log(`  模式: ${mode}  画幅: ${format} (${width}x${height})`)
  console.log(`  时间线: ${hasTTS ? 'TTS音频驱动' : `固定${duration}s/段`}  总时长: ${(totalDurationUs / US).toFixed(1)}s`)
@@ -285,7 +289,7 @@ async function assemble(args) {

  const steps = []
  if (mode === 'images') steps.push('upload')
-  steps.push('draft', 'materials', 'voiceover', 'audio', 'subtitles', 'effects', 'filter', 'save', 'sync')
+  steps.push('draft', 'materials', 'audio_oss', 'voiceover', 'audio', 'subtitles', 'effects', 'filter', 'save', 'sync')
  const totalSteps = steps.length
  let step = 0

@@ -371,10 +375,22 @@ async function assemble(args) {
    await addVideos(draftUrl, inputDir, items, timeline, width, height, transitionConfig)
  }

+  // -- 上传 TTS 音频到 OSS --
+  let audioUrls = {}
+  if (voiceover === 'true' && hasTTS) {
+    step++; console.log(`[${step}/${totalSteps}] 上传 TTS 音频到 OSS...`)
+    try {
+      audioUrls = await batchUploadAudio(inputDir, items)
+      console.log(`   成功: ${Object.keys(audioUrls).length} 段音频\n`)
+    } catch (err) {
+      console.log(`   OSS 上传失败，将尝试本地路径: ${err.message}\n`)
+    }
+  }
+
  // -- 添加 TTS 配音 --
  step++; console.log(`[${step}/${totalSteps}] 添加 TTS 配音...`)
  if (voiceover === 'true' && hasTTS) {
-    await addVoiceover(draftUrl, inputDir, items, timeline, localAudio === 'true')
+    await addVoiceover(draftUrl, inputDir, items, timeline, audioUrls)
  } else {
    console.log('   跳过（无 TTS 音频或未启用）')
  }
@@ -393,9 +409,6 @@ async function assemble(args) {
    console.log(`  字幕风格: ${subtitleStyle.font || '默认'} ${subtitleStyle.inAnimation ? subtitleStyle.inAnimation + '→' + subtitleStyle.outAnimation : ''}`)
  }

-  // -- 读取转场策略 --
-  const transitionConfig = loadTransitions(manifest)
-
  // -- 添加字幕 --
  step++; console.log(`[${step}/${totalSteps}] 添加字幕...`)
  if (subtitles === 'true' && items.some(i => i.script || i.text)) {
@@ -640,15 +653,34 @@ async function uploadAudioToOSS(filePath) {
 async function batchUploadAudio(inputDir, items) {
  const urls = {}
  for (const item of items) {
+    // 上传 segments 中的每段音频
+    if (item.segments && item.segments.length > 1) {
+      for (const seg of item.segments) {
+        if (!seg.audio || seg.audio.startsWith('http') || urls[seg.audio]) continue
+        const filePath = path.isAbsolute(seg.audio)
+          ? seg.audio
+          : path.resolve(inputDir, seg.audio)
+        if (!fs.existsSync(filePath)) {
+          console.error(`   音频文件不存在: ${filePath}`)
+          continue
+        }
+        try {
+          urls[seg.audio] = await uploadAudioToOSS(filePath)
+          console.log(`   上传: ${path.basename(filePath)} -> OK`)
+        } catch (err) {
+          console.error(`   上传失败: ${path.basename(filePath)} - ${err.message}`)
+        }
+      }
+    }
+    // 上传 item.audio（单段或 segments 的第一段）
    if (!item.audio || item.audio.startsWith('http')) {
      if (item.audio) urls[item.audio] = item.audio
      continue
    }
-    // audio 可以是相对路径或绝对路径
+    if (urls[item.audio]) continue
    const filePath = path.isAbsolute(item.audio)
      ? item.audio
      : path.resolve(inputDir, item.audio)
-
    if (!fs.existsSync(filePath)) {
      console.error(`   音频文件不存在: ${filePath}`)
      continue
@@ -667,51 +699,54 @@ async function batchUploadAudio(inputDir, items) {
 // 添加 TTS 配音（每段音频按时间线排列）
 // ============================================================================

-async function addVoiceover(draftUrl, inputDir, items, timeline, localAudio = true) {
+async function addVoiceover(draftUrl, inputDir, items, timeline, audioUrls = {}) {
  // 收集音频
-  const audioItems = items.filter(item => item.audio)
+  const audioItems = items.filter(item => item.audio || (item.segments && item.segments.length > 0))
  if (audioItems.length === 0) {
    console.log('   无 TTS 音频文件，跳过')
    return
  }

  const audioInfos = []
+  const resolveAudio = (relPath) => {
+    if (relPath.startsWith('http')) return relPath
+    if (audioUrls[relPath]) return audioUrls[relPath]
+    return path.isAbsolute(relPath) ? relPath : path.resolve(inputDir, relPath)
+  }

-  if (localAudio) {
-    // 本地模式：直接用本地路径，不上传 OSS
-    for (let i = 0; i < items.length; i++) {
-      const item = items[i]
-      if (!item.audio) continue
+  for (let i = 0; i < items.length; i++) {
+    const item = items[i]
+    const tl = timeline[i]
+    const segments = item.segments && item.segments.length > 1 ? item.segments : null

-      const filePath = item.audio.startsWith('http')
-        ? item.audio
-        : (path.isAbsolute(item.audio) ? item.audio : path.resolve(inputDir, item.audio))
+    if (segments) {
+      // 多段音频：按 segment 逐段添加，使用精确时长
+      const totalSegDur = segments.reduce((sum, s) => sum + s.duration * US, 0)
+      const tlDuration = tl.end - tl.start
+      let currentTime = tl.start

-      if (!item.audio.startsWith('http') && !fs.existsSync(filePath)) {
-        console.error(`   音频文件不存在: ${filePath}`)
-        continue
+      for (let j = 0; j < segments.length; j++) {
+        const seg = segments[j]
+        const segDurUs = Math.round(seg.duration * US)
+        let duration = Math.round(tlDuration * (segDurUs / totalSegDur))
+        if (j === segments.length - 1) duration = tl.end - currentTime
+        duration = Math.max(duration, 100000)
+
+        const audioUrl = resolveAudio(seg.audio)
+
+        audioInfos.push({
+          audio_url: audioUrl,
+          start: currentTime,
+          end: currentTime + duration,
+          duration,
+          volume: 1.0,
+        })
+        currentTime += duration
      }
+    } else if (item.audio) {
+      // 单段音频
+      const audioUrl = resolveAudio(item.audio)

-      const tl = timeline[i]
-      audioInfos.push({
-        audio_url: filePath,
-        start: tl.start,
-        end: tl.end,
-        duration: tl.duration,
-        volume: 1.0,
-      })
-    }
-  } else {
-    // 上传模式：先传 OSS 再用 URL
-    const audioUrls = await batchUploadAudio(inputDir, items)
-    for (let i = 0; i < items.length; i++) {
-      const item = items[i]
-      if (!item.audio) continue
-
-      const audioUrl = audioUrls[item.audio]
-      if (!audioUrl) continue
-
-      const tl = timeline[i]
      audioInfos.push({
        audio_url: audioUrl,
        start: tl.start,
@@ -731,7 +766,8 @@ async function addVoiceover(draftUrl, inputDir, items, timeline, localAudio = tr
    draft_url: draftUrl,
    audio_infos: JSON.stringify(audioInfos),
  })
-  console.log(`   已添加 ${audioInfos.length} 段 TTS 配音 (${localAudio ? '本地路径' : 'OSS'})`)
+  const ossCount = audioInfos.filter(a => a.audio_url.startsWith('http')).length
+  console.log(`   已添加 ${audioInfos.length} 段 TTS 配音 (${ossCount > 0 ? `${ossCount} 段 OSS + ` : ''}${audioInfos.length - ossCount} 段本地)`)
 }

 // ============================================================================
@@ -793,40 +829,6 @@ function loadTransitions(manifest) {
 // 添加字幕（支持关键词高亮 + 账号字幕风格 + 分句切分）
 // ============================================================================

-/**
- * 按标点符号切分文本为短句（去除所有标点符号）
- */
-function splitTextIntoSentences(text) {
-  const sentenceEnders = /[。！？；]/
-  const clauseEnders = /[，：]/
-
-  const sentences = []
-  let current = ''
-  let chars = text.split('')
-
-  for (let i = 0; i < chars.length; i++) {
-    const char = chars[i]
-    current += char
-
-    if (sentenceEnders.test(char)) {
-      // 切分并去掉所有标点
-      sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
-      current = ''
-    } else if (clauseEnders.test(char) && current.length > 8) {
-      // 切分并去掉所有标点
-      sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
-      current = ''
-    }
-  }
-
-  // 处理剩余文本
-  if (current.trim()) {
-    sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
-  }
-
-  return sentences
-}
-
 async function addSubtitles(draftUrl, items, timeline, style = {}, split = false) {
  const captions = []

@@ -844,45 +846,76 @@ async function addSubtitles(draftUrl, items, timeline, style = {}, split = false
    const tl = timeline[i]

    if (split) {
-      // 分句模式：切分长文本
-      const sentences = splitTextIntoSentences(text)
-      if (sentences.length === 0) continue
+      // 分句模式：优先用 segments（TTS 逐句生成的精确时长），回退到字数估算
+      const segments = item.segments && item.segments.length > 1 ? item.segments : null

-      const totalDuration = tl.end - tl.start
+      if (segments) {
+        // 精确模式：用 segments 的实际音频时长
+        const totalSegDur = segments.reduce((sum, s) => sum + s.duration * US, 0)
+        const tlDuration = tl.end - tl.start
+        let currentTime = tl.start

-      // 按字数权重分配时间（改进版）
-      const totalChars = sentences.reduce((sum, s) => sum + s.length, 0)
-      let currentTime = tl.start
+        segments.forEach((seg, idx) => {
+          const segDurUs = Math.round(seg.duration * US)
+          // 按实际时长占比映射到时间线（处理 ffprobe 重新测量的差异）
+          let duration = Math.round(tlDuration * (segDurUs / totalSegDur))
+          if (idx === segments.length - 1) {
+            duration = tl.end - currentTime
+          }
+          duration = Math.max(duration, 1000000)

-      sentences.forEach((sentence, idx) => {
-        // 按字数比例计算时长
-        const charRatio = sentence.length / totalChars
-        let duration = Math.round(totalDuration * charRatio)
+          const cap = {
+            start: currentTime,
+            end: currentTime + duration,
+            text: seg.text,
+            keyword: '',
+            keyword_color: '',
+          }

-        // 最后一句使用剩余全部时间（避免精度误差）
-        if (idx === sentences.length - 1) {
-          duration = tl.end - currentTime
-        }
+          if (inAnimation) cap.in_animation = inAnimation
+          if (outAnimation) cap.out_animation = outAnimation
+          if (inAnimDuration) cap.in_animation_duration = inAnimDuration
+          if (outAnimDuration) cap.out_animation_duration = outAnimDuration

-        // 最小1秒，避免太短
-        duration = Math.max(duration, 1000000) // 1秒 = 1000000微秒
+          captions.push(cap)
+          currentTime += duration
+        })
+      } else {
+        // 回退：字数权重估算
+        const sentences = splitTextIntoSentences(text)
+        if (sentences.length === 0) continue

-        const cap = {
-          start: currentTime,
-          end: currentTime + duration,
-          text: sentence,
-          keyword: '',
-          keyword_color: '',
-        }
+        const totalDuration = tl.end - tl.start
+        const totalChars = sentences.reduce((sum, s) => sum + s.length, 0)
+        let currentTime = tl.start

-        if (inAnimation) cap.in_animation = inAnimation
-        if (outAnimation) cap.out_animation = outAnimation
-        if (inAnimDuration) cap.in_animation_duration = inAnimDuration
-        if (outAnimDuration) cap.out_animation_duration = outAnimDuration
+        sentences.forEach((sentence, idx) => {
+          const charRatio = sentence.length / totalChars
+          let duration = Math.round(totalDuration * charRatio)

-        captions.push(cap)
-        currentTime += duration
-      })
+          if (idx === sentences.length - 1) {
+            duration = tl.end - currentTime
+          }
+
+          duration = Math.max(duration, 1000000)
+
+          const cap = {
+            start: currentTime,
+            end: currentTime + duration,
+            text: sentence,
+            keyword: '',
+            keyword_color: '',
+          }
+
+          if (inAnimation) cap.in_animation = inAnimation
+          if (outAnimation) cap.out_animation = outAnimation
+          if (inAnimDuration) cap.in_animation_duration = inAnimDuration
+          if (outAnimDuration) cap.out_animation_duration = outAnimDuration
+
+          captions.push(cap)
+          currentTime += duration
+        })
+      }
    } else {
      // 原始模式：一句字幕
      const keyword = ''
--- a/.claude/skills/video-from-script/scripts/lib/phase-tts.js
+++ b/.claude/skills/video-from-script/scripts/lib/phase-tts.js
@@ -1,11 +1,12 @@
 /**
- * Phase: tts — 语音合成
+ * Phase: tts — 语音合成（逐句分句生成）
 *
- * 使用通义千问 TTS 生成旁白音频
+ * 将每个 item 的 script 按标点切分为短句，每句单独生成 TTS 音频。
+ * 结果写入 item.segments[]，实现字幕与语音精确对齐。
 */

 const path = require('path')
-const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils')
+const { saveManifest, ensureDir, log, getManifestDir, splitTextIntoSentences } = require('./pipeline-utils')

 async function phaseTts(manifest, manifestPath, options = {}) {
  const dir = getManifestDir(manifestPath)
@@ -24,17 +25,51 @@ async function phaseTts(manifest, manifestPath, options = {}) {
  for (let i = 0; i < items.length; i++) {
    const item = items[i]
    const idx = i + 1
+    const fullText = item.script || item.text
+
    try {
-      const { filePath, duration } = await synthesize(item.script || item.text, {
-        outputDir: audioDir,
-        id: item.id || idx,
-        voice: manifest.ttsVoice || undefined,
-        instruction: manifest.ttsInstruction || undefined,
-        rate: manifest.ttsRate || undefined,
-      })
-      item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
-      item.audioDuration = Math.round(duration * 1000) / 1000
-      log('tts', `[${idx}/${items.length}] ${duration.toFixed(1)}s: ${(item.script || item.text).substring(0, 30)}...`)
+      const sentences = splitTextIntoSentences(fullText)
+
+      if (sentences.length <= 1) {
+        // 单句：不需要 segments，走原逻辑
+        const { filePath, duration } = await synthesize(fullText, {
+          outputDir: audioDir,
+          id: item.id || idx,
+          voice: manifest.ttsVoice || undefined,
+          instruction: manifest.ttsInstruction || undefined,
+          rate: manifest.ttsRate || undefined,
+        })
+        item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
+        item.audioDuration = Math.round(duration * 1000) / 1000
+        log('tts', `[${idx}/${items.length}] ${duration.toFixed(1)}s: ${fullText.substring(0, 30)}...`)
+      } else {
+        // 多句：逐句生成，写入 segments
+        const segments = []
+        let totalDuration = 0
+
+        for (let j = 0; j < sentences.length; j++) {
+          const sentence = sentences[j]
+          const segId = `${item.id || idx}_${j + 1}`
+          const { filePath, duration } = await synthesize(sentence, {
+            outputDir: audioDir,
+            id: segId,
+            voice: manifest.ttsVoice || undefined,
+            instruction: manifest.ttsInstruction || undefined,
+            rate: manifest.ttsRate || undefined,
+          })
+          segments.push({
+            text: sentence,
+            audio: path.relative(dir, filePath).replace(/\\/g, '/'),
+            duration: Math.round(duration * 1000) / 1000,
+          })
+          totalDuration += duration
+        }
+
+        item.segments = segments
+        item.audio = segments[0].audio
+        item.audioDuration = Math.round(totalDuration * 1000) / 1000
+        log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s (${segments.length}句): ${fullText.substring(0, 30)}...`)
+      }
    } catch (err) {
      item.status = 'failed'
      item.error = `TTS失败: ${err.message}`
--- a/.claude/skills/video-from-script/scripts/lib/pipeline-utils.js
+++ b/.claude/skills/video-from-script/scripts/lib/pipeline-utils.js
@@ -160,6 +160,36 @@ function getManifestDir(manifestPath) {
  return path.dirname(path.resolve(manifestPath))
 }

+// ============================================================================
+// 文本切分
+// ============================================================================
+
+function splitTextIntoSentences(text) {
+  const sentenceEnders = /[。！？；]/
+  const clauseEnders = /[，：]/
+
+  const sentences = []
+  let current = ''
+
+  for (const char of text) {
+    current += char
+
+    if (sentenceEnders.test(char)) {
+      sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
+      current = ''
+    } else if (clauseEnders.test(char) && current.length > 8) {
+      sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
+      current = ''
+    }
+  }
+
+  if (current.trim()) {
+    sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
+  }
+
+  return sentences
+}
+
 // ============================================================================
 // Exports
 // ============================================================================
@@ -178,6 +208,7 @@ module.exports = {
  ensureDir,
  slugify,
  renameGeneratedFile,
+  splitTextIntoSentences,
  log,
  getManifestDir,
 }