feat(video-pipeline): 实现 TTS 逐句分句生成与字幕精确对齐

TTS 阶段将长文本按标点切分为短句，逐句生成音频并记录每句时长到 `item.segments[]`。assemble 阶段优先使用 segments 的精确时长分配字幕时间线，无 segments 时回退到字数权重估算。同时优化音频上传流程，支持分段音频独立上传 OSS 并在配音时按段映射时间线。
2026-05-01 14:41:28 +08:00
parent f5d47ec5db
commit 9d19437a29
4 changed files with 236 additions and 122 deletions
--- a/.claude/skills/video-from-script/references/manifest-schema.md
+++ b/.claude/skills/video-from-script/references/manifest-schema.md
@@ -82,8 +82,9 @@ node scripts/pipeline.js validate --manifest <path>
 | `video` | 生成的视频路径 | videos |
 | `videoDuration` | 视频时长（秒），Grok=6, VEO=8 | videos |
 | `videoUrl` | 视频 OSS 公网 URL | videos |
-| `audio` | TTS 音频路径 | tts |
+| `audio` | TTS 音频路径（多句时为合并后的完整音频） | tts |
 | `audioDuration` | 音频时长（秒） | tts |
 | `segments` | 分句音频数组（仅多句时存在），见下方 | tts |
 ### Agent 审查时可操作
@@ -199,3 +200,17 @@ output/{account}_{YYYYMMDD}_{NNN}/
 ```
 slug 从 `shotDesc` 派生（slugify: 保留中文和字母数字，最多 20 字符）。
 ---
 ## segments[] 字段（TTS 分句）
 TTS 阶段自动生成。仅当 `script` 被切分为 2 句及以上时才写入。单句时不写 segments。
 | 字段 | 说明 |
 |------|------|
 | `text` | 分句文本（已去除标点） |
 | `audio` | 该句音频路径（相对 manifest） |
 | `duration` | 该句音频时长（秒） |
 `item.audio` 指向所有分段合并后的完整音频，`item.audioDuration` 为各段累计时长。assemble 阶段优先用 `segments` 的精确时长对齐字幕，无 segments 时回退到字数权重估算。
--- a/.claude/skills/video-from-script/scripts/capcut_assemble.js
+++ b/.claude/skills/video-from-script/scripts/capcut_assemble.js
@@ -17,6 +17,7 @@ const path = require('path')
 const fs = require('fs')
 const { execFile } = require('child_process')
 const { syncDraft, registerDraft, triggerDirectoryScan } = require('./sync-to-jianying')
 const { splitTextIntoSentences } = require('./lib/pipeline-utils')
 // ============================================================================
 // 配置
@@ -248,8 +249,8 @@ async function assemble(args) {
  // 用 ffprobe 测量实际音频/视频时长，替代 manifest 中的估计值
  let audioMeasured = 0, videoMeasured = 0
  for (const item of items) {
-    // 测量 TTS 音频实际时长
+    // 测量 TTS 音频实际时长（有 segments 时跳过，audioDuration 已是精确累计值）
-    if (item.audio && !item.audio.startsWith('http')) {
+    if (item.audio && !item.audio.startsWith('http') && !item.segments) {
      const audioPath = path.isAbsolute(item.audio)
        ? item.audio
        : path.resolve(inputDir, item.audio)
@@ -277,6 +278,9 @@ async function assemble(args) {
  const totalDurationUs = timeline.length > 0 ? timeline[timeline.length - 1].end : 0
  const hasTTS = items.some(item => item.audio && item.audioDuration != null)
  // -- 读取转场策略（在 addImages/addVideos 之前） --
  const transitionConfig = loadTransitions(manifest)
  console.log(`\nCapCut 成片组装`)
  console.log(`  模式: ${mode}  画幅: ${format} (${width}x${height})`)
  console.log(`  时间线: ${hasTTS ? 'TTS音频驱动' : `固定${duration}s/段`}  总时长: ${(totalDurationUs / US).toFixed(1)}s`)
@@ -285,7 +289,7 @@ async function assemble(args) {
  const steps = []
  if (mode === 'images') steps.push('upload')
-  steps.push('draft', 'materials', 'voiceover', 'audio', 'subtitles', 'effects', 'filter', 'save', 'sync')
+  steps.push('draft', 'materials', 'audio_oss', 'voiceover', 'audio', 'subtitles', 'effects', 'filter', 'save', 'sync')
  const totalSteps = steps.length
  let step = 0
@@ -371,10 +375,22 @@ async function assemble(args) {
    await addVideos(draftUrl, inputDir, items, timeline, width, height, transitionConfig)
  }
  // -- 上传 TTS 音频到 OSS --
  let audioUrls = {}
  if (voiceover === 'true' && hasTTS) {
    step++; console.log(`[${step}/${totalSteps}] 上传 TTS 音频到 OSS...`)
    try {
      audioUrls = await batchUploadAudio(inputDir, items)
      console.log(`   成功: ${Object.keys(audioUrls).length} 段音频\n`)
    } catch (err) {
      console.log(`   OSS 上传失败，将尝试本地路径: ${err.message}\n`)
    }
  }
  // -- 添加 TTS 配音 --
  step++; console.log(`[${step}/${totalSteps}] 添加 TTS 配音...`)
  if (voiceover === 'true' && hasTTS) {
-    await addVoiceover(draftUrl, inputDir, items, timeline, localAudio === 'true')
+    await addVoiceover(draftUrl, inputDir, items, timeline, audioUrls)
  } else {
    console.log('   跳过（无 TTS 音频或未启用）')
  }
@@ -393,9 +409,6 @@ async function assemble(args) {
    console.log(`  字幕风格: ${subtitleStyle.font || '默认'} ${subtitleStyle.inAnimation ? subtitleStyle.inAnimation + '→' + subtitleStyle.outAnimation : ''}`)
  }
  // -- 读取转场策略 --
  const transitionConfig = loadTransitions(manifest)
  // -- 添加字幕 --
  step++; console.log(`[${step}/${totalSteps}] 添加字幕...`)
  if (subtitles === 'true' && items.some(i => i.script || i.text)) {
@@ -640,15 +653,34 @@ async function uploadAudioToOSS(filePath) {
 async function batchUploadAudio(inputDir, items) {
  const urls = {}
  for (const item of items) {
    // 上传 segments 中的每段音频
    if (item.segments && item.segments.length > 1) {
      for (const seg of item.segments) {
        if (!seg.audio || seg.audio.startsWith('http') || urls[seg.audio]) continue
        const filePath = path.isAbsolute(seg.audio)
          ? seg.audio
          : path.resolve(inputDir, seg.audio)
        if (!fs.existsSync(filePath)) {
          console.error(`   音频文件不存在: ${filePath}`)
          continue
        }
        try {
          urls[seg.audio] = await uploadAudioToOSS(filePath)
          console.log(`   上传: ${path.basename(filePath)} -> OK`)
        } catch (err) {
          console.error(`   上传失败: ${path.basename(filePath)} - ${err.message}`)
        }
      }
    }
    // 上传 item.audio（单段或 segments 的第一段）
    if (!item.audio || item.audio.startsWith('http')) {
      if (item.audio) urls[item.audio] = item.audio
      continue
    }
-    // audio 可以是相对路径或绝对路径
+    if (urls[item.audio]) continue
    const filePath = path.isAbsolute(item.audio)
      ? item.audio
      : path.resolve(inputDir, item.audio)
    if (!fs.existsSync(filePath)) {
      console.error(`   音频文件不存在: ${filePath}`)
      continue
@@ -667,51 +699,54 @@ async function batchUploadAudio(inputDir, items) {
 // 添加 TTS 配音（每段音频按时间线排列）
 // ============================================================================
-async function addVoiceover(draftUrl, inputDir, items, timeline, localAudio = true) {
+async function addVoiceover(draftUrl, inputDir, items, timeline, audioUrls = {}) {
  // 收集音频
-  const audioItems = items.filter(item => item.audio)
+  const audioItems = items.filter(item => item.audio || (item.segments && item.segments.length > 0))
  if (audioItems.length === 0) {
    console.log('   无 TTS 音频文件，跳过')
    return
  }
  const audioInfos = []
  const resolveAudio = (relPath) => {
    if (relPath.startsWith('http')) return relPath
    if (audioUrls[relPath]) return audioUrls[relPath]
    return path.isAbsolute(relPath) ? relPath : path.resolve(inputDir, relPath)
  }
-  if (localAudio) {
+  for (let i = 0; i < items.length; i++) {
-    // 本地模式：直接用本地路径，不上传 OSS
+    const item = items[i]
-    for (let i = 0; i < items.length; i++) {
+    const tl = timeline[i]
-      const item = items[i]
+    const segments = item.segments && item.segments.length > 1 ? item.segments : null
      if (!item.audio) continue
-      const filePath = item.audio.startsWith('http')
+    if (segments) {
-        ? item.audio
+      // 多段音频：按 segment 逐段添加，使用精确时长
-        : (path.isAbsolute(item.audio) ? item.audio : path.resolve(inputDir, item.audio))
+      const totalSegDur = segments.reduce((sum, s) => sum + s.duration * US, 0)
      const tlDuration = tl.end - tl.start
      let currentTime = tl.start
-      if (!item.audio.startsWith('http') && !fs.existsSync(filePath)) {
+      for (let j = 0; j < segments.length; j++) {
-        console.error(`   音频文件不存在: ${filePath}`)
+        const seg = segments[j]
-        continue
+        const segDurUs = Math.round(seg.duration * US)
        let duration = Math.round(tlDuration * (segDurUs / totalSegDur))
        if (j === segments.length - 1) duration = tl.end - currentTime
        duration = Math.max(duration, 100000)
        const audioUrl = resolveAudio(seg.audio)
        audioInfos.push({
          audio_url: audioUrl,
          start: currentTime,
          end: currentTime + duration,
          duration,
          volume: 1.0,
        })
        currentTime += duration
      }
    } else if (item.audio) {
      // 单段音频
      const audioUrl = resolveAudio(item.audio)
      const tl = timeline[i]
      audioInfos.push({
        audio_url: filePath,
        start: tl.start,
        end: tl.end,
        duration: tl.duration,
        volume: 1.0,
      })
    }
  } else {
    // 上传模式：先传 OSS 再用 URL
    const audioUrls = await batchUploadAudio(inputDir, items)
    for (let i = 0; i < items.length; i++) {
      const item = items[i]
      if (!item.audio) continue
      const audioUrl = audioUrls[item.audio]
      if (!audioUrl) continue
      const tl = timeline[i]
      audioInfos.push({
        audio_url: audioUrl,
        start: tl.start,
@@ -731,7 +766,8 @@ async function addVoiceover(draftUrl, inputDir, items, timeline, localAudio = tr
    draft_url: draftUrl,
    audio_infos: JSON.stringify(audioInfos),
  })
-  console.log(`   已添加 ${audioInfos.length} 段 TTS 配音 (${localAudio ? '本地路径' : 'OSS'})`)
+  const ossCount = audioInfos.filter(a => a.audio_url.startsWith('http')).length
  console.log(`   已添加 ${audioInfos.length} 段 TTS 配音 (${ossCount > 0 ? `${ossCount} 段 OSS + ` : ''}${audioInfos.length - ossCount} 段本地)`)
 }
 // ============================================================================
@@ -793,40 +829,6 @@ function loadTransitions(manifest) {
 // 添加字幕（支持关键词高亮 + 账号字幕风格 + 分句切分）
 // ============================================================================
 /**
 * 按标点符号切分文本为短句（去除所有标点符号）
 */
 function splitTextIntoSentences(text) {
  const sentenceEnders = /[。！？；]/
  const clauseEnders = /[，：]/
  const sentences = []
  let current = ''
  let chars = text.split('')
  for (let i = 0; i < chars.length; i++) {
    const char = chars[i]
    current += char
    if (sentenceEnders.test(char)) {
      // 切分并去掉所有标点
      sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
      current = ''
    } else if (clauseEnders.test(char) && current.length > 8) {
      // 切分并去掉所有标点
      sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
      current = ''
    }
  }
  // 处理剩余文本
  if (current.trim()) {
    sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
  }
  return sentences
 }
 async function addSubtitles(draftUrl, items, timeline, style = {}, split = false) {
  const captions = []
@@ -844,45 +846,76 @@ async function addSubtitles(draftUrl, items, timeline, style = {}, split = false
    const tl = timeline[i]
    if (split) {
-      // 分句模式：切分长文本
+      // 分句模式：优先用 segments（TTS 逐句生成的精确时长），回退到字数估算
-      const sentences = splitTextIntoSentences(text)
+      const segments = item.segments && item.segments.length > 1 ? item.segments : null
      if (sentences.length === 0) continue
-      const totalDuration = tl.end - tl.start
+      if (segments) {
        // 精确模式：用 segments 的实际音频时长
        const totalSegDur = segments.reduce((sum, s) => sum + s.duration * US, 0)
        const tlDuration = tl.end - tl.start
        let currentTime = tl.start
-      // 按字数权重分配时间（改进版）
+        segments.forEach((seg, idx) => {
-      const totalChars = sentences.reduce((sum, s) => sum + s.length, 0)
+          const segDurUs = Math.round(seg.duration * US)
-      let currentTime = tl.start
+          // 按实际时长占比映射到时间线（处理 ffprobe 重新测量的差异）
          let duration = Math.round(tlDuration * (segDurUs / totalSegDur))
          if (idx === segments.length - 1) {
            duration = tl.end - currentTime
          }
          duration = Math.max(duration, 1000000)
-      sentences.forEach((sentence, idx) => {
+          const cap = {
-        // 按字数比例计算时长
+            start: currentTime,
-        const charRatio = sentence.length / totalChars
+            end: currentTime + duration,
-        let duration = Math.round(totalDuration * charRatio)
+            text: seg.text,
            keyword: '',
            keyword_color: '',
          }
-        // 最后一句使用剩余全部时间（避免精度误差）
+          if (inAnimation) cap.in_animation = inAnimation
-        if (idx === sentences.length - 1) {
+          if (outAnimation) cap.out_animation = outAnimation
-          duration = tl.end - currentTime
+          if (inAnimDuration) cap.in_animation_duration = inAnimDuration
-        }
+          if (outAnimDuration) cap.out_animation_duration = outAnimDuration
-        // 最小1秒，避免太短
+          captions.push(cap)
-        duration = Math.max(duration, 1000000) // 1秒 = 1000000微秒
+          currentTime += duration
        })
      } else {
        // 回退：字数权重估算
        const sentences = splitTextIntoSentences(text)
        if (sentences.length === 0) continue
-        const cap = {
+        const totalDuration = tl.end - tl.start
-          start: currentTime,
+        const totalChars = sentences.reduce((sum, s) => sum + s.length, 0)
-          end: currentTime + duration,
+        let currentTime = tl.start
          text: sentence,
          keyword: '',
          keyword_color: '',
        }
-        if (inAnimation) cap.in_animation = inAnimation
+        sentences.forEach((sentence, idx) => {
-        if (outAnimation) cap.out_animation = outAnimation
+          const charRatio = sentence.length / totalChars
-        if (inAnimDuration) cap.in_animation_duration = inAnimDuration
+          let duration = Math.round(totalDuration * charRatio)
        if (outAnimDuration) cap.out_animation_duration = outAnimDuration
-        captions.push(cap)
+          if (idx === sentences.length - 1) {
-        currentTime += duration
+            duration = tl.end - currentTime
-      })
+          }
          duration = Math.max(duration, 1000000)
          const cap = {
            start: currentTime,
            end: currentTime + duration,
            text: sentence,
            keyword: '',
            keyword_color: '',
          }
          if (inAnimation) cap.in_animation = inAnimation
          if (outAnimation) cap.out_animation = outAnimation
          if (inAnimDuration) cap.in_animation_duration = inAnimDuration
          if (outAnimDuration) cap.out_animation_duration = outAnimDuration
          captions.push(cap)
          currentTime += duration
        })
      }
    } else {
      // 原始模式：一句字幕
      const keyword = ''
--- a/.claude/skills/video-from-script/scripts/lib/phase-tts.js
+++ b/.claude/skills/video-from-script/scripts/lib/phase-tts.js
@@ -1,11 +1,12 @@
 /**
- * Phase: tts — 语音合成
+ * Phase: tts — 语音合成（逐句分句生成）
 *
- * 使用通义千问 TTS 生成旁白音频
+ * 将每个 item 的 script 按标点切分为短句，每句单独生成 TTS 音频。
 * 结果写入 item.segments[]，实现字幕与语音精确对齐。
 */
 const path = require('path')
-const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils')
+const { saveManifest, ensureDir, log, getManifestDir, splitTextIntoSentences } = require('./pipeline-utils')
 async function phaseTts(manifest, manifestPath, options = {}) {
  const dir = getManifestDir(manifestPath)
@@ -24,17 +25,51 @@ async function phaseTts(manifest, manifestPath, options = {}) {
  for (let i = 0; i < items.length; i++) {
    const item = items[i]
    const idx = i + 1
    const fullText = item.script || item.text
    try {
-      const { filePath, duration } = await synthesize(item.script || item.text, {
+      const sentences = splitTextIntoSentences(fullText)
-        outputDir: audioDir,
+
-        id: item.id || idx,
+      if (sentences.length <= 1) {
-        voice: manifest.ttsVoice || undefined,
+        // 单句：不需要 segments，走原逻辑
-        instruction: manifest.ttsInstruction || undefined,
+        const { filePath, duration } = await synthesize(fullText, {
-        rate: manifest.ttsRate || undefined,
+          outputDir: audioDir,
-      })
+          id: item.id || idx,
-      item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
+          voice: manifest.ttsVoice || undefined,
-      item.audioDuration = Math.round(duration * 1000) / 1000
+          instruction: manifest.ttsInstruction || undefined,
-      log('tts', `[${idx}/${items.length}] ${duration.toFixed(1)}s: ${(item.script || item.text).substring(0, 30)}...`)
+          rate: manifest.ttsRate || undefined,
        })
        item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
        item.audioDuration = Math.round(duration * 1000) / 1000
        log('tts', `[${idx}/${items.length}] ${duration.toFixed(1)}s: ${fullText.substring(0, 30)}...`)
      } else {
        // 多句：逐句生成，写入 segments
        const segments = []
        let totalDuration = 0
        for (let j = 0; j < sentences.length; j++) {
          const sentence = sentences[j]
          const segId = `${item.id || idx}_${j + 1}`
          const { filePath, duration } = await synthesize(sentence, {
            outputDir: audioDir,
            id: segId,
            voice: manifest.ttsVoice || undefined,
            instruction: manifest.ttsInstruction || undefined,
            rate: manifest.ttsRate || undefined,
          })
          segments.push({
            text: sentence,
            audio: path.relative(dir, filePath).replace(/\\/g, '/'),
            duration: Math.round(duration * 1000) / 1000,
          })
          totalDuration += duration
        }
        item.segments = segments
        item.audio = segments[0].audio
        item.audioDuration = Math.round(totalDuration * 1000) / 1000
        log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s (${segments.length}句): ${fullText.substring(0, 30)}...`)
      }
    } catch (err) {
      item.status = 'failed'
      item.error = `TTS失败: ${err.message}`
--- a/.claude/skills/video-from-script/scripts/lib/pipeline-utils.js
+++ b/.claude/skills/video-from-script/scripts/lib/pipeline-utils.js
@@ -160,6 +160,36 @@ function getManifestDir(manifestPath) {
  return path.dirname(path.resolve(manifestPath))
 }
 // ============================================================================
 // 文本切分
 // ============================================================================
 function splitTextIntoSentences(text) {
  const sentenceEnders = /[。！？；]/
  const clauseEnders = /[，：]/
  const sentences = []
  let current = ''
  for (const char of text) {
    current += char
    if (sentenceEnders.test(char)) {
      sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
      current = ''
    } else if (clauseEnders.test(char) && current.length > 8) {
      sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
      current = ''
    }
  }
  if (current.trim()) {
    sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
  }
  return sentences
 }
 // ============================================================================
 // Exports
 // ============================================================================
@@ -178,6 +208,7 @@ module.exports = {
  ensureDir,
  slugify,
  renameGeneratedFile,
  splitTextIntoSentences,
  log,
  getManifestDir,
 }