From 9d19437a292ab39b9cddb995a4641c4a3ec943b6 Mon Sep 17 00:00:00 2001
From: sion123 <450702724@qq.com>
Date: Fri, 1 May 2026 14:41:28 +0800
Subject: [PATCH] =?UTF-8?q?feat(video-pipeline):=20=E5=AE=9E=E7=8E=B0=20TT?=
 =?UTF-8?q?S=20=E9=80=90=E5=8F=A5=E5=88=86=E5=8F=A5=E7=94=9F=E6=88=90?=
 =?UTF-8?q?=E4=B8=8E=E5=AD=97=E5=B9=95=E7=B2=BE=E7=A1=AE=E5=AF=B9=E9=BD=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

TTS 阶段将长文本按标点切分为短句，逐句生成音频并记录每句时长到 `item.segments[]`。assemble 阶段优先使用 segments 的精确时长分配字幕时间线，无 segments 时回退到字数权重估算。同时优化音频上传流程，支持分段音频独立上传 OSS 并在配音时按段映射时间线。
---
 .../references/manifest-schema.md             |  17 +-
 .../scripts/capcut_assemble.js                | 249 ++++++++++--------
 .../scripts/lib/phase-tts.js                  |  61 ++++-
 .../scripts/lib/pipeline-utils.js             |  31 +++
 4 files changed, 236 insertions(+), 122 deletions(-)
diff --git a/.claude/skills/video-from-script/references/manifest-schema.md b/.claude/skills/video-from-script/references/manifest-schema.md
index f9466a9..d554389 100644
--- a/.claude/skills/video-from-script/references/manifest-schema.md
+++ b/.claude/skills/video-from-script/references/manifest-schema.md
@@ -82,8 +82,9 @@ node scripts/pipeline.js validate --manifest <path>
 | `video` | 生成的视频路径 | videos |
 | `videoDuration` | 视频时长（秒），Grok=6, VEO=8 | videos |
 | `videoUrl` | 视频 OSS 公网 URL | videos |
-| `audio` | TTS 音频路径 | tts |
+| `audio` | TTS 音频路径（多句时为合并后的完整音频） | tts |
 | `audioDuration` | 音频时长（秒） | tts |
+| `segments` | 分句音频数组（仅多句时存在），见下方 | tts |
 
 ### Agent 审查时可操作
 
@@ -199,3 +200,17 @@ output/{account}_{YYYYMMDD}_{NNN}/
 ```
 
 slug 从 `shotDesc` 派生（slugify: 保留中文和字母数字，最多 20 字符）。
+
+---
+
+## segments[] 字段（TTS 分句）
+
+TTS 阶段自动生成。仅当 `script` 被切分为 2 句及以上时才写入。单句时不写 segments。
+
+| 字段 | 说明 |
+|------|------|
+| `text` | 分句文本（已去除标点） |
+| `audio` | 该句音频路径（相对 manifest） |
+| `duration` | 该句音频时长（秒） |
+
+`item.audio` 指向所有分段合并后的完整音频，`item.audioDuration` 为各段累计时长。assemble 阶段优先用 `segments` 的精确时长对齐字幕，无 segments 时回退到字数权重估算。
diff --git a/.claude/skills/video-from-script/scripts/capcut_assemble.js b/.claude/skills/video-from-script/scripts/capcut_assemble.js
index 9a5f8d6..6b2c989 100644
--- a/.claude/skills/video-from-script/scripts/capcut_assemble.js
+++ b/.claude/skills/video-from-script/scripts/capcut_assemble.js
@@ -17,6 +17,7 @@ const path = require('path')
 const fs = require('fs')
 const { execFile } = require('child_process')
 const { syncDraft, registerDraft, triggerDirectoryScan } = require('./sync-to-jianying')
+const { splitTextIntoSentences } = require('./lib/pipeline-utils')
 
 // ============================================================================
 // 配置
@@ -248,8 +249,8 @@ async function assemble(args) {
   // 用 ffprobe 测量实际音频/视频时长，替代 manifest 中的估计值
   let audioMeasured = 0, videoMeasured = 0
   for (const item of items) {
-    // 测量 TTS 音频实际时长
-    if (item.audio && !item.audio.startsWith('http')) {
+    // 测量 TTS 音频实际时长（有 segments 时跳过，audioDuration 已是精确累计值）
+    if (item.audio && !item.audio.startsWith('http') && !item.segments) {
       const audioPath = path.isAbsolute(item.audio)
         ? item.audio
         : path.resolve(inputDir, item.audio)
@@ -277,6 +278,9 @@ async function assemble(args) {
   const totalDurationUs = timeline.length > 0 ? timeline[timeline.length - 1].end : 0
   const hasTTS = items.some(item => item.audio && item.audioDuration != null)
 
+  // -- 读取转场策略（在 addImages/addVideos 之前） --
+  const transitionConfig = loadTransitions(manifest)
+
   console.log(`\nCapCut 成片组装`)
   console.log(`  模式: ${mode}  画幅: ${format} (${width}x${height})`)
   console.log(`  时间线: ${hasTTS ? 'TTS音频驱动' : `固定${duration}s/段`}  总时长: ${(totalDurationUs / US).toFixed(1)}s`)
@@ -285,7 +289,7 @@ async function assemble(args) {
 
   const steps = []
   if (mode === 'images') steps.push('upload')
-  steps.push('draft', 'materials', 'voiceover', 'audio', 'subtitles', 'effects', 'filter', 'save', 'sync')
+  steps.push('draft', 'materials', 'audio_oss', 'voiceover', 'audio', 'subtitles', 'effects', 'filter', 'save', 'sync')
   const totalSteps = steps.length
   let step = 0
 
@@ -371,10 +375,22 @@ async function assemble(args) {
     await addVideos(draftUrl, inputDir, items, timeline, width, height, transitionConfig)
   }
 
+  // -- 上传 TTS 音频到 OSS --
+  let audioUrls = {}
+  if (voiceover === 'true' && hasTTS) {
+    step++; console.log(`[${step}/${totalSteps}] 上传 TTS 音频到 OSS...`)
+    try {
+      audioUrls = await batchUploadAudio(inputDir, items)
+      console.log(`   成功: ${Object.keys(audioUrls).length} 段音频\n`)
+    } catch (err) {
+      console.log(`   OSS 上传失败，将尝试本地路径: ${err.message}\n`)
+    }
+  }
+
   // -- 添加 TTS 配音 --
   step++; console.log(`[${step}/${totalSteps}] 添加 TTS 配音...`)
   if (voiceover === 'true' && hasTTS) {
-    await addVoiceover(draftUrl, inputDir, items, timeline, localAudio === 'true')
+    await addVoiceover(draftUrl, inputDir, items, timeline, audioUrls)
   } else {
     console.log('   跳过（无 TTS 音频或未启用）')
   }
@@ -393,9 +409,6 @@ async function assemble(args) {
     console.log(`  字幕风格: ${subtitleStyle.font || '默认'} ${subtitleStyle.inAnimation ? subtitleStyle.inAnimation + '→' + subtitleStyle.outAnimation : ''}`)
   }
 
-  // -- 读取转场策略 --
-  const transitionConfig = loadTransitions(manifest)
-
   // -- 添加字幕 --
   step++; console.log(`[${step}/${totalSteps}] 添加字幕...`)
   if (subtitles === 'true' && items.some(i => i.script || i.text)) {
@@ -640,15 +653,34 @@ async function uploadAudioToOSS(filePath) {
 async function batchUploadAudio(inputDir, items) {
   const urls = {}
   for (const item of items) {
+    // 上传 segments 中的每段音频
+    if (item.segments && item.segments.length > 1) {
+      for (const seg of item.segments) {
+        if (!seg.audio || seg.audio.startsWith('http') || urls[seg.audio]) continue
+        const filePath = path.isAbsolute(seg.audio)
+          ? seg.audio
+          : path.resolve(inputDir, seg.audio)
+        if (!fs.existsSync(filePath)) {
+          console.error(`   音频文件不存在: ${filePath}`)
+          continue
+        }
+        try {
+          urls[seg.audio] = await uploadAudioToOSS(filePath)
+          console.log(`   上传: ${path.basename(filePath)} -> OK`)
+        } catch (err) {
+          console.error(`   上传失败: ${path.basename(filePath)} - ${err.message}`)
+        }
+      }
+    }
+    // 上传 item.audio（单段或 segments 的第一段）
     if (!item.audio || item.audio.startsWith('http')) {
       if (item.audio) urls[item.audio] = item.audio
       continue
     }
-    // audio 可以是相对路径或绝对路径
+    if (urls[item.audio]) continue
     const filePath = path.isAbsolute(item.audio)
       ? item.audio
       : path.resolve(inputDir, item.audio)
-
     if (!fs.existsSync(filePath)) {
       console.error(`   音频文件不存在: ${filePath}`)
       continue
@@ -667,51 +699,54 @@ async function batchUploadAudio(inputDir, items) {
 // 添加 TTS 配音（每段音频按时间线排列）
 // ============================================================================
 
-async function addVoiceover(draftUrl, inputDir, items, timeline, localAudio = true) {
+async function addVoiceover(draftUrl, inputDir, items, timeline, audioUrls = {}) {
   // 收集音频
-  const audioItems = items.filter(item => item.audio)
+  const audioItems = items.filter(item => item.audio || (item.segments && item.segments.length > 0))
   if (audioItems.length === 0) {
     console.log('   无 TTS 音频文件，跳过')
     return
   }
 
   const audioInfos = []
+  const resolveAudio = (relPath) => {
+    if (relPath.startsWith('http')) return relPath
+    if (audioUrls[relPath]) return audioUrls[relPath]
+    return path.isAbsolute(relPath) ? relPath : path.resolve(inputDir, relPath)
+  }
 
-  if (localAudio) {
-    // 本地模式：直接用本地路径，不上传 OSS
-    for (let i = 0; i < items.length; i++) {
-      const item = items[i]
-      if (!item.audio) continue
+  for (let i = 0; i < items.length; i++) {
+    const item = items[i]
+    const tl = timeline[i]
+    const segments = item.segments && item.segments.length > 1 ? item.segments : null
 
-      const filePath = item.audio.startsWith('http')
-        ? item.audio
-        : (path.isAbsolute(item.audio) ? item.audio : path.resolve(inputDir, item.audio))
+    if (segments) {
+      // 多段音频：按 segment 逐段添加，使用精确时长
+      const totalSegDur = segments.reduce((sum, s) => sum + s.duration * US, 0)
+      const tlDuration = tl.end - tl.start
+      let currentTime = tl.start
 
-      if (!item.audio.startsWith('http') && !fs.existsSync(filePath)) {
-        console.error(`   音频文件不存在: ${filePath}`)
-        continue
+      for (let j = 0; j < segments.length; j++) {
+        const seg = segments[j]
+        const segDurUs = Math.round(seg.duration * US)
+        let duration = Math.round(tlDuration * (segDurUs / totalSegDur))
+        if (j === segments.length - 1) duration = tl.end - currentTime
+        duration = Math.max(duration, 100000)
+
+        const audioUrl = resolveAudio(seg.audio)
+
+        audioInfos.push({
+          audio_url: audioUrl,
+          start: currentTime,
+          end: currentTime + duration,
+          duration,
+          volume: 1.0,
+        })
+        currentTime += duration
       }
+    } else if (item.audio) {
+      // 单段音频
+      const audioUrl = resolveAudio(item.audio)
 
-      const tl = timeline[i]
-      audioInfos.push({
-        audio_url: filePath,
-        start: tl.start,
-        end: tl.end,
-        duration: tl.duration,
-        volume: 1.0,
-      })
-    }
-  } else {
-    // 上传模式：先传 OSS 再用 URL
-    const audioUrls = await batchUploadAudio(inputDir, items)
-    for (let i = 0; i < items.length; i++) {
-      const item = items[i]
-      if (!item.audio) continue
-
-      const audioUrl = audioUrls[item.audio]
-      if (!audioUrl) continue
-
-      const tl = timeline[i]
       audioInfos.push({
         audio_url: audioUrl,
         start: tl.start,
@@ -731,7 +766,8 @@ async function addVoiceover(draftUrl, inputDir, items, timeline, localAudio = tr
     draft_url: draftUrl,
     audio_infos: JSON.stringify(audioInfos),
   })
-  console.log(`   已添加 ${audioInfos.length} 段 TTS 配音 (${localAudio ? '本地路径' : 'OSS'})`)
+  const ossCount = audioInfos.filter(a => a.audio_url.startsWith('http')).length
+  console.log(`   已添加 ${audioInfos.length} 段 TTS 配音 (${ossCount > 0 ? `${ossCount} 段 OSS + ` : ''}${audioInfos.length - ossCount} 段本地)`)
 }
 
 // ============================================================================
@@ -793,40 +829,6 @@ function loadTransitions(manifest) {
 // 添加字幕（支持关键词高亮 + 账号字幕风格 + 分句切分）
 // ============================================================================
 
-/**
- * 按标点符号切分文本为短句（去除所有标点符号）
- */
-function splitTextIntoSentences(text) {
-  const sentenceEnders = /[。！？；]/
-  const clauseEnders = /[，：]/
-
-  const sentences = []
-  let current = ''
-  let chars = text.split('')
-
-  for (let i = 0; i < chars.length; i++) {
-    const char = chars[i]
-    current += char
-
-    if (sentenceEnders.test(char)) {
-      // 切分并去掉所有标点
-      sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
-      current = ''
-    } else if (clauseEnders.test(char) && current.length > 8) {
-      // 切分并去掉所有标点
-      sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
-      current = ''
-    }
-  }
-
-  // 处理剩余文本
-  if (current.trim()) {
-    sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
-  }
-
-  return sentences
-}
-
 async function addSubtitles(draftUrl, items, timeline, style = {}, split = false) {
   const captions = []
 
@@ -844,45 +846,76 @@ async function addSubtitles(draftUrl, items, timeline, style = {}, split = false
     const tl = timeline[i]
 
     if (split) {
-      // 分句模式：切分长文本
-      const sentences = splitTextIntoSentences(text)
-      if (sentences.length === 0) continue
+      // 分句模式：优先用 segments（TTS 逐句生成的精确时长），回退到字数估算
+      const segments = item.segments && item.segments.length > 1 ? item.segments : null
 
-      const totalDuration = tl.end - tl.start
+      if (segments) {
+        // 精确模式：用 segments 的实际音频时长
+        const totalSegDur = segments.reduce((sum, s) => sum + s.duration * US, 0)
+        const tlDuration = tl.end - tl.start
+        let currentTime = tl.start
 
-      // 按字数权重分配时间（改进版）
-      const totalChars = sentences.reduce((sum, s) => sum + s.length, 0)
-      let currentTime = tl.start
+        segments.forEach((seg, idx) => {
+          const segDurUs = Math.round(seg.duration * US)
+          // 按实际时长占比映射到时间线（处理 ffprobe 重新测量的差异）
+          let duration = Math.round(tlDuration * (segDurUs / totalSegDur))
+          if (idx === segments.length - 1) {
+            duration = tl.end - currentTime
+          }
+          duration = Math.max(duration, 1000000)
 
-      sentences.forEach((sentence, idx) => {
-        // 按字数比例计算时长
-        const charRatio = sentence.length / totalChars
-        let duration = Math.round(totalDuration * charRatio)
+          const cap = {
+            start: currentTime,
+            end: currentTime + duration,
+            text: seg.text,
+            keyword: '',
+            keyword_color: '',
+          }
 
-        // 最后一句使用剩余全部时间（避免精度误差）
-        if (idx === sentences.length - 1) {
-          duration = tl.end - currentTime
-        }
+          if (inAnimation) cap.in_animation = inAnimation
+          if (outAnimation) cap.out_animation = outAnimation
+          if (inAnimDuration) cap.in_animation_duration = inAnimDuration
+          if (outAnimDuration) cap.out_animation_duration = outAnimDuration
 
-        // 最小1秒，避免太短
-        duration = Math.max(duration, 1000000) // 1秒 = 1000000微秒
+          captions.push(cap)
+          currentTime += duration
+        })
+      } else {
+        // 回退：字数权重估算
+        const sentences = splitTextIntoSentences(text)
+        if (sentences.length === 0) continue
 
-        const cap = {
-          start: currentTime,
-          end: currentTime + duration,
-          text: sentence,
-          keyword: '',
-          keyword_color: '',
-        }
+        const totalDuration = tl.end - tl.start
+        const totalChars = sentences.reduce((sum, s) => sum + s.length, 0)
+        let currentTime = tl.start
 
-        if (inAnimation) cap.in_animation = inAnimation
-        if (outAnimation) cap.out_animation = outAnimation
-        if (inAnimDuration) cap.in_animation_duration = inAnimDuration
-        if (outAnimDuration) cap.out_animation_duration = outAnimDuration
+        sentences.forEach((sentence, idx) => {
+          const charRatio = sentence.length / totalChars
+          let duration = Math.round(totalDuration * charRatio)
 
-        captions.push(cap)
-        currentTime += duration
-      })
+          if (idx === sentences.length - 1) {
+            duration = tl.end - currentTime
+          }
+
+          duration = Math.max(duration, 1000000)
+
+          const cap = {
+            start: currentTime,
+            end: currentTime + duration,
+            text: sentence,
+            keyword: '',
+            keyword_color: '',
+          }
+
+          if (inAnimation) cap.in_animation = inAnimation
+          if (outAnimation) cap.out_animation = outAnimation
+          if (inAnimDuration) cap.in_animation_duration = inAnimDuration
+          if (outAnimDuration) cap.out_animation_duration = outAnimDuration
+
+          captions.push(cap)
+          currentTime += duration
+        })
+      }
     } else {
       // 原始模式：一句字幕
       const keyword = ''
diff --git a/.claude/skills/video-from-script/scripts/lib/phase-tts.js b/.claude/skills/video-from-script/scripts/lib/phase-tts.js
index 97e5f14..16b9c85 100644
--- a/.claude/skills/video-from-script/scripts/lib/phase-tts.js
+++ b/.claude/skills/video-from-script/scripts/lib/phase-tts.js
@@ -1,11 +1,12 @@
 /**
- * Phase: tts — 语音合成
+ * Phase: tts — 语音合成（逐句分句生成）
  *
- * 使用通义千问 TTS 生成旁白音频
+ * 将每个 item 的 script 按标点切分为短句，每句单独生成 TTS 音频。
+ * 结果写入 item.segments[]，实现字幕与语音精确对齐。
  */
 
 const path = require('path')
-const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils')
+const { saveManifest, ensureDir, log, getManifestDir, splitTextIntoSentences } = require('./pipeline-utils')
 
 async function phaseTts(manifest, manifestPath, options = {}) {
   const dir = getManifestDir(manifestPath)
@@ -24,17 +25,51 @@ async function phaseTts(manifest, manifestPath, options = {}) {
   for (let i = 0; i < items.length; i++) {
     const item = items[i]
     const idx = i + 1
+    const fullText = item.script || item.text
+
     try {
-      const { filePath, duration } = await synthesize(item.script || item.text, {
-        outputDir: audioDir,
-        id: item.id || idx,
-        voice: manifest.ttsVoice || undefined,
-        instruction: manifest.ttsInstruction || undefined,
-        rate: manifest.ttsRate || undefined,
-      })
-      item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
-      item.audioDuration = Math.round(duration * 1000) / 1000
-      log('tts', `[${idx}/${items.length}] ${duration.toFixed(1)}s: ${(item.script || item.text).substring(0, 30)}...`)
+      const sentences = splitTextIntoSentences(fullText)
+
+      if (sentences.length <= 1) {
+        // 单句：不需要 segments，走原逻辑
+        const { filePath, duration } = await synthesize(fullText, {
+          outputDir: audioDir,
+          id: item.id || idx,
+          voice: manifest.ttsVoice || undefined,
+          instruction: manifest.ttsInstruction || undefined,
+          rate: manifest.ttsRate || undefined,
+        })
+        item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
+        item.audioDuration = Math.round(duration * 1000) / 1000
+        log('tts', `[${idx}/${items.length}] ${duration.toFixed(1)}s: ${fullText.substring(0, 30)}...`)
+      } else {
+        // 多句：逐句生成，写入 segments
+        const segments = []
+        let totalDuration = 0
+
+        for (let j = 0; j < sentences.length; j++) {
+          const sentence = sentences[j]
+          const segId = `${item.id || idx}_${j + 1}`
+          const { filePath, duration } = await synthesize(sentence, {
+            outputDir: audioDir,
+            id: segId,
+            voice: manifest.ttsVoice || undefined,
+            instruction: manifest.ttsInstruction || undefined,
+            rate: manifest.ttsRate || undefined,
+          })
+          segments.push({
+            text: sentence,
+            audio: path.relative(dir, filePath).replace(/\\/g, '/'),
+            duration: Math.round(duration * 1000) / 1000,
+          })
+          totalDuration += duration
+        }
+
+        item.segments = segments
+        item.audio = segments[0].audio
+        item.audioDuration = Math.round(totalDuration * 1000) / 1000
+        log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s (${segments.length}句): ${fullText.substring(0, 30)}...`)
+      }
     } catch (err) {
       item.status = 'failed'
       item.error = `TTS失败: ${err.message}`
diff --git a/.claude/skills/video-from-script/scripts/lib/pipeline-utils.js b/.claude/skills/video-from-script/scripts/lib/pipeline-utils.js
index be73053..df2d756 100644
--- a/.claude/skills/video-from-script/scripts/lib/pipeline-utils.js
+++ b/.claude/skills/video-from-script/scripts/lib/pipeline-utils.js
@@ -160,6 +160,36 @@ function getManifestDir(manifestPath) {
   return path.dirname(path.resolve(manifestPath))
 }
 
+// ============================================================================
+// 文本切分
+// ============================================================================
+
+function splitTextIntoSentences(text) {
+  const sentenceEnders = /[。！？；]/
+  const clauseEnders = /[，：]/
+
+  const sentences = []
+  let current = ''
+
+  for (const char of text) {
+    current += char
+
+    if (sentenceEnders.test(char)) {
+      sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
+      current = ''
+    } else if (clauseEnders.test(char) && current.length > 8) {
+      sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
+      current = ''
+    }
+  }
+
+  if (current.trim()) {
+    sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
+  }
+
+  return sentences
+}
+
 // ============================================================================
 // Exports
 // ============================================================================
@@ -178,6 +208,7 @@ module.exports = {
   ensureDir,
   slugify,
   renameGeneratedFile,
+  splitTextIntoSentences,
   log,
   getManifestDir,
 }