refactor(video-pipeline): 移除 segments 机制，改为整段音频合成

移除 TTS 阶段逐句切分及 segments 数组逻辑，统一为整段音频合成。 CapCut 字幕切分由组装阶段按字符比例分配，简化音频上传、时间线构建和字幕生成流程，减少冗余处理分支。
2026-05-02 02:31:55 +08:00
parent ac753ef367
commit 6097a809bf
9 changed files with 95 additions and 244 deletions
--- a/.claude/skills/video-from-script/scripts/capcut_assemble.js
+++ b/.claude/skills/video-from-script/scripts/capcut_assemble.js
@@ -65,24 +65,6 @@ async function batchUploadToOSS(inputDir, files, concurrency = 3) {
 async function batchUploadAudio(inputDir, items) {
  const urls = {}
  for (const item of items) {
-    if (item.segments && item.segments.length > 0) {
-      for (const seg of item.segments) {
-        if (!seg.audio || seg.audio.startsWith('http') || urls[seg.audio]) continue
-        const filePath = path.isAbsolute(seg.audio)
-          ? seg.audio
-          : path.resolve(inputDir, seg.audio)
-        if (!fs.existsSync(filePath)) {
-          console.error(`   音频文件不存在: ${filePath}`)
-          continue
-        }
-        try {
-          urls[seg.audio] = await uploadToOSS(filePath)
-          console.log(`   上传: ${path.basename(filePath)} -> OK`)
-        } catch (err) {
-          console.error(`   上传失败: ${path.basename(filePath)} - ${err.message}`)
-        }
-      }
-    }
    if (!item.audio || item.audio.startsWith('http')) {
      if (item.audio) urls[item.audio] = item.audio
      continue
@@ -174,17 +156,7 @@ async function assemble(args) {
  // ffprobe 测量实际时长
  let audioMeasured = 0, videoMeasured = 0
  for (const item of items) {
-    if (item.segments && item.segments.length > 0) {
-      for (const seg of item.segments) {
-        if (!seg.audio || seg.audio.startsWith('http')) continue
-        const audioPath = path.isAbsolute(seg.audio)
-          ? seg.audio
-          : path.resolve(inputDir, seg.audio)
-        if (!fs.existsSync(audioPath)) continue
-        const actualDur = await getAudioDurationSec(audioPath)
-        if (actualDur != null) { seg.duration = actualDur; audioMeasured++ }
-      }
-    } else if (item.audio && !item.audio.startsWith('http')) {
+    if (item.audio && !item.audio.startsWith('http')) {
      const audioPath = path.isAbsolute(item.audio)
        ? item.audio
        : path.resolve(inputDir, item.audio)
@@ -216,9 +188,7 @@ async function assemble(args) {
    const item = items[i]
    const tl = timeline[i]
    if (tl.skip) { console.log(`  [${i + 1}] 跳过（无音频）`); continue }
-    const audioDur = item.segments
-      ? item.segments.reduce((s, seg) => s + (seg.duration || 0), 0)
-      : (item.audioDuration || 0)
+    const audioDur = item.audioDuration || 0
    const slotDur = tl.duration / US
    const diff = slotDur - audioDur
    const videoDur = (item.videoDuration || 0)
@@ -341,14 +311,6 @@ async function assemble(args) {
            item.audio = audioUrls[item.audio]
            changed = true
          }
-          if (item.segments) {
-            for (const seg of item.segments) {
-              if (seg.audio && audioUrls[seg.audio]) {
-                seg.audio = audioUrls[seg.audio]
-                changed = true
-              }
-            }
-          }
        }
        if (changed) saveManifest(manifestFile, manifest)
      }
--- a/.claude/skills/video-from-script/scripts/lib/capcut-timeline.js
+++ b/.claude/skills/video-from-script/scripts/lib/capcut-timeline.js
@@ -23,12 +23,7 @@ const { US } = require('./capcut-api')
 function buildTimeline(items) {
  let offset = 0
  return items.map(item => {
-    let audioDur
-    if (item.segments && item.segments.length > 0) {
-      audioDur = item.segments.reduce((sum, s) => sum + (s.duration || 0), 0) * US
-    } else {
-      audioDur = (item.audioDuration != null) ? item.audioDuration * US : 0
-    }
+    const audioDur = (item.audioDuration != null) ? item.audioDuration * US : 0
    const videoDur = (item.videoDuration != null) ? item.videoDuration * US : 0
    const hasVideo = !!(item.video || item.videoUrl || item.url)

--- a/.claude/skills/video-from-script/scripts/lib/capcut-tracks.js
+++ b/.claude/skills/video-from-script/scripts/lib/capcut-tracks.js
@@ -308,7 +308,7 @@ async function addVideos(draftUrl, inputDir, items, timeline, width, height, tra
 // ============================================================================

 async function addVoiceover(draftUrl, inputDir, items, timeline, audioUrls = {}) {
-  const audioItems = items.filter(item => item.audio || (item.segments && item.segments.length > 0))
+  const audioItems = items.filter(item => item.audio)
  if (audioItems.length === 0) {
    console.log('   无 TTS 音频文件，跳过')
    return
@@ -325,25 +325,7 @@ async function addVoiceover(draftUrl, inputDir, items, timeline, audioUrls = {})
    const item = items[i]
    const tl = timeline[i]

-    if (item.segments && item.segments.length > 0) {
-      let currentTime = tl.start
-      for (let si = 0; si < item.segments.length; si++) {
-        const seg = item.segments[si]
-        const audioUrl = resolveAudio(seg.audio)
-        const segDurUs = (seg.duration || 0) * US
-        if (segDurUs <= 0) continue
-        const isLast = si === item.segments.length - 1
-        const endTime = isLast ? tl.end : currentTime + segDurUs
-        audioInfos.push({
-          audio_url: audioUrl,
-          start: currentTime,
-          end: endTime,
-          duration: endTime - currentTime,
-          volume: 1.0,
-        })
-        currentTime = endTime
-      }
-    } else if (item.audio) {
+    if (item.audio) {
      const audioUrl = resolveAudio(item.audio)
      const audioDurUs = item.audioDuration ? item.audioDuration * US : tl.duration

@@ -421,48 +403,33 @@ async function addSubtitles(draftUrl, items, timeline, style = {}, split = false
    const tl = timeline[i]

    if (split) {
-      if (item.segments && item.segments.length > 0) {
-        let currentTime = tl.start
-        for (let si = 0; si < item.segments.length; si++) {
-          const seg = item.segments[si]
-          const segDurUs = (seg.duration || 0) * US
-          if (segDurUs <= 0) continue
-          const isLast = si === item.segments.length - 1
-          const endTime = isLast ? tl.end : currentTime + segDurUs
-          const cap = { start: currentTime, end: endTime, text: seg.text }
-          applyAnimationProps(cap, animStyle)
-          captions.push(cap)
-          currentTime = endTime
+      const sentences = splitTextIntoSentences(text)
+      if (sentences.length === 0) continue
+
+      const totalDuration = tl.end - tl.start
+      const totalChars = sentences.reduce((sum, s) => sum + s.length, 0)
+      let currentTime = tl.start
+
+      sentences.forEach((sentence, idx) => {
+        const charRatio = sentence.length / totalChars
+        let duration = Math.round(totalDuration * charRatio)
+
+        if (idx === sentences.length - 1) {
+          duration = tl.end - currentTime
        }
-      } else {
-        const sentences = splitTextIntoSentences(text)
-        if (sentences.length === 0) continue

-        const totalDuration = tl.end - tl.start
-        const totalChars = sentences.reduce((sum, s) => sum + s.length, 0)
-        let currentTime = tl.start
+        duration = Math.max(duration, 500000)

-        sentences.forEach((sentence, idx) => {
-          const charRatio = sentence.length / totalChars
-          let duration = Math.round(totalDuration * charRatio)
+        const cap = {
+          start: currentTime,
+          end: currentTime + duration,
+          text: sentence,
+        }

-          if (idx === sentences.length - 1) {
-            duration = tl.end - currentTime
-          }
-
-          duration = Math.max(duration, 500000)
-
-          const cap = {
-            start: currentTime,
-            end: currentTime + duration,
-            text: sentence,
-          }
-
-          applyAnimationProps(cap, animStyle)
-          captions.push(cap)
-          currentTime += duration
-        })
-      }
+        applyAnimationProps(cap, animStyle)
+        captions.push(cap)
+        currentTime += duration
+      })
    } else {
      const cap = {
        start: tl.start,
--- a/.claude/skills/video-from-script/scripts/lib/phase-assemble.js
+++ b/.claude/skills/video-from-script/scripts/lib/phase-assemble.js
@@ -28,7 +28,7 @@ async function phaseAssemble(manifest, manifestPath, options) {
    manifest: manifestPath,
    mode,
    format: manifest.format || accountConfig.defaultFormat || '9:16',
-    subtitles: mode === 'images' ? 'true' : 'false',
+    subtitles: 'true',
    voiceover: manifest.items.some(it => it.audio) ? 'true' : 'false',
    animation: capcutConfig.animation || '渐显+放大',
  }
--- a/.claude/skills/video-from-script/scripts/lib/phase-tts.js
+++ b/.claude/skills/video-from-script/scripts/lib/phase-tts.js
@@ -1,13 +1,13 @@
 /**
- * Phase: tts — 语音合成（逐句分句生成）
+ * Phase: tts — 语音合成（整段合成）
 *
- * 将每个 item 的 script 按标点切分为短句，每句单独生成 TTS 音频。
- * 统一写入 item.segments[]，单句时数组仅 1 个元素。
- * item.audio 指向第一段，item.audioDuration 为累计时长。
+ * 每个 item 的 script 整段合成一个音频文件，保留自然语调。
+ * item.audio 指向完整音频，item.audioDuration 为总时长。
+ * 字幕切分由组装阶段按字符比例分配，不在 TTS 阶段处理。
 */

 const path = require('path')
-const { saveManifest, ensureDir, log, getManifestDir, splitTextIntoSentences } = require('./pipeline-utils')
+const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils')

 async function phaseTts(manifest, manifestPath, options = {}) {
  const dir = getManifestDir(manifestPath)
@@ -29,33 +29,18 @@ async function phaseTts(manifest, manifestPath, options = {}) {
    const fullText = item.script || item.text

    try {
-      const sentences = splitTextIntoSentences(fullText)
-      const segments = []
-      let totalDuration = 0
+      const { filePath, duration } = await synthesize(fullText, {
+        outputDir: audioDir,
+        id: String(item.id || idx),
+        voice: manifest.ttsVoice || undefined,
+        instruction: manifest.ttsInstruction || undefined,
+        rate: manifest.ttsRate || undefined,
+      })

-      for (let j = 0; j < sentences.length; j++) {
-        const sentence = sentences[j]
-        const segId = `${item.id || idx}_${j + 1}`
-        const { filePath, duration } = await synthesize(sentence, {
-          outputDir: audioDir,
-          id: segId,
-          voice: manifest.ttsVoice || undefined,
-          instruction: manifest.ttsInstruction || undefined,
-          rate: manifest.ttsRate || undefined,
-        })
-        segments.push({
-          text: sentence,
-          audio: path.relative(dir, filePath).replace(/\\/g, '/'),
-          duration: Math.round(duration * 1000) / 1000,
-        })
-        totalDuration += duration
-      }
-
-      // 统一使用 segments 数组（单句 = 1 元素，多句 = N 元素）
-      item.segments = segments
-      item.audio = segments[0].audio
-      item.audioDuration = Math.round(totalDuration * 1000) / 1000
-      log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s (${segments.length}句): ${fullText.substring(0, 30)}...`)
+      const totalDuration = Math.round(duration * 1000) / 1000
+      item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
+      item.audioDuration = totalDuration
+      log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s: ${fullText.substring(0, 30)}...`)
    } catch (err) {
      item.status = 'failed'
      item.error = `TTS失败: ${err.message}`
--- a/.claude/skills/video-from-script/scripts/lib/pipeline-utils.js
+++ b/.claude/skills/video-from-script/scripts/lib/pipeline-utils.js
@@ -165,8 +165,8 @@ function getManifestDir(manifestPath) {
 // ============================================================================

 function splitTextIntoSentences(text) {
-  const sentenceEnders = /[。！？；]/
-  const clauseEnders = /[，：]/
+  // 在句号、感叹号、分号、逗号处断句——它们是口播语音的天然呼吸点。
+  const sentenceEnders = /[。！；，]/

  const sentences = []
  let current = ''
@@ -175,16 +175,13 @@ function splitTextIntoSentences(text) {
    current += char

    if (sentenceEnders.test(char)) {
-      sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
-      current = ''
-    } else if (clauseEnders.test(char) && current.length > 8) {
-      sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
+      sentences.push(current.trim().replace(/[。！；，：？、——…]/g, ''))
      current = ''
    }
  }

  if (current.trim()) {
-    sentences.push(current.trim().replace(/[。！？；，：、]/g, ''))
+    sentences.push(current.trim().replace(/[。！；，：？、——…]/g, ''))
  }

  return sentences