feat(skills): 完善视频生产 pipeline 及新增健身跟练账号

- SKILL.md: 新增工作流阶段定义、质量卡点、分镜规则 - manifest-schema.md: 补充完整字段规范及类型定义 - phase-tts.js: 优化 TTS 合成长逻辑，添加进度追踪 - capcut-tracks.js: 扩展轨道构建能力，支持更多元素类型 - capcut-timeline.js: 改进时间线生成，支持淡入淡出 - capcut_assemble.js: 新增 assemble 阶段完整实现 - cmd-init.js: 完善 init 命令逻辑 - qwen-tts.js: 调整超时配置 - accounts/禁忌帝王学: 更新拆分/图像/台词提示词 - accounts/健身跟练: 新增账号含 account.json 及全套提示词模板 - 新增 workflow-issues-20260501.md 参考文档 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 22:53:37 +08:00
parent e6daf7a8d8
commit 6eec0e8889
28 changed files with 2199 additions and 253 deletions
--- a/.claude/skills/video-from-script/scripts/lib/phase-tts.js
+++ b/.claude/skills/video-from-script/scripts/lib/phase-tts.js
@@ -1,13 +1,100 @@
 /**
- * Phase: tts — 语音合成（整段合成）
+ * Phase: tts — 语音合成（先分段，后合成）
 *
- * 每个 item 的 script 整段合成一个音频文件，保留自然语调。
- * item.audio 指向完整音频，item.audioDuration 为总时长。
- * 字幕切分由组装阶段按字符比例分配，不在 TTS 阶段处理。
+ * 核心变化：音频分段优先于生图。
+ *
+ * 1. 在生成图片之前，先将文案按语义断点切分为多个音频片段
+ * 2. 每个片段时长 < videoModel 固定时长（Kling=6s）
+ * 3. 逐段合成，记录实测时长，写入 manifest.segments[]
+ * 4. manifest.items[n].segments = [{text, audio, duration, startOffset}, ...]
+ * 5. manifest.items[n].audioDuration = 片段总和（供 assemble 计算 ratio）
+ *
+ * 流程顺序变为：tts → images → upload → videos → assemble
 */

 const path = require('path')
-const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils')
+const { saveManifest, ensureDir, log, getManifestDir, splitTextIntoSentences } = require('./pipeline-utils')
+
+/**
+ * 在语义断点处将文案切分为音频片段
+ * 每段时长（估算）必须 < videoDuration，且尽量接近（最佳 ratio 接近1.0）
+ *
+ * @param {string} text - 完整文案
+ * @param {number} videoDur - 视频模型固定时长（秒），如 6
+ * @param {number} charsPerSec - 语速（字/秒），固定 5
+ * @returns {Array<{text, estimatedDuration}>}
+ */
+function splitIntoAudioSegments(text, videoDur, charsPerSec = 5) {
+  // 优先在自然断点切分（句号/感叹号/分号）
+  const naturalBreaks = splitTextIntoSentences(text)
+  if (naturalBreaks.length <= 1) {
+    // 无自然断点：在半段处（含小数点）切分
+    const chars = text.length
+    const estimatedTotal = chars / charsPerSec
+    if (estimatedTotal <= videoDur) {
+      // 整段可容纳
+      return [{ text, estimatedDuration: estimatedTotal }]
+    }
+    // 无法单段容纳，在中间逗号处切
+    const mid = Math.floor(chars / 2)
+    const breakIdx = text.indexOf('，', mid)
+    if (breakIdx > 0) {
+      return [
+        { text: text.slice(0, breakIdx + 1), estimatedDuration: (breakIdx + 1) / charsPerSec },
+        { text: text.slice(breakIdx + 1), estimatedDuration: (chars - breakIdx - 1) / charsPerSec },
+      ]
+    }
+    // 强制按字数切
+    const halfChars = Math.floor(chars / 2)
+    return [
+      { text: text.slice(0, halfChars), estimatedDuration: halfChars / charsPerSec },
+      { text: text.slice(halfChars), estimatedDuration: (chars - halfChars) / charsPerSec },
+    ]
+  }
+
+  // 多个自然句：逐句判断，合并短句
+  const result = []
+  let currentText = ''
+  let currentEstDur = 0
+
+  for (let i = 0; i < naturalBreaks.length; i++) {
+    const sentence = naturalBreaks[i]
+    const sentenceLen = sentence.length
+    const sentenceEstDur = sentenceLen / charsPerSec
+
+    if (currentEstDur + sentenceEstDur <= videoDur) {
+      // 可以合并到当前段
+      currentText += sentence + '。'
+      currentEstDur += sentenceEstDur
+    } else {
+      // 先保存当前段
+      if (currentText) {
+        result.push({ text: currentText.trim(), estimatedDuration: currentEstDur })
+      }
+      currentText = sentence + '。'
+      currentEstDur = sentenceEstDur
+
+      // 单句本身超长（超 videoDur）
+      if (sentenceEstDur > videoDur) {
+        // 按半段切
+        const halfLen = Math.floor(sentenceLen / 2)
+        const half1 = sentence.slice(0, halfLen)
+        const half2 = sentence.slice(halfLen)
+        // 回退上一段，用两个半段替代
+        result.pop()
+        result.push({ text: half1, estimatedDuration: halfLen / charsPerSec })
+        currentText = half2 + '。'
+        currentEstDur = (sentenceLen - halfLen) / charsPerSec
+      }
+    }
+  }
+
+  if (currentText) {
+    result.push({ text: currentText.trim(), estimatedDuration: currentEstDur })
+  }
+
+  return result
+}

 async function phaseTts(manifest, manifestPath, options = {}) {
  const dir = getManifestDir(manifestPath)
@@ -16,38 +103,89 @@ async function phaseTts(manifest, manifestPath, options = {}) {

  const { synthesize } = require('../qwen-tts')

-  const items = manifest.items.filter(it =>
-    it.status === 'done' && (it.script || it.text) && !it.audio
-  )
-  if (items.length === 0) { log('tts', '无待处理 item，跳过'); return }
+  const videoDur = manifest.estimatedVideoDuration || 6
+  const ttsRate = manifest.ttsRate || 1.15

-  log('tts', `共 ${items.length} 段`)
+  const items = manifest.items.filter(it =>
+    (it.script || it.text) && !it.audio
+  )
+  if (items.length === 0) { log('tts', '无待处理 item（已合成），跳过'); return }
+
+  log('tts', `共 ${items.length} 段, 视频固定时长=${videoDur}s, TTS语速=${ttsRate}x`)

  for (let i = 0; i < items.length; i++) {
    const item = items[i]
    const idx = i + 1
-    const fullText = item.script || item.text
+    const fullText = (item.script || item.text).trim()

-    try {
-      const { filePath, duration } = await synthesize(fullText, {
-        outputDir: audioDir,
-        id: String(item.id || idx),
-        voice: manifest.ttsVoice || undefined,
-        instruction: manifest.ttsInstruction || undefined,
-        rate: manifest.ttsRate || undefined,
-      })
-
-      const totalDuration = Math.round(duration * 1000) / 1000
-      item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
-      item.audioDuration = totalDuration
-      log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s: ${fullText.substring(0, 30)}...`)
-    } catch (err) {
-      item.status = 'failed'
-      item.error = `TTS失败: ${err.message}`
-      log('tts', `[${idx}/${items.length}] 失败: ${err.message}`)
+    // Step 1: 计算音频分段
+    const rawSegments = splitIntoAudioSegments(fullText, videoDur)
+    log('tts', `[${idx}/${items.length}] 原始分段: ${rawSegments.length} 段`)
+    for (const seg of rawSegments) {
+      log('tts', `        分段估算: ${seg.estimatedDuration.toFixed(2)}s / ${seg.text.slice(0, 20)}...`)
    }
+
+    // Step 2: 逐段合成
+    const segments = []
+    let globalOffset = 0
+
+    for (let j = 0; j < rawSegments.length; j++) {
+      const segInput = rawSegments[j]
+      const segId = `${item.id}_${j + 1}`
+
+      try {
+        const { filePath, duration: realDuration } = await synthesize(segInput.text, {
+          outputDir: audioDir,
+          id: segId,
+          voice: manifest.ttsVoice || undefined,
+          instruction: manifest.ttsInstruction || undefined,
+          rate: ttsRate,
+        })
+
+        const segment = {
+          id: segId,
+          text: segInput.text,
+          audio: path.relative(dir, filePath).replace(/\\/g, '/'),
+          estimatedDuration: Math.round(segInput.estimatedDuration * 1000) / 1000,
+          duration: Math.round(realDuration * 1000) / 1000,
+          startOffset: Math.round(globalOffset * 1000) / 1000,
+        }
+        segments.push(segment)
+        globalOffset += realDuration
+
+        log('tts', `[${idx}/${items.length}] 段${j + 1}: 估算${segInput.estimatedDuration.toFixed(2)}s → 实测${realDuration.toFixed(2)}s | ${segInput.text.slice(0, 15)}...`)
+      } catch (err) {
+        log('tts', `[${idx}/${items.length}] 段${j + 1} 合成失败: ${err.message}`)
+        segments.push({
+          id: segId,
+          text: segInput.text,
+          audio: '',
+          estimatedDuration: segInput.estimatedDuration,
+          duration: 0,
+          startOffset: globalOffset,
+          error: err.message,
+        })
+        globalOffset += segInput.estimatedDuration
+      }
+    }
+
+    // Step 3: 汇总到 item
+    const totalAudioDuration = Math.round(globalOffset * 1000) / 1000
+    item.segments = segments
+    item.audio = segments[0]?.audio || ''
+    item.audioDuration = totalAudioDuration
+    item.segmentCount = segments.length
+
+    // Step 4: 时长合规诊断
+    const ratio = videoDur / totalAudioDuration
+    if (ratio < 0.9) {
+      item._timelineWarning = `⚠ audioDur(${totalAudioDuration.toFixed(1)}s) > videoDur(${videoDur}s)，ratio=${ratio.toFixed(2)}，assemble 将截断`
+    }
+
+    log('tts', `[${idx}/${items.length}] 完成: ${segments.length}段, 总音频${totalAudioDuration.toFixed(1)}s, ratio=${ratio.toFixed(2)}`)
+
    saveManifest(manifestPath, manifest)
  }
 }

-module.exports = { phaseTts }
+module.exports = { phaseTts, splitIntoAudioSegments }