video-create/.claude/skills/video-from-script/scripts/lib/phase-tts.js

/**
 * Phase: tts — 语音合成（先分段，后合成）
 *
 * 核心变化：音频分段优先于生图。
 *
 * 1. 在生成图片之前，先将文案按语义断点切分为多个音频片段
 * 2. 每个片段时长 < videoModel 固定时长（Kling=6s）
 * 3. 逐段合成，记录实测时长，写入 manifest.segments[]
 * 4. manifest.items[n].segments = [{text, audio, duration, startOffset}, ...]
 * 5. manifest.items[n].audioDuration = 片段总和（供 assemble 计算 ratio）
 *
 * 流程顺序变为：tts → images → upload → videos → assemble
 */

const path = require('path')
const { saveManifest, ensureDir, log, getManifestDir, splitTextIntoSentences } = require('./pipeline-utils')

/**
 * 在语义断点处将文案切分为音频片段
 * 每段时长（估算）必须 < videoDuration，且尽量接近（最佳 ratio 接近1.0）
 *
 * @param {string} text - 完整文案
 * @param {number} videoDur - 视频模型固定时长（秒），如 6
 * @param {number} charsPerSec - 语速（字/秒），固定 5
 * @returns {Array<{text, estimatedDuration}>}
 */
function splitIntoAudioSegments(text, videoDur, charsPerSec = 5) {
  // 优先在自然断点切分（句号/感叹号/分号）
  const naturalBreaks = splitTextIntoSentences(text)
  if (naturalBreaks.length <= 1) {
    // 无自然断点：在半段处（含小数点）切分
    const chars = text.length
    const estimatedTotal = chars / charsPerSec
    if (estimatedTotal <= videoDur) {
      // 整段可容纳
      return [{ text, estimatedDuration: estimatedTotal }]
    }
    // 无法单段容纳，在中间逗号处切
    const mid = Math.floor(chars / 2)
    const breakIdx = text.indexOf('，', mid)
    if (breakIdx > 0) {
      return [
        { text: text.slice(0, breakIdx + 1), estimatedDuration: (breakIdx + 1) / charsPerSec },
        { text: text.slice(breakIdx + 1), estimatedDuration: (chars - breakIdx - 1) / charsPerSec },
      ]
    }
    // 强制按字数切
    const halfChars = Math.floor(chars / 2)
    return [
      { text: text.slice(0, halfChars), estimatedDuration: halfChars / charsPerSec },
      { text: text.slice(halfChars), estimatedDuration: (chars - halfChars) / charsPerSec },
    ]
  }

  // 多个自然句：逐句判断，合并短句
  const result = []
  let currentText = ''
  let currentEstDur = 0

  for (let i = 0; i < naturalBreaks.length; i++) {
    const sentence = naturalBreaks[i]
    const sentenceLen = sentence.length
    const sentenceEstDur = sentenceLen / charsPerSec

    if (currentEstDur + sentenceEstDur <= videoDur) {
      // 可以合并到当前段
      currentText += sentence + '。'
      currentEstDur += sentenceEstDur
    } else {
      // 先保存当前段
      if (currentText) {
        result.push({ text: currentText.trim(), estimatedDuration: currentEstDur })
      }
      currentText = sentence + '。'
      currentEstDur = sentenceEstDur

      // 单句本身超长（超 videoDur）
      if (sentenceEstDur > videoDur) {
        // 按半段切
        const halfLen = Math.floor(sentenceLen / 2)
        const half1 = sentence.slice(0, halfLen)
        const half2 = sentence.slice(halfLen)
        // 回退上一段，用两个半段替代
        result.pop()
        result.push({ text: half1, estimatedDuration: halfLen / charsPerSec })
        currentText = half2 + '。'
        currentEstDur = (sentenceLen - halfLen) / charsPerSec
      }
    }
  }

  if (currentText) {
    result.push({ text: currentText.trim(), estimatedDuration: currentEstDur })
  }

  return result
}

async function phaseTts(manifest, manifestPath, options = {}) {
  const dir = getManifestDir(manifestPath)
  const audioDir = path.join(dir, 'audio')
  ensureDir(audioDir)

  const { synthesize } = require('../qwen-tts')

  const videoDur = manifest.estimatedVideoDuration || 6
  const ttsRate = manifest.ttsRate || 1.15

  const items = manifest.items.filter(it =>
    (it.script || it.text) && !it.audio
  )
  if (items.length === 0) { log('tts', '无待处理 item（已合成），跳过'); return }

  log('tts', `共 ${items.length} 段, 视频固定时长=${videoDur}s, TTS语速=${ttsRate}x`)

  for (let i = 0; i < items.length; i++) {
    const item = items[i]
    const idx = i + 1
    const fullText = (item.script || item.text).trim()

    // Step 1: 计算音频分段
    const rawSegments = splitIntoAudioSegments(fullText, videoDur)
    log('tts', `[${idx}/${items.length}] 原始分段: ${rawSegments.length} 段`)
    for (const seg of rawSegments) {
      log('tts', `        分段估算: ${seg.estimatedDuration.toFixed(2)}s / ${seg.text.slice(0, 20)}...`)
    }

    // Step 2: 逐段合成
    const segments = []
    let globalOffset = 0

    for (let j = 0; j < rawSegments.length; j++) {
      const segInput = rawSegments[j]
      const segId = `${item.id}_${j + 1}`

      try {
        const { filePath, duration: realDuration } = await synthesize(segInput.text, {
          outputDir: audioDir,
          id: segId,
          voice: manifest.ttsVoice || undefined,
          instruction: manifest.ttsInstruction || undefined,
          rate: ttsRate,
        })

        const segment = {
          id: segId,
          text: segInput.text,
          audio: path.relative(dir, filePath).replace(/\\/g, '/'),
          estimatedDuration: Math.round(segInput.estimatedDuration * 1000) / 1000,
          duration: Math.round(realDuration * 1000) / 1000,
          startOffset: Math.round(globalOffset * 1000) / 1000,
        }
        segments.push(segment)
        globalOffset += realDuration

        log('tts', `[${idx}/${items.length}] 段${j + 1}: 估算${segInput.estimatedDuration.toFixed(2)}s → 实测${realDuration.toFixed(2)}s | ${segInput.text.slice(0, 15)}...`)
      } catch (err) {
        log('tts', `[${idx}/${items.length}] 段${j + 1} 合成失败: ${err.message}`)
        segments.push({
          id: segId,
          text: segInput.text,
          audio: '',
          estimatedDuration: segInput.estimatedDuration,
          duration: 0,
          startOffset: globalOffset,
          error: err.message,
        })
        globalOffset += segInput.estimatedDuration
      }
    }

    // Step 3: 汇总到 item
    const totalAudioDuration = Math.round(globalOffset * 1000) / 1000
    item.segments = segments
    item.audio = segments[0]?.audio || ''
    item.audioDuration = totalAudioDuration
    item.segmentCount = segments.length

    // Step 4: 时长合规诊断
    const ratio = videoDur / totalAudioDuration
    if (ratio < 0.9) {
      item._timelineWarning = `⚠ audioDur(${totalAudioDuration.toFixed(1)}s) > videoDur(${videoDur}s)，ratio=${ratio.toFixed(2)}，assemble 将截断`
    }

    log('tts', `[${idx}/${items.length}] 完成: ${segments.length}段, 总音频${totalAudioDuration.toFixed(1)}s, ratio=${ratio.toFixed(2)}`)

    saveManifest(manifestPath, manifest)
  }
}

module.exports = { phaseTts, splitIntoAudioSegments }