video-create/.claude/skills/video-from-script/scripts/lib/phase-tts.js

/**
 * Phase: tts — 语音合成（先分段，后合成）
 *
 * 核心变化：音频分段优先于生图。
 *
 * 1. 在生成图片之前，先将文案按语义断点切分为多个音频片段
 * 2. 每个片段时长 < videoModel 固定时长（Kling=6s）
 * 3. 逐段合成，记录实测时长，写入 manifest.segments[]
 * 4. manifest.items[n].segments = [{text, audio, duration, startOffset}, ...]
 * 5. manifest.items[n].audioDuration = 片段总和（供 assemble 计算 ratio）
 *
 * 流程顺序变为：tts → images → upload → videos → assemble
 */

const path = require('path')
const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils')

/**
 * 在语义断点处将文案切分为音频片段
 * 每段时长（估算）必须 < videoDuration，且尽量接近（最佳 ratio 接近1.0）
 *
 * @param {string} text - 完整文案
 * @param {number} videoDur - 视频模型固定时长（秒），如 6
 * @param {number} charsPerSec - 语速（字/秒），固定 5
 * @returns {Array<{text, estimatedDuration}>}
 */
function splitIntoAudioSegments(text, videoDur, charsPerSec = 5) {
  const estimatedTotal = text.length / charsPerSec
  if (estimatedTotal <= videoDur) {
    return [{ text, estimatedDuration: estimatedTotal }]
  }

  // 在原文标点处切分，保留原始标点（不剥离、不重加）
  const breakPattern = /[。！；，]/
  const rawParts = []
  let lastIdx = 0
  for (let i = 0; i < text.length; i++) {
    if (breakPattern.test(text[i])) {
      rawParts.push(text.slice(lastIdx, i + 1))
      lastIdx = i + 1
    }
  }
  if (lastIdx < text.length) {
    rawParts.push(text.slice(lastIdx))
  }

  // 无标点断点，强制对半切
  if (rawParts.length <= 1) {
    const half = Math.floor(text.length / 2)
    return [
      { text: text.slice(0, half), estimatedDuration: half / charsPerSec },
      { text: text.slice(half), estimatedDuration: (text.length - half) / charsPerSec },
    ]
  }

  // 合并短片段，确保每段 ≤ videoDur
  const result = []
  let curText = ''
  let curDur = 0

  for (const part of rawParts) {
    const partDur = part.length / charsPerSec
    if (curDur + partDur <= videoDur) {
      curText += part
      curDur += partDur
    } else {
      if (curText) result.push({ text: curText, estimatedDuration: curDur })
      // 单段超长，强制对半切
      if (partDur > videoDur) {
        const half = Math.floor(part.length / 2)
        result.push({ text: part.slice(0, half), estimatedDuration: half / charsPerSec })
        curText = part.slice(half)
        curDur = (part.length - half) / charsPerSec
      } else {
        curText = part
        curDur = partDur
      }
    }
  }
  if (curText) result.push({ text: curText, estimatedDuration: curDur })

  return result
}

async function phaseTts(manifest, manifestPath, options = {}) {
  const dir = getManifestDir(manifestPath)
  const audioDir = path.join(dir, 'audio')
  ensureDir(audioDir)

  const { synthesize } = require('../qwen-tts')

  const videoDur = manifest.estimatedVideoDuration || 6
  const ttsRate = manifest.ttsRate || 1.15

  const items = manifest.items.filter(it =>
    (it.script || it.text) && !it.audio
  )
  if (items.length === 0) { log('tts', '无待处理 item（已合成），跳过'); return }

  log('tts', `共 ${items.length} 段, 视频固定时长=${videoDur}s, TTS语速=${ttsRate}x`)

  for (let i = 0; i < items.length; i++) {
    const item = items[i]
    const idx = i + 1
    const fullText = (item.script || item.text).trim()

    // Step 1: 计算音频分段
    const rawSegments = splitIntoAudioSegments(fullText, videoDur)
    log('tts', `[${idx}/${items.length}] 原始分段: ${rawSegments.length} 段`)
    for (const seg of rawSegments) {
      log('tts', `        分段估算: ${seg.estimatedDuration.toFixed(2)}s / ${seg.text.slice(0, 20)}...`)
    }

    // Step 2: 逐段合成
    const segments = []
    let globalOffset = 0

    for (let j = 0; j < rawSegments.length; j++) {
      const segInput = rawSegments[j]
      const segId = `${item.id}_${j + 1}`

      try {
        const { filePath, duration: realDuration } = await synthesize(segInput.text, {
          outputDir: audioDir,
          id: segId,
          voice: manifest.ttsVoice || undefined,
          instruction: manifest.ttsInstruction || undefined,
          rate: ttsRate,
        })

        const segment = {
          id: segId,
          text: segInput.text,
          audio: path.relative(dir, filePath).replace(/\\/g, '/'),
          estimatedDuration: Math.round(segInput.estimatedDuration * 1000) / 1000,
          duration: Math.round(realDuration * 1000) / 1000,
          startOffset: Math.round(globalOffset * 1000) / 1000,
        }
        segments.push(segment)
        globalOffset += realDuration

        log('tts', `[${idx}/${items.length}] 段${j + 1}: 估算${segInput.estimatedDuration.toFixed(2)}s → 实测${realDuration.toFixed(2)}s | ${segInput.text.slice(0, 15)}...`)
      } catch (err) {
        log('tts', `[${idx}/${items.length}] 段${j + 1} 合成失败: ${err.message}`)
        segments.push({
          id: segId,
          text: segInput.text,
          audio: '',
          estimatedDuration: segInput.estimatedDuration,
          duration: 0,
          startOffset: globalOffset,
          error: err.message,
        })
        globalOffset += segInput.estimatedDuration
      }
    }

    // Step 3: 汇总到 item
    const totalAudioDuration = Math.round(globalOffset * 1000) / 1000
    item.segments = segments
    item.audio = segments[0]?.audio || ''
    item.audioDuration = totalAudioDuration
    item.segmentCount = segments.length

    // Step 4: 时长合规诊断
    const ratio = videoDur / totalAudioDuration
    if (ratio < 0.9) {
      item._timelineWarning = `⚠ audioDur(${totalAudioDuration.toFixed(1)}s) > videoDur(${videoDur}s)，ratio=${ratio.toFixed(2)}，assemble 将截断`
    }

    log('tts', `[${idx}/${items.length}] 完成: ${segments.length}段, 总音频${totalAudioDuration.toFixed(1)}s, ratio=${ratio.toFixed(2)}`)

    saveManifest(manifestPath, manifest)
  }
}

module.exports = { phaseTts, splitIntoAudioSegments }