Files
video-create/.claude/skills/video-from-script/scripts/lib/phase-tts.js
lc 1e110219ff feat: MiniMax TTS集成、口播批量剪辑流水线、执黑先行二号账号
- 新增 minimax-tts.js 和 minimax-voice-clone.js 脚本
- 新增口播批量剪辑流水线 (口播_assemble.js, 口播_pipeline.js)
- 更新 video-from-script 各阶段脚本 (kling, images, tts, videos)
- 新增执黑先行二号-风格延伸账号
- 更新执黑先行 account.json 配置
- 替换 ugc_product_seeding 参考图
- 更新 CLAUDE.md 和依赖配置

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-24 20:05:56 +08:00

215 lines
7.8 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Phase: tts — 语音合成(先分段,后合成)
*
* 核心变化:音频分段优先于生图。
*
* 1. 在生成图片之前,先将文案按语义断点切分为多个音频片段
* 2. 每个片段时长 < videoModel 固定时长Kling=6s
* 3. 逐段合成,记录实测时长,写入 manifest.segments[]
* 4. manifest.items[n].segments = [{text, audio, duration, startOffset}, ...]
* 5. manifest.items[n].audioDuration = 片段总和(供 assemble 计算 ratio
*
* 流程顺序变为tts → images → upload → videos → assemble
*/
const path = require('path')
const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils')
/**
* 在语义断点处将文案切分为音频片段
* 每段时长(估算)必须 < videoDuration且尽量接近最佳 ratio 接近1.0
*
* @param {string} text - 完整文案
* @param {number} videoDur - 视频模型固定时长(秒),如 6
* @param {number} charsPerSec - 语速(字/秒),固定 5
* @returns {Array<{text, estimatedDuration}>}
*/
function splitIntoAudioSegments(text, videoDur, charsPerSec = 5) {
const estimatedTotal = text.length / charsPerSec
if (estimatedTotal <= videoDur) {
return [{ text, estimatedDuration: estimatedTotal }]
}
// 在原文标点处切分,保留原始标点(不剥离、不重加)
const breakPattern = /[。!;,]/
const rawParts = []
let lastIdx = 0
for (let i = 0; i < text.length; i++) {
if (breakPattern.test(text[i])) {
rawParts.push(text.slice(lastIdx, i + 1))
lastIdx = i + 1
}
}
if (lastIdx < text.length) {
rawParts.push(text.slice(lastIdx))
}
// 无标点断点,强制对半切
if (rawParts.length <= 1) {
const half = Math.floor(text.length / 2)
return [
{ text: text.slice(0, half), estimatedDuration: half / charsPerSec },
{ text: text.slice(half), estimatedDuration: (text.length - half) / charsPerSec },
]
}
// 合并短片段,确保每段 ≤ videoDur
const result = []
let curText = ''
let curDur = 0
for (const part of rawParts) {
const partDur = part.length / charsPerSec
if (curDur + partDur <= videoDur) {
curText += part
curDur += partDur
} else {
if (curText) result.push({ text: curText, estimatedDuration: curDur })
// 单段超长,强制对半切
if (partDur > videoDur) {
const half = Math.floor(part.length / 2)
result.push({ text: part.slice(0, half), estimatedDuration: half / charsPerSec })
curText = part.slice(half)
curDur = (part.length - half) / charsPerSec
} else {
curText = part
curDur = partDur
}
}
}
if (curText) result.push({ text: curText, estimatedDuration: curDur })
return result
}
async function phaseTts(manifest, manifestPath, options = {}) {
const dir = getManifestDir(manifestPath)
const audioDir = path.join(dir, 'audio')
ensureDir(audioDir)
const engine = manifest.ttsEngine || 'qwen'
const { synthesize } = engine === 'minimax' ? require('../minimax-tts') : require('../qwen-tts')
const videoDur = manifest.estimatedVideoDuration || 6
const ttsRate = manifest.ttsRate || 1.15
const items = manifest.items.filter(it =>
(it.script || it.text) && !it.audio
)
if (items.length === 0) {
const total = manifest.items.length
const withAudio = manifest.items.filter(it => it.audio).length
const withScript = manifest.items.filter(it => it.script || it.text).length
if (withAudio === 0) {
console.error("\n" + "=".repeat(60))
console.error("❌ [tts] 严重错误:没有任何待处理的配音项,且 manifest 中也没有已生成的音频!")
console.error("=".repeat(60))
console.error(` 总数: ${total} | 有script: ${withScript} | 已有audio: ${withAudio}`)
if (withScript === 0) console.error(" 根因: 所有 item 都缺少 script 文本 — 分镜文件有问题")
console.error("=".repeat(60) + "\n")
throw new Error(`TTS 阶段中断: ${total} 个 item 均无音频且无待处理项 (script=${withScript}/${total})`)
}
log('tts', '所有音频已合成,跳过')
return
}
log('tts', `${items.length} 段, 视频固定时长=${videoDur}s, TTS语速=${ttsRate}x`)
for (let i = 0; i < items.length; i++) {
const item = items[i]
const idx = i + 1
const fullText = (item.script || item.text).trim()
// Step 1: 计算音频分段
const rawSegments = splitIntoAudioSegments(fullText, videoDur)
log('tts', `[${idx}/${items.length}] 原始分段: ${rawSegments.length}`)
for (const seg of rawSegments) {
log('tts', ` 分段估算: ${seg.estimatedDuration.toFixed(2)}s / ${seg.text.slice(0, 20)}...`)
}
// Step 2: 逐段合成
const segments = []
let globalOffset = 0
for (let j = 0; j < rawSegments.length; j++) {
const segInput = rawSegments[j]
const segId = `${item.id}_${j + 1}`
// 带重试的合成最多3次指数退避
let synthResult = null
let lastErr = null
for (let retry = 0; retry < 3; retry++) {
try {
synthResult = await synthesize(segInput.text, {
outputDir: audioDir,
id: segId,
voice: manifest.ttsVoice || undefined,
model: manifest.ttsModel || undefined,
instruction: manifest.ttsInstruction || undefined,
rate: ttsRate,
emotion: manifest.ttsEmotion || undefined,
languageBoost: manifest.ttsLanguageBoost || undefined,
pitch: manifest.ttsPitch ?? undefined,
})
break
} catch (e) {
lastErr = e
if (retry < 2) {
const delay = Math.pow(2, retry) * 3000
log('tts', `[${idx}/${items.length}] 段${j + 1} 重试 ${retry + 1}/3, ${delay / 1000}s 后重试...`)
await new Promise(r => setTimeout(r, delay))
}
}
}
if (synthResult) {
const { filePath, duration: realDuration } = synthResult
const segment = {
id: segId,
text: segInput.text,
audio: path.relative(dir, filePath).replace(/\\/g, '/'),
estimatedDuration: Math.round(segInput.estimatedDuration * 1000) / 1000,
duration: Math.round(realDuration * 1000) / 1000,
startOffset: Math.round(globalOffset * 1000) / 1000,
}
segments.push(segment)
globalOffset += realDuration
log('tts', `[${idx}/${items.length}] 段${j + 1}: 估算${segInput.estimatedDuration.toFixed(2)}s → 实测${realDuration.toFixed(2)}s | ${segInput.text.slice(0, 15)}...`)
} else {
log('tts', `[${idx}/${items.length}] 段${j + 1} 合成失败(重试3次后): ${lastErr?.message || '未知错误'}`)
segments.push({
id: segId,
text: segInput.text,
audio: '',
estimatedDuration: segInput.estimatedDuration,
duration: 0,
startOffset: globalOffset,
error: lastErr?.message || '未知错误',
})
globalOffset += segInput.estimatedDuration
}
}
// Step 3: 汇总到 item
const totalAudioDuration = Math.round(globalOffset * 1000) / 1000
item.segments = segments
item.audio = segments[0]?.audio || ''
item.audioDuration = totalAudioDuration
item.segmentCount = segments.length
// Step 4: 时长合规诊断
const ratio = videoDur / totalAudioDuration
if (ratio < 0.9) {
item._timelineWarning = `⚠ audioDur(${totalAudioDuration.toFixed(1)}s) > videoDur(${videoDur}s)ratio=${ratio.toFixed(2)}assemble 将截断`
}
log('tts', `[${idx}/${items.length}] 完成: ${segments.length}段, 总音频${totalAudioDuration.toFixed(1)}s, ratio=${ratio.toFixed(2)}`)
saveManifest(manifestPath, manifest)
}
}
module.exports = { phaseTts, splitIntoAudioSegments }