- 新增 minimax-tts.js 和 minimax-voice-clone.js 脚本 - 新增口播批量剪辑流水线 (口播_assemble.js, 口播_pipeline.js) - 更新 video-from-script 各阶段脚本 (kling, images, tts, videos) - 新增执黑先行二号-风格延伸账号 - 更新执黑先行 account.json 配置 - 替换 ugc_product_seeding 参考图 - 更新 CLAUDE.md 和依赖配置 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
215 lines
7.8 KiB
JavaScript
215 lines
7.8 KiB
JavaScript
/**
|
||
* Phase: tts — 语音合成(先分段,后合成)
|
||
*
|
||
* 核心变化:音频分段优先于生图。
|
||
*
|
||
* 1. 在生成图片之前,先将文案按语义断点切分为多个音频片段
|
||
* 2. 每个片段时长 < videoModel 固定时长(Kling=6s)
|
||
* 3. 逐段合成,记录实测时长,写入 manifest.segments[]
|
||
* 4. manifest.items[n].segments = [{text, audio, duration, startOffset}, ...]
|
||
* 5. manifest.items[n].audioDuration = 片段总和(供 assemble 计算 ratio)
|
||
*
|
||
* 流程顺序变为:tts → images → upload → videos → assemble
|
||
*/
|
||
|
||
const path = require('path')
|
||
const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils')
|
||
|
||
/**
|
||
* 在语义断点处将文案切分为音频片段
|
||
* 每段时长(估算)必须 < videoDuration,且尽量接近(最佳 ratio 接近1.0)
|
||
*
|
||
* @param {string} text - 完整文案
|
||
* @param {number} videoDur - 视频模型固定时长(秒),如 6
|
||
* @param {number} charsPerSec - 语速(字/秒),固定 5
|
||
* @returns {Array<{text, estimatedDuration}>}
|
||
*/
|
||
function splitIntoAudioSegments(text, videoDur, charsPerSec = 5) {
|
||
const estimatedTotal = text.length / charsPerSec
|
||
if (estimatedTotal <= videoDur) {
|
||
return [{ text, estimatedDuration: estimatedTotal }]
|
||
}
|
||
|
||
// 在原文标点处切分,保留原始标点(不剥离、不重加)
|
||
const breakPattern = /[。!;,]/
|
||
const rawParts = []
|
||
let lastIdx = 0
|
||
for (let i = 0; i < text.length; i++) {
|
||
if (breakPattern.test(text[i])) {
|
||
rawParts.push(text.slice(lastIdx, i + 1))
|
||
lastIdx = i + 1
|
||
}
|
||
}
|
||
if (lastIdx < text.length) {
|
||
rawParts.push(text.slice(lastIdx))
|
||
}
|
||
|
||
// 无标点断点,强制对半切
|
||
if (rawParts.length <= 1) {
|
||
const half = Math.floor(text.length / 2)
|
||
return [
|
||
{ text: text.slice(0, half), estimatedDuration: half / charsPerSec },
|
||
{ text: text.slice(half), estimatedDuration: (text.length - half) / charsPerSec },
|
||
]
|
||
}
|
||
|
||
// 合并短片段,确保每段 ≤ videoDur
|
||
const result = []
|
||
let curText = ''
|
||
let curDur = 0
|
||
|
||
for (const part of rawParts) {
|
||
const partDur = part.length / charsPerSec
|
||
if (curDur + partDur <= videoDur) {
|
||
curText += part
|
||
curDur += partDur
|
||
} else {
|
||
if (curText) result.push({ text: curText, estimatedDuration: curDur })
|
||
// 单段超长,强制对半切
|
||
if (partDur > videoDur) {
|
||
const half = Math.floor(part.length / 2)
|
||
result.push({ text: part.slice(0, half), estimatedDuration: half / charsPerSec })
|
||
curText = part.slice(half)
|
||
curDur = (part.length - half) / charsPerSec
|
||
} else {
|
||
curText = part
|
||
curDur = partDur
|
||
}
|
||
}
|
||
}
|
||
if (curText) result.push({ text: curText, estimatedDuration: curDur })
|
||
|
||
return result
|
||
}
|
||
|
||
async function phaseTts(manifest, manifestPath, options = {}) {
|
||
const dir = getManifestDir(manifestPath)
|
||
const audioDir = path.join(dir, 'audio')
|
||
ensureDir(audioDir)
|
||
|
||
const engine = manifest.ttsEngine || 'qwen'
|
||
const { synthesize } = engine === 'minimax' ? require('../minimax-tts') : require('../qwen-tts')
|
||
|
||
const videoDur = manifest.estimatedVideoDuration || 6
|
||
const ttsRate = manifest.ttsRate || 1.15
|
||
|
||
const items = manifest.items.filter(it =>
|
||
(it.script || it.text) && !it.audio
|
||
)
|
||
if (items.length === 0) {
|
||
const total = manifest.items.length
|
||
const withAudio = manifest.items.filter(it => it.audio).length
|
||
const withScript = manifest.items.filter(it => it.script || it.text).length
|
||
if (withAudio === 0) {
|
||
console.error("\n" + "=".repeat(60))
|
||
console.error("❌ [tts] 严重错误:没有任何待处理的配音项,且 manifest 中也没有已生成的音频!")
|
||
console.error("=".repeat(60))
|
||
console.error(` 总数: ${total} | 有script: ${withScript} | 已有audio: ${withAudio}`)
|
||
if (withScript === 0) console.error(" 根因: 所有 item 都缺少 script 文本 — 分镜文件有问题")
|
||
console.error("=".repeat(60) + "\n")
|
||
throw new Error(`TTS 阶段中断: ${total} 个 item 均无音频且无待处理项 (script=${withScript}/${total})`)
|
||
}
|
||
log('tts', '所有音频已合成,跳过')
|
||
return
|
||
}
|
||
|
||
log('tts', `共 ${items.length} 段, 视频固定时长=${videoDur}s, TTS语速=${ttsRate}x`)
|
||
|
||
for (let i = 0; i < items.length; i++) {
|
||
const item = items[i]
|
||
const idx = i + 1
|
||
const fullText = (item.script || item.text).trim()
|
||
|
||
// Step 1: 计算音频分段
|
||
const rawSegments = splitIntoAudioSegments(fullText, videoDur)
|
||
log('tts', `[${idx}/${items.length}] 原始分段: ${rawSegments.length} 段`)
|
||
for (const seg of rawSegments) {
|
||
log('tts', ` 分段估算: ${seg.estimatedDuration.toFixed(2)}s / ${seg.text.slice(0, 20)}...`)
|
||
}
|
||
|
||
// Step 2: 逐段合成
|
||
const segments = []
|
||
let globalOffset = 0
|
||
|
||
for (let j = 0; j < rawSegments.length; j++) {
|
||
const segInput = rawSegments[j]
|
||
const segId = `${item.id}_${j + 1}`
|
||
|
||
// 带重试的合成(最多3次,指数退避)
|
||
let synthResult = null
|
||
let lastErr = null
|
||
for (let retry = 0; retry < 3; retry++) {
|
||
try {
|
||
synthResult = await synthesize(segInput.text, {
|
||
outputDir: audioDir,
|
||
id: segId,
|
||
voice: manifest.ttsVoice || undefined,
|
||
model: manifest.ttsModel || undefined,
|
||
instruction: manifest.ttsInstruction || undefined,
|
||
rate: ttsRate,
|
||
emotion: manifest.ttsEmotion || undefined,
|
||
languageBoost: manifest.ttsLanguageBoost || undefined,
|
||
pitch: manifest.ttsPitch ?? undefined,
|
||
})
|
||
break
|
||
} catch (e) {
|
||
lastErr = e
|
||
if (retry < 2) {
|
||
const delay = Math.pow(2, retry) * 3000
|
||
log('tts', `[${idx}/${items.length}] 段${j + 1} 重试 ${retry + 1}/3, ${delay / 1000}s 后重试...`)
|
||
await new Promise(r => setTimeout(r, delay))
|
||
}
|
||
}
|
||
}
|
||
|
||
if (synthResult) {
|
||
const { filePath, duration: realDuration } = synthResult
|
||
|
||
const segment = {
|
||
id: segId,
|
||
text: segInput.text,
|
||
audio: path.relative(dir, filePath).replace(/\\/g, '/'),
|
||
estimatedDuration: Math.round(segInput.estimatedDuration * 1000) / 1000,
|
||
duration: Math.round(realDuration * 1000) / 1000,
|
||
startOffset: Math.round(globalOffset * 1000) / 1000,
|
||
}
|
||
segments.push(segment)
|
||
globalOffset += realDuration
|
||
|
||
log('tts', `[${idx}/${items.length}] 段${j + 1}: 估算${segInput.estimatedDuration.toFixed(2)}s → 实测${realDuration.toFixed(2)}s | ${segInput.text.slice(0, 15)}...`)
|
||
} else {
|
||
log('tts', `[${idx}/${items.length}] 段${j + 1} 合成失败(重试3次后): ${lastErr?.message || '未知错误'}`)
|
||
segments.push({
|
||
id: segId,
|
||
text: segInput.text,
|
||
audio: '',
|
||
estimatedDuration: segInput.estimatedDuration,
|
||
duration: 0,
|
||
startOffset: globalOffset,
|
||
error: lastErr?.message || '未知错误',
|
||
})
|
||
globalOffset += segInput.estimatedDuration
|
||
}
|
||
}
|
||
|
||
// Step 3: 汇总到 item
|
||
const totalAudioDuration = Math.round(globalOffset * 1000) / 1000
|
||
item.segments = segments
|
||
item.audio = segments[0]?.audio || ''
|
||
item.audioDuration = totalAudioDuration
|
||
item.segmentCount = segments.length
|
||
|
||
// Step 4: 时长合规诊断
|
||
const ratio = videoDur / totalAudioDuration
|
||
if (ratio < 0.9) {
|
||
item._timelineWarning = `⚠ audioDur(${totalAudioDuration.toFixed(1)}s) > videoDur(${videoDur}s),ratio=${ratio.toFixed(2)},assemble 将截断`
|
||
}
|
||
|
||
log('tts', `[${idx}/${items.length}] 完成: ${segments.length}段, 总音频${totalAudioDuration.toFixed(1)}s, ratio=${ratio.toFixed(2)}`)
|
||
|
||
saveManifest(manifestPath, manifest)
|
||
}
|
||
}
|
||
|
||
module.exports = { phaseTts, splitIntoAudioSegments }
|