Files
video-create/.claude/skills/video-from-script/scripts/lib/phase-tts.js

178 lines
6.1 KiB
JavaScript
Raw Normal View History

/**
* Phase: tts 语音合成先分段后合成
*
* 核心变化音频分段优先于生图
*
* 1. 在生成图片之前先将文案按语义断点切分为多个音频片段
* 2. 每个片段时长 < videoModel 固定时长Kling=6s
* 3. 逐段合成记录实测时长写入 manifest.segments[]
* 4. manifest.items[n].segments = [{text, audio, duration, startOffset}, ...]
* 5. manifest.items[n].audioDuration = 片段总和 assemble 计算 ratio
*
* 流程顺序变为tts images upload videos assemble
*/
const path = require('path')
const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils')
/**
* 在语义断点处将文案切分为音频片段
* 每段时长估算必须 < videoDuration且尽量接近最佳 ratio 接近1.0
*
* @param {string} text - 完整文案
* @param {number} videoDur - 视频模型固定时长 6
* @param {number} charsPerSec - 语速/固定 5
* @returns {Array<{text, estimatedDuration}>}
*/
function splitIntoAudioSegments(text, videoDur, charsPerSec = 5) {
const estimatedTotal = text.length / charsPerSec
if (estimatedTotal <= videoDur) {
return [{ text, estimatedDuration: estimatedTotal }]
}
// 在原文标点处切分,保留原始标点(不剥离、不重加)
const breakPattern = /[。!;,]/
const rawParts = []
let lastIdx = 0
for (let i = 0; i < text.length; i++) {
if (breakPattern.test(text[i])) {
rawParts.push(text.slice(lastIdx, i + 1))
lastIdx = i + 1
}
}
if (lastIdx < text.length) {
rawParts.push(text.slice(lastIdx))
}
// 无标点断点,强制对半切
if (rawParts.length <= 1) {
const half = Math.floor(text.length / 2)
return [
{ text: text.slice(0, half), estimatedDuration: half / charsPerSec },
{ text: text.slice(half), estimatedDuration: (text.length - half) / charsPerSec },
]
}
// 合并短片段,确保每段 ≤ videoDur
const result = []
let curText = ''
let curDur = 0
for (const part of rawParts) {
const partDur = part.length / charsPerSec
if (curDur + partDur <= videoDur) {
curText += part
curDur += partDur
} else {
if (curText) result.push({ text: curText, estimatedDuration: curDur })
// 单段超长,强制对半切
if (partDur > videoDur) {
const half = Math.floor(part.length / 2)
result.push({ text: part.slice(0, half), estimatedDuration: half / charsPerSec })
curText = part.slice(half)
curDur = (part.length - half) / charsPerSec
} else {
curText = part
curDur = partDur
}
}
}
if (curText) result.push({ text: curText, estimatedDuration: curDur })
return result
}
async function phaseTts(manifest, manifestPath, options = {}) {
const dir = getManifestDir(manifestPath)
const audioDir = path.join(dir, 'audio')
ensureDir(audioDir)
const { synthesize } = require('../qwen-tts')
const videoDur = manifest.estimatedVideoDuration || 6
const ttsRate = manifest.ttsRate || 1.15
const items = manifest.items.filter(it =>
(it.script || it.text) && !it.audio
)
if (items.length === 0) { log('tts', '无待处理 item已合成跳过'); return }
log('tts', `${items.length} 段, 视频固定时长=${videoDur}s, TTS语速=${ttsRate}x`)
for (let i = 0; i < items.length; i++) {
const item = items[i]
const idx = i + 1
const fullText = (item.script || item.text).trim()
// Step 1: 计算音频分段
const rawSegments = splitIntoAudioSegments(fullText, videoDur)
log('tts', `[${idx}/${items.length}] 原始分段: ${rawSegments.length}`)
for (const seg of rawSegments) {
log('tts', ` 分段估算: ${seg.estimatedDuration.toFixed(2)}s / ${seg.text.slice(0, 20)}...`)
}
// Step 2: 逐段合成
const segments = []
let globalOffset = 0
for (let j = 0; j < rawSegments.length; j++) {
const segInput = rawSegments[j]
const segId = `${item.id}_${j + 1}`
try {
const { filePath, duration: realDuration } = await synthesize(segInput.text, {
outputDir: audioDir,
id: segId,
voice: manifest.ttsVoice || undefined,
instruction: manifest.ttsInstruction || undefined,
rate: ttsRate,
})
const segment = {
id: segId,
text: segInput.text,
audio: path.relative(dir, filePath).replace(/\\/g, '/'),
estimatedDuration: Math.round(segInput.estimatedDuration * 1000) / 1000,
duration: Math.round(realDuration * 1000) / 1000,
startOffset: Math.round(globalOffset * 1000) / 1000,
}
segments.push(segment)
globalOffset += realDuration
log('tts', `[${idx}/${items.length}] 段${j + 1}: 估算${segInput.estimatedDuration.toFixed(2)}s → 实测${realDuration.toFixed(2)}s | ${segInput.text.slice(0, 15)}...`)
} catch (err) {
log('tts', `[${idx}/${items.length}] 段${j + 1} 合成失败: ${err.message}`)
segments.push({
id: segId,
text: segInput.text,
audio: '',
estimatedDuration: segInput.estimatedDuration,
duration: 0,
startOffset: globalOffset,
error: err.message,
})
globalOffset += segInput.estimatedDuration
}
}
// Step 3: 汇总到 item
const totalAudioDuration = Math.round(globalOffset * 1000) / 1000
item.segments = segments
item.audio = segments[0]?.audio || ''
item.audioDuration = totalAudioDuration
item.segmentCount = segments.length
// Step 4: 时长合规诊断
const ratio = videoDur / totalAudioDuration
if (ratio < 0.9) {
item._timelineWarning = `⚠ audioDur(${totalAudioDuration.toFixed(1)}s) > videoDur(${videoDur}s)ratio=${ratio.toFixed(2)}assemble 将截断`
}
log('tts', `[${idx}/${items.length}] 完成: ${segments.length}段, 总音频${totalAudioDuration.toFixed(1)}s, ratio=${ratio.toFixed(2)}`)
saveManifest(manifestPath, manifest)
}
}
module.exports = { phaseTts, splitIntoAudioSegments }