Files
video-create/.claude/skills/video-from-script/scripts/lib/phase-tts.js
sion123 b309f54430 feat(capcut): 优化音频/字幕添加策略并重构语音切分逻辑
- 音频和字幕 API 调用改为先批量添加,批量失败时逐个兜底
- 重写 `splitIntoAudioSegments`,基于原始标点保留切分,合并短片段
- `qwen-tts.js` 补充中文逗号作为句末标点判断
2026-05-06 23:21:40 +08:00

178 lines
6.1 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Phase: tts — 语音合成(先分段,后合成)
*
* 核心变化:音频分段优先于生图。
*
* 1. 在生成图片之前,先将文案按语义断点切分为多个音频片段
* 2. 每个片段时长 < videoModel 固定时长Kling=6s
* 3. 逐段合成,记录实测时长,写入 manifest.segments[]
* 4. manifest.items[n].segments = [{text, audio, duration, startOffset}, ...]
* 5. manifest.items[n].audioDuration = 片段总和(供 assemble 计算 ratio
*
* 流程顺序变为tts → images → upload → videos → assemble
*/
const path = require('path')
const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils')
/**
* 在语义断点处将文案切分为音频片段
* 每段时长(估算)必须 < videoDuration且尽量接近最佳 ratio 接近1.0
*
* @param {string} text - 完整文案
* @param {number} videoDur - 视频模型固定时长(秒),如 6
* @param {number} charsPerSec - 语速(字/秒),固定 5
* @returns {Array<{text, estimatedDuration}>}
*/
function splitIntoAudioSegments(text, videoDur, charsPerSec = 5) {
const estimatedTotal = text.length / charsPerSec
if (estimatedTotal <= videoDur) {
return [{ text, estimatedDuration: estimatedTotal }]
}
// 在原文标点处切分,保留原始标点(不剥离、不重加)
const breakPattern = /[。!;,]/
const rawParts = []
let lastIdx = 0
for (let i = 0; i < text.length; i++) {
if (breakPattern.test(text[i])) {
rawParts.push(text.slice(lastIdx, i + 1))
lastIdx = i + 1
}
}
if (lastIdx < text.length) {
rawParts.push(text.slice(lastIdx))
}
// 无标点断点,强制对半切
if (rawParts.length <= 1) {
const half = Math.floor(text.length / 2)
return [
{ text: text.slice(0, half), estimatedDuration: half / charsPerSec },
{ text: text.slice(half), estimatedDuration: (text.length - half) / charsPerSec },
]
}
// 合并短片段,确保每段 ≤ videoDur
const result = []
let curText = ''
let curDur = 0
for (const part of rawParts) {
const partDur = part.length / charsPerSec
if (curDur + partDur <= videoDur) {
curText += part
curDur += partDur
} else {
if (curText) result.push({ text: curText, estimatedDuration: curDur })
// 单段超长,强制对半切
if (partDur > videoDur) {
const half = Math.floor(part.length / 2)
result.push({ text: part.slice(0, half), estimatedDuration: half / charsPerSec })
curText = part.slice(half)
curDur = (part.length - half) / charsPerSec
} else {
curText = part
curDur = partDur
}
}
}
if (curText) result.push({ text: curText, estimatedDuration: curDur })
return result
}
async function phaseTts(manifest, manifestPath, options = {}) {
const dir = getManifestDir(manifestPath)
const audioDir = path.join(dir, 'audio')
ensureDir(audioDir)
const { synthesize } = require('../qwen-tts')
const videoDur = manifest.estimatedVideoDuration || 6
const ttsRate = manifest.ttsRate || 1.15
const items = manifest.items.filter(it =>
(it.script || it.text) && !it.audio
)
if (items.length === 0) { log('tts', '无待处理 item已合成跳过'); return }
log('tts', `${items.length} 段, 视频固定时长=${videoDur}s, TTS语速=${ttsRate}x`)
for (let i = 0; i < items.length; i++) {
const item = items[i]
const idx = i + 1
const fullText = (item.script || item.text).trim()
// Step 1: 计算音频分段
const rawSegments = splitIntoAudioSegments(fullText, videoDur)
log('tts', `[${idx}/${items.length}] 原始分段: ${rawSegments.length}`)
for (const seg of rawSegments) {
log('tts', ` 分段估算: ${seg.estimatedDuration.toFixed(2)}s / ${seg.text.slice(0, 20)}...`)
}
// Step 2: 逐段合成
const segments = []
let globalOffset = 0
for (let j = 0; j < rawSegments.length; j++) {
const segInput = rawSegments[j]
const segId = `${item.id}_${j + 1}`
try {
const { filePath, duration: realDuration } = await synthesize(segInput.text, {
outputDir: audioDir,
id: segId,
voice: manifest.ttsVoice || undefined,
instruction: manifest.ttsInstruction || undefined,
rate: ttsRate,
})
const segment = {
id: segId,
text: segInput.text,
audio: path.relative(dir, filePath).replace(/\\/g, '/'),
estimatedDuration: Math.round(segInput.estimatedDuration * 1000) / 1000,
duration: Math.round(realDuration * 1000) / 1000,
startOffset: Math.round(globalOffset * 1000) / 1000,
}
segments.push(segment)
globalOffset += realDuration
log('tts', `[${idx}/${items.length}] 段${j + 1}: 估算${segInput.estimatedDuration.toFixed(2)}s → 实测${realDuration.toFixed(2)}s | ${segInput.text.slice(0, 15)}...`)
} catch (err) {
log('tts', `[${idx}/${items.length}] 段${j + 1} 合成失败: ${err.message}`)
segments.push({
id: segId,
text: segInput.text,
audio: '',
estimatedDuration: segInput.estimatedDuration,
duration: 0,
startOffset: globalOffset,
error: err.message,
})
globalOffset += segInput.estimatedDuration
}
}
// Step 3: 汇总到 item
const totalAudioDuration = Math.round(globalOffset * 1000) / 1000
item.segments = segments
item.audio = segments[0]?.audio || ''
item.audioDuration = totalAudioDuration
item.segmentCount = segments.length
// Step 4: 时长合规诊断
const ratio = videoDur / totalAudioDuration
if (ratio < 0.9) {
item._timelineWarning = `⚠ audioDur(${totalAudioDuration.toFixed(1)}s) > videoDur(${videoDur}s)ratio=${ratio.toFixed(2)}assemble 将截断`
}
log('tts', `[${idx}/${items.length}] 完成: ${segments.length}段, 总音频${totalAudioDuration.toFixed(1)}s, ratio=${ratio.toFixed(2)}`)
saveManifest(manifestPath, manifest)
}
}
module.exports = { phaseTts, splitIntoAudioSegments }