/** * Phase: tts — 语音合成(先分段,后合成) * * 核心变化:音频分段优先于生图。 * * 1. 在生成图片之前,先将文案按语义断点切分为多个音频片段 * 2. 每个片段时长 < videoModel 固定时长(Kling=6s) * 3. 逐段合成,记录实测时长,写入 manifest.segments[] * 4. manifest.items[n].segments = [{text, audio, duration, startOffset}, ...] * 5. manifest.items[n].audioDuration = 片段总和(供 assemble 计算 ratio) * * 流程顺序变为:tts → images → upload → videos → assemble */ const path = require('path') const { saveManifest, ensureDir, log, getManifestDir, splitTextIntoSentences } = require('./pipeline-utils') /** * 在语义断点处将文案切分为音频片段 * 每段时长(估算)必须 < videoDuration,且尽量接近(最佳 ratio 接近1.0) * * @param {string} text - 完整文案 * @param {number} videoDur - 视频模型固定时长(秒),如 6 * @param {number} charsPerSec - 语速(字/秒),固定 5 * @returns {Array<{text, estimatedDuration}>} */ function splitIntoAudioSegments(text, videoDur, charsPerSec = 5) { // 优先在自然断点切分(句号/感叹号/分号) const naturalBreaks = splitTextIntoSentences(text) if (naturalBreaks.length <= 1) { // 无自然断点:在半段处(含小数点)切分 const chars = text.length const estimatedTotal = chars / charsPerSec if (estimatedTotal <= videoDur) { // 整段可容纳 return [{ text, estimatedDuration: estimatedTotal }] } // 无法单段容纳,在中间逗号处切 const mid = Math.floor(chars / 2) const breakIdx = text.indexOf(',', mid) if (breakIdx > 0) { return [ { text: text.slice(0, breakIdx + 1), estimatedDuration: (breakIdx + 1) / charsPerSec }, { text: text.slice(breakIdx + 1), estimatedDuration: (chars - breakIdx - 1) / charsPerSec }, ] } // 强制按字数切 const halfChars = Math.floor(chars / 2) return [ { text: text.slice(0, halfChars), estimatedDuration: halfChars / charsPerSec }, { text: text.slice(halfChars), estimatedDuration: (chars - halfChars) / charsPerSec }, ] } // 多个自然句:逐句判断,合并短句 const result = [] let currentText = '' let currentEstDur = 0 for (let i = 0; i < naturalBreaks.length; i++) { const sentence = naturalBreaks[i] const sentenceLen = sentence.length const sentenceEstDur = sentenceLen / charsPerSec if (currentEstDur + sentenceEstDur <= videoDur) { // 可以合并到当前段 currentText += sentence + '。' currentEstDur += sentenceEstDur } else { // 先保存当前段 if (currentText) { result.push({ text: currentText.trim(), estimatedDuration: currentEstDur }) } currentText = sentence + '。' currentEstDur = sentenceEstDur // 单句本身超长(超 videoDur) if (sentenceEstDur > videoDur) { // 按半段切 const halfLen = Math.floor(sentenceLen / 2) const half1 = sentence.slice(0, halfLen) const half2 = sentence.slice(halfLen) // 回退上一段,用两个半段替代 result.pop() result.push({ text: half1, estimatedDuration: halfLen / charsPerSec }) currentText = half2 + '。' currentEstDur = (sentenceLen - halfLen) / charsPerSec } } } if (currentText) { result.push({ text: currentText.trim(), estimatedDuration: currentEstDur }) } return result } async function phaseTts(manifest, manifestPath, options = {}) { const dir = getManifestDir(manifestPath) const audioDir = path.join(dir, 'audio') ensureDir(audioDir) const { synthesize } = require('../qwen-tts') const videoDur = manifest.estimatedVideoDuration || 6 const ttsRate = manifest.ttsRate || 1.15 const items = manifest.items.filter(it => (it.script || it.text) && !it.audio ) if (items.length === 0) { log('tts', '无待处理 item(已合成),跳过'); return } log('tts', `共 ${items.length} 段, 视频固定时长=${videoDur}s, TTS语速=${ttsRate}x`) for (let i = 0; i < items.length; i++) { const item = items[i] const idx = i + 1 const fullText = (item.script || item.text).trim() // Step 1: 计算音频分段 const rawSegments = splitIntoAudioSegments(fullText, videoDur) log('tts', `[${idx}/${items.length}] 原始分段: ${rawSegments.length} 段`) for (const seg of rawSegments) { log('tts', ` 分段估算: ${seg.estimatedDuration.toFixed(2)}s / ${seg.text.slice(0, 20)}...`) } // Step 2: 逐段合成 const segments = [] let globalOffset = 0 for (let j = 0; j < rawSegments.length; j++) { const segInput = rawSegments[j] const segId = `${item.id}_${j + 1}` try { const { filePath, duration: realDuration } = await synthesize(segInput.text, { outputDir: audioDir, id: segId, voice: manifest.ttsVoice || undefined, instruction: manifest.ttsInstruction || undefined, rate: ttsRate, }) const segment = { id: segId, text: segInput.text, audio: path.relative(dir, filePath).replace(/\\/g, '/'), estimatedDuration: Math.round(segInput.estimatedDuration * 1000) / 1000, duration: Math.round(realDuration * 1000) / 1000, startOffset: Math.round(globalOffset * 1000) / 1000, } segments.push(segment) globalOffset += realDuration log('tts', `[${idx}/${items.length}] 段${j + 1}: 估算${segInput.estimatedDuration.toFixed(2)}s → 实测${realDuration.toFixed(2)}s | ${segInput.text.slice(0, 15)}...`) } catch (err) { log('tts', `[${idx}/${items.length}] 段${j + 1} 合成失败: ${err.message}`) segments.push({ id: segId, text: segInput.text, audio: '', estimatedDuration: segInput.estimatedDuration, duration: 0, startOffset: globalOffset, error: err.message, }) globalOffset += segInput.estimatedDuration } } // Step 3: 汇总到 item const totalAudioDuration = Math.round(globalOffset * 1000) / 1000 item.segments = segments item.audio = segments[0]?.audio || '' item.audioDuration = totalAudioDuration item.segmentCount = segments.length // Step 4: 时长合规诊断 const ratio = videoDur / totalAudioDuration if (ratio < 0.9) { item._timelineWarning = `⚠ audioDur(${totalAudioDuration.toFixed(1)}s) > videoDur(${videoDur}s),ratio=${ratio.toFixed(2)},assemble 将截断` } log('tts', `[${idx}/${items.length}] 完成: ${segments.length}段, 总音频${totalAudioDuration.toFixed(1)}s, ratio=${ratio.toFixed(2)}`) saveManifest(manifestPath, manifest) } } module.exports = { phaseTts, splitIntoAudioSegments }