Files
video-create/.claude/skills/video-from-script/scripts/lib/phase-tts.js
lc 6eec0e8889 feat(skills): 完善视频生产 pipeline 及新增健身跟练账号
- SKILL.md: 新增工作流阶段定义、质量卡点、分镜规则
- manifest-schema.md: 补充完整字段规范及类型定义
- phase-tts.js: 优化 TTS 合成长逻辑,添加进度追踪
- capcut-tracks.js: 扩展轨道构建能力,支持更多元素类型
- capcut-timeline.js: 改进时间线生成,支持淡入淡出
- capcut_assemble.js: 新增 assemble 阶段完整实现
- cmd-init.js: 完善 init 命令逻辑
- qwen-tts.js: 调整超时配置
- accounts/禁忌帝王学: 更新拆分/图像/台词提示词
- accounts/健身跟练: 新增账号含 account.json 及全套提示词模板
- 新增 workflow-issues-20260501.md 参考文档

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 22:53:37 +08:00

192 lines
6.8 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* Phase: tts — 语音合成(先分段,后合成)
*
* 核心变化:音频分段优先于生图。
*
* 1. 在生成图片之前,先将文案按语义断点切分为多个音频片段
* 2. 每个片段时长 < videoModel 固定时长Kling=6s
* 3. 逐段合成,记录实测时长,写入 manifest.segments[]
* 4. manifest.items[n].segments = [{text, audio, duration, startOffset}, ...]
* 5. manifest.items[n].audioDuration = 片段总和(供 assemble 计算 ratio
*
* 流程顺序变为tts → images → upload → videos → assemble
*/
const path = require('path')
const { saveManifest, ensureDir, log, getManifestDir, splitTextIntoSentences } = require('./pipeline-utils')
/**
* 在语义断点处将文案切分为音频片段
* 每段时长(估算)必须 < videoDuration且尽量接近最佳 ratio 接近1.0
*
* @param {string} text - 完整文案
* @param {number} videoDur - 视频模型固定时长(秒),如 6
* @param {number} charsPerSec - 语速(字/秒),固定 5
* @returns {Array<{text, estimatedDuration}>}
*/
function splitIntoAudioSegments(text, videoDur, charsPerSec = 5) {
// 优先在自然断点切分(句号/感叹号/分号)
const naturalBreaks = splitTextIntoSentences(text)
if (naturalBreaks.length <= 1) {
// 无自然断点:在半段处(含小数点)切分
const chars = text.length
const estimatedTotal = chars / charsPerSec
if (estimatedTotal <= videoDur) {
// 整段可容纳
return [{ text, estimatedDuration: estimatedTotal }]
}
// 无法单段容纳,在中间逗号处切
const mid = Math.floor(chars / 2)
const breakIdx = text.indexOf('', mid)
if (breakIdx > 0) {
return [
{ text: text.slice(0, breakIdx + 1), estimatedDuration: (breakIdx + 1) / charsPerSec },
{ text: text.slice(breakIdx + 1), estimatedDuration: (chars - breakIdx - 1) / charsPerSec },
]
}
// 强制按字数切
const halfChars = Math.floor(chars / 2)
return [
{ text: text.slice(0, halfChars), estimatedDuration: halfChars / charsPerSec },
{ text: text.slice(halfChars), estimatedDuration: (chars - halfChars) / charsPerSec },
]
}
// 多个自然句:逐句判断,合并短句
const result = []
let currentText = ''
let currentEstDur = 0
for (let i = 0; i < naturalBreaks.length; i++) {
const sentence = naturalBreaks[i]
const sentenceLen = sentence.length
const sentenceEstDur = sentenceLen / charsPerSec
if (currentEstDur + sentenceEstDur <= videoDur) {
// 可以合并到当前段
currentText += sentence + '。'
currentEstDur += sentenceEstDur
} else {
// 先保存当前段
if (currentText) {
result.push({ text: currentText.trim(), estimatedDuration: currentEstDur })
}
currentText = sentence + '。'
currentEstDur = sentenceEstDur
// 单句本身超长(超 videoDur
if (sentenceEstDur > videoDur) {
// 按半段切
const halfLen = Math.floor(sentenceLen / 2)
const half1 = sentence.slice(0, halfLen)
const half2 = sentence.slice(halfLen)
// 回退上一段,用两个半段替代
result.pop()
result.push({ text: half1, estimatedDuration: halfLen / charsPerSec })
currentText = half2 + '。'
currentEstDur = (sentenceLen - halfLen) / charsPerSec
}
}
}
if (currentText) {
result.push({ text: currentText.trim(), estimatedDuration: currentEstDur })
}
return result
}
async function phaseTts(manifest, manifestPath, options = {}) {
const dir = getManifestDir(manifestPath)
const audioDir = path.join(dir, 'audio')
ensureDir(audioDir)
const { synthesize } = require('../qwen-tts')
const videoDur = manifest.estimatedVideoDuration || 6
const ttsRate = manifest.ttsRate || 1.15
const items = manifest.items.filter(it =>
(it.script || it.text) && !it.audio
)
if (items.length === 0) { log('tts', '无待处理 item已合成跳过'); return }
log('tts', `${items.length} 段, 视频固定时长=${videoDur}s, TTS语速=${ttsRate}x`)
for (let i = 0; i < items.length; i++) {
const item = items[i]
const idx = i + 1
const fullText = (item.script || item.text).trim()
// Step 1: 计算音频分段
const rawSegments = splitIntoAudioSegments(fullText, videoDur)
log('tts', `[${idx}/${items.length}] 原始分段: ${rawSegments.length}`)
for (const seg of rawSegments) {
log('tts', ` 分段估算: ${seg.estimatedDuration.toFixed(2)}s / ${seg.text.slice(0, 20)}...`)
}
// Step 2: 逐段合成
const segments = []
let globalOffset = 0
for (let j = 0; j < rawSegments.length; j++) {
const segInput = rawSegments[j]
const segId = `${item.id}_${j + 1}`
try {
const { filePath, duration: realDuration } = await synthesize(segInput.text, {
outputDir: audioDir,
id: segId,
voice: manifest.ttsVoice || undefined,
instruction: manifest.ttsInstruction || undefined,
rate: ttsRate,
})
const segment = {
id: segId,
text: segInput.text,
audio: path.relative(dir, filePath).replace(/\\/g, '/'),
estimatedDuration: Math.round(segInput.estimatedDuration * 1000) / 1000,
duration: Math.round(realDuration * 1000) / 1000,
startOffset: Math.round(globalOffset * 1000) / 1000,
}
segments.push(segment)
globalOffset += realDuration
log('tts', `[${idx}/${items.length}] 段${j + 1}: 估算${segInput.estimatedDuration.toFixed(2)}s → 实测${realDuration.toFixed(2)}s | ${segInput.text.slice(0, 15)}...`)
} catch (err) {
log('tts', `[${idx}/${items.length}] 段${j + 1} 合成失败: ${err.message}`)
segments.push({
id: segId,
text: segInput.text,
audio: '',
estimatedDuration: segInput.estimatedDuration,
duration: 0,
startOffset: globalOffset,
error: err.message,
})
globalOffset += segInput.estimatedDuration
}
}
// Step 3: 汇总到 item
const totalAudioDuration = Math.round(globalOffset * 1000) / 1000
item.segments = segments
item.audio = segments[0]?.audio || ''
item.audioDuration = totalAudioDuration
item.segmentCount = segments.length
// Step 4: 时长合规诊断
const ratio = videoDur / totalAudioDuration
if (ratio < 0.9) {
item._timelineWarning = `⚠ audioDur(${totalAudioDuration.toFixed(1)}s) > videoDur(${videoDur}s)ratio=${ratio.toFixed(2)}assemble 将截断`
}
log('tts', `[${idx}/${items.length}] 完成: ${segments.length}段, 总音频${totalAudioDuration.toFixed(1)}s, ratio=${ratio.toFixed(2)}`)
saveManifest(manifestPath, manifest)
}
}
module.exports = { phaseTts, splitIntoAudioSegments }