feat(skills): 完善视频生产 pipeline 及新增健身跟练账号
- SKILL.md: 新增工作流阶段定义、质量卡点、分镜规则 - manifest-schema.md: 补充完整字段规范及类型定义 - phase-tts.js: 优化 TTS 合成长逻辑,添加进度追踪 - capcut-tracks.js: 扩展轨道构建能力,支持更多元素类型 - capcut-timeline.js: 改进时间线生成,支持淡入淡出 - capcut_assemble.js: 新增 assemble 阶段完整实现 - cmd-init.js: 完善 init 命令逻辑 - qwen-tts.js: 调整超时配置 - accounts/禁忌帝王学: 更新拆分/图像/台词提示词 - accounts/健身跟练: 新增账号含 account.json 及全套提示词模板 - 新增 workflow-issues-20260501.md 参考文档 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,13 +1,100 @@
|
||||
/**
|
||||
* Phase: tts — 语音合成(整段合成)
|
||||
* Phase: tts — 语音合成(先分段,后合成)
|
||||
*
|
||||
* 每个 item 的 script 整段合成一个音频文件,保留自然语调。
|
||||
* item.audio 指向完整音频,item.audioDuration 为总时长。
|
||||
* 字幕切分由组装阶段按字符比例分配,不在 TTS 阶段处理。
|
||||
* 核心变化:音频分段优先于生图。
|
||||
*
|
||||
* 1. 在生成图片之前,先将文案按语义断点切分为多个音频片段
|
||||
* 2. 每个片段时长 < videoModel 固定时长(Kling=6s)
|
||||
* 3. 逐段合成,记录实测时长,写入 manifest.segments[]
|
||||
* 4. manifest.items[n].segments = [{text, audio, duration, startOffset}, ...]
|
||||
* 5. manifest.items[n].audioDuration = 片段总和(供 assemble 计算 ratio)
|
||||
*
|
||||
* 流程顺序变为:tts → images → upload → videos → assemble
|
||||
*/
|
||||
|
||||
const path = require('path')
|
||||
const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils')
|
||||
const { saveManifest, ensureDir, log, getManifestDir, splitTextIntoSentences } = require('./pipeline-utils')
|
||||
|
||||
/**
|
||||
* 在语义断点处将文案切分为音频片段
|
||||
* 每段时长(估算)必须 < videoDuration,且尽量接近(最佳 ratio 接近1.0)
|
||||
*
|
||||
* @param {string} text - 完整文案
|
||||
* @param {number} videoDur - 视频模型固定时长(秒),如 6
|
||||
* @param {number} charsPerSec - 语速(字/秒),固定 5
|
||||
* @returns {Array<{text, estimatedDuration}>}
|
||||
*/
|
||||
function splitIntoAudioSegments(text, videoDur, charsPerSec = 5) {
|
||||
// 优先在自然断点切分(句号/感叹号/分号)
|
||||
const naturalBreaks = splitTextIntoSentences(text)
|
||||
if (naturalBreaks.length <= 1) {
|
||||
// 无自然断点:在半段处(含小数点)切分
|
||||
const chars = text.length
|
||||
const estimatedTotal = chars / charsPerSec
|
||||
if (estimatedTotal <= videoDur) {
|
||||
// 整段可容纳
|
||||
return [{ text, estimatedDuration: estimatedTotal }]
|
||||
}
|
||||
// 无法单段容纳,在中间逗号处切
|
||||
const mid = Math.floor(chars / 2)
|
||||
const breakIdx = text.indexOf(',', mid)
|
||||
if (breakIdx > 0) {
|
||||
return [
|
||||
{ text: text.slice(0, breakIdx + 1), estimatedDuration: (breakIdx + 1) / charsPerSec },
|
||||
{ text: text.slice(breakIdx + 1), estimatedDuration: (chars - breakIdx - 1) / charsPerSec },
|
||||
]
|
||||
}
|
||||
// 强制按字数切
|
||||
const halfChars = Math.floor(chars / 2)
|
||||
return [
|
||||
{ text: text.slice(0, halfChars), estimatedDuration: halfChars / charsPerSec },
|
||||
{ text: text.slice(halfChars), estimatedDuration: (chars - halfChars) / charsPerSec },
|
||||
]
|
||||
}
|
||||
|
||||
// 多个自然句:逐句判断,合并短句
|
||||
const result = []
|
||||
let currentText = ''
|
||||
let currentEstDur = 0
|
||||
|
||||
for (let i = 0; i < naturalBreaks.length; i++) {
|
||||
const sentence = naturalBreaks[i]
|
||||
const sentenceLen = sentence.length
|
||||
const sentenceEstDur = sentenceLen / charsPerSec
|
||||
|
||||
if (currentEstDur + sentenceEstDur <= videoDur) {
|
||||
// 可以合并到当前段
|
||||
currentText += sentence + '。'
|
||||
currentEstDur += sentenceEstDur
|
||||
} else {
|
||||
// 先保存当前段
|
||||
if (currentText) {
|
||||
result.push({ text: currentText.trim(), estimatedDuration: currentEstDur })
|
||||
}
|
||||
currentText = sentence + '。'
|
||||
currentEstDur = sentenceEstDur
|
||||
|
||||
// 单句本身超长(超 videoDur)
|
||||
if (sentenceEstDur > videoDur) {
|
||||
// 按半段切
|
||||
const halfLen = Math.floor(sentenceLen / 2)
|
||||
const half1 = sentence.slice(0, halfLen)
|
||||
const half2 = sentence.slice(halfLen)
|
||||
// 回退上一段,用两个半段替代
|
||||
result.pop()
|
||||
result.push({ text: half1, estimatedDuration: halfLen / charsPerSec })
|
||||
currentText = half2 + '。'
|
||||
currentEstDur = (sentenceLen - halfLen) / charsPerSec
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (currentText) {
|
||||
result.push({ text: currentText.trim(), estimatedDuration: currentEstDur })
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
async function phaseTts(manifest, manifestPath, options = {}) {
|
||||
const dir = getManifestDir(manifestPath)
|
||||
@@ -16,38 +103,89 @@ async function phaseTts(manifest, manifestPath, options = {}) {
|
||||
|
||||
const { synthesize } = require('../qwen-tts')
|
||||
|
||||
const items = manifest.items.filter(it =>
|
||||
it.status === 'done' && (it.script || it.text) && !it.audio
|
||||
)
|
||||
if (items.length === 0) { log('tts', '无待处理 item,跳过'); return }
|
||||
const videoDur = manifest.estimatedVideoDuration || 6
|
||||
const ttsRate = manifest.ttsRate || 1.15
|
||||
|
||||
log('tts', `共 ${items.length} 段`)
|
||||
const items = manifest.items.filter(it =>
|
||||
(it.script || it.text) && !it.audio
|
||||
)
|
||||
if (items.length === 0) { log('tts', '无待处理 item(已合成),跳过'); return }
|
||||
|
||||
log('tts', `共 ${items.length} 段, 视频固定时长=${videoDur}s, TTS语速=${ttsRate}x`)
|
||||
|
||||
for (let i = 0; i < items.length; i++) {
|
||||
const item = items[i]
|
||||
const idx = i + 1
|
||||
const fullText = item.script || item.text
|
||||
const fullText = (item.script || item.text).trim()
|
||||
|
||||
try {
|
||||
const { filePath, duration } = await synthesize(fullText, {
|
||||
outputDir: audioDir,
|
||||
id: String(item.id || idx),
|
||||
voice: manifest.ttsVoice || undefined,
|
||||
instruction: manifest.ttsInstruction || undefined,
|
||||
rate: manifest.ttsRate || undefined,
|
||||
})
|
||||
|
||||
const totalDuration = Math.round(duration * 1000) / 1000
|
||||
item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
|
||||
item.audioDuration = totalDuration
|
||||
log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s: ${fullText.substring(0, 30)}...`)
|
||||
} catch (err) {
|
||||
item.status = 'failed'
|
||||
item.error = `TTS失败: ${err.message}`
|
||||
log('tts', `[${idx}/${items.length}] 失败: ${err.message}`)
|
||||
// Step 1: 计算音频分段
|
||||
const rawSegments = splitIntoAudioSegments(fullText, videoDur)
|
||||
log('tts', `[${idx}/${items.length}] 原始分段: ${rawSegments.length} 段`)
|
||||
for (const seg of rawSegments) {
|
||||
log('tts', ` 分段估算: ${seg.estimatedDuration.toFixed(2)}s / ${seg.text.slice(0, 20)}...`)
|
||||
}
|
||||
|
||||
// Step 2: 逐段合成
|
||||
const segments = []
|
||||
let globalOffset = 0
|
||||
|
||||
for (let j = 0; j < rawSegments.length; j++) {
|
||||
const segInput = rawSegments[j]
|
||||
const segId = `${item.id}_${j + 1}`
|
||||
|
||||
try {
|
||||
const { filePath, duration: realDuration } = await synthesize(segInput.text, {
|
||||
outputDir: audioDir,
|
||||
id: segId,
|
||||
voice: manifest.ttsVoice || undefined,
|
||||
instruction: manifest.ttsInstruction || undefined,
|
||||
rate: ttsRate,
|
||||
})
|
||||
|
||||
const segment = {
|
||||
id: segId,
|
||||
text: segInput.text,
|
||||
audio: path.relative(dir, filePath).replace(/\\/g, '/'),
|
||||
estimatedDuration: Math.round(segInput.estimatedDuration * 1000) / 1000,
|
||||
duration: Math.round(realDuration * 1000) / 1000,
|
||||
startOffset: Math.round(globalOffset * 1000) / 1000,
|
||||
}
|
||||
segments.push(segment)
|
||||
globalOffset += realDuration
|
||||
|
||||
log('tts', `[${idx}/${items.length}] 段${j + 1}: 估算${segInput.estimatedDuration.toFixed(2)}s → 实测${realDuration.toFixed(2)}s | ${segInput.text.slice(0, 15)}...`)
|
||||
} catch (err) {
|
||||
log('tts', `[${idx}/${items.length}] 段${j + 1} 合成失败: ${err.message}`)
|
||||
segments.push({
|
||||
id: segId,
|
||||
text: segInput.text,
|
||||
audio: '',
|
||||
estimatedDuration: segInput.estimatedDuration,
|
||||
duration: 0,
|
||||
startOffset: globalOffset,
|
||||
error: err.message,
|
||||
})
|
||||
globalOffset += segInput.estimatedDuration
|
||||
}
|
||||
}
|
||||
|
||||
// Step 3: 汇总到 item
|
||||
const totalAudioDuration = Math.round(globalOffset * 1000) / 1000
|
||||
item.segments = segments
|
||||
item.audio = segments[0]?.audio || ''
|
||||
item.audioDuration = totalAudioDuration
|
||||
item.segmentCount = segments.length
|
||||
|
||||
// Step 4: 时长合规诊断
|
||||
const ratio = videoDur / totalAudioDuration
|
||||
if (ratio < 0.9) {
|
||||
item._timelineWarning = `⚠ audioDur(${totalAudioDuration.toFixed(1)}s) > videoDur(${videoDur}s),ratio=${ratio.toFixed(2)},assemble 将截断`
|
||||
}
|
||||
|
||||
log('tts', `[${idx}/${items.length}] 完成: ${segments.length}段, 总音频${totalAudioDuration.toFixed(1)}s, ratio=${ratio.toFixed(2)}`)
|
||||
|
||||
saveManifest(manifestPath, manifest)
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { phaseTts }
|
||||
module.exports = { phaseTts, splitIntoAudioSegments }
|
||||
|
||||
Reference in New Issue
Block a user