feat(skills): 完善视频生产 pipeline 及新增健身跟练账号
- SKILL.md: 新增工作流阶段定义、质量卡点、分镜规则 - manifest-schema.md: 补充完整字段规范及类型定义 - phase-tts.js: 优化 TTS 合成长逻辑,添加进度追踪 - capcut-tracks.js: 扩展轨道构建能力,支持更多元素类型 - capcut-timeline.js: 改进时间线生成,支持淡入淡出 - capcut_assemble.js: 新增 assemble 阶段完整实现 - cmd-init.js: 完善 init 命令逻辑 - qwen-tts.js: 调整超时配置 - accounts/禁忌帝王学: 更新拆分/图像/台词提示词 - accounts/健身跟练: 新增账号含 account.json 及全套提示词模板 - 新增 workflow-issues-20260501.md 参考文档 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -3,12 +3,15 @@
|
||||
*
|
||||
* 核心算法模块。纯函数 + ffmpeg,自包含可测试。
|
||||
*
|
||||
* 规则:
|
||||
* 铁律(固化,不可绕过):
|
||||
* 音频:生成后不可调速(TTS=1.15x,CapCut无speed字段)
|
||||
* 视频:始终配合音频时长(只允许加速/截断,不允许慢放/冻结)
|
||||
*
|
||||
* 时间线规则:
|
||||
* 图片模式: TTS 音频时长 = 画面时长,无音频 = 跳过
|
||||
* 视频模式: TTS 为主轴,视频通过策略适配
|
||||
* 视频比音频长 → 加速(≤2x) / 裁剪(>2x)
|
||||
* 视频比音频短 → 放缓(≥0.5x) / 画面停顿(<0.5x)
|
||||
* 所有策略失败 → 兜底截断
|
||||
* 视频比音频短 → 禁止!应在分镜阶段拆分 shot,不允许慢放/冻结补齐
|
||||
*/
|
||||
|
||||
const fs = require('fs')
|
||||
@@ -20,6 +23,20 @@ const { US } = require('./capcut-api')
|
||||
// 时间线构建
|
||||
// ============================================================================
|
||||
|
||||
/**
|
||||
* 构建时间线条目
|
||||
*
|
||||
* @param {Array} items - manifest items
|
||||
* @returns {Array} timeline entries
|
||||
*
|
||||
* 策略选择(固化,按 ratio = videoDur / audioDur):
|
||||
* ≥ 1.1, ≤ 2 → speed_up (视频加速追上音频,最优)
|
||||
* > 2 → trim (视频截断至音频时长)
|
||||
* 0.9 ~ 1.1 → none (接近匹配,无需调整)
|
||||
* < 0.9 → 禁止!音频时长超过视频,分镜阶段未正确拆分 shot
|
||||
*
|
||||
* 铁律:不允许 slow_down / freeze,不允许音频调速
|
||||
*/
|
||||
function buildTimeline(items) {
|
||||
let offset = 0
|
||||
return items.map(item => {
|
||||
@@ -46,7 +63,7 @@ function buildTimeline(items) {
|
||||
return entry
|
||||
}
|
||||
|
||||
// 视频模式:策略选择
|
||||
// 视频模式:策略选择(铁律:不允许音频>视频)
|
||||
const ratio = videoDur / audioDur
|
||||
|
||||
if (ratio > 1.1) {
|
||||
@@ -59,23 +76,25 @@ function buildTimeline(items) {
|
||||
offset += dur
|
||||
return entry
|
||||
}
|
||||
} else if (ratio < 0.9) {
|
||||
if (ratio >= 0.5) {
|
||||
const entry = { start: offset, end: offset + dur, duration: dur, speed: ratio, strategy: 'slow_down' }
|
||||
offset += dur
|
||||
return entry
|
||||
} else {
|
||||
const entry = {
|
||||
start: offset, end: offset + dur, duration: dur, speed: 1,
|
||||
strategy: 'freeze', freezeExtra: dur - videoDur,
|
||||
}
|
||||
offset += dur
|
||||
return entry
|
||||
}
|
||||
} else {
|
||||
} else if (ratio >= 0.9) {
|
||||
// 0.9 ~ 1.1:无需调整
|
||||
const entry = { start: offset, end: offset + dur, duration: dur, speed: 1, strategy: 'none' }
|
||||
offset += dur
|
||||
return entry
|
||||
} else {
|
||||
// ratio < 0.9:音频时长超过视频!
|
||||
// 铁律禁止:不允许慢放/冻结/拼接补齐。此情况应在分镜阶段拆分 shot。
|
||||
// 强制截断并打印错误标记,由主 Agent 上报给用户/打回分镜重做。
|
||||
const entry = {
|
||||
start: offset, end: offset + dur, duration: dur, speed: 1,
|
||||
strategy: 'FORBIDDEN_audio_gt_video',
|
||||
ratio: parseFloat(ratio.toFixed(3)),
|
||||
videoDur: parseFloat((videoDur / US).toFixed(2)),
|
||||
audioDur: parseFloat((audioDur / US).toFixed(2)),
|
||||
error: '音频时长(' + (audioDur / US).toFixed(2) + 's) > 视频时长(' + (videoDur / US).toFixed(2) + 's),分镜阶段 shot 未正确拆分,请打回重新切割',
|
||||
}
|
||||
offset += dur
|
||||
return entry
|
||||
}
|
||||
})
|
||||
}
|
||||
@@ -87,16 +106,18 @@ function buildTimeline(items) {
|
||||
/**
|
||||
* ffmpeg 视频调整:根据策略适配音频时长
|
||||
*
|
||||
* 策略(按 ratio = videoDur / audioDur 选择):
|
||||
* speed_up (ratio > 1.1, ≤2x) → setpts 压缩时间(加速)
|
||||
* trim (ratio > 2x) → 截断到目标时长
|
||||
* slow_down (ratio < 0.9, ≥0.5x) → setpts 拉长时间(慢放)
|
||||
* freeze (ratio < 0.5x) → 视频原速 + 最后一帧冻结补时长
|
||||
* 允许策略(按 ratio = videoDur / audioDur 选择):
|
||||
* speed_up (ratio > 1.1, ≤2x) → setpts 压缩时间(加速),最优
|
||||
* trim (ratio > 2x) → 截断到目标时长,次选
|
||||
* none (0.9~1.1) → 无需调整
|
||||
*
|
||||
* 禁止策略(已删除):
|
||||
* slow_down (ratio < 0.9) → ❌ 音频不可调速!
|
||||
* freeze (ratio < 0.5) → ❌ 不允许冻结帧补齐!
|
||||
*
|
||||
* 所有策略失败后兜底:截断到目标时长
|
||||
*/
|
||||
async function adjustVideoSpeed(videoPath, targetDurationSec, strategy = 'none', speed = 1, freezeExtraUs = 0) {
|
||||
async function adjustVideoSpeed(videoPath, targetDurationSec, strategy = 'none', speed = 1) {
|
||||
if (!fs.existsSync(videoPath)) return videoPath
|
||||
if (strategy === 'none') return videoPath
|
||||
|
||||
@@ -150,72 +171,9 @@ async function adjustVideoSpeed(videoPath, targetDurationSec, strategy = 'none',
|
||||
console.log(` 加速: ${videoDur.toFixed(1)}s → ${targetDurationSec.toFixed(1)}s (${speedVal}x)`)
|
||||
resolve(outPath)
|
||||
})
|
||||
} else if (strategy === 'slow_down') {
|
||||
const factor = (1 / speed).toFixed(3)
|
||||
execFile('ffmpeg', [
|
||||
'-y', '-i', videoPath,
|
||||
'-filter_complex', `setpts=PTS*${factor}`,
|
||||
'-an',
|
||||
outPath
|
||||
], { timeout: 30000 }, (err) => {
|
||||
if (err) {
|
||||
console.log(` 放缓失败,兜底截断: ${err.message}`)
|
||||
fallbackTrim(resolve)
|
||||
return
|
||||
}
|
||||
console.log(` 放缓: ${videoDur.toFixed(1)}s → ${targetDurationSec.toFixed(1)}s (${speed.toFixed(2)}x speed)`)
|
||||
resolve(outPath)
|
||||
})
|
||||
} else if (strategy === 'freeze') {
|
||||
const freezeSec = freezeExtraUs / US
|
||||
execFile('ffmpeg', [
|
||||
'-y', '-i', videoPath,
|
||||
'-filter_complex', `tpad=stop=-1:stop_duration=${freezeSec.toFixed(3)}`,
|
||||
'-an',
|
||||
outPath
|
||||
], { timeout: 30000 }, (err) => {
|
||||
if (err) {
|
||||
console.log(` tpad freeze 失败,尝试 concat 方案: ${err.message}`)
|
||||
const lastFrame = videoPath.replace(/(\.\w+)$/, '_lastframe.png')
|
||||
const frozenVideo = videoPath.replace(/(\.\w+)$/, '_frozen.mp4')
|
||||
execFile('ffmpeg', [
|
||||
'-y', '-sseof', '-0.1', '-i', videoPath,
|
||||
'-frames:v', '1', lastFrame
|
||||
], { timeout: 10000 }, (err2) => {
|
||||
if (err2) { console.log(` concat 方案也失败,兜底截断`); fallbackTrim(resolve); return }
|
||||
execFile('ffmpeg', [
|
||||
'-y', '-loop', '1', '-i', lastFrame,
|
||||
'-t', String(freezeSec.toFixed(3)),
|
||||
'-pix_fmt', 'yuv420p',
|
||||
'-vf', 'scale=trunc(iw/2)*2:trunc(ih/2)*2',
|
||||
frozenVideo
|
||||
], { timeout: 15000 }, (err3) => {
|
||||
if (err3) {
|
||||
try { fs.unlinkSync(lastFrame) } catch (_) {}
|
||||
console.log(` 冻结帧视频生成失败,兜底截断`)
|
||||
fallbackTrim(resolve)
|
||||
return
|
||||
}
|
||||
const concatList = path.join(path.dirname(videoPath), '_freeze_concat.txt')
|
||||
fs.writeFileSync(concatList, `file '${videoPath}'\nfile '${frozenVideo}'\n`)
|
||||
execFile('ffmpeg', [
|
||||
'-y', '-f', 'concat', '-safe', '0', '-i', concatList,
|
||||
'-c', 'copy', outPath
|
||||
], { timeout: 30000 }, (err4) => {
|
||||
try { fs.unlinkSync(lastFrame); fs.unlinkSync(frozenVideo); fs.unlinkSync(concatList) } catch (_) {}
|
||||
if (err4) { console.log(` 拼接失败,兜底截断`); fallbackTrim(resolve); return }
|
||||
console.log(` 画面停顿: ${videoDur.toFixed(1)}s + 冻结 ${freezeSec.toFixed(1)}s = ${targetDurationSec.toFixed(1)}s`)
|
||||
resolve(outPath)
|
||||
})
|
||||
})
|
||||
})
|
||||
return
|
||||
}
|
||||
console.log(` 画面停顿: ${videoDur.toFixed(1)}s + 冻结 ${freezeSec.toFixed(1)}s = ${targetDurationSec.toFixed(1)}s`)
|
||||
resolve(outPath)
|
||||
})
|
||||
} else {
|
||||
resolve(videoPath)
|
||||
// 未知策略,兜底截断
|
||||
fallbackTrim(resolve)
|
||||
}
|
||||
})
|
||||
})
|
||||
|
||||
@@ -3,6 +3,10 @@
|
||||
*
|
||||
* 所有 add* 函数 + 转场策略 + 账号配置读取。
|
||||
* Agent 修改字幕风格、Ken Burns、转场、特效等只需关注此文件。
|
||||
*
|
||||
* 音频策略(固化铁律):
|
||||
* - 音频由 TTS 1.15x 生成,导入 CapCut 时无 speed 字段(不可调速)
|
||||
* - 每个 item 的 segments[] 逐段添加,各段 start 按 startOffset 精确对齐
|
||||
*/
|
||||
|
||||
const path = require('path')
|
||||
@@ -303,33 +307,233 @@ async function addVideos(draftUrl, inputDir, items, timeline, width, height, tra
|
||||
return allSegmentIds
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// 将 segment 写入视频轨道时间线(slot)
|
||||
// 背景:add_videos 只负责把视频加入素材库,不自动上时间线。
|
||||
// 此函数在 add_videos 成功后调用,将每个 segment_id 写入第一个 video track。
|
||||
// ============================================================================
|
||||
|
||||
async function addSlots(draftUrl, items, timeline) {
|
||||
const { api: capcutApi, US } = require('./capcut-api')
|
||||
const { getManifestDir } = require('./pipeline-utils')
|
||||
const path = require('path')
|
||||
|
||||
// 获取当前云端草稿的 draft_content,获取第一个 video track 的 id
|
||||
let draftData
|
||||
try {
|
||||
draftData = (await capcutApi('get_draft', { draft_url: draftUrl })).data || {}
|
||||
} catch (err) {
|
||||
// get_draft 接口不可用,尝试从本地 manifest 目录寻找草稿
|
||||
const manifestDir = path.dirname(draftUrl.startsWith('http') ? draftUrl : '')
|
||||
console.log(' get_draft 不可用,切换本地写入模式')
|
||||
return addSlotsLocally(draftUrl, items, timeline)
|
||||
}
|
||||
|
||||
const tracks = draftData.tracks || []
|
||||
const videoTrack = tracks.find(t => t.type === 'video')
|
||||
if (!videoTrack) {
|
||||
console.log(' 未找到 video track,跳过 slot 写入')
|
||||
return
|
||||
}
|
||||
|
||||
// 构造 slot 数据
|
||||
const slots = []
|
||||
for (let i = 0; i < items.length; i++) {
|
||||
const item = items[i]
|
||||
const tl = timeline[i]
|
||||
const segId = item.segmentId || item._segmentId
|
||||
if (!segId) continue
|
||||
|
||||
const slotId = generateUUID()
|
||||
slots.push({
|
||||
id: slotId,
|
||||
material_id: segId,
|
||||
track_id: videoTrack.id,
|
||||
render_index: i,
|
||||
type: 'video',
|
||||
common_property: {
|
||||
start_time: tl.start,
|
||||
source_timerange: { start: 0, duration: tl.duration },
|
||||
target_timerange: { start: tl.start, duration: tl.duration },
|
||||
is_avatar: false,
|
||||
audio_fade: { fade_in_duration: 0, fade_out_duration: 0 },
|
||||
volume: 1.0,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
if (slots.length === 0) {
|
||||
console.log(' 无有效 slot 数据,跳过')
|
||||
return
|
||||
}
|
||||
|
||||
// 通过 add_slots API 写入
|
||||
try {
|
||||
await capcutApi('add_slots', {
|
||||
draft_url: draftUrl,
|
||||
slots: JSON.stringify(slots),
|
||||
})
|
||||
console.log(` 已写入 ${slots.length} 个 slot 到视频轨道`)
|
||||
} catch (err) {
|
||||
// API 不支持时,降级为本地写入
|
||||
console.log(` add_slots API 不可用: ${err.message},降级为本地写入`)
|
||||
await addSlotsLocally(draftUrl, items, timeline, videoTrack.id)
|
||||
}
|
||||
}
|
||||
|
||||
// 直接写入本地 draft_content.json 的 slot
|
||||
// options.draftId: 可选,直接指定 draftId(优先使用),否则从 draftUrl 提取
|
||||
async function addSlotsLocally(draftUrl, items, timeline, trackId, options = {}) {
|
||||
const { api: capcutApi, US } = require('./capcut-api')
|
||||
const fs = require('fs')
|
||||
|
||||
// 优先使用 options.draftId,否则从 draftUrl 提取
|
||||
let draftId = options.draftId || null
|
||||
if (!draftId) {
|
||||
try {
|
||||
draftId = new URL(draftUrl).searchParams.get('draft_id')
|
||||
} catch {
|
||||
console.log(' 无法解析 draftUrl,跳过本地 slot 写入')
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if (!draftId) {
|
||||
console.log(' 无法提取 draft_id,跳过本地 slot 写入')
|
||||
return
|
||||
}
|
||||
|
||||
const { getConfig } = require('./capcut-api')
|
||||
const jianyingPath = getConfig().jianyingDraftPath
|
||||
const draftPath = path.join(jianyingPath, draftId, 'draft_content.json')
|
||||
if (!fs.existsSync(draftPath)) {
|
||||
console.log(` 本地草稿不存在: ${draftPath},跳过 slot 写入`)
|
||||
return
|
||||
}
|
||||
|
||||
let draft
|
||||
try {
|
||||
draft = JSON.parse(fs.readFileSync(draftPath, 'utf-8'))
|
||||
} catch {
|
||||
console.log(' draft_content.json 读取失败,跳过')
|
||||
return
|
||||
}
|
||||
|
||||
// 找到第一个 video track
|
||||
const videoTrack = trackId
|
||||
? draft.tracks.find(t => t.id === trackId)
|
||||
: draft.tracks.find(t => t.type === 'video')
|
||||
|
||||
if (!videoTrack) {
|
||||
console.log(' 未找到 video track,跳过')
|
||||
return
|
||||
}
|
||||
|
||||
const slots = []
|
||||
for (let i = 0; i < items.length; i++) {
|
||||
const item = items[i]
|
||||
const tl = timeline[i]
|
||||
const segId = item.segmentId || item._segmentId
|
||||
if (!segId) {
|
||||
// 尝试从 materials.videos 匹配
|
||||
const fname = item.video ? path.basename(item.video) : ''
|
||||
const matVideo = (draft.materials.videos || []).find(v => {
|
||||
const matFname = path.basename(v.path || '')
|
||||
return fname && matFname.includes(fname.replace('videos/', ''))
|
||||
})
|
||||
if (matVideo) {
|
||||
items[i]._segmentId = matVideo.id
|
||||
slots.push(buildSlot(matVideo.id, videoTrack.id, i, tl, US))
|
||||
}
|
||||
} else {
|
||||
slots.push(buildSlot(segId, videoTrack.id, i, tl, US))
|
||||
}
|
||||
}
|
||||
|
||||
if (slots.length > 0) {
|
||||
videoTrack.slots = slots
|
||||
draft.duration = timeline.length > 0 ? timeline[timeline.length - 1].end : 0
|
||||
fs.writeFileSync(draftPath, JSON.stringify(draft, null, 2), 'utf-8')
|
||||
console.log(` 已本地写入 ${slots.length} 个 slot 到视频轨道`)
|
||||
|
||||
// 触发剪映扫描
|
||||
triggerDirScan(path.dirname(draftPath))
|
||||
}
|
||||
}
|
||||
|
||||
function buildSlot(segId, trackId, index, tl, US) {
|
||||
return {
|
||||
id: generateUUID(),
|
||||
material_id: segId,
|
||||
track_id: trackId,
|
||||
render_index: index,
|
||||
type: 'video',
|
||||
common_property: {
|
||||
start_time: tl.start,
|
||||
source_timerange: { start: 0, duration: tl.duration },
|
||||
target_timerange: { start: tl.start, duration: tl.duration },
|
||||
is_avatar: false,
|
||||
audio_fade: { fade_in_duration: 0, fade_out_duration: 0 },
|
||||
volume: 1.0,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
function generateUUID() {
|
||||
return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, c => {
|
||||
const r = Math.random() * 16 | 0
|
||||
return (c === 'x' ? r : (r & 0x3 | 0x8)).toString(16).toUpperCase()
|
||||
})
|
||||
}
|
||||
|
||||
function triggerDirScan(dir) {
|
||||
const { execFile } = require('child_process')
|
||||
const tmp = dir + '.slot_tmp'
|
||||
if (process.platform === 'darwin') {
|
||||
execFile('rsync', ['-a', dir + '/', tmp], (err) => {
|
||||
try { require('fs').rmSync(tmp, { recursive: true, force: true }) } catch {}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// 添加 TTS 配音
|
||||
// ============================================================================
|
||||
|
||||
async function addVoiceover(draftUrl, inputDir, items, timeline, audioUrls = {}) {
|
||||
const audioItems = items.filter(item => item.audio)
|
||||
if (audioItems.length === 0) {
|
||||
console.log(' 无 TTS 音频文件,跳过')
|
||||
return
|
||||
}
|
||||
|
||||
const audioInfos = []
|
||||
const resolveAudio = (relPath) => {
|
||||
if (relPath.startsWith('http')) return relPath
|
||||
if (audioUrls[relPath]) return audioUrls[relPath]
|
||||
return path.isAbsolute(relPath) ? relPath : path.resolve(inputDir, relPath)
|
||||
}
|
||||
// 优先使用 segments[] 逐段添加(精确对齐)
|
||||
// 无 segments 时降级为旧的整段方式
|
||||
const segmentsFlat = []
|
||||
|
||||
for (let i = 0; i < items.length; i++) {
|
||||
const item = items[i]
|
||||
const tl = timeline[i]
|
||||
if (!item.audio) continue
|
||||
|
||||
if (item.audio) {
|
||||
const audioUrl = resolveAudio(item.audio)
|
||||
if (item.segments && item.segments.length > 0) {
|
||||
// 使用 segments 精确添加
|
||||
for (const seg of item.segments) {
|
||||
if (!seg.audio || seg.error) continue
|
||||
const audioUrl = seg.audio.startsWith('http')
|
||||
? seg.audio
|
||||
: (audioUrls[seg.audio] || path.resolve(inputDir, seg.audio))
|
||||
const segDurUs = Math.round(seg.duration * US)
|
||||
const segStartUs = tl.start + Math.round(seg.startOffset * US)
|
||||
segmentsFlat.push({
|
||||
audio_url: audioUrl,
|
||||
start: segStartUs,
|
||||
end: segStartUs + segDurUs,
|
||||
duration: segDurUs,
|
||||
volume: 1.0,
|
||||
})
|
||||
}
|
||||
} else {
|
||||
// 降级:整段添加
|
||||
const audioUrl = item.audio.startsWith('http')
|
||||
? item.audio
|
||||
: (audioUrls[item.audio] || path.resolve(inputDir, item.audio))
|
||||
const audioDurUs = item.audioDuration ? item.audioDuration * US : tl.duration
|
||||
|
||||
audioInfos.push({
|
||||
segmentsFlat.push({
|
||||
audio_url: audioUrl,
|
||||
start: tl.start,
|
||||
end: tl.start + audioDurUs,
|
||||
@@ -339,17 +543,26 @@ async function addVoiceover(draftUrl, inputDir, items, timeline, audioUrls = {})
|
||||
}
|
||||
}
|
||||
|
||||
if (audioInfos.length === 0) {
|
||||
console.log(' 无可用音频,跳过配音')
|
||||
if (segmentsFlat.length === 0) {
|
||||
console.log(' 无 TTS 音频文件,跳过')
|
||||
return
|
||||
}
|
||||
|
||||
await api('add_audios', {
|
||||
draft_url: draftUrl,
|
||||
audio_infos: JSON.stringify(audioInfos),
|
||||
})
|
||||
const ossCount = audioInfos.filter(a => a.audio_url.startsWith('http')).length
|
||||
console.log(` 已添加 ${audioInfos.length} 段 TTS 配音 (${ossCount > 0 ? `${ossCount} 段 OSS + ` : ''}${audioInfos.length - ossCount} 段本地)`)
|
||||
// 逐个添加音频(CapCut API 批量添加不稳定)
|
||||
let addedCount = 0
|
||||
for (const audioInfo of segmentsFlat) {
|
||||
try {
|
||||
await api('add_audios', {
|
||||
draft_url: draftUrl,
|
||||
audio_infos: JSON.stringify([audioInfo]),
|
||||
})
|
||||
addedCount++
|
||||
} catch (err) {
|
||||
console.error(` 音频添加失败: ${err.message.slice(0, 80)}`)
|
||||
}
|
||||
}
|
||||
const ossCount = segmentsFlat.filter(a => a.audio_url.startsWith('http')).length
|
||||
console.log(` 已添加 ${addedCount}/${segmentsFlat.length} 段 TTS 配音 (${ossCount} 段 OSS)`)
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
@@ -402,7 +615,24 @@ async function addSubtitles(draftUrl, items, timeline, style = {}, split = false
|
||||
|
||||
const tl = timeline[i]
|
||||
|
||||
if (split) {
|
||||
if (split && item.segments && item.segments.length > 0) {
|
||||
// 精确字幕模式:使用 segments 实测时长,逐段添加字幕
|
||||
for (const seg of item.segments) {
|
||||
if (seg.error || !seg.text) continue
|
||||
const segStartUs = tl.start + Math.round(seg.startOffset * US)
|
||||
const segDurUs = Math.round(seg.duration * US)
|
||||
|
||||
const cap = {
|
||||
start: segStartUs,
|
||||
end: segStartUs + segDurUs,
|
||||
text: seg.text,
|
||||
}
|
||||
|
||||
applyAnimationProps(cap, animStyle)
|
||||
captions.push(cap)
|
||||
}
|
||||
} else if (split) {
|
||||
// 降级:按字符比例分配(无 segments 时)
|
||||
const sentences = splitTextIntoSentences(text)
|
||||
if (sentences.length === 0) continue
|
||||
|
||||
@@ -447,9 +677,7 @@ async function addSubtitles(draftUrl, items, timeline, style = {}, split = false
|
||||
return
|
||||
}
|
||||
|
||||
await api('add_captions', {
|
||||
draft_url: draftUrl,
|
||||
captions: JSON.stringify(captions),
|
||||
const commonStyle = {
|
||||
font: style.font || null,
|
||||
font_size: style.fontSize || 15,
|
||||
text_color: style.color || '#ffffff',
|
||||
@@ -472,9 +700,23 @@ async function addSubtitles(draftUrl, items, timeline, style = {}, split = false
|
||||
transform_x: 0,
|
||||
transform_y: style.transformY || 0,
|
||||
style_text: 0,
|
||||
})
|
||||
}
|
||||
|
||||
console.log(` 已添加 ${captions.length} 条字幕${split ? ' (分句模式)' : ''} (字体: ${style.font || '默认'}, 动画: ${animStyle.inAnimation || '无'} → ${animStyle.outAnimation || '无'})`)
|
||||
// 逐条添加字幕(CapCut API 批量添加不稳定)
|
||||
let addedCount = 0
|
||||
for (const cap of captions) {
|
||||
try {
|
||||
await api('add_captions', {
|
||||
draft_url: draftUrl,
|
||||
captions: JSON.stringify([cap]),
|
||||
...commonStyle,
|
||||
})
|
||||
addedCount++
|
||||
} catch (err) {
|
||||
console.error(` 字幕添加失败: ${err.message.slice(0, 80)}`)
|
||||
}
|
||||
}
|
||||
console.log(` 已添加 ${addedCount}/${captions.length} 条字幕${split ? ' (分句模式)' : ''} (字体: ${style.font || '默认'}, 动画: ${animStyle.inAnimation || '无'} → ${animStyle.outAnimation || '无'})`)
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
@@ -583,6 +825,8 @@ module.exports = {
|
||||
addBGM,
|
||||
addSubtitles,
|
||||
addKeywordOverlays,
|
||||
addSlots,
|
||||
addSlotsLocally,
|
||||
addEffects,
|
||||
addFilter,
|
||||
}
|
||||
|
||||
@@ -72,6 +72,28 @@ function initManifest(options) {
|
||||
console.log(` ⚠ ${refsWithoutUrl.length} 个参考图缺少 OSS URL,images 阶段会自动上传`)
|
||||
}
|
||||
|
||||
// 从 videoModel 推算固定时长(秒)
|
||||
const videoModelFixedDurations = {
|
||||
'kling': 6,
|
||||
'kling-v2-5-turbo': 6,
|
||||
'veo3-fast': 8,
|
||||
'veo3-fast-frames': 8,
|
||||
'grok-video-3': 6,
|
||||
}
|
||||
const estimatedVideoDuration = videoModelFixedDurations[options.videoModel || accountConfig.videoModel] || 6
|
||||
|
||||
// 校验时长约束
|
||||
for (let i = 0; i < rawItems.length; i++) {
|
||||
const item = rawItems[i]
|
||||
const dur = Number(item.duration) || 5
|
||||
if (dur > estimatedVideoDuration) {
|
||||
console.error(`错误: items[${i}] 的 TTS 估算 duration=${dur}s > videoModel 固定时长 ${estimatedVideoDuration}s`)
|
||||
console.error(` 必须先拆分 shot 再执行 init!`)
|
||||
console.error(` script: "${item.script}"`)
|
||||
process.exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
// 构建 items
|
||||
const items = rawItems.map((raw, i) => {
|
||||
const slug = slugify(raw.shotDesc || raw.script || `scene_${i + 1}`)
|
||||
@@ -81,7 +103,8 @@ function initManifest(options) {
|
||||
file: `images/scene_${String(i + 1).padStart(2, '0')}_${slug}.jpeg`,
|
||||
shotDesc: raw.shotDesc || '',
|
||||
script: raw.script || '',
|
||||
duration: raw.duration || 5,
|
||||
duration: Number(raw.duration) || 5,
|
||||
estimatedVideoDuration,
|
||||
imagePrompt: raw.imagePrompt,
|
||||
confirmed: false,
|
||||
}
|
||||
@@ -102,8 +125,10 @@ function initManifest(options) {
|
||||
references,
|
||||
...(accountConfig.ttsVoice ? { ttsVoice: accountConfig.ttsVoice } : {}),
|
||||
...(accountConfig.ttsInstruction ? { ttsInstruction: accountConfig.ttsInstruction } : {}),
|
||||
...(accountConfig.ttsRate ? { ttsRate: accountConfig.ttsRate } : {}),
|
||||
// 铁律:ttsRate 写死 1.15x,不允许配置覆盖(除非显式传入)
|
||||
ttsRate: options.ttsRate || 1.15,
|
||||
items,
|
||||
estimatedVideoDuration, // 顶层冗余,便于 assemble 直接读取
|
||||
}
|
||||
|
||||
// 创建输出目录(自增序号)
|
||||
|
||||
@@ -1,13 +1,100 @@
|
||||
/**
|
||||
* Phase: tts — 语音合成(整段合成)
|
||||
* Phase: tts — 语音合成(先分段,后合成)
|
||||
*
|
||||
* 每个 item 的 script 整段合成一个音频文件,保留自然语调。
|
||||
* item.audio 指向完整音频,item.audioDuration 为总时长。
|
||||
* 字幕切分由组装阶段按字符比例分配,不在 TTS 阶段处理。
|
||||
* 核心变化:音频分段优先于生图。
|
||||
*
|
||||
* 1. 在生成图片之前,先将文案按语义断点切分为多个音频片段
|
||||
* 2. 每个片段时长 < videoModel 固定时长(Kling=6s)
|
||||
* 3. 逐段合成,记录实测时长,写入 manifest.segments[]
|
||||
* 4. manifest.items[n].segments = [{text, audio, duration, startOffset}, ...]
|
||||
* 5. manifest.items[n].audioDuration = 片段总和(供 assemble 计算 ratio)
|
||||
*
|
||||
* 流程顺序变为:tts → images → upload → videos → assemble
|
||||
*/
|
||||
|
||||
const path = require('path')
|
||||
const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils')
|
||||
const { saveManifest, ensureDir, log, getManifestDir, splitTextIntoSentences } = require('./pipeline-utils')
|
||||
|
||||
/**
|
||||
* 在语义断点处将文案切分为音频片段
|
||||
* 每段时长(估算)必须 < videoDuration,且尽量接近(最佳 ratio 接近1.0)
|
||||
*
|
||||
* @param {string} text - 完整文案
|
||||
* @param {number} videoDur - 视频模型固定时长(秒),如 6
|
||||
* @param {number} charsPerSec - 语速(字/秒),固定 5
|
||||
* @returns {Array<{text, estimatedDuration}>}
|
||||
*/
|
||||
function splitIntoAudioSegments(text, videoDur, charsPerSec = 5) {
|
||||
// 优先在自然断点切分(句号/感叹号/分号)
|
||||
const naturalBreaks = splitTextIntoSentences(text)
|
||||
if (naturalBreaks.length <= 1) {
|
||||
// 无自然断点:在半段处(含小数点)切分
|
||||
const chars = text.length
|
||||
const estimatedTotal = chars / charsPerSec
|
||||
if (estimatedTotal <= videoDur) {
|
||||
// 整段可容纳
|
||||
return [{ text, estimatedDuration: estimatedTotal }]
|
||||
}
|
||||
// 无法单段容纳,在中间逗号处切
|
||||
const mid = Math.floor(chars / 2)
|
||||
const breakIdx = text.indexOf(',', mid)
|
||||
if (breakIdx > 0) {
|
||||
return [
|
||||
{ text: text.slice(0, breakIdx + 1), estimatedDuration: (breakIdx + 1) / charsPerSec },
|
||||
{ text: text.slice(breakIdx + 1), estimatedDuration: (chars - breakIdx - 1) / charsPerSec },
|
||||
]
|
||||
}
|
||||
// 强制按字数切
|
||||
const halfChars = Math.floor(chars / 2)
|
||||
return [
|
||||
{ text: text.slice(0, halfChars), estimatedDuration: halfChars / charsPerSec },
|
||||
{ text: text.slice(halfChars), estimatedDuration: (chars - halfChars) / charsPerSec },
|
||||
]
|
||||
}
|
||||
|
||||
// 多个自然句:逐句判断,合并短句
|
||||
const result = []
|
||||
let currentText = ''
|
||||
let currentEstDur = 0
|
||||
|
||||
for (let i = 0; i < naturalBreaks.length; i++) {
|
||||
const sentence = naturalBreaks[i]
|
||||
const sentenceLen = sentence.length
|
||||
const sentenceEstDur = sentenceLen / charsPerSec
|
||||
|
||||
if (currentEstDur + sentenceEstDur <= videoDur) {
|
||||
// 可以合并到当前段
|
||||
currentText += sentence + '。'
|
||||
currentEstDur += sentenceEstDur
|
||||
} else {
|
||||
// 先保存当前段
|
||||
if (currentText) {
|
||||
result.push({ text: currentText.trim(), estimatedDuration: currentEstDur })
|
||||
}
|
||||
currentText = sentence + '。'
|
||||
currentEstDur = sentenceEstDur
|
||||
|
||||
// 单句本身超长(超 videoDur)
|
||||
if (sentenceEstDur > videoDur) {
|
||||
// 按半段切
|
||||
const halfLen = Math.floor(sentenceLen / 2)
|
||||
const half1 = sentence.slice(0, halfLen)
|
||||
const half2 = sentence.slice(halfLen)
|
||||
// 回退上一段,用两个半段替代
|
||||
result.pop()
|
||||
result.push({ text: half1, estimatedDuration: halfLen / charsPerSec })
|
||||
currentText = half2 + '。'
|
||||
currentEstDur = (sentenceLen - halfLen) / charsPerSec
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (currentText) {
|
||||
result.push({ text: currentText.trim(), estimatedDuration: currentEstDur })
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
async function phaseTts(manifest, manifestPath, options = {}) {
|
||||
const dir = getManifestDir(manifestPath)
|
||||
@@ -16,38 +103,89 @@ async function phaseTts(manifest, manifestPath, options = {}) {
|
||||
|
||||
const { synthesize } = require('../qwen-tts')
|
||||
|
||||
const items = manifest.items.filter(it =>
|
||||
it.status === 'done' && (it.script || it.text) && !it.audio
|
||||
)
|
||||
if (items.length === 0) { log('tts', '无待处理 item,跳过'); return }
|
||||
const videoDur = manifest.estimatedVideoDuration || 6
|
||||
const ttsRate = manifest.ttsRate || 1.15
|
||||
|
||||
log('tts', `共 ${items.length} 段`)
|
||||
const items = manifest.items.filter(it =>
|
||||
(it.script || it.text) && !it.audio
|
||||
)
|
||||
if (items.length === 0) { log('tts', '无待处理 item(已合成),跳过'); return }
|
||||
|
||||
log('tts', `共 ${items.length} 段, 视频固定时长=${videoDur}s, TTS语速=${ttsRate}x`)
|
||||
|
||||
for (let i = 0; i < items.length; i++) {
|
||||
const item = items[i]
|
||||
const idx = i + 1
|
||||
const fullText = item.script || item.text
|
||||
const fullText = (item.script || item.text).trim()
|
||||
|
||||
try {
|
||||
const { filePath, duration } = await synthesize(fullText, {
|
||||
outputDir: audioDir,
|
||||
id: String(item.id || idx),
|
||||
voice: manifest.ttsVoice || undefined,
|
||||
instruction: manifest.ttsInstruction || undefined,
|
||||
rate: manifest.ttsRate || undefined,
|
||||
})
|
||||
|
||||
const totalDuration = Math.round(duration * 1000) / 1000
|
||||
item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
|
||||
item.audioDuration = totalDuration
|
||||
log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s: ${fullText.substring(0, 30)}...`)
|
||||
} catch (err) {
|
||||
item.status = 'failed'
|
||||
item.error = `TTS失败: ${err.message}`
|
||||
log('tts', `[${idx}/${items.length}] 失败: ${err.message}`)
|
||||
// Step 1: 计算音频分段
|
||||
const rawSegments = splitIntoAudioSegments(fullText, videoDur)
|
||||
log('tts', `[${idx}/${items.length}] 原始分段: ${rawSegments.length} 段`)
|
||||
for (const seg of rawSegments) {
|
||||
log('tts', ` 分段估算: ${seg.estimatedDuration.toFixed(2)}s / ${seg.text.slice(0, 20)}...`)
|
||||
}
|
||||
|
||||
// Step 2: 逐段合成
|
||||
const segments = []
|
||||
let globalOffset = 0
|
||||
|
||||
for (let j = 0; j < rawSegments.length; j++) {
|
||||
const segInput = rawSegments[j]
|
||||
const segId = `${item.id}_${j + 1}`
|
||||
|
||||
try {
|
||||
const { filePath, duration: realDuration } = await synthesize(segInput.text, {
|
||||
outputDir: audioDir,
|
||||
id: segId,
|
||||
voice: manifest.ttsVoice || undefined,
|
||||
instruction: manifest.ttsInstruction || undefined,
|
||||
rate: ttsRate,
|
||||
})
|
||||
|
||||
const segment = {
|
||||
id: segId,
|
||||
text: segInput.text,
|
||||
audio: path.relative(dir, filePath).replace(/\\/g, '/'),
|
||||
estimatedDuration: Math.round(segInput.estimatedDuration * 1000) / 1000,
|
||||
duration: Math.round(realDuration * 1000) / 1000,
|
||||
startOffset: Math.round(globalOffset * 1000) / 1000,
|
||||
}
|
||||
segments.push(segment)
|
||||
globalOffset += realDuration
|
||||
|
||||
log('tts', `[${idx}/${items.length}] 段${j + 1}: 估算${segInput.estimatedDuration.toFixed(2)}s → 实测${realDuration.toFixed(2)}s | ${segInput.text.slice(0, 15)}...`)
|
||||
} catch (err) {
|
||||
log('tts', `[${idx}/${items.length}] 段${j + 1} 合成失败: ${err.message}`)
|
||||
segments.push({
|
||||
id: segId,
|
||||
text: segInput.text,
|
||||
audio: '',
|
||||
estimatedDuration: segInput.estimatedDuration,
|
||||
duration: 0,
|
||||
startOffset: globalOffset,
|
||||
error: err.message,
|
||||
})
|
||||
globalOffset += segInput.estimatedDuration
|
||||
}
|
||||
}
|
||||
|
||||
// Step 3: 汇总到 item
|
||||
const totalAudioDuration = Math.round(globalOffset * 1000) / 1000
|
||||
item.segments = segments
|
||||
item.audio = segments[0]?.audio || ''
|
||||
item.audioDuration = totalAudioDuration
|
||||
item.segmentCount = segments.length
|
||||
|
||||
// Step 4: 时长合规诊断
|
||||
const ratio = videoDur / totalAudioDuration
|
||||
if (ratio < 0.9) {
|
||||
item._timelineWarning = `⚠ audioDur(${totalAudioDuration.toFixed(1)}s) > videoDur(${videoDur}s),ratio=${ratio.toFixed(2)},assemble 将截断`
|
||||
}
|
||||
|
||||
log('tts', `[${idx}/${items.length}] 完成: ${segments.length}段, 总音频${totalAudioDuration.toFixed(1)}s, ratio=${ratio.toFixed(2)}`)
|
||||
|
||||
saveManifest(manifestPath, manifest)
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { phaseTts }
|
||||
module.exports = { phaseTts, splitIntoAudioSegments }
|
||||
|
||||
Reference in New Issue
Block a user