feat(video-pipeline): 重构视频流水线,优化成片时间线规则和状态管理

- 引入 manifest.json 作为唯一状态源,所有子 Agent 操作回写 manifest
- 重构 timebuilder 逻辑,支持四种视频适配策略(加速/裁剪/放缓/画面停顿)
- 统一 TTS 阶段输出结构,单句和多句均写入 segments[]
- 重写字幕和配音生成,基于 segments 精确时长实现音画同步
- 新增 confirm 命令支持按 id 范围确认,上传阶段分离图片和视频
- 添加中间产物写入 output/ 目录的约束,清理废弃配置参数
This commit is contained in:
2026-05-02 00:14:40 +08:00
parent b4b92854db
commit 0998fd6ae1
14 changed files with 457 additions and 205 deletions

View File

@@ -215,28 +215,89 @@ function getAudioDurationSec(filePath) {
// 主流程
// ============================================================================
function buildTimeline(items, defaultDurationUs) {
// 音频为主轴视频调速适配≤2x 加速,>2x 截断)
function buildTimeline(items) {
// 核心规则:
// 图片模式图片没有独立时长TTS 音频时长 = 画面时长。无音频 = 0 时长(跳过)
// 视频模式TTS 为主轴,视频通过 裁剪/加速/放缓/停顿 适配
// 视频比音频长ratio > 1.1:
// ≤ 2x → 加速setpts 压缩时间)
// > 2x → 裁剪(截断到音频时长)
// 视频比音频短ratio < 0.9:
// ≥ 0.5x → 放缓setpts 拉长时间≤2x慢速
// < 0.5x → 画面停顿(视频正常播放+最后一帧冻结补时长)
let offset = 0
return items.map(item => {
const audioDur = (item.audioDuration != null) ? item.audioDuration * US : 0
// 有 segments 时用各段实际时长之和(精确对齐音频文件)
let audioDur
if (item.segments && item.segments.length > 0) {
audioDur = item.segments.reduce((sum, s) => sum + (s.duration || 0), 0) * US
} else {
audioDur = (item.audioDuration != null) ? item.audioDuration * US : 0
}
const videoDur = (item.videoDuration != null) ? item.videoDuration * US : 0
// 无 TTS用视频时长或固定时长
const hasVideo = !!(item.video || item.videoUrl || item.url)
// 无 TTS 音频
if (audioDur <= 0) {
const dur = videoDur || defaultDurationUs
const entry = { start: offset, end: offset + dur, duration: dur, speed: 1 }
if (hasVideo && videoDur > 0) {
// 视频模式无音频:用视频原始时长
const entry = { start: offset, end: offset + videoDur, duration: videoDur, speed: 1, strategy: 'none' }
offset += videoDur
return entry
}
// 图片模式无音频0 时长,标记跳过
const entry = { start: offset, end: offset, duration: 0, speed: 1, strategy: 'none', skip: true }
return entry
}
// 有 TTS音频时长为主轴
const dur = audioDur
if (!hasVideo || videoDur <= 0) {
// 图片模式:直接用音频时长
const entry = { start: offset, end: offset + dur, duration: dur, speed: 1, strategy: 'none' }
offset += dur
return entry
}
// 视频模式:视频 vs 音频时长匹配
const ratio = videoDur / audioDur
if (ratio > 1.1) {
// 视频比音频长
if (ratio <= 2) {
// 加速策略
const entry = { start: offset, end: offset + dur, duration: dur, speed: ratio, strategy: 'speed_up' }
offset += dur
return entry
} else {
// 裁剪策略
const entry = { start: offset, end: offset + dur, duration: dur, speed: 1, strategy: 'trim' }
offset += dur
return entry
}
} else if (ratio < 0.9) {
// 视频比音频短
if (ratio >= 0.5) {
// 放缓策略(慢放 ≤2x
const entry = { start: offset, end: offset + dur, duration: dur, speed: ratio, strategy: 'slow_down' }
offset += dur
return entry
} else {
// 画面停顿策略(视频原速播放 + 最后一帧冻结补时长)
const entry = {
start: offset, end: offset + dur, duration: dur, speed: 1,
strategy: 'freeze', freezeExtra: dur - videoDur,
}
offset += dur
return entry
}
} else {
// 接近匹配0.9 ~ 1.1),无需调整
const entry = { start: offset, end: offset + dur, duration: dur, speed: 1, strategy: 'none' }
offset += dur
return entry
}
// 有 TTS音频时长为主轴
const dur = audioDur
const ratio = videoDur > 0 ? videoDur / audioDur : 1
// ≤2x: 加速到音频时长;>2x: 截断(视频只取前 audioDur 部分)
const speed = ratio <= 2 ? ratio : 1
const needAdjust = videoDur > audioDur + 100000 // 视频比音频长 0.1s 以上才需要调整
const entry = { start: offset, end: offset + dur, duration: dur, speed, needAdjust }
offset += dur
return entry
})
}
@@ -253,7 +314,6 @@ async function assemble(args) {
filter: filterStr,
format = '9:16',
apiKey = '',
duration = '4',
animation = '轻微放大',
} = args
@@ -284,22 +344,44 @@ async function assemble(args) {
}
const { width, height } = getResolution(format)
const defaultDurationUs = parseFloat(duration) * US
// 过滤出实际存在的文件
const missingFileItems = []
const items = manifest.items.filter(item => {
if (item.url) return true // 视频模式可能用 URL
if (item.video) return true // 视频模式本地文件
if (!item.file) {
missingFileItems.push(item.id || '?')
return false
}
const filePath = path.join(inputDir, item.file)
return fs.existsSync(filePath)
})
if (items.length === 0) {
if (missingFileItems.length > 0) {
throw new Error(`没有可用的素材文件 — ${missingFileItems.length} 个 item 缺少 file 字段id: ${missingFileItems.join(', ')}),请先运行 images 阶段`)
}
throw new Error('没有可用的素材文件')
}
if (items.length === 0) throw new Error('没有可用的素材文件')
// 用 ffprobe 测量实际音频/视频时长,替代 manifest 中的估计值
let audioMeasured = 0, videoMeasured = 0
for (const item of items) {
// 测量 TTS 音频实际时长(有 segments 时跳过audioDuration 已是精确累计值)
if (item.audio && !item.audio.startsWith('http') && !item.segments) {
// 测量各 segment 音频文件实际时长
if (item.segments && item.segments.length > 0) {
for (const seg of item.segments) {
if (!seg.audio || seg.audio.startsWith('http')) continue
const audioPath = path.isAbsolute(seg.audio)
? seg.audio
: path.resolve(inputDir, seg.audio)
if (!fs.existsSync(audioPath)) continue
const actualDur = await getAudioDurationSec(audioPath)
if (actualDur != null) { seg.duration = actualDur; audioMeasured++ }
}
} else if (item.audio && !item.audio.startsWith('http')) {
const audioPath = path.isAbsolute(item.audio)
? item.audio
: path.resolve(inputDir, item.audio)
@@ -323,16 +405,32 @@ async function assemble(args) {
console.log(` 实际时长测量: 音频 ${audioMeasured} 个, 视频 ${videoMeasured}`)
}
const timeline = buildTimeline(items, defaultDurationUs)
const timeline = buildTimeline(items)
const totalDurationUs = timeline.length > 0 ? timeline[timeline.length - 1].end : 0
const hasTTS = items.some(item => item.audio && item.audioDuration != null)
// 时间轴诊断
for (let i = 0; i < items.length; i++) {
const item = items[i]
const tl = timeline[i]
if (tl.skip) { console.log(` [${i + 1}] 跳过(无音频)`); continue }
const audioDur = item.segments
? item.segments.reduce((s, seg) => s + (seg.duration || 0), 0)
: (item.audioDuration || 0)
const slotDur = tl.duration / US
const diff = slotDur - audioDur
const videoDur = (item.videoDuration || 0)
const stratInfo = tl.strategy && tl.strategy !== 'none' ? ` 策略=${tl.strategy}` : ''
const marker = Math.abs(diff) > 0.05 ? ' ⚠️ 不对齐' : ''
console.log(` [${i + 1}] 画面=${slotDur.toFixed(2)}s 音频=${audioDur.toFixed(2)}s 视频=${videoDur.toFixed(2)}s${stratInfo}${marker}`)
}
// -- 读取转场策略(在 addImages/addVideos 之前) --
const transitionConfig = loadTransitions(manifest)
console.log(`\nCapCut 成片组装`)
console.log(` 模式: ${mode} 画幅: ${format} (${width}x${height})`)
console.log(` 时间线: ${hasTTS ? 'TTS音频驱动' : `固定${duration}s/段`} 总时长: ${(totalDurationUs / US).toFixed(1)}s`)
console.log(` 时间线: ${hasTTS ? 'TTS音频驱动' : '视频原始时长'} 总时长: ${(totalDurationUs / US).toFixed(1)}s`)
console.log(` 字幕: ${subtitles} 配音: ${voiceover} 动画: ${animation}`)
if (finalEffects) console.log(` 特效: ${finalEffects}`)
if (finalFilter) console.log(` 滤镜: ${finalFilter}`)
@@ -386,10 +484,10 @@ async function assemble(args) {
for (let i = 0; i < items.length; i++) {
const item = items[i]
const tl = timeline[i]
if (tl.needAdjust && item.video) {
if (tl.strategy && tl.strategy !== 'none' && item.video) {
const videoPath = path.resolve(inputDir, item.video)
const audioDur = tl.duration / US
const adjustedPath = await adjustVideoSpeed(videoPath, audioDur)
const adjustedPath = await adjustVideoSpeed(videoPath, audioDur, tl.strategy, tl.speed, tl.freezeExtra || 0)
if (adjustedPath !== videoPath) {
item.video = path.relative(inputDir, adjustedPath)
item.videoDuration = audioDur
@@ -398,7 +496,7 @@ async function assemble(args) {
}
}
if (adjustedCount > 0) {
console.log(` 视频调: ${adjustedCount}/${items.length}`)
console.log(` 视频调: ${adjustedCount}/${items.length}`)
}
// Step 2: 上传(已调速的)视频到 OSS
@@ -547,7 +645,7 @@ async function assemble(args) {
console.log(` 草稿ID: ${draftId}`)
console.log(` 总时长: ${(totalDurationUs / US).toFixed(1)}s`)
console.log(` 素材数: ${items.length}`)
console.log(` 时间线: ${hasTTS ? 'TTS音频驱动' : '固定时长'}`)
console.log(` 时间线: ${hasTTS ? 'TTS音频驱动' : '视频原始时长'}`)
if (mode === 'videos' && subtitles === 'false') {
console.log(`\n >> 视频模式未加字幕,请在剪映中打开草稿 → 识别字幕 → 语音识别生成\n`)
}
@@ -713,54 +811,142 @@ async function addKenBurns(draftUrl, segmentIds, items, timeline, manifest) {
// ============================================================================
/**
* ffmpeg 调速:将视频调整为指定时长
* ratio <= 2x: 加速ratio > 2x: 截断
* 返回调整后的文件路径(调整失败则返回原路径)
* ffmpeg 视频调整:根据策略适配音频时长
*
* 策略(按 ratio = videoDur / audioDur 选择):
* speed_up (ratio > 1.1, ≤2x) → setpts 压缩时间(加速)
* trim (ratio > 2x) → 截断到目标时长
* slow_down (ratio < 0.9, ≥0.5x) → setpts 拉长时间(慢放)
* freeze (ratio < 0.5x) → 视频原速 + 最后一帧冻结补时长
* none (0.9~1.1) → 无需调整
*
* 所有策略失败后兜底:截断到目标时长
*
* 返回调整后的文件路径(失败则返回原路径)
*/
async function adjustVideoSpeed(videoPath, targetDurationSec) {
async function adjustVideoSpeed(videoPath, targetDurationSec, strategy = 'none', speed = 1, freezeExtraUs = 0) {
if (!fs.existsSync(videoPath)) return videoPath
if (strategy === 'none') return videoPath
// 兜底截断:所有策略失败后的最终回退
function fallbackTrim(cb) {
execFile('ffmpeg', [
'-y', '-i', videoPath,
'-t', String(targetDurationSec),
'-c', 'copy',
videoPath.replace(/(\.\w+)$/, '_adj$1')
], { timeout: 30000 }, (err) => {
if (err) { cb(videoPath); return }
cb(videoPath.replace(/(\.\w+)$/, '_adj$1'))
})
}
return new Promise((resolve) => {
// 先获取视频时长
execFile('ffprobe', [
'-v', 'quiet', '-show_entries', 'format=duration',
'-of', 'csv=p=0', videoPath
], (err, stdout) => {
if (err) { resolve(videoPath); return }
if (err) { fallbackTrim(resolve); return }
const videoDur = parseFloat(stdout.trim())
if (!videoDur || videoDur <= 0 || videoDur <= targetDurationSec + 0.1) {
resolve(videoPath); return
}
if (!videoDur || videoDur <= 0) { fallbackTrim(resolve); return }
const ratio = videoDur / targetDurationSec
const outPath = videoPath.replace(/(\.\w+)$/, '_adj$1')
if (ratio <= 2) {
// 加速setpts=PTS/speed, atempo=speed (音频变速)
const speed = ratio.toFixed(3)
const atempo = Math.min(speed, 2.0) // atempo 单次上限 2.0
execFile('ffmpeg', [
'-y', '-i', videoPath,
'-filter_complex', `setpts=PTS/${speed}`,
'-an',
outPath
], { timeout: 30000 }, (err) => {
if (err) { console.log(` 调速失败,使用原始视频: ${err.message}`); resolve(videoPath); return }
console.log(` 调速: ${videoDur.toFixed(1)}s → ${targetDurationSec.toFixed(1)}s (${speed}x)`)
resolve(outPath)
})
} else {
// 截断:取前 targetDuration 秒
if (strategy === 'trim') {
execFile('ffmpeg', [
'-y', '-i', videoPath,
'-t', String(targetDurationSec),
'-c', 'copy',
outPath
], { timeout: 30000 }, (err) => {
if (err) { console.log(` 截断失败,使用原始视频: ${err.message}`); resolve(videoPath); return }
if (err) { console.log(` 截断失败: ${err.message}`); resolve(videoPath); return }
console.log(` 截断: ${videoDur.toFixed(1)}s → ${targetDurationSec.toFixed(1)}s`)
resolve(outPath)
})
} else if (strategy === 'speed_up') {
const speedVal = speed.toFixed(3)
execFile('ffmpeg', [
'-y', '-i', videoPath,
'-filter_complex', `setpts=PTS/${speedVal}`,
'-an',
outPath
], { timeout: 30000 }, (err) => {
if (err) {
console.log(` 加速失败,兜底截断: ${err.message}`)
fallbackTrim(resolve)
return
}
console.log(` 加速: ${videoDur.toFixed(1)}s → ${targetDurationSec.toFixed(1)}s (${speedVal}x)`)
resolve(outPath)
})
} else if (strategy === 'slow_down') {
const factor = (1 / speed).toFixed(3)
execFile('ffmpeg', [
'-y', '-i', videoPath,
'-filter_complex', `setpts=PTS*${factor}`,
'-an',
outPath
], { timeout: 30000 }, (err) => {
if (err) {
console.log(` 放缓失败,兜底截断: ${err.message}`)
fallbackTrim(resolve)
return
}
console.log(` 放缓: ${videoDur.toFixed(1)}s → ${targetDurationSec.toFixed(1)}s (${speed.toFixed(2)}x speed)`)
resolve(outPath)
})
} else if (strategy === 'freeze') {
// 画面停顿:原速播放 + 最后一帧冻结补时长
const freezeSec = freezeExtraUs / US
execFile('ffmpeg', [
'-y', '-i', videoPath,
'-filter_complex', `tpad=stop=-1:stop_duration=${freezeSec.toFixed(3)}`,
'-an',
outPath
], { timeout: 30000 }, (err) => {
if (err) {
// 回退方案:截取最后一帧 → 生成冻结帧视频 → concat 拼接
console.log(` tpad freeze 失败,尝试 concat 方案: ${err.message}`)
const lastFrame = videoPath.replace(/(\.\w+)$/, '_lastframe.png')
const frozenVideo = videoPath.replace(/(\.\w+)$/, '_frozen.mp4')
execFile('ffmpeg', [
'-y', '-sseof', '-0.1', '-i', videoPath,
'-frames:v', '1', lastFrame
], { timeout: 10000 }, (err2) => {
if (err2) { console.log(` concat 方案也失败,兜底截断`); fallbackTrim(resolve); return }
execFile('ffmpeg', [
'-y', '-loop', '1', '-i', lastFrame,
'-t', String(freezeSec.toFixed(3)),
'-pix_fmt', 'yuv420p',
'-vf', 'scale=trunc(iw/2)*2:trunc(ih/2)*2',
frozenVideo
], { timeout: 15000 }, (err3) => {
if (err3) {
try { fs.unlinkSync(lastFrame) } catch (_) {}
console.log(` 冻结帧视频生成失败,兜底截断`)
fallbackTrim(resolve)
return
}
const concatList = path.join(path.dirname(videoPath), '_freeze_concat.txt')
fs.writeFileSync(concatList, `file '${videoPath}'\nfile '${frozenVideo}'\n`)
execFile('ffmpeg', [
'-y', '-f', 'concat', '-safe', '0', '-i', concatList,
'-c', 'copy', outPath
], { timeout: 30000 }, (err4) => {
try { fs.unlinkSync(lastFrame); fs.unlinkSync(frozenVideo); fs.unlinkSync(concatList) } catch (_) {}
if (err4) { console.log(` 拼接失败,兜底截断`); fallbackTrim(resolve); return }
console.log(` 画面停顿: ${videoDur.toFixed(1)}s + 冻结 ${freezeSec.toFixed(1)}s = ${targetDurationSec.toFixed(1)}s`)
resolve(outPath)
})
})
})
return
}
console.log(` 画面停顿: ${videoDur.toFixed(1)}s + 冻结 ${freezeSec.toFixed(1)}s = ${targetDurationSec.toFixed(1)}s`)
resolve(outPath)
})
} else {
resolve(videoPath)
}
})
})
@@ -829,8 +1015,8 @@ async function addVideos(draftUrl, inputDir, items, timeline, width, height, tra
async function batchUploadAudio(inputDir, items) {
const urls = {}
for (const item of items) {
// 上传 segments 中的每段音频
if (item.segments && item.segments.length > 1) {
// 上传所有 segment 音频文件
if (item.segments && item.segments.length > 0) {
for (const seg of item.segments) {
if (!seg.audio || seg.audio.startsWith('http') || urls[seg.audio]) continue
const filePath = path.isAbsolute(seg.audio)
@@ -848,7 +1034,7 @@ async function batchUploadAudio(inputDir, items) {
}
}
}
// 上传 item.audio单段或 segments 的第一段
// 上传 item.audio向后兼容,segments[0].audio 通常等于此值
if (!item.audio || item.audio.startsWith('http')) {
if (item.audio) urls[item.audio] = item.audio
continue
@@ -893,24 +1079,29 @@ async function addVoiceover(draftUrl, inputDir, items, timeline, audioUrls = {})
for (let i = 0; i < items.length; i++) {
const item = items[i]
const tl = timeline[i]
const segments = item.segments && item.segments.length > 1 ? item.segments : null
if (segments) {
// 多段音频:按 segment 逐段添加,使用精确时长
const slots = distributeSegments(tl, segments)
for (const slot of slots) {
const audioUrl = resolveAudio(slot.audio)
if (item.segments && item.segments.length > 0) {
// 逐段添加,每段使用实际音频文件时长(不做比例分配,消除留白)
let currentTime = tl.start
for (let si = 0; si < item.segments.length; si++) {
const seg = item.segments[si]
const audioUrl = resolveAudio(seg.audio)
const segDurUs = (seg.duration || 0) * US
if (segDurUs <= 0) continue
// 最后一段对齐 timeline 末尾,吃掉浮点误差
const isLast = si === item.segments.length - 1
const endTime = isLast ? tl.end : currentTime + segDurUs
audioInfos.push({
audio_url: audioUrl,
start: slot.start,
end: slot.end,
duration: slot.duration,
start: currentTime,
end: endTime,
duration: endTime - currentTime,
volume: 1.0,
})
currentTime = endTime
}
} else if (item.audio) {
// 单段音频:用实际音频时长,不超过 timeline 时长
// 无 segments用实际音频时长
const audioUrl = resolveAudio(item.audio)
const audioDurUs = item.audioDuration ? item.audioDuration * US : tl.duration
@@ -981,23 +1172,6 @@ function applyAnimationProps(cap, style = {}) {
if (style.outAnimDuration) cap.out_animation_duration = style.outAnimDuration
}
// segments 按比例分配到时间线DRY helper
function distributeSegments(tl, segments) {
const totalSegDur = segments.reduce((sum, s) => sum + (s.duration || 0) * US, 0)
if (totalSegDur <= 0) return []
const tlDuration = tl.end - tl.start
let currentTime = tl.start
return segments.map((seg, idx) => {
const segDurUs = Math.round((seg.duration || 0) * US)
let duration = Math.round(tlDuration * (segDurUs / totalSegDur))
if (idx === segments.length - 1) duration = tl.end - currentTime
duration = Math.max(duration, 100000)
const entry = { start: currentTime, end: currentTime + duration, duration, text: seg.text, audio: seg.audio }
currentTime += duration
return entry
})
}
function loadAccountConfig(manifest) {
const account = manifest.account
if (!account) return {}
@@ -1093,17 +1267,19 @@ async function addSubtitles(draftUrl, items, timeline, style = {}, split = false
const tl = timeline[i]
if (split) {
// 分句模式:优先用 segmentsTTS 逐句生成的精确时长),回退到字数估算
const segments = item.segments && item.segments.length > 1 ? item.segments : null
if (segments) {
// 精确模式:用 segments 的实际音频时长
const slots = distributeSegments(tl, segments)
for (const slot of slots) {
const cap = { start: slot.start, end: slot.end, text: slot.text }
// 分句模式:优先用 segments 精确时长(与 addVoiceover 同步),回退到字数估算
if (item.segments && item.segments.length > 0) {
let currentTime = tl.start
for (let si = 0; si < item.segments.length; si++) {
const seg = item.segments[si]
const segDurUs = (seg.duration || 0) * US
if (segDurUs <= 0) continue
const isLast = si === item.segments.length - 1
const endTime = isLast ? tl.end : currentTime + segDurUs
const cap = { start: currentTime, end: endTime, text: seg.text }
applyAnimationProps(cap, animStyle)
captions.push(cap)
currentTime = endTime
}
} else {
// 回退:字数权重估算
@@ -1246,7 +1422,6 @@ async function main() {
console.log('选项:')
console.log(' --mode images|videos 素材类型(默认 images')
console.log(' --format 9:16 画幅比例')
console.log(' --duration 4 默认每段时长/秒无TTS时的fallback默认 4')
console.log(' --voiceover true|false 是否添加TTS配音轨道默认 true')
console.log(' --subtitles true|false 是否添加字幕(默认 true')
console.log(' --split-captions true|false 分句字幕模式(默认 true按标点切分')
@@ -1256,12 +1431,12 @@ async function main() {
console.log(' --apiKey <key> 云渲染 API Key可选')
console.log(' --manifest <path> manifest.json 路径')
console.log('')
console.log('时间线模式:')
console.log(' manifest.json 中每段包含 audio + duration → TTS音频驱动时间线')
console.log(' 无 audio/duration → 按 --duration 固定时长')
console.log('')
console.log('manifest.json 示例TTS驱动:')
console.log(' {"items":[{"file":"1.png","text":"文案","audio":"seg_1.mp3","duration":3.5}]}')
console.log('时间线规则:')
console.log(' 图片模式: TTS 音频时长 = 画面时长,无音频则跳过')
console.log(' 视频模式: TTS 为主轴,视频通过以下策略适配:')
console.log(' 视频比音频长 → 加速(≤2x) 或 裁剪(>2x)')
console.log(' 视频比音频短 → 放缓(≥0.5x) 或 画面停顿(<0.5x)')
console.log(' 所有策略失败 → 兜底截断')
console.log('')
console.log('配置:')
console.log(' 请运行 node setup.js 生成配置')