feat(video-pipeline): 实现 TTS 逐句分句生成与字幕精确对齐

TTS 阶段将长文本按标点切分为短句,逐句生成音频并记录每句时长到 `item.segments[]`。assemble 阶段优先使用 segments 的精确时长分配字幕时间线,无 segments 时回退到字数权重估算。同时优化音频上传流程,支持分段音频独立上传 OSS 并在配音时按段映射时间线。
This commit is contained in:
2026-05-01 14:41:28 +08:00
parent f5d47ec5db
commit 9d19437a29
4 changed files with 236 additions and 122 deletions

View File

@@ -17,6 +17,7 @@ const path = require('path')
const fs = require('fs')
const { execFile } = require('child_process')
const { syncDraft, registerDraft, triggerDirectoryScan } = require('./sync-to-jianying')
const { splitTextIntoSentences } = require('./lib/pipeline-utils')
// ============================================================================
// 配置
@@ -248,8 +249,8 @@ async function assemble(args) {
// 用 ffprobe 测量实际音频/视频时长,替代 manifest 中的估计值
let audioMeasured = 0, videoMeasured = 0
for (const item of items) {
// 测量 TTS 音频实际时长
if (item.audio && !item.audio.startsWith('http')) {
// 测量 TTS 音频实际时长(有 segments 时跳过audioDuration 已是精确累计值)
if (item.audio && !item.audio.startsWith('http') && !item.segments) {
const audioPath = path.isAbsolute(item.audio)
? item.audio
: path.resolve(inputDir, item.audio)
@@ -277,6 +278,9 @@ async function assemble(args) {
const totalDurationUs = timeline.length > 0 ? timeline[timeline.length - 1].end : 0
const hasTTS = items.some(item => item.audio && item.audioDuration != null)
// -- 读取转场策略(在 addImages/addVideos 之前) --
const transitionConfig = loadTransitions(manifest)
console.log(`\nCapCut 成片组装`)
console.log(` 模式: ${mode} 画幅: ${format} (${width}x${height})`)
console.log(` 时间线: ${hasTTS ? 'TTS音频驱动' : `固定${duration}s/段`} 总时长: ${(totalDurationUs / US).toFixed(1)}s`)
@@ -285,7 +289,7 @@ async function assemble(args) {
const steps = []
if (mode === 'images') steps.push('upload')
steps.push('draft', 'materials', 'voiceover', 'audio', 'subtitles', 'effects', 'filter', 'save', 'sync')
steps.push('draft', 'materials', 'audio_oss', 'voiceover', 'audio', 'subtitles', 'effects', 'filter', 'save', 'sync')
const totalSteps = steps.length
let step = 0
@@ -371,10 +375,22 @@ async function assemble(args) {
await addVideos(draftUrl, inputDir, items, timeline, width, height, transitionConfig)
}
// -- 上传 TTS 音频到 OSS --
let audioUrls = {}
if (voiceover === 'true' && hasTTS) {
step++; console.log(`[${step}/${totalSteps}] 上传 TTS 音频到 OSS...`)
try {
audioUrls = await batchUploadAudio(inputDir, items)
console.log(` 成功: ${Object.keys(audioUrls).length} 段音频\n`)
} catch (err) {
console.log(` OSS 上传失败,将尝试本地路径: ${err.message}\n`)
}
}
// -- 添加 TTS 配音 --
step++; console.log(`[${step}/${totalSteps}] 添加 TTS 配音...`)
if (voiceover === 'true' && hasTTS) {
await addVoiceover(draftUrl, inputDir, items, timeline, localAudio === 'true')
await addVoiceover(draftUrl, inputDir, items, timeline, audioUrls)
} else {
console.log(' 跳过(无 TTS 音频或未启用)')
}
@@ -393,9 +409,6 @@ async function assemble(args) {
console.log(` 字幕风格: ${subtitleStyle.font || '默认'} ${subtitleStyle.inAnimation ? subtitleStyle.inAnimation + '→' + subtitleStyle.outAnimation : ''}`)
}
// -- 读取转场策略 --
const transitionConfig = loadTransitions(manifest)
// -- 添加字幕 --
step++; console.log(`[${step}/${totalSteps}] 添加字幕...`)
if (subtitles === 'true' && items.some(i => i.script || i.text)) {
@@ -640,15 +653,34 @@ async function uploadAudioToOSS(filePath) {
async function batchUploadAudio(inputDir, items) {
const urls = {}
for (const item of items) {
// 上传 segments 中的每段音频
if (item.segments && item.segments.length > 1) {
for (const seg of item.segments) {
if (!seg.audio || seg.audio.startsWith('http') || urls[seg.audio]) continue
const filePath = path.isAbsolute(seg.audio)
? seg.audio
: path.resolve(inputDir, seg.audio)
if (!fs.existsSync(filePath)) {
console.error(` 音频文件不存在: ${filePath}`)
continue
}
try {
urls[seg.audio] = await uploadAudioToOSS(filePath)
console.log(` 上传: ${path.basename(filePath)} -> OK`)
} catch (err) {
console.error(` 上传失败: ${path.basename(filePath)} - ${err.message}`)
}
}
}
// 上传 item.audio单段或 segments 的第一段)
if (!item.audio || item.audio.startsWith('http')) {
if (item.audio) urls[item.audio] = item.audio
continue
}
// audio 可以是相对路径或绝对路径
if (urls[item.audio]) continue
const filePath = path.isAbsolute(item.audio)
? item.audio
: path.resolve(inputDir, item.audio)
if (!fs.existsSync(filePath)) {
console.error(` 音频文件不存在: ${filePath}`)
continue
@@ -667,51 +699,54 @@ async function batchUploadAudio(inputDir, items) {
// 添加 TTS 配音(每段音频按时间线排列)
// ============================================================================
async function addVoiceover(draftUrl, inputDir, items, timeline, localAudio = true) {
async function addVoiceover(draftUrl, inputDir, items, timeline, audioUrls = {}) {
// 收集音频
const audioItems = items.filter(item => item.audio)
const audioItems = items.filter(item => item.audio || (item.segments && item.segments.length > 0))
if (audioItems.length === 0) {
console.log(' 无 TTS 音频文件,跳过')
return
}
const audioInfos = []
const resolveAudio = (relPath) => {
if (relPath.startsWith('http')) return relPath
if (audioUrls[relPath]) return audioUrls[relPath]
return path.isAbsolute(relPath) ? relPath : path.resolve(inputDir, relPath)
}
if (localAudio) {
// 本地模式:直接用本地路径,不上传 OSS
for (let i = 0; i < items.length; i++) {
const item = items[i]
if (!item.audio) continue
for (let i = 0; i < items.length; i++) {
const item = items[i]
const tl = timeline[i]
const segments = item.segments && item.segments.length > 1 ? item.segments : null
const filePath = item.audio.startsWith('http')
? item.audio
: (path.isAbsolute(item.audio) ? item.audio : path.resolve(inputDir, item.audio))
if (segments) {
// 多段音频:按 segment 逐段添加,使用精确时长
const totalSegDur = segments.reduce((sum, s) => sum + s.duration * US, 0)
const tlDuration = tl.end - tl.start
let currentTime = tl.start
if (!item.audio.startsWith('http') && !fs.existsSync(filePath)) {
console.error(` 音频文件不存在: ${filePath}`)
continue
for (let j = 0; j < segments.length; j++) {
const seg = segments[j]
const segDurUs = Math.round(seg.duration * US)
let duration = Math.round(tlDuration * (segDurUs / totalSegDur))
if (j === segments.length - 1) duration = tl.end - currentTime
duration = Math.max(duration, 100000)
const audioUrl = resolveAudio(seg.audio)
audioInfos.push({
audio_url: audioUrl,
start: currentTime,
end: currentTime + duration,
duration,
volume: 1.0,
})
currentTime += duration
}
} else if (item.audio) {
// 单段音频
const audioUrl = resolveAudio(item.audio)
const tl = timeline[i]
audioInfos.push({
audio_url: filePath,
start: tl.start,
end: tl.end,
duration: tl.duration,
volume: 1.0,
})
}
} else {
// 上传模式:先传 OSS 再用 URL
const audioUrls = await batchUploadAudio(inputDir, items)
for (let i = 0; i < items.length; i++) {
const item = items[i]
if (!item.audio) continue
const audioUrl = audioUrls[item.audio]
if (!audioUrl) continue
const tl = timeline[i]
audioInfos.push({
audio_url: audioUrl,
start: tl.start,
@@ -731,7 +766,8 @@ async function addVoiceover(draftUrl, inputDir, items, timeline, localAudio = tr
draft_url: draftUrl,
audio_infos: JSON.stringify(audioInfos),
})
console.log(` 已添加 ${audioInfos.length} 段 TTS 配音 (${localAudio ? '本地路径' : 'OSS'})`)
const ossCount = audioInfos.filter(a => a.audio_url.startsWith('http')).length
console.log(` 已添加 ${audioInfos.length} 段 TTS 配音 (${ossCount > 0 ? `${ossCount} 段 OSS + ` : ''}${audioInfos.length - ossCount} 段本地)`)
}
// ============================================================================
@@ -793,40 +829,6 @@ function loadTransitions(manifest) {
// 添加字幕(支持关键词高亮 + 账号字幕风格 + 分句切分)
// ============================================================================
/**
* 按标点符号切分文本为短句(去除所有标点符号)
*/
function splitTextIntoSentences(text) {
const sentenceEnders = /[。!?;]/
const clauseEnders = /[]/
const sentences = []
let current = ''
let chars = text.split('')
for (let i = 0; i < chars.length; i++) {
const char = chars[i]
current += char
if (sentenceEnders.test(char)) {
// 切分并去掉所有标点
sentences.push(current.trim().replace(/[。!?;,:、]/g, ''))
current = ''
} else if (clauseEnders.test(char) && current.length > 8) {
// 切分并去掉所有标点
sentences.push(current.trim().replace(/[。!?;,:、]/g, ''))
current = ''
}
}
// 处理剩余文本
if (current.trim()) {
sentences.push(current.trim().replace(/[。!?;,:、]/g, ''))
}
return sentences
}
async function addSubtitles(draftUrl, items, timeline, style = {}, split = false) {
const captions = []
@@ -844,45 +846,76 @@ async function addSubtitles(draftUrl, items, timeline, style = {}, split = false
const tl = timeline[i]
if (split) {
// 分句模式:切分长文本
const sentences = splitTextIntoSentences(text)
if (sentences.length === 0) continue
// 分句模式:优先用 segmentsTTS 逐句生成的精确时长),回退到字数估算
const segments = item.segments && item.segments.length > 1 ? item.segments : null
const totalDuration = tl.end - tl.start
if (segments) {
// 精确模式:用 segments 的实际音频时长
const totalSegDur = segments.reduce((sum, s) => sum + s.duration * US, 0)
const tlDuration = tl.end - tl.start
let currentTime = tl.start
// 按字数权重分配时间(改进版)
const totalChars = sentences.reduce((sum, s) => sum + s.length, 0)
let currentTime = tl.start
segments.forEach((seg, idx) => {
const segDurUs = Math.round(seg.duration * US)
// 按实际时长占比映射到时间线(处理 ffprobe 重新测量的差异)
let duration = Math.round(tlDuration * (segDurUs / totalSegDur))
if (idx === segments.length - 1) {
duration = tl.end - currentTime
}
duration = Math.max(duration, 1000000)
sentences.forEach((sentence, idx) => {
// 按字数比例计算时长
const charRatio = sentence.length / totalChars
let duration = Math.round(totalDuration * charRatio)
const cap = {
start: currentTime,
end: currentTime + duration,
text: seg.text,
keyword: '',
keyword_color: '',
}
// 最后一句使用剩余全部时间(避免精度误差)
if (idx === sentences.length - 1) {
duration = tl.end - currentTime
}
if (inAnimation) cap.in_animation = inAnimation
if (outAnimation) cap.out_animation = outAnimation
if (inAnimDuration) cap.in_animation_duration = inAnimDuration
if (outAnimDuration) cap.out_animation_duration = outAnimDuration
// 最小1秒避免太短
duration = Math.max(duration, 1000000) // 1秒 = 1000000微秒
captions.push(cap)
currentTime += duration
})
} else {
// 回退:字数权重估算
const sentences = splitTextIntoSentences(text)
if (sentences.length === 0) continue
const cap = {
start: currentTime,
end: currentTime + duration,
text: sentence,
keyword: '',
keyword_color: '',
}
const totalDuration = tl.end - tl.start
const totalChars = sentences.reduce((sum, s) => sum + s.length, 0)
let currentTime = tl.start
if (inAnimation) cap.in_animation = inAnimation
if (outAnimation) cap.out_animation = outAnimation
if (inAnimDuration) cap.in_animation_duration = inAnimDuration
if (outAnimDuration) cap.out_animation_duration = outAnimDuration
sentences.forEach((sentence, idx) => {
const charRatio = sentence.length / totalChars
let duration = Math.round(totalDuration * charRatio)
captions.push(cap)
currentTime += duration
})
if (idx === sentences.length - 1) {
duration = tl.end - currentTime
}
duration = Math.max(duration, 1000000)
const cap = {
start: currentTime,
end: currentTime + duration,
text: sentence,
keyword: '',
keyword_color: '',
}
if (inAnimation) cap.in_animation = inAnimation
if (outAnimation) cap.out_animation = outAnimation
if (inAnimDuration) cap.in_animation_duration = inAnimDuration
if (outAnimDuration) cap.out_animation_duration = outAnimDuration
captions.push(cap)
currentTime += duration
})
}
} else {
// 原始模式:一句字幕
const keyword = ''

View File

@@ -1,11 +1,12 @@
/**
* Phase: tts — 语音合成
* Phase: tts — 语音合成(逐句分句生成)
*
* 使用通义千问 TTS 生成旁白音频
* 将每个 item 的 script 按标点切分为短句,每句单独生成 TTS 音频
* 结果写入 item.segments[],实现字幕与语音精确对齐。
*/
const path = require('path')
const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils')
const { saveManifest, ensureDir, log, getManifestDir, splitTextIntoSentences } = require('./pipeline-utils')
async function phaseTts(manifest, manifestPath, options = {}) {
const dir = getManifestDir(manifestPath)
@@ -24,17 +25,51 @@ async function phaseTts(manifest, manifestPath, options = {}) {
for (let i = 0; i < items.length; i++) {
const item = items[i]
const idx = i + 1
const fullText = item.script || item.text
try {
const { filePath, duration } = await synthesize(item.script || item.text, {
outputDir: audioDir,
id: item.id || idx,
voice: manifest.ttsVoice || undefined,
instruction: manifest.ttsInstruction || undefined,
rate: manifest.ttsRate || undefined,
})
item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
item.audioDuration = Math.round(duration * 1000) / 1000
log('tts', `[${idx}/${items.length}] ${duration.toFixed(1)}s: ${(item.script || item.text).substring(0, 30)}...`)
const sentences = splitTextIntoSentences(fullText)
if (sentences.length <= 1) {
// 单句:不需要 segments走原逻辑
const { filePath, duration } = await synthesize(fullText, {
outputDir: audioDir,
id: item.id || idx,
voice: manifest.ttsVoice || undefined,
instruction: manifest.ttsInstruction || undefined,
rate: manifest.ttsRate || undefined,
})
item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
item.audioDuration = Math.round(duration * 1000) / 1000
log('tts', `[${idx}/${items.length}] ${duration.toFixed(1)}s: ${fullText.substring(0, 30)}...`)
} else {
// 多句:逐句生成,写入 segments
const segments = []
let totalDuration = 0
for (let j = 0; j < sentences.length; j++) {
const sentence = sentences[j]
const segId = `${item.id || idx}_${j + 1}`
const { filePath, duration } = await synthesize(sentence, {
outputDir: audioDir,
id: segId,
voice: manifest.ttsVoice || undefined,
instruction: manifest.ttsInstruction || undefined,
rate: manifest.ttsRate || undefined,
})
segments.push({
text: sentence,
audio: path.relative(dir, filePath).replace(/\\/g, '/'),
duration: Math.round(duration * 1000) / 1000,
})
totalDuration += duration
}
item.segments = segments
item.audio = segments[0].audio
item.audioDuration = Math.round(totalDuration * 1000) / 1000
log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s (${segments.length}句): ${fullText.substring(0, 30)}...`)
}
} catch (err) {
item.status = 'failed'
item.error = `TTS失败: ${err.message}`

View File

@@ -160,6 +160,36 @@ function getManifestDir(manifestPath) {
return path.dirname(path.resolve(manifestPath))
}
// ============================================================================
// 文本切分
// ============================================================================
function splitTextIntoSentences(text) {
const sentenceEnders = /[。!?;]/
const clauseEnders = /[]/
const sentences = []
let current = ''
for (const char of text) {
current += char
if (sentenceEnders.test(char)) {
sentences.push(current.trim().replace(/[。!?;,:、]/g, ''))
current = ''
} else if (clauseEnders.test(char) && current.length > 8) {
sentences.push(current.trim().replace(/[。!?;,:、]/g, ''))
current = ''
}
}
if (current.trim()) {
sentences.push(current.trim().replace(/[。!?;,:、]/g, ''))
}
return sentences
}
// ============================================================================
// Exports
// ============================================================================
@@ -178,6 +208,7 @@ module.exports = {
ensureDir,
slugify,
renameGeneratedFile,
splitTextIntoSentences,
log,
getManifestDir,
}