refactor(video-pipeline): 移除 segments 机制,改为整段音频合成
移除 TTS 阶段逐句切分及 segments 数组逻辑,统一为整段音频合成。 CapCut 字幕切分由组装阶段按字符比例分配,简化音频上传、 时间线构建和字幕生成流程,减少冗余处理分支。
This commit is contained in:
@@ -65,24 +65,6 @@ async function batchUploadToOSS(inputDir, files, concurrency = 3) {
|
||||
async function batchUploadAudio(inputDir, items) {
|
||||
const urls = {}
|
||||
for (const item of items) {
|
||||
if (item.segments && item.segments.length > 0) {
|
||||
for (const seg of item.segments) {
|
||||
if (!seg.audio || seg.audio.startsWith('http') || urls[seg.audio]) continue
|
||||
const filePath = path.isAbsolute(seg.audio)
|
||||
? seg.audio
|
||||
: path.resolve(inputDir, seg.audio)
|
||||
if (!fs.existsSync(filePath)) {
|
||||
console.error(` 音频文件不存在: ${filePath}`)
|
||||
continue
|
||||
}
|
||||
try {
|
||||
urls[seg.audio] = await uploadToOSS(filePath)
|
||||
console.log(` 上传: ${path.basename(filePath)} -> OK`)
|
||||
} catch (err) {
|
||||
console.error(` 上传失败: ${path.basename(filePath)} - ${err.message}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!item.audio || item.audio.startsWith('http')) {
|
||||
if (item.audio) urls[item.audio] = item.audio
|
||||
continue
|
||||
@@ -174,17 +156,7 @@ async function assemble(args) {
|
||||
// ffprobe 测量实际时长
|
||||
let audioMeasured = 0, videoMeasured = 0
|
||||
for (const item of items) {
|
||||
if (item.segments && item.segments.length > 0) {
|
||||
for (const seg of item.segments) {
|
||||
if (!seg.audio || seg.audio.startsWith('http')) continue
|
||||
const audioPath = path.isAbsolute(seg.audio)
|
||||
? seg.audio
|
||||
: path.resolve(inputDir, seg.audio)
|
||||
if (!fs.existsSync(audioPath)) continue
|
||||
const actualDur = await getAudioDurationSec(audioPath)
|
||||
if (actualDur != null) { seg.duration = actualDur; audioMeasured++ }
|
||||
}
|
||||
} else if (item.audio && !item.audio.startsWith('http')) {
|
||||
if (item.audio && !item.audio.startsWith('http')) {
|
||||
const audioPath = path.isAbsolute(item.audio)
|
||||
? item.audio
|
||||
: path.resolve(inputDir, item.audio)
|
||||
@@ -216,9 +188,7 @@ async function assemble(args) {
|
||||
const item = items[i]
|
||||
const tl = timeline[i]
|
||||
if (tl.skip) { console.log(` [${i + 1}] 跳过(无音频)`); continue }
|
||||
const audioDur = item.segments
|
||||
? item.segments.reduce((s, seg) => s + (seg.duration || 0), 0)
|
||||
: (item.audioDuration || 0)
|
||||
const audioDur = item.audioDuration || 0
|
||||
const slotDur = tl.duration / US
|
||||
const diff = slotDur - audioDur
|
||||
const videoDur = (item.videoDuration || 0)
|
||||
@@ -341,14 +311,6 @@ async function assemble(args) {
|
||||
item.audio = audioUrls[item.audio]
|
||||
changed = true
|
||||
}
|
||||
if (item.segments) {
|
||||
for (const seg of item.segments) {
|
||||
if (seg.audio && audioUrls[seg.audio]) {
|
||||
seg.audio = audioUrls[seg.audio]
|
||||
changed = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (changed) saveManifest(manifestFile, manifest)
|
||||
}
|
||||
|
||||
@@ -23,12 +23,7 @@ const { US } = require('./capcut-api')
|
||||
function buildTimeline(items) {
|
||||
let offset = 0
|
||||
return items.map(item => {
|
||||
let audioDur
|
||||
if (item.segments && item.segments.length > 0) {
|
||||
audioDur = item.segments.reduce((sum, s) => sum + (s.duration || 0), 0) * US
|
||||
} else {
|
||||
audioDur = (item.audioDuration != null) ? item.audioDuration * US : 0
|
||||
}
|
||||
const audioDur = (item.audioDuration != null) ? item.audioDuration * US : 0
|
||||
const videoDur = (item.videoDuration != null) ? item.videoDuration * US : 0
|
||||
const hasVideo = !!(item.video || item.videoUrl || item.url)
|
||||
|
||||
|
||||
@@ -308,7 +308,7 @@ async function addVideos(draftUrl, inputDir, items, timeline, width, height, tra
|
||||
// ============================================================================
|
||||
|
||||
async function addVoiceover(draftUrl, inputDir, items, timeline, audioUrls = {}) {
|
||||
const audioItems = items.filter(item => item.audio || (item.segments && item.segments.length > 0))
|
||||
const audioItems = items.filter(item => item.audio)
|
||||
if (audioItems.length === 0) {
|
||||
console.log(' 无 TTS 音频文件,跳过')
|
||||
return
|
||||
@@ -325,25 +325,7 @@ async function addVoiceover(draftUrl, inputDir, items, timeline, audioUrls = {})
|
||||
const item = items[i]
|
||||
const tl = timeline[i]
|
||||
|
||||
if (item.segments && item.segments.length > 0) {
|
||||
let currentTime = tl.start
|
||||
for (let si = 0; si < item.segments.length; si++) {
|
||||
const seg = item.segments[si]
|
||||
const audioUrl = resolveAudio(seg.audio)
|
||||
const segDurUs = (seg.duration || 0) * US
|
||||
if (segDurUs <= 0) continue
|
||||
const isLast = si === item.segments.length - 1
|
||||
const endTime = isLast ? tl.end : currentTime + segDurUs
|
||||
audioInfos.push({
|
||||
audio_url: audioUrl,
|
||||
start: currentTime,
|
||||
end: endTime,
|
||||
duration: endTime - currentTime,
|
||||
volume: 1.0,
|
||||
})
|
||||
currentTime = endTime
|
||||
}
|
||||
} else if (item.audio) {
|
||||
if (item.audio) {
|
||||
const audioUrl = resolveAudio(item.audio)
|
||||
const audioDurUs = item.audioDuration ? item.audioDuration * US : tl.duration
|
||||
|
||||
@@ -421,48 +403,33 @@ async function addSubtitles(draftUrl, items, timeline, style = {}, split = false
|
||||
const tl = timeline[i]
|
||||
|
||||
if (split) {
|
||||
if (item.segments && item.segments.length > 0) {
|
||||
let currentTime = tl.start
|
||||
for (let si = 0; si < item.segments.length; si++) {
|
||||
const seg = item.segments[si]
|
||||
const segDurUs = (seg.duration || 0) * US
|
||||
if (segDurUs <= 0) continue
|
||||
const isLast = si === item.segments.length - 1
|
||||
const endTime = isLast ? tl.end : currentTime + segDurUs
|
||||
const cap = { start: currentTime, end: endTime, text: seg.text }
|
||||
applyAnimationProps(cap, animStyle)
|
||||
captions.push(cap)
|
||||
currentTime = endTime
|
||||
const sentences = splitTextIntoSentences(text)
|
||||
if (sentences.length === 0) continue
|
||||
|
||||
const totalDuration = tl.end - tl.start
|
||||
const totalChars = sentences.reduce((sum, s) => sum + s.length, 0)
|
||||
let currentTime = tl.start
|
||||
|
||||
sentences.forEach((sentence, idx) => {
|
||||
const charRatio = sentence.length / totalChars
|
||||
let duration = Math.round(totalDuration * charRatio)
|
||||
|
||||
if (idx === sentences.length - 1) {
|
||||
duration = tl.end - currentTime
|
||||
}
|
||||
} else {
|
||||
const sentences = splitTextIntoSentences(text)
|
||||
if (sentences.length === 0) continue
|
||||
|
||||
const totalDuration = tl.end - tl.start
|
||||
const totalChars = sentences.reduce((sum, s) => sum + s.length, 0)
|
||||
let currentTime = tl.start
|
||||
duration = Math.max(duration, 500000)
|
||||
|
||||
sentences.forEach((sentence, idx) => {
|
||||
const charRatio = sentence.length / totalChars
|
||||
let duration = Math.round(totalDuration * charRatio)
|
||||
const cap = {
|
||||
start: currentTime,
|
||||
end: currentTime + duration,
|
||||
text: sentence,
|
||||
}
|
||||
|
||||
if (idx === sentences.length - 1) {
|
||||
duration = tl.end - currentTime
|
||||
}
|
||||
|
||||
duration = Math.max(duration, 500000)
|
||||
|
||||
const cap = {
|
||||
start: currentTime,
|
||||
end: currentTime + duration,
|
||||
text: sentence,
|
||||
}
|
||||
|
||||
applyAnimationProps(cap, animStyle)
|
||||
captions.push(cap)
|
||||
currentTime += duration
|
||||
})
|
||||
}
|
||||
applyAnimationProps(cap, animStyle)
|
||||
captions.push(cap)
|
||||
currentTime += duration
|
||||
})
|
||||
} else {
|
||||
const cap = {
|
||||
start: tl.start,
|
||||
|
||||
@@ -28,7 +28,7 @@ async function phaseAssemble(manifest, manifestPath, options) {
|
||||
manifest: manifestPath,
|
||||
mode,
|
||||
format: manifest.format || accountConfig.defaultFormat || '9:16',
|
||||
subtitles: mode === 'images' ? 'true' : 'false',
|
||||
subtitles: 'true',
|
||||
voiceover: manifest.items.some(it => it.audio) ? 'true' : 'false',
|
||||
animation: capcutConfig.animation || '渐显+放大',
|
||||
}
|
||||
|
||||
@@ -1,13 +1,13 @@
|
||||
/**
|
||||
* Phase: tts — 语音合成(逐句分句生成)
|
||||
* Phase: tts — 语音合成(整段合成)
|
||||
*
|
||||
* 将每个 item 的 script 按标点切分为短句,每句单独生成 TTS 音频。
|
||||
* 统一写入 item.segments[],单句时数组仅 1 个元素。
|
||||
* item.audio 指向第一段,item.audioDuration 为累计时长。
|
||||
* 每个 item 的 script 整段合成一个音频文件,保留自然语调。
|
||||
* item.audio 指向完整音频,item.audioDuration 为总时长。
|
||||
* 字幕切分由组装阶段按字符比例分配,不在 TTS 阶段处理。
|
||||
*/
|
||||
|
||||
const path = require('path')
|
||||
const { saveManifest, ensureDir, log, getManifestDir, splitTextIntoSentences } = require('./pipeline-utils')
|
||||
const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils')
|
||||
|
||||
async function phaseTts(manifest, manifestPath, options = {}) {
|
||||
const dir = getManifestDir(manifestPath)
|
||||
@@ -29,33 +29,18 @@ async function phaseTts(manifest, manifestPath, options = {}) {
|
||||
const fullText = item.script || item.text
|
||||
|
||||
try {
|
||||
const sentences = splitTextIntoSentences(fullText)
|
||||
const segments = []
|
||||
let totalDuration = 0
|
||||
const { filePath, duration } = await synthesize(fullText, {
|
||||
outputDir: audioDir,
|
||||
id: String(item.id || idx),
|
||||
voice: manifest.ttsVoice || undefined,
|
||||
instruction: manifest.ttsInstruction || undefined,
|
||||
rate: manifest.ttsRate || undefined,
|
||||
})
|
||||
|
||||
for (let j = 0; j < sentences.length; j++) {
|
||||
const sentence = sentences[j]
|
||||
const segId = `${item.id || idx}_${j + 1}`
|
||||
const { filePath, duration } = await synthesize(sentence, {
|
||||
outputDir: audioDir,
|
||||
id: segId,
|
||||
voice: manifest.ttsVoice || undefined,
|
||||
instruction: manifest.ttsInstruction || undefined,
|
||||
rate: manifest.ttsRate || undefined,
|
||||
})
|
||||
segments.push({
|
||||
text: sentence,
|
||||
audio: path.relative(dir, filePath).replace(/\\/g, '/'),
|
||||
duration: Math.round(duration * 1000) / 1000,
|
||||
})
|
||||
totalDuration += duration
|
||||
}
|
||||
|
||||
// 统一使用 segments 数组(单句 = 1 元素,多句 = N 元素)
|
||||
item.segments = segments
|
||||
item.audio = segments[0].audio
|
||||
item.audioDuration = Math.round(totalDuration * 1000) / 1000
|
||||
log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s (${segments.length}句): ${fullText.substring(0, 30)}...`)
|
||||
const totalDuration = Math.round(duration * 1000) / 1000
|
||||
item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
|
||||
item.audioDuration = totalDuration
|
||||
log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s: ${fullText.substring(0, 30)}...`)
|
||||
} catch (err) {
|
||||
item.status = 'failed'
|
||||
item.error = `TTS失败: ${err.message}`
|
||||
|
||||
@@ -165,8 +165,8 @@ function getManifestDir(manifestPath) {
|
||||
// ============================================================================
|
||||
|
||||
function splitTextIntoSentences(text) {
|
||||
const sentenceEnders = /[。!?;]/
|
||||
const clauseEnders = /[,:]/
|
||||
// 在句号、感叹号、分号、逗号处断句——它们是口播语音的天然呼吸点。
|
||||
const sentenceEnders = /[。!;,]/
|
||||
|
||||
const sentences = []
|
||||
let current = ''
|
||||
@@ -175,16 +175,13 @@ function splitTextIntoSentences(text) {
|
||||
current += char
|
||||
|
||||
if (sentenceEnders.test(char)) {
|
||||
sentences.push(current.trim().replace(/[。!?;,:、]/g, ''))
|
||||
current = ''
|
||||
} else if (clauseEnders.test(char) && current.length > 8) {
|
||||
sentences.push(current.trim().replace(/[。!?;,:、]/g, ''))
|
||||
sentences.push(current.trim().replace(/[。!;,:?、——…]/g, ''))
|
||||
current = ''
|
||||
}
|
||||
}
|
||||
|
||||
if (current.trim()) {
|
||||
sentences.push(current.trim().replace(/[。!?;,:、]/g, ''))
|
||||
sentences.push(current.trim().replace(/[。!;,:?、——…]/g, ''))
|
||||
}
|
||||
|
||||
return sentences
|
||||
|
||||
Reference in New Issue
Block a user