From 9d19437a292ab39b9cddb995a4641c4a3ec943b6 Mon Sep 17 00:00:00 2001 From: sion123 <450702724@qq.com> Date: Fri, 1 May 2026 14:41:28 +0800 Subject: [PATCH] =?UTF-8?q?feat(video-pipeline):=20=E5=AE=9E=E7=8E=B0=20TT?= =?UTF-8?q?S=20=E9=80=90=E5=8F=A5=E5=88=86=E5=8F=A5=E7=94=9F=E6=88=90?= =?UTF-8?q?=E4=B8=8E=E5=AD=97=E5=B9=95=E7=B2=BE=E7=A1=AE=E5=AF=B9=E9=BD=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit TTS 阶段将长文本按标点切分为短句,逐句生成音频并记录每句时长到 `item.segments[]`。assemble 阶段优先使用 segments 的精确时长分配字幕时间线,无 segments 时回退到字数权重估算。同时优化音频上传流程,支持分段音频独立上传 OSS 并在配音时按段映射时间线。 --- .../references/manifest-schema.md | 17 +- .../scripts/capcut_assemble.js | 249 ++++++++++-------- .../scripts/lib/phase-tts.js | 61 ++++- .../scripts/lib/pipeline-utils.js | 31 +++ 4 files changed, 236 insertions(+), 122 deletions(-) diff --git a/.claude/skills/video-from-script/references/manifest-schema.md b/.claude/skills/video-from-script/references/manifest-schema.md index f9466a9..d554389 100644 --- a/.claude/skills/video-from-script/references/manifest-schema.md +++ b/.claude/skills/video-from-script/references/manifest-schema.md @@ -82,8 +82,9 @@ node scripts/pipeline.js validate --manifest | `video` | 生成的视频路径 | videos | | `videoDuration` | 视频时长(秒),Grok=6, VEO=8 | videos | | `videoUrl` | 视频 OSS 公网 URL | videos | -| `audio` | TTS 音频路径 | tts | +| `audio` | TTS 音频路径(多句时为合并后的完整音频) | tts | | `audioDuration` | 音频时长(秒) | tts | +| `segments` | 分句音频数组(仅多句时存在),见下方 | tts | ### Agent 审查时可操作 @@ -199,3 +200,17 @@ output/{account}_{YYYYMMDD}_{NNN}/ ``` slug 从 `shotDesc` 派生(slugify: 保留中文和字母数字,最多 20 字符)。 + +--- + +## segments[] 字段(TTS 分句) + +TTS 阶段自动生成。仅当 `script` 被切分为 2 句及以上时才写入。单句时不写 segments。 + +| 字段 | 说明 | +|------|------| +| `text` | 分句文本(已去除标点) | +| `audio` | 该句音频路径(相对 manifest) | +| `duration` | 该句音频时长(秒) | + +`item.audio` 指向所有分段合并后的完整音频,`item.audioDuration` 为各段累计时长。assemble 阶段优先用 `segments` 的精确时长对齐字幕,无 segments 时回退到字数权重估算。 diff --git a/.claude/skills/video-from-script/scripts/capcut_assemble.js b/.claude/skills/video-from-script/scripts/capcut_assemble.js index 9a5f8d6..6b2c989 100644 --- a/.claude/skills/video-from-script/scripts/capcut_assemble.js +++ b/.claude/skills/video-from-script/scripts/capcut_assemble.js @@ -17,6 +17,7 @@ const path = require('path') const fs = require('fs') const { execFile } = require('child_process') const { syncDraft, registerDraft, triggerDirectoryScan } = require('./sync-to-jianying') +const { splitTextIntoSentences } = require('./lib/pipeline-utils') // ============================================================================ // 配置 @@ -248,8 +249,8 @@ async function assemble(args) { // 用 ffprobe 测量实际音频/视频时长,替代 manifest 中的估计值 let audioMeasured = 0, videoMeasured = 0 for (const item of items) { - // 测量 TTS 音频实际时长 - if (item.audio && !item.audio.startsWith('http')) { + // 测量 TTS 音频实际时长(有 segments 时跳过,audioDuration 已是精确累计值) + if (item.audio && !item.audio.startsWith('http') && !item.segments) { const audioPath = path.isAbsolute(item.audio) ? item.audio : path.resolve(inputDir, item.audio) @@ -277,6 +278,9 @@ async function assemble(args) { const totalDurationUs = timeline.length > 0 ? timeline[timeline.length - 1].end : 0 const hasTTS = items.some(item => item.audio && item.audioDuration != null) + // -- 读取转场策略(在 addImages/addVideos 之前) -- + const transitionConfig = loadTransitions(manifest) + console.log(`\nCapCut 成片组装`) console.log(` 模式: ${mode} 画幅: ${format} (${width}x${height})`) console.log(` 时间线: ${hasTTS ? 'TTS音频驱动' : `固定${duration}s/段`} 总时长: ${(totalDurationUs / US).toFixed(1)}s`) @@ -285,7 +289,7 @@ async function assemble(args) { const steps = [] if (mode === 'images') steps.push('upload') - steps.push('draft', 'materials', 'voiceover', 'audio', 'subtitles', 'effects', 'filter', 'save', 'sync') + steps.push('draft', 'materials', 'audio_oss', 'voiceover', 'audio', 'subtitles', 'effects', 'filter', 'save', 'sync') const totalSteps = steps.length let step = 0 @@ -371,10 +375,22 @@ async function assemble(args) { await addVideos(draftUrl, inputDir, items, timeline, width, height, transitionConfig) } + // -- 上传 TTS 音频到 OSS -- + let audioUrls = {} + if (voiceover === 'true' && hasTTS) { + step++; console.log(`[${step}/${totalSteps}] 上传 TTS 音频到 OSS...`) + try { + audioUrls = await batchUploadAudio(inputDir, items) + console.log(` 成功: ${Object.keys(audioUrls).length} 段音频\n`) + } catch (err) { + console.log(` OSS 上传失败,将尝试本地路径: ${err.message}\n`) + } + } + // -- 添加 TTS 配音 -- step++; console.log(`[${step}/${totalSteps}] 添加 TTS 配音...`) if (voiceover === 'true' && hasTTS) { - await addVoiceover(draftUrl, inputDir, items, timeline, localAudio === 'true') + await addVoiceover(draftUrl, inputDir, items, timeline, audioUrls) } else { console.log(' 跳过(无 TTS 音频或未启用)') } @@ -393,9 +409,6 @@ async function assemble(args) { console.log(` 字幕风格: ${subtitleStyle.font || '默认'} ${subtitleStyle.inAnimation ? subtitleStyle.inAnimation + '→' + subtitleStyle.outAnimation : ''}`) } - // -- 读取转场策略 -- - const transitionConfig = loadTransitions(manifest) - // -- 添加字幕 -- step++; console.log(`[${step}/${totalSteps}] 添加字幕...`) if (subtitles === 'true' && items.some(i => i.script || i.text)) { @@ -640,15 +653,34 @@ async function uploadAudioToOSS(filePath) { async function batchUploadAudio(inputDir, items) { const urls = {} for (const item of items) { + // 上传 segments 中的每段音频 + if (item.segments && item.segments.length > 1) { + for (const seg of item.segments) { + if (!seg.audio || seg.audio.startsWith('http') || urls[seg.audio]) continue + const filePath = path.isAbsolute(seg.audio) + ? seg.audio + : path.resolve(inputDir, seg.audio) + if (!fs.existsSync(filePath)) { + console.error(` 音频文件不存在: ${filePath}`) + continue + } + try { + urls[seg.audio] = await uploadAudioToOSS(filePath) + console.log(` 上传: ${path.basename(filePath)} -> OK`) + } catch (err) { + console.error(` 上传失败: ${path.basename(filePath)} - ${err.message}`) + } + } + } + // 上传 item.audio(单段或 segments 的第一段) if (!item.audio || item.audio.startsWith('http')) { if (item.audio) urls[item.audio] = item.audio continue } - // audio 可以是相对路径或绝对路径 + if (urls[item.audio]) continue const filePath = path.isAbsolute(item.audio) ? item.audio : path.resolve(inputDir, item.audio) - if (!fs.existsSync(filePath)) { console.error(` 音频文件不存在: ${filePath}`) continue @@ -667,51 +699,54 @@ async function batchUploadAudio(inputDir, items) { // 添加 TTS 配音(每段音频按时间线排列) // ============================================================================ -async function addVoiceover(draftUrl, inputDir, items, timeline, localAudio = true) { +async function addVoiceover(draftUrl, inputDir, items, timeline, audioUrls = {}) { // 收集音频 - const audioItems = items.filter(item => item.audio) + const audioItems = items.filter(item => item.audio || (item.segments && item.segments.length > 0)) if (audioItems.length === 0) { console.log(' 无 TTS 音频文件,跳过') return } const audioInfos = [] + const resolveAudio = (relPath) => { + if (relPath.startsWith('http')) return relPath + if (audioUrls[relPath]) return audioUrls[relPath] + return path.isAbsolute(relPath) ? relPath : path.resolve(inputDir, relPath) + } - if (localAudio) { - // 本地模式:直接用本地路径,不上传 OSS - for (let i = 0; i < items.length; i++) { - const item = items[i] - if (!item.audio) continue + for (let i = 0; i < items.length; i++) { + const item = items[i] + const tl = timeline[i] + const segments = item.segments && item.segments.length > 1 ? item.segments : null - const filePath = item.audio.startsWith('http') - ? item.audio - : (path.isAbsolute(item.audio) ? item.audio : path.resolve(inputDir, item.audio)) + if (segments) { + // 多段音频:按 segment 逐段添加,使用精确时长 + const totalSegDur = segments.reduce((sum, s) => sum + s.duration * US, 0) + const tlDuration = tl.end - tl.start + let currentTime = tl.start - if (!item.audio.startsWith('http') && !fs.existsSync(filePath)) { - console.error(` 音频文件不存在: ${filePath}`) - continue + for (let j = 0; j < segments.length; j++) { + const seg = segments[j] + const segDurUs = Math.round(seg.duration * US) + let duration = Math.round(tlDuration * (segDurUs / totalSegDur)) + if (j === segments.length - 1) duration = tl.end - currentTime + duration = Math.max(duration, 100000) + + const audioUrl = resolveAudio(seg.audio) + + audioInfos.push({ + audio_url: audioUrl, + start: currentTime, + end: currentTime + duration, + duration, + volume: 1.0, + }) + currentTime += duration } + } else if (item.audio) { + // 单段音频 + const audioUrl = resolveAudio(item.audio) - const tl = timeline[i] - audioInfos.push({ - audio_url: filePath, - start: tl.start, - end: tl.end, - duration: tl.duration, - volume: 1.0, - }) - } - } else { - // 上传模式:先传 OSS 再用 URL - const audioUrls = await batchUploadAudio(inputDir, items) - for (let i = 0; i < items.length; i++) { - const item = items[i] - if (!item.audio) continue - - const audioUrl = audioUrls[item.audio] - if (!audioUrl) continue - - const tl = timeline[i] audioInfos.push({ audio_url: audioUrl, start: tl.start, @@ -731,7 +766,8 @@ async function addVoiceover(draftUrl, inputDir, items, timeline, localAudio = tr draft_url: draftUrl, audio_infos: JSON.stringify(audioInfos), }) - console.log(` 已添加 ${audioInfos.length} 段 TTS 配音 (${localAudio ? '本地路径' : 'OSS'})`) + const ossCount = audioInfos.filter(a => a.audio_url.startsWith('http')).length + console.log(` 已添加 ${audioInfos.length} 段 TTS 配音 (${ossCount > 0 ? `${ossCount} 段 OSS + ` : ''}${audioInfos.length - ossCount} 段本地)`) } // ============================================================================ @@ -793,40 +829,6 @@ function loadTransitions(manifest) { // 添加字幕(支持关键词高亮 + 账号字幕风格 + 分句切分) // ============================================================================ -/** - * 按标点符号切分文本为短句(去除所有标点符号) - */ -function splitTextIntoSentences(text) { - const sentenceEnders = /[。!?;]/ - const clauseEnders = /[,:]/ - - const sentences = [] - let current = '' - let chars = text.split('') - - for (let i = 0; i < chars.length; i++) { - const char = chars[i] - current += char - - if (sentenceEnders.test(char)) { - // 切分并去掉所有标点 - sentences.push(current.trim().replace(/[。!?;,:、]/g, '')) - current = '' - } else if (clauseEnders.test(char) && current.length > 8) { - // 切分并去掉所有标点 - sentences.push(current.trim().replace(/[。!?;,:、]/g, '')) - current = '' - } - } - - // 处理剩余文本 - if (current.trim()) { - sentences.push(current.trim().replace(/[。!?;,:、]/g, '')) - } - - return sentences -} - async function addSubtitles(draftUrl, items, timeline, style = {}, split = false) { const captions = [] @@ -844,45 +846,76 @@ async function addSubtitles(draftUrl, items, timeline, style = {}, split = false const tl = timeline[i] if (split) { - // 分句模式:切分长文本 - const sentences = splitTextIntoSentences(text) - if (sentences.length === 0) continue + // 分句模式:优先用 segments(TTS 逐句生成的精确时长),回退到字数估算 + const segments = item.segments && item.segments.length > 1 ? item.segments : null - const totalDuration = tl.end - tl.start + if (segments) { + // 精确模式:用 segments 的实际音频时长 + const totalSegDur = segments.reduce((sum, s) => sum + s.duration * US, 0) + const tlDuration = tl.end - tl.start + let currentTime = tl.start - // 按字数权重分配时间(改进版) - const totalChars = sentences.reduce((sum, s) => sum + s.length, 0) - let currentTime = tl.start + segments.forEach((seg, idx) => { + const segDurUs = Math.round(seg.duration * US) + // 按实际时长占比映射到时间线(处理 ffprobe 重新测量的差异) + let duration = Math.round(tlDuration * (segDurUs / totalSegDur)) + if (idx === segments.length - 1) { + duration = tl.end - currentTime + } + duration = Math.max(duration, 1000000) - sentences.forEach((sentence, idx) => { - // 按字数比例计算时长 - const charRatio = sentence.length / totalChars - let duration = Math.round(totalDuration * charRatio) + const cap = { + start: currentTime, + end: currentTime + duration, + text: seg.text, + keyword: '', + keyword_color: '', + } - // 最后一句使用剩余全部时间(避免精度误差) - if (idx === sentences.length - 1) { - duration = tl.end - currentTime - } + if (inAnimation) cap.in_animation = inAnimation + if (outAnimation) cap.out_animation = outAnimation + if (inAnimDuration) cap.in_animation_duration = inAnimDuration + if (outAnimDuration) cap.out_animation_duration = outAnimDuration - // 最小1秒,避免太短 - duration = Math.max(duration, 1000000) // 1秒 = 1000000微秒 + captions.push(cap) + currentTime += duration + }) + } else { + // 回退:字数权重估算 + const sentences = splitTextIntoSentences(text) + if (sentences.length === 0) continue - const cap = { - start: currentTime, - end: currentTime + duration, - text: sentence, - keyword: '', - keyword_color: '', - } + const totalDuration = tl.end - tl.start + const totalChars = sentences.reduce((sum, s) => sum + s.length, 0) + let currentTime = tl.start - if (inAnimation) cap.in_animation = inAnimation - if (outAnimation) cap.out_animation = outAnimation - if (inAnimDuration) cap.in_animation_duration = inAnimDuration - if (outAnimDuration) cap.out_animation_duration = outAnimDuration + sentences.forEach((sentence, idx) => { + const charRatio = sentence.length / totalChars + let duration = Math.round(totalDuration * charRatio) - captions.push(cap) - currentTime += duration - }) + if (idx === sentences.length - 1) { + duration = tl.end - currentTime + } + + duration = Math.max(duration, 1000000) + + const cap = { + start: currentTime, + end: currentTime + duration, + text: sentence, + keyword: '', + keyword_color: '', + } + + if (inAnimation) cap.in_animation = inAnimation + if (outAnimation) cap.out_animation = outAnimation + if (inAnimDuration) cap.in_animation_duration = inAnimDuration + if (outAnimDuration) cap.out_animation_duration = outAnimDuration + + captions.push(cap) + currentTime += duration + }) + } } else { // 原始模式:一句字幕 const keyword = '' diff --git a/.claude/skills/video-from-script/scripts/lib/phase-tts.js b/.claude/skills/video-from-script/scripts/lib/phase-tts.js index 97e5f14..16b9c85 100644 --- a/.claude/skills/video-from-script/scripts/lib/phase-tts.js +++ b/.claude/skills/video-from-script/scripts/lib/phase-tts.js @@ -1,11 +1,12 @@ /** - * Phase: tts — 语音合成 + * Phase: tts — 语音合成(逐句分句生成) * - * 使用通义千问 TTS 生成旁白音频 + * 将每个 item 的 script 按标点切分为短句,每句单独生成 TTS 音频。 + * 结果写入 item.segments[],实现字幕与语音精确对齐。 */ const path = require('path') -const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils') +const { saveManifest, ensureDir, log, getManifestDir, splitTextIntoSentences } = require('./pipeline-utils') async function phaseTts(manifest, manifestPath, options = {}) { const dir = getManifestDir(manifestPath) @@ -24,17 +25,51 @@ async function phaseTts(manifest, manifestPath, options = {}) { for (let i = 0; i < items.length; i++) { const item = items[i] const idx = i + 1 + const fullText = item.script || item.text + try { - const { filePath, duration } = await synthesize(item.script || item.text, { - outputDir: audioDir, - id: item.id || idx, - voice: manifest.ttsVoice || undefined, - instruction: manifest.ttsInstruction || undefined, - rate: manifest.ttsRate || undefined, - }) - item.audio = path.relative(dir, filePath).replace(/\\/g, '/') - item.audioDuration = Math.round(duration * 1000) / 1000 - log('tts', `[${idx}/${items.length}] ${duration.toFixed(1)}s: ${(item.script || item.text).substring(0, 30)}...`) + const sentences = splitTextIntoSentences(fullText) + + if (sentences.length <= 1) { + // 单句:不需要 segments,走原逻辑 + const { filePath, duration } = await synthesize(fullText, { + outputDir: audioDir, + id: item.id || idx, + voice: manifest.ttsVoice || undefined, + instruction: manifest.ttsInstruction || undefined, + rate: manifest.ttsRate || undefined, + }) + item.audio = path.relative(dir, filePath).replace(/\\/g, '/') + item.audioDuration = Math.round(duration * 1000) / 1000 + log('tts', `[${idx}/${items.length}] ${duration.toFixed(1)}s: ${fullText.substring(0, 30)}...`) + } else { + // 多句:逐句生成,写入 segments + const segments = [] + let totalDuration = 0 + + for (let j = 0; j < sentences.length; j++) { + const sentence = sentences[j] + const segId = `${item.id || idx}_${j + 1}` + const { filePath, duration } = await synthesize(sentence, { + outputDir: audioDir, + id: segId, + voice: manifest.ttsVoice || undefined, + instruction: manifest.ttsInstruction || undefined, + rate: manifest.ttsRate || undefined, + }) + segments.push({ + text: sentence, + audio: path.relative(dir, filePath).replace(/\\/g, '/'), + duration: Math.round(duration * 1000) / 1000, + }) + totalDuration += duration + } + + item.segments = segments + item.audio = segments[0].audio + item.audioDuration = Math.round(totalDuration * 1000) / 1000 + log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s (${segments.length}句): ${fullText.substring(0, 30)}...`) + } } catch (err) { item.status = 'failed' item.error = `TTS失败: ${err.message}` diff --git a/.claude/skills/video-from-script/scripts/lib/pipeline-utils.js b/.claude/skills/video-from-script/scripts/lib/pipeline-utils.js index be73053..df2d756 100644 --- a/.claude/skills/video-from-script/scripts/lib/pipeline-utils.js +++ b/.claude/skills/video-from-script/scripts/lib/pipeline-utils.js @@ -160,6 +160,36 @@ function getManifestDir(manifestPath) { return path.dirname(path.resolve(manifestPath)) } +// ============================================================================ +// 文本切分 +// ============================================================================ + +function splitTextIntoSentences(text) { + const sentenceEnders = /[。!?;]/ + const clauseEnders = /[,:]/ + + const sentences = [] + let current = '' + + for (const char of text) { + current += char + + if (sentenceEnders.test(char)) { + sentences.push(current.trim().replace(/[。!?;,:、]/g, '')) + current = '' + } else if (clauseEnders.test(char) && current.length > 8) { + sentences.push(current.trim().replace(/[。!?;,:、]/g, '')) + current = '' + } + } + + if (current.trim()) { + sentences.push(current.trim().replace(/[。!?;,:、]/g, '')) + } + + return sentences +} + // ============================================================================ // Exports // ============================================================================ @@ -178,6 +208,7 @@ module.exports = { ensureDir, slugify, renameGeneratedFile, + splitTextIntoSentences, log, getManifestDir, }