feat(video-pipeline): 实现 TTS 逐句分句生成与字幕精确对齐
TTS 阶段将长文本按标点切分为短句,逐句生成音频并记录每句时长到 `item.segments[]`。assemble 阶段优先使用 segments 的精确时长分配字幕时间线,无 segments 时回退到字数权重估算。同时优化音频上传流程,支持分段音频独立上传 OSS 并在配音时按段映射时间线。
This commit is contained in:
@@ -82,8 +82,9 @@ node scripts/pipeline.js validate --manifest <path>
|
|||||||
| `video` | 生成的视频路径 | videos |
|
| `video` | 生成的视频路径 | videos |
|
||||||
| `videoDuration` | 视频时长(秒),Grok=6, VEO=8 | videos |
|
| `videoDuration` | 视频时长(秒),Grok=6, VEO=8 | videos |
|
||||||
| `videoUrl` | 视频 OSS 公网 URL | videos |
|
| `videoUrl` | 视频 OSS 公网 URL | videos |
|
||||||
| `audio` | TTS 音频路径 | tts |
|
| `audio` | TTS 音频路径(多句时为合并后的完整音频) | tts |
|
||||||
| `audioDuration` | 音频时长(秒) | tts |
|
| `audioDuration` | 音频时长(秒) | tts |
|
||||||
|
| `segments` | 分句音频数组(仅多句时存在),见下方 | tts |
|
||||||
|
|
||||||
### Agent 审查时可操作
|
### Agent 审查时可操作
|
||||||
|
|
||||||
@@ -199,3 +200,17 @@ output/{account}_{YYYYMMDD}_{NNN}/
|
|||||||
```
|
```
|
||||||
|
|
||||||
slug 从 `shotDesc` 派生(slugify: 保留中文和字母数字,最多 20 字符)。
|
slug 从 `shotDesc` 派生(slugify: 保留中文和字母数字,最多 20 字符)。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## segments[] 字段(TTS 分句)
|
||||||
|
|
||||||
|
TTS 阶段自动生成。仅当 `script` 被切分为 2 句及以上时才写入。单句时不写 segments。
|
||||||
|
|
||||||
|
| 字段 | 说明 |
|
||||||
|
|------|------|
|
||||||
|
| `text` | 分句文本(已去除标点) |
|
||||||
|
| `audio` | 该句音频路径(相对 manifest) |
|
||||||
|
| `duration` | 该句音频时长(秒) |
|
||||||
|
|
||||||
|
`item.audio` 指向所有分段合并后的完整音频,`item.audioDuration` 为各段累计时长。assemble 阶段优先用 `segments` 的精确时长对齐字幕,无 segments 时回退到字数权重估算。
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ const path = require('path')
|
|||||||
const fs = require('fs')
|
const fs = require('fs')
|
||||||
const { execFile } = require('child_process')
|
const { execFile } = require('child_process')
|
||||||
const { syncDraft, registerDraft, triggerDirectoryScan } = require('./sync-to-jianying')
|
const { syncDraft, registerDraft, triggerDirectoryScan } = require('./sync-to-jianying')
|
||||||
|
const { splitTextIntoSentences } = require('./lib/pipeline-utils')
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// 配置
|
// 配置
|
||||||
@@ -248,8 +249,8 @@ async function assemble(args) {
|
|||||||
// 用 ffprobe 测量实际音频/视频时长,替代 manifest 中的估计值
|
// 用 ffprobe 测量实际音频/视频时长,替代 manifest 中的估计值
|
||||||
let audioMeasured = 0, videoMeasured = 0
|
let audioMeasured = 0, videoMeasured = 0
|
||||||
for (const item of items) {
|
for (const item of items) {
|
||||||
// 测量 TTS 音频实际时长
|
// 测量 TTS 音频实际时长(有 segments 时跳过,audioDuration 已是精确累计值)
|
||||||
if (item.audio && !item.audio.startsWith('http')) {
|
if (item.audio && !item.audio.startsWith('http') && !item.segments) {
|
||||||
const audioPath = path.isAbsolute(item.audio)
|
const audioPath = path.isAbsolute(item.audio)
|
||||||
? item.audio
|
? item.audio
|
||||||
: path.resolve(inputDir, item.audio)
|
: path.resolve(inputDir, item.audio)
|
||||||
@@ -277,6 +278,9 @@ async function assemble(args) {
|
|||||||
const totalDurationUs = timeline.length > 0 ? timeline[timeline.length - 1].end : 0
|
const totalDurationUs = timeline.length > 0 ? timeline[timeline.length - 1].end : 0
|
||||||
const hasTTS = items.some(item => item.audio && item.audioDuration != null)
|
const hasTTS = items.some(item => item.audio && item.audioDuration != null)
|
||||||
|
|
||||||
|
// -- 读取转场策略(在 addImages/addVideos 之前) --
|
||||||
|
const transitionConfig = loadTransitions(manifest)
|
||||||
|
|
||||||
console.log(`\nCapCut 成片组装`)
|
console.log(`\nCapCut 成片组装`)
|
||||||
console.log(` 模式: ${mode} 画幅: ${format} (${width}x${height})`)
|
console.log(` 模式: ${mode} 画幅: ${format} (${width}x${height})`)
|
||||||
console.log(` 时间线: ${hasTTS ? 'TTS音频驱动' : `固定${duration}s/段`} 总时长: ${(totalDurationUs / US).toFixed(1)}s`)
|
console.log(` 时间线: ${hasTTS ? 'TTS音频驱动' : `固定${duration}s/段`} 总时长: ${(totalDurationUs / US).toFixed(1)}s`)
|
||||||
@@ -285,7 +289,7 @@ async function assemble(args) {
|
|||||||
|
|
||||||
const steps = []
|
const steps = []
|
||||||
if (mode === 'images') steps.push('upload')
|
if (mode === 'images') steps.push('upload')
|
||||||
steps.push('draft', 'materials', 'voiceover', 'audio', 'subtitles', 'effects', 'filter', 'save', 'sync')
|
steps.push('draft', 'materials', 'audio_oss', 'voiceover', 'audio', 'subtitles', 'effects', 'filter', 'save', 'sync')
|
||||||
const totalSteps = steps.length
|
const totalSteps = steps.length
|
||||||
let step = 0
|
let step = 0
|
||||||
|
|
||||||
@@ -371,10 +375,22 @@ async function assemble(args) {
|
|||||||
await addVideos(draftUrl, inputDir, items, timeline, width, height, transitionConfig)
|
await addVideos(draftUrl, inputDir, items, timeline, width, height, transitionConfig)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// -- 上传 TTS 音频到 OSS --
|
||||||
|
let audioUrls = {}
|
||||||
|
if (voiceover === 'true' && hasTTS) {
|
||||||
|
step++; console.log(`[${step}/${totalSteps}] 上传 TTS 音频到 OSS...`)
|
||||||
|
try {
|
||||||
|
audioUrls = await batchUploadAudio(inputDir, items)
|
||||||
|
console.log(` 成功: ${Object.keys(audioUrls).length} 段音频\n`)
|
||||||
|
} catch (err) {
|
||||||
|
console.log(` OSS 上传失败,将尝试本地路径: ${err.message}\n`)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// -- 添加 TTS 配音 --
|
// -- 添加 TTS 配音 --
|
||||||
step++; console.log(`[${step}/${totalSteps}] 添加 TTS 配音...`)
|
step++; console.log(`[${step}/${totalSteps}] 添加 TTS 配音...`)
|
||||||
if (voiceover === 'true' && hasTTS) {
|
if (voiceover === 'true' && hasTTS) {
|
||||||
await addVoiceover(draftUrl, inputDir, items, timeline, localAudio === 'true')
|
await addVoiceover(draftUrl, inputDir, items, timeline, audioUrls)
|
||||||
} else {
|
} else {
|
||||||
console.log(' 跳过(无 TTS 音频或未启用)')
|
console.log(' 跳过(无 TTS 音频或未启用)')
|
||||||
}
|
}
|
||||||
@@ -393,9 +409,6 @@ async function assemble(args) {
|
|||||||
console.log(` 字幕风格: ${subtitleStyle.font || '默认'} ${subtitleStyle.inAnimation ? subtitleStyle.inAnimation + '→' + subtitleStyle.outAnimation : ''}`)
|
console.log(` 字幕风格: ${subtitleStyle.font || '默认'} ${subtitleStyle.inAnimation ? subtitleStyle.inAnimation + '→' + subtitleStyle.outAnimation : ''}`)
|
||||||
}
|
}
|
||||||
|
|
||||||
// -- 读取转场策略 --
|
|
||||||
const transitionConfig = loadTransitions(manifest)
|
|
||||||
|
|
||||||
// -- 添加字幕 --
|
// -- 添加字幕 --
|
||||||
step++; console.log(`[${step}/${totalSteps}] 添加字幕...`)
|
step++; console.log(`[${step}/${totalSteps}] 添加字幕...`)
|
||||||
if (subtitles === 'true' && items.some(i => i.script || i.text)) {
|
if (subtitles === 'true' && items.some(i => i.script || i.text)) {
|
||||||
@@ -640,15 +653,34 @@ async function uploadAudioToOSS(filePath) {
|
|||||||
async function batchUploadAudio(inputDir, items) {
|
async function batchUploadAudio(inputDir, items) {
|
||||||
const urls = {}
|
const urls = {}
|
||||||
for (const item of items) {
|
for (const item of items) {
|
||||||
|
// 上传 segments 中的每段音频
|
||||||
|
if (item.segments && item.segments.length > 1) {
|
||||||
|
for (const seg of item.segments) {
|
||||||
|
if (!seg.audio || seg.audio.startsWith('http') || urls[seg.audio]) continue
|
||||||
|
const filePath = path.isAbsolute(seg.audio)
|
||||||
|
? seg.audio
|
||||||
|
: path.resolve(inputDir, seg.audio)
|
||||||
|
if (!fs.existsSync(filePath)) {
|
||||||
|
console.error(` 音频文件不存在: ${filePath}`)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
urls[seg.audio] = await uploadAudioToOSS(filePath)
|
||||||
|
console.log(` 上传: ${path.basename(filePath)} -> OK`)
|
||||||
|
} catch (err) {
|
||||||
|
console.error(` 上传失败: ${path.basename(filePath)} - ${err.message}`)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 上传 item.audio(单段或 segments 的第一段)
|
||||||
if (!item.audio || item.audio.startsWith('http')) {
|
if (!item.audio || item.audio.startsWith('http')) {
|
||||||
if (item.audio) urls[item.audio] = item.audio
|
if (item.audio) urls[item.audio] = item.audio
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// audio 可以是相对路径或绝对路径
|
if (urls[item.audio]) continue
|
||||||
const filePath = path.isAbsolute(item.audio)
|
const filePath = path.isAbsolute(item.audio)
|
||||||
? item.audio
|
? item.audio
|
||||||
: path.resolve(inputDir, item.audio)
|
: path.resolve(inputDir, item.audio)
|
||||||
|
|
||||||
if (!fs.existsSync(filePath)) {
|
if (!fs.existsSync(filePath)) {
|
||||||
console.error(` 音频文件不存在: ${filePath}`)
|
console.error(` 音频文件不存在: ${filePath}`)
|
||||||
continue
|
continue
|
||||||
@@ -667,51 +699,54 @@ async function batchUploadAudio(inputDir, items) {
|
|||||||
// 添加 TTS 配音(每段音频按时间线排列)
|
// 添加 TTS 配音(每段音频按时间线排列)
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
||||||
async function addVoiceover(draftUrl, inputDir, items, timeline, localAudio = true) {
|
async function addVoiceover(draftUrl, inputDir, items, timeline, audioUrls = {}) {
|
||||||
// 收集音频
|
// 收集音频
|
||||||
const audioItems = items.filter(item => item.audio)
|
const audioItems = items.filter(item => item.audio || (item.segments && item.segments.length > 0))
|
||||||
if (audioItems.length === 0) {
|
if (audioItems.length === 0) {
|
||||||
console.log(' 无 TTS 音频文件,跳过')
|
console.log(' 无 TTS 音频文件,跳过')
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
const audioInfos = []
|
const audioInfos = []
|
||||||
|
const resolveAudio = (relPath) => {
|
||||||
|
if (relPath.startsWith('http')) return relPath
|
||||||
|
if (audioUrls[relPath]) return audioUrls[relPath]
|
||||||
|
return path.isAbsolute(relPath) ? relPath : path.resolve(inputDir, relPath)
|
||||||
|
}
|
||||||
|
|
||||||
if (localAudio) {
|
for (let i = 0; i < items.length; i++) {
|
||||||
// 本地模式:直接用本地路径,不上传 OSS
|
const item = items[i]
|
||||||
for (let i = 0; i < items.length; i++) {
|
const tl = timeline[i]
|
||||||
const item = items[i]
|
const segments = item.segments && item.segments.length > 1 ? item.segments : null
|
||||||
if (!item.audio) continue
|
|
||||||
|
|
||||||
const filePath = item.audio.startsWith('http')
|
if (segments) {
|
||||||
? item.audio
|
// 多段音频:按 segment 逐段添加,使用精确时长
|
||||||
: (path.isAbsolute(item.audio) ? item.audio : path.resolve(inputDir, item.audio))
|
const totalSegDur = segments.reduce((sum, s) => sum + s.duration * US, 0)
|
||||||
|
const tlDuration = tl.end - tl.start
|
||||||
|
let currentTime = tl.start
|
||||||
|
|
||||||
if (!item.audio.startsWith('http') && !fs.existsSync(filePath)) {
|
for (let j = 0; j < segments.length; j++) {
|
||||||
console.error(` 音频文件不存在: ${filePath}`)
|
const seg = segments[j]
|
||||||
continue
|
const segDurUs = Math.round(seg.duration * US)
|
||||||
|
let duration = Math.round(tlDuration * (segDurUs / totalSegDur))
|
||||||
|
if (j === segments.length - 1) duration = tl.end - currentTime
|
||||||
|
duration = Math.max(duration, 100000)
|
||||||
|
|
||||||
|
const audioUrl = resolveAudio(seg.audio)
|
||||||
|
|
||||||
|
audioInfos.push({
|
||||||
|
audio_url: audioUrl,
|
||||||
|
start: currentTime,
|
||||||
|
end: currentTime + duration,
|
||||||
|
duration,
|
||||||
|
volume: 1.0,
|
||||||
|
})
|
||||||
|
currentTime += duration
|
||||||
}
|
}
|
||||||
|
} else if (item.audio) {
|
||||||
|
// 单段音频
|
||||||
|
const audioUrl = resolveAudio(item.audio)
|
||||||
|
|
||||||
const tl = timeline[i]
|
|
||||||
audioInfos.push({
|
|
||||||
audio_url: filePath,
|
|
||||||
start: tl.start,
|
|
||||||
end: tl.end,
|
|
||||||
duration: tl.duration,
|
|
||||||
volume: 1.0,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// 上传模式:先传 OSS 再用 URL
|
|
||||||
const audioUrls = await batchUploadAudio(inputDir, items)
|
|
||||||
for (let i = 0; i < items.length; i++) {
|
|
||||||
const item = items[i]
|
|
||||||
if (!item.audio) continue
|
|
||||||
|
|
||||||
const audioUrl = audioUrls[item.audio]
|
|
||||||
if (!audioUrl) continue
|
|
||||||
|
|
||||||
const tl = timeline[i]
|
|
||||||
audioInfos.push({
|
audioInfos.push({
|
||||||
audio_url: audioUrl,
|
audio_url: audioUrl,
|
||||||
start: tl.start,
|
start: tl.start,
|
||||||
@@ -731,7 +766,8 @@ async function addVoiceover(draftUrl, inputDir, items, timeline, localAudio = tr
|
|||||||
draft_url: draftUrl,
|
draft_url: draftUrl,
|
||||||
audio_infos: JSON.stringify(audioInfos),
|
audio_infos: JSON.stringify(audioInfos),
|
||||||
})
|
})
|
||||||
console.log(` 已添加 ${audioInfos.length} 段 TTS 配音 (${localAudio ? '本地路径' : 'OSS'})`)
|
const ossCount = audioInfos.filter(a => a.audio_url.startsWith('http')).length
|
||||||
|
console.log(` 已添加 ${audioInfos.length} 段 TTS 配音 (${ossCount > 0 ? `${ossCount} 段 OSS + ` : ''}${audioInfos.length - ossCount} 段本地)`)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
@@ -793,40 +829,6 @@ function loadTransitions(manifest) {
|
|||||||
// 添加字幕(支持关键词高亮 + 账号字幕风格 + 分句切分)
|
// 添加字幕(支持关键词高亮 + 账号字幕风格 + 分句切分)
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
||||||
/**
|
|
||||||
* 按标点符号切分文本为短句(去除所有标点符号)
|
|
||||||
*/
|
|
||||||
function splitTextIntoSentences(text) {
|
|
||||||
const sentenceEnders = /[。!?;]/
|
|
||||||
const clauseEnders = /[,:]/
|
|
||||||
|
|
||||||
const sentences = []
|
|
||||||
let current = ''
|
|
||||||
let chars = text.split('')
|
|
||||||
|
|
||||||
for (let i = 0; i < chars.length; i++) {
|
|
||||||
const char = chars[i]
|
|
||||||
current += char
|
|
||||||
|
|
||||||
if (sentenceEnders.test(char)) {
|
|
||||||
// 切分并去掉所有标点
|
|
||||||
sentences.push(current.trim().replace(/[。!?;,:、]/g, ''))
|
|
||||||
current = ''
|
|
||||||
} else if (clauseEnders.test(char) && current.length > 8) {
|
|
||||||
// 切分并去掉所有标点
|
|
||||||
sentences.push(current.trim().replace(/[。!?;,:、]/g, ''))
|
|
||||||
current = ''
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 处理剩余文本
|
|
||||||
if (current.trim()) {
|
|
||||||
sentences.push(current.trim().replace(/[。!?;,:、]/g, ''))
|
|
||||||
}
|
|
||||||
|
|
||||||
return sentences
|
|
||||||
}
|
|
||||||
|
|
||||||
async function addSubtitles(draftUrl, items, timeline, style = {}, split = false) {
|
async function addSubtitles(draftUrl, items, timeline, style = {}, split = false) {
|
||||||
const captions = []
|
const captions = []
|
||||||
|
|
||||||
@@ -844,45 +846,76 @@ async function addSubtitles(draftUrl, items, timeline, style = {}, split = false
|
|||||||
const tl = timeline[i]
|
const tl = timeline[i]
|
||||||
|
|
||||||
if (split) {
|
if (split) {
|
||||||
// 分句模式:切分长文本
|
// 分句模式:优先用 segments(TTS 逐句生成的精确时长),回退到字数估算
|
||||||
const sentences = splitTextIntoSentences(text)
|
const segments = item.segments && item.segments.length > 1 ? item.segments : null
|
||||||
if (sentences.length === 0) continue
|
|
||||||
|
|
||||||
const totalDuration = tl.end - tl.start
|
if (segments) {
|
||||||
|
// 精确模式:用 segments 的实际音频时长
|
||||||
|
const totalSegDur = segments.reduce((sum, s) => sum + s.duration * US, 0)
|
||||||
|
const tlDuration = tl.end - tl.start
|
||||||
|
let currentTime = tl.start
|
||||||
|
|
||||||
// 按字数权重分配时间(改进版)
|
segments.forEach((seg, idx) => {
|
||||||
const totalChars = sentences.reduce((sum, s) => sum + s.length, 0)
|
const segDurUs = Math.round(seg.duration * US)
|
||||||
let currentTime = tl.start
|
// 按实际时长占比映射到时间线(处理 ffprobe 重新测量的差异)
|
||||||
|
let duration = Math.round(tlDuration * (segDurUs / totalSegDur))
|
||||||
|
if (idx === segments.length - 1) {
|
||||||
|
duration = tl.end - currentTime
|
||||||
|
}
|
||||||
|
duration = Math.max(duration, 1000000)
|
||||||
|
|
||||||
sentences.forEach((sentence, idx) => {
|
const cap = {
|
||||||
// 按字数比例计算时长
|
start: currentTime,
|
||||||
const charRatio = sentence.length / totalChars
|
end: currentTime + duration,
|
||||||
let duration = Math.round(totalDuration * charRatio)
|
text: seg.text,
|
||||||
|
keyword: '',
|
||||||
|
keyword_color: '',
|
||||||
|
}
|
||||||
|
|
||||||
// 最后一句使用剩余全部时间(避免精度误差)
|
if (inAnimation) cap.in_animation = inAnimation
|
||||||
if (idx === sentences.length - 1) {
|
if (outAnimation) cap.out_animation = outAnimation
|
||||||
duration = tl.end - currentTime
|
if (inAnimDuration) cap.in_animation_duration = inAnimDuration
|
||||||
}
|
if (outAnimDuration) cap.out_animation_duration = outAnimDuration
|
||||||
|
|
||||||
// 最小1秒,避免太短
|
captions.push(cap)
|
||||||
duration = Math.max(duration, 1000000) // 1秒 = 1000000微秒
|
currentTime += duration
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
// 回退:字数权重估算
|
||||||
|
const sentences = splitTextIntoSentences(text)
|
||||||
|
if (sentences.length === 0) continue
|
||||||
|
|
||||||
const cap = {
|
const totalDuration = tl.end - tl.start
|
||||||
start: currentTime,
|
const totalChars = sentences.reduce((sum, s) => sum + s.length, 0)
|
||||||
end: currentTime + duration,
|
let currentTime = tl.start
|
||||||
text: sentence,
|
|
||||||
keyword: '',
|
|
||||||
keyword_color: '',
|
|
||||||
}
|
|
||||||
|
|
||||||
if (inAnimation) cap.in_animation = inAnimation
|
sentences.forEach((sentence, idx) => {
|
||||||
if (outAnimation) cap.out_animation = outAnimation
|
const charRatio = sentence.length / totalChars
|
||||||
if (inAnimDuration) cap.in_animation_duration = inAnimDuration
|
let duration = Math.round(totalDuration * charRatio)
|
||||||
if (outAnimDuration) cap.out_animation_duration = outAnimDuration
|
|
||||||
|
|
||||||
captions.push(cap)
|
if (idx === sentences.length - 1) {
|
||||||
currentTime += duration
|
duration = tl.end - currentTime
|
||||||
})
|
}
|
||||||
|
|
||||||
|
duration = Math.max(duration, 1000000)
|
||||||
|
|
||||||
|
const cap = {
|
||||||
|
start: currentTime,
|
||||||
|
end: currentTime + duration,
|
||||||
|
text: sentence,
|
||||||
|
keyword: '',
|
||||||
|
keyword_color: '',
|
||||||
|
}
|
||||||
|
|
||||||
|
if (inAnimation) cap.in_animation = inAnimation
|
||||||
|
if (outAnimation) cap.out_animation = outAnimation
|
||||||
|
if (inAnimDuration) cap.in_animation_duration = inAnimDuration
|
||||||
|
if (outAnimDuration) cap.out_animation_duration = outAnimDuration
|
||||||
|
|
||||||
|
captions.push(cap)
|
||||||
|
currentTime += duration
|
||||||
|
})
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
// 原始模式:一句字幕
|
// 原始模式:一句字幕
|
||||||
const keyword = ''
|
const keyword = ''
|
||||||
|
|||||||
@@ -1,11 +1,12 @@
|
|||||||
/**
|
/**
|
||||||
* Phase: tts — 语音合成
|
* Phase: tts — 语音合成(逐句分句生成)
|
||||||
*
|
*
|
||||||
* 使用通义千问 TTS 生成旁白音频
|
* 将每个 item 的 script 按标点切分为短句,每句单独生成 TTS 音频。
|
||||||
|
* 结果写入 item.segments[],实现字幕与语音精确对齐。
|
||||||
*/
|
*/
|
||||||
|
|
||||||
const path = require('path')
|
const path = require('path')
|
||||||
const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils')
|
const { saveManifest, ensureDir, log, getManifestDir, splitTextIntoSentences } = require('./pipeline-utils')
|
||||||
|
|
||||||
async function phaseTts(manifest, manifestPath, options = {}) {
|
async function phaseTts(manifest, manifestPath, options = {}) {
|
||||||
const dir = getManifestDir(manifestPath)
|
const dir = getManifestDir(manifestPath)
|
||||||
@@ -24,17 +25,51 @@ async function phaseTts(manifest, manifestPath, options = {}) {
|
|||||||
for (let i = 0; i < items.length; i++) {
|
for (let i = 0; i < items.length; i++) {
|
||||||
const item = items[i]
|
const item = items[i]
|
||||||
const idx = i + 1
|
const idx = i + 1
|
||||||
|
const fullText = item.script || item.text
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const { filePath, duration } = await synthesize(item.script || item.text, {
|
const sentences = splitTextIntoSentences(fullText)
|
||||||
outputDir: audioDir,
|
|
||||||
id: item.id || idx,
|
if (sentences.length <= 1) {
|
||||||
voice: manifest.ttsVoice || undefined,
|
// 单句:不需要 segments,走原逻辑
|
||||||
instruction: manifest.ttsInstruction || undefined,
|
const { filePath, duration } = await synthesize(fullText, {
|
||||||
rate: manifest.ttsRate || undefined,
|
outputDir: audioDir,
|
||||||
})
|
id: item.id || idx,
|
||||||
item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
|
voice: manifest.ttsVoice || undefined,
|
||||||
item.audioDuration = Math.round(duration * 1000) / 1000
|
instruction: manifest.ttsInstruction || undefined,
|
||||||
log('tts', `[${idx}/${items.length}] ${duration.toFixed(1)}s: ${(item.script || item.text).substring(0, 30)}...`)
|
rate: manifest.ttsRate || undefined,
|
||||||
|
})
|
||||||
|
item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
|
||||||
|
item.audioDuration = Math.round(duration * 1000) / 1000
|
||||||
|
log('tts', `[${idx}/${items.length}] ${duration.toFixed(1)}s: ${fullText.substring(0, 30)}...`)
|
||||||
|
} else {
|
||||||
|
// 多句:逐句生成,写入 segments
|
||||||
|
const segments = []
|
||||||
|
let totalDuration = 0
|
||||||
|
|
||||||
|
for (let j = 0; j < sentences.length; j++) {
|
||||||
|
const sentence = sentences[j]
|
||||||
|
const segId = `${item.id || idx}_${j + 1}`
|
||||||
|
const { filePath, duration } = await synthesize(sentence, {
|
||||||
|
outputDir: audioDir,
|
||||||
|
id: segId,
|
||||||
|
voice: manifest.ttsVoice || undefined,
|
||||||
|
instruction: manifest.ttsInstruction || undefined,
|
||||||
|
rate: manifest.ttsRate || undefined,
|
||||||
|
})
|
||||||
|
segments.push({
|
||||||
|
text: sentence,
|
||||||
|
audio: path.relative(dir, filePath).replace(/\\/g, '/'),
|
||||||
|
duration: Math.round(duration * 1000) / 1000,
|
||||||
|
})
|
||||||
|
totalDuration += duration
|
||||||
|
}
|
||||||
|
|
||||||
|
item.segments = segments
|
||||||
|
item.audio = segments[0].audio
|
||||||
|
item.audioDuration = Math.round(totalDuration * 1000) / 1000
|
||||||
|
log('tts', `[${idx}/${items.length}] ${totalDuration.toFixed(1)}s (${segments.length}句): ${fullText.substring(0, 30)}...`)
|
||||||
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
item.status = 'failed'
|
item.status = 'failed'
|
||||||
item.error = `TTS失败: ${err.message}`
|
item.error = `TTS失败: ${err.message}`
|
||||||
|
|||||||
@@ -160,6 +160,36 @@ function getManifestDir(manifestPath) {
|
|||||||
return path.dirname(path.resolve(manifestPath))
|
return path.dirname(path.resolve(manifestPath))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ============================================================================
|
||||||
|
// 文本切分
|
||||||
|
// ============================================================================
|
||||||
|
|
||||||
|
function splitTextIntoSentences(text) {
|
||||||
|
const sentenceEnders = /[。!?;]/
|
||||||
|
const clauseEnders = /[,:]/
|
||||||
|
|
||||||
|
const sentences = []
|
||||||
|
let current = ''
|
||||||
|
|
||||||
|
for (const char of text) {
|
||||||
|
current += char
|
||||||
|
|
||||||
|
if (sentenceEnders.test(char)) {
|
||||||
|
sentences.push(current.trim().replace(/[。!?;,:、]/g, ''))
|
||||||
|
current = ''
|
||||||
|
} else if (clauseEnders.test(char) && current.length > 8) {
|
||||||
|
sentences.push(current.trim().replace(/[。!?;,:、]/g, ''))
|
||||||
|
current = ''
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (current.trim()) {
|
||||||
|
sentences.push(current.trim().replace(/[。!?;,:、]/g, ''))
|
||||||
|
}
|
||||||
|
|
||||||
|
return sentences
|
||||||
|
}
|
||||||
|
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
// Exports
|
// Exports
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
@@ -178,6 +208,7 @@ module.exports = {
|
|||||||
ensureDir,
|
ensureDir,
|
||||||
slugify,
|
slugify,
|
||||||
renameGeneratedFile,
|
renameGeneratedFile,
|
||||||
|
splitTextIntoSentences,
|
||||||
log,
|
log,
|
||||||
getManifestDir,
|
getManifestDir,
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user