feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`:改为并发 3 张并行生成,每个 item 完成立即写入 manifest,支持 MJ task ID 恢复
- 重写 `phase-videos`:先恢复已有 task ID 再提交新任务(并发 3),支持中断恢复
- 迁移 TTS 引擎:从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口,支持音色/语气参数透传
- 精简账号系统:移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验,`references` 改为顶层字段
- 调整 `slugify`:限制中文字符 5 个、其他 10 个,避免文件名过长
- 更新文档:`manifest-schema.md` 中 `narration` 改为完整原文案,`account-creation.md` 新增 TTS 配置项
- 配置更新:默认 TTS 模型切换为 `cosyvoice-v3.5-plus`,新增 `localAudio` 参数
This commit is contained in:
2026-05-01 00:44:18 +08:00
parent 3326f6cb37
commit 7d526d2b60
19 changed files with 888 additions and 411 deletions

View File

@@ -1,7 +1,8 @@
#!/usr/bin/env node
/**
* 阿里云 Qwen-TTS 批量语音合成脚本
* CosyVoice TTS 批量语音合成脚本
* 通过 WebSocket 调用阿里云 DashScope CosyVoice API
*
* 输入 JSON 文件格式:
* {
@@ -9,26 +10,25 @@
* {"id": 1, "text": "第一段文案"},
* {"id": 2, "text": "第二段文案"}
* ],
* "voice": "Cherry", // 可选,覆盖 config
* "output_dir": "./audio" // 可选,默认 ./audio
* "voice": "longanyang", // 可选,覆盖 config
* "output_dir": "./audio" // 可选,默认 ./audio
* }
*
* 输出 JSON (stdout):
* {
* "segments": [
* {"id": 1, "text": "...", "audio": "./audio/seg_001.wav", "duration": 3.456},
* ...
* {"id": 1, "text": "...", "audio": "./audio/seg_001.mp3", "duration": 3.456}
* ]
* }
*
* 也可作为模块调用:
* const { synthesize } = require('./qwen-tts')
* const { filePath, duration } = await synthesize('你好世界', { voice: 'Cherry' })
* const { filePath, duration } = await synthesize('你好世界', { voice: 'longanyang' })
*/
const axios = require('axios')
const fs = require('fs')
const path = require('path')
const { execFileSync } = require('child_process')
const CONFIG_PATH = path.join(__dirname, '..', '..', 'config.json')
@@ -37,102 +37,185 @@ function loadConfig() {
return JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf-8'))
}
/**
* 单段语音合成(非流式)
* @param {string} text - 要合成的文本
* @param {object} options - { voice, model, language, outputDir, id }
* @returns {{ filePath: string, duration: number }}
*/
async function synthesize(text, options = {}) {
const config = loadConfig()
const apiKey = options.apiKey || config.ttsApiKey
if (!apiKey) throw new Error('ttsApiKey 未配置,请在 config.json 中设置')
const baseUrl = (options.apiBaseUrl || config.ttsApiBaseUrl || 'https://dashscope.aliyuncs.com/api/v1').replace(/\/$/, '')
const model = options.model || config.ttsModel || 'qwen-tts'
const voice = options.voice || config.ttsVoice || 'Cherry'
const language = options.language || config.ttsLanguage || 'Chinese'
const outputDir = options.outputDir || './audio'
fs.mkdirSync(outputDir, { recursive: true })
// 确保文本有句末标点,让 TTS 生成自然语调和尾部停顿
text = text.trimEnd()
if (!/[。!?.!?…]$/.test(text)) text += '。'
const url = `${baseUrl}/services/aigc/multimodal-generation/generation`
let res
function getAudioDuration(filePath) {
try {
res = await axios.post(url, {
model,
input: {
text,
voice,
language_type: language,
},
}, {
headers: {
'Authorization': `Bearer ${apiKey}`,
'Content-Type': 'application/json',
},
timeout: 60000,
const out = execFileSync('ffprobe', [
'-v', 'quiet', '-show_entries', 'format=duration',
'-of', 'default=noprint_wrappers=1:nokey=1', filePath,
], { encoding: 'utf-8', timeout: 10000 })
return parseFloat(out.trim())
} catch {
const stat = fs.statSync(filePath)
return stat.size * 8 / 32000
}
}
/**
* 单段语音合成CosyVoice WebSocket
* @param {string} text
* @param {object} options - { voice, model, outputDir, id, instruction }
* @returns {Promise<{filePath: string, duration: number}>}
*/
function synthesize(text, options = {}) {
return new Promise((resolve, reject) => {
const config = loadConfig()
const apiKey = options.apiKey || config.ttsApiKey
if (!apiKey) { reject(new Error('ttsApiKey 未配置')); return }
const model = options.model || config.ttsModel || 'cosyvoice-v3-flash'
const voice = options.voice || config.ttsVoice || 'longanyang'
const instruction = options.instruction || config.ttsInstruction || ''
const outputDir = options.outputDir || './audio'
fs.mkdirSync(outputDir, { recursive: true })
text = text.trimEnd()
if (!/[。!?.!?…]$/.test(text)) text += '。'
const id = options.id || 1
const fileName = `seg_${String(id).padStart(3, '0')}.mp3`
const filePath = path.resolve(outputDir, fileName)
const wsUrl = 'wss://dashscope.aliyuncs.com/api-ws/v1/inference'
const ws = new WebSocket(wsUrl, {
headers: { Authorization: `bearer ${apiKey}` },
})
} catch (err) {
const detail = err.response?.data
throw new Error(`TTS API 错误: ${err.message}${detail ? ' ' + JSON.stringify(detail) : ''}`)
}
const audioUrl = res.data?.output?.audio?.url
if (!audioUrl) {
throw new Error(`TTS API 未返回音频 URL: ${JSON.stringify(res.data)}`)
}
const taskId = `tts_${Date.now()}_${id}`
const chunks = []
let settled = false
// 下载音频到本地
const id = options.id || 1
const fileName = `seg_${String(id).padStart(3, '0')}.wav`
const filePath = path.resolve(outputDir, fileName)
const timer = setTimeout(() => {
if (!settled) { settled = true; ws.close(); reject(new Error('TTS 超时 (60s)')) }
}, 60000)
const audioRes = await axios.get(audioUrl, { responseType: 'arraybuffer', timeout: 30000 })
const wavBuffer = Buffer.from(audioRes.data)
ws.addEventListener('open', () => {
// Step 1: run-task — empty input, no text
ws.send(JSON.stringify({
header: {
task_id: taskId,
action: 'run-task',
streaming: 'duplex',
},
payload: {
task_group: 'audio',
task: 'tts',
function: 'SpeechSynthesizer',
model,
parameters: {
voice,
format: 'mp3',
sample_rate: 24000,
volume: 50,
rate: 1.0,
pitch_rate: 1.0,
text_type: 'PlainText',
...(instruction ? { instruction } : {}),
},
input: {},
},
}))
})
// 追加 0.3s 静音(句间气口)
const silenceSec = options.silencePadding !== undefined ? options.silencePadding : 0.3
const silenceBytes = Math.round(24000 * 2 * silenceSec)
const silenceBuffer = Buffer.alloc(silenceBytes, 0)
const finalBuffer = Buffer.concat([wavBuffer, silenceBuffer])
// 更新 WAV 头的文件大小
finalBuffer.writeUInt32LE(finalBuffer.length - 8, 4)
finalBuffer.writeUInt32LE(wavBuffer.length - 44 + silenceBytes, 40)
fs.writeFileSync(filePath, finalBuffer)
ws.addEventListener('message', async (event) => {
if (typeof event.data !== 'string') {
const buf = event.data instanceof Blob
? Buffer.from(await event.data.arrayBuffer())
: Buffer.from(event.data)
chunks.push(buf)
return
}
try {
const msg = JSON.parse(event.data)
const evt = msg.header?.event
const duration = (finalBuffer.length - 44) / (24000 * 2)
if (evt === 'task-started') {
// Step 2: continue-task — send text
ws.send(JSON.stringify({
header: {
task_id: taskId,
action: 'continue-task',
streaming: 'duplex',
},
payload: {
task_group: 'audio',
task: 'tts',
function: 'SpeechSynthesizer',
model,
input: { text },
},
}))
return { filePath, duration }
// Step 3: finish-task
ws.send(JSON.stringify({
header: {
task_id: taskId,
action: 'finish-task',
streaming: 'duplex',
},
payload: {
task_group: 'audio',
task: 'tts',
function: 'SpeechSynthesizer',
input: {},
},
}))
} else if (evt === 'task-finished') {
clearTimeout(timer)
ws.close()
if (settled) return
settled = true
const audio = Buffer.concat(chunks)
if (audio.length === 0) { reject(new Error('TTS 未返回音频')); return }
fs.writeFileSync(filePath, audio)
resolve({ filePath, duration: getAudioDuration(filePath) })
} else if (evt === 'task-failed') {
clearTimeout(timer)
ws.close()
if (settled) return
settled = true
reject(new Error(`TTS 失败: ${msg.header?.error_message || msg.header?.message || JSON.stringify(msg)}`))
}
} catch {}
})
ws.addEventListener('error', (e) => {
clearTimeout(timer)
if (!settled) { settled = true; reject(new Error(`WebSocket 错误: ${e.message || '连接失败'}`)) }
})
})
}
/**
* 批量语音合成
* @param {Array<{id: number, text: string}>} segments
* @param {object} options - { voice, outputDir }
* @returns {Array<{id: number, text: string, audio: string, duration: number}>}
*/
async function synthesizeBatch(segments, options = {}) {
const results = []
for (const seg of segments) {
console.error(` 合成 #${seg.id}: ${seg.text.substring(0, 30)}...`)
const { filePath, duration } = await synthesize(seg.text, {
...options,
id: seg.id,
})
results.push({
id: seg.id,
text: seg.text,
audio: filePath,
duration: Math.round(duration * 1000) / 1000,
})
// 间隔 0.5 秒避免限流
try {
const { filePath, duration } = await synthesize(seg.text, {
...options,
id: seg.id,
})
results.push({
id: seg.id,
text: seg.text,
audio: filePath,
duration: Math.round(duration * 1000) / 1000,
})
} catch (err) {
results.push({
id: seg.id,
text: seg.text,
audio: '',
duration: 0,
error: err.message,
})
}
await new Promise(r => setTimeout(r, 500))
}
return results
@@ -147,22 +230,19 @@ async function main() {
console.error('input.json 格式:')
console.error(JSON.stringify({
segments: [{ id: 1, text: '文案' }],
voice: 'Cherry',
voice: 'longanyang',
output_dir: './audio',
}, null, 2))
process.exit(1)
}
const config = JSON.parse(fs.readFileSync(inputJson, 'utf-8'))
const segments = config.segments
const options = {
const results = await synthesizeBatch(config.segments, {
voice: config.voice,
outputDir: config.output_dir || './audio',
}
})
const results = await synthesizeBatch(segments, options)
const output = { segments: results }
process.stdout.write(JSON.stringify(output, null, 2) + '\n')
process.stdout.write(JSON.stringify({ segments: results }, null, 2) + '\n')
}
if (require.main === module) {