2026-04-29 21:04:43 +08:00
|
|
|
|
#!/usr/bin/env node
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
2026-05-01 00:44:18 +08:00
|
|
|
|
* CosyVoice TTS 批量语音合成脚本
|
|
|
|
|
|
* 通过 WebSocket 调用阿里云 DashScope CosyVoice API
|
2026-04-29 21:04:43 +08:00
|
|
|
|
*
|
|
|
|
|
|
* 输入 JSON 文件格式:
|
|
|
|
|
|
* {
|
|
|
|
|
|
* "segments": [
|
|
|
|
|
|
* {"id": 1, "text": "第一段文案"},
|
|
|
|
|
|
* {"id": 2, "text": "第二段文案"}
|
|
|
|
|
|
* ],
|
2026-05-01 00:44:18 +08:00
|
|
|
|
* "voice": "longanyang", // 可选,覆盖 config
|
|
|
|
|
|
* "output_dir": "./audio" // 可选,默认 ./audio
|
2026-04-29 21:04:43 +08:00
|
|
|
|
* }
|
|
|
|
|
|
*
|
|
|
|
|
|
* 输出 JSON (stdout):
|
|
|
|
|
|
* {
|
|
|
|
|
|
* "segments": [
|
2026-05-01 00:44:18 +08:00
|
|
|
|
* {"id": 1, "text": "...", "audio": "./audio/seg_001.mp3", "duration": 3.456}
|
2026-04-29 21:04:43 +08:00
|
|
|
|
* ]
|
|
|
|
|
|
* }
|
|
|
|
|
|
*
|
|
|
|
|
|
* 也可作为模块调用:
|
|
|
|
|
|
* const { synthesize } = require('./qwen-tts')
|
2026-05-01 00:44:18 +08:00
|
|
|
|
* const { filePath, duration } = await synthesize('你好世界', { voice: 'longanyang' })
|
2026-04-29 21:04:43 +08:00
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
const fs = require('fs')
|
|
|
|
|
|
const path = require('path')
|
2026-05-01 00:44:18 +08:00
|
|
|
|
const { execFileSync } = require('child_process')
|
2026-04-29 21:04:43 +08:00
|
|
|
|
|
|
|
|
|
|
const CONFIG_PATH = path.join(__dirname, '..', '..', 'config.json')
|
|
|
|
|
|
|
|
|
|
|
|
function loadConfig() {
|
|
|
|
|
|
if (!fs.existsSync(CONFIG_PATH)) throw new Error(`config.json 不存在: ${CONFIG_PATH}`)
|
|
|
|
|
|
return JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf-8'))
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-05-01 00:44:18 +08:00
|
|
|
|
function getAudioDuration(filePath) {
|
|
|
|
|
|
try {
|
|
|
|
|
|
const out = execFileSync('ffprobe', [
|
|
|
|
|
|
'-v', 'quiet', '-show_entries', 'format=duration',
|
|
|
|
|
|
'-of', 'default=noprint_wrappers=1:nokey=1', filePath,
|
|
|
|
|
|
], { encoding: 'utf-8', timeout: 10000 })
|
|
|
|
|
|
return parseFloat(out.trim())
|
|
|
|
|
|
} catch {
|
|
|
|
|
|
const stat = fs.statSync(filePath)
|
|
|
|
|
|
return stat.size * 8 / 32000
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2026-04-29 21:04:43 +08:00
|
|
|
|
/**
|
2026-05-01 00:44:18 +08:00
|
|
|
|
* 单段语音合成(CosyVoice WebSocket)
|
|
|
|
|
|
* @param {string} text
|
|
|
|
|
|
* @param {object} options - { voice, model, outputDir, id, instruction }
|
|
|
|
|
|
* @returns {Promise<{filePath: string, duration: number}>}
|
2026-04-29 21:04:43 +08:00
|
|
|
|
*/
|
2026-05-01 00:44:18 +08:00
|
|
|
|
function synthesize(text, options = {}) {
|
|
|
|
|
|
return new Promise((resolve, reject) => {
|
|
|
|
|
|
const config = loadConfig()
|
2026-04-29 21:04:43 +08:00
|
|
|
|
|
2026-05-01 00:44:18 +08:00
|
|
|
|
const apiKey = options.apiKey || config.ttsApiKey
|
|
|
|
|
|
if (!apiKey) { reject(new Error('ttsApiKey 未配置')); return }
|
2026-04-29 21:04:43 +08:00
|
|
|
|
|
2026-05-01 00:44:18 +08:00
|
|
|
|
const model = options.model || config.ttsModel || 'cosyvoice-v3-flash'
|
|
|
|
|
|
const voice = options.voice || config.ttsVoice || 'longanyang'
|
|
|
|
|
|
const instruction = options.instruction || config.ttsInstruction || ''
|
|
|
|
|
|
const outputDir = options.outputDir || './audio'
|
2026-04-29 21:04:43 +08:00
|
|
|
|
|
2026-05-01 00:44:18 +08:00
|
|
|
|
fs.mkdirSync(outputDir, { recursive: true })
|
2026-04-29 21:04:43 +08:00
|
|
|
|
|
2026-05-01 00:44:18 +08:00
|
|
|
|
text = text.trimEnd()
|
|
|
|
|
|
if (!/[。!?.!?…]$/.test(text)) text += '。'
|
2026-04-29 21:04:43 +08:00
|
|
|
|
|
2026-05-01 00:44:18 +08:00
|
|
|
|
const id = options.id || 1
|
|
|
|
|
|
const fileName = `seg_${String(id).padStart(3, '0')}.mp3`
|
|
|
|
|
|
const filePath = path.resolve(outputDir, fileName)
|
2026-04-29 21:04:43 +08:00
|
|
|
|
|
2026-05-01 00:44:18 +08:00
|
|
|
|
const wsUrl = 'wss://dashscope.aliyuncs.com/api-ws/v1/inference'
|
|
|
|
|
|
const ws = new WebSocket(wsUrl, {
|
|
|
|
|
|
headers: { Authorization: `bearer ${apiKey}` },
|
2026-04-29 21:04:43 +08:00
|
|
|
|
})
|
|
|
|
|
|
|
2026-05-01 00:44:18 +08:00
|
|
|
|
const taskId = `tts_${Date.now()}_${id}`
|
|
|
|
|
|
const chunks = []
|
|
|
|
|
|
let settled = false
|
|
|
|
|
|
|
|
|
|
|
|
const timer = setTimeout(() => {
|
|
|
|
|
|
if (!settled) { settled = true; ws.close(); reject(new Error('TTS 超时 (60s)')) }
|
|
|
|
|
|
}, 60000)
|
|
|
|
|
|
|
|
|
|
|
|
ws.addEventListener('open', () => {
|
|
|
|
|
|
// Step 1: run-task — empty input, no text
|
|
|
|
|
|
ws.send(JSON.stringify({
|
|
|
|
|
|
header: {
|
|
|
|
|
|
task_id: taskId,
|
|
|
|
|
|
action: 'run-task',
|
|
|
|
|
|
streaming: 'duplex',
|
|
|
|
|
|
},
|
|
|
|
|
|
payload: {
|
|
|
|
|
|
task_group: 'audio',
|
|
|
|
|
|
task: 'tts',
|
|
|
|
|
|
function: 'SpeechSynthesizer',
|
|
|
|
|
|
model,
|
|
|
|
|
|
parameters: {
|
|
|
|
|
|
voice,
|
|
|
|
|
|
format: 'mp3',
|
|
|
|
|
|
sample_rate: 24000,
|
|
|
|
|
|
volume: 50,
|
|
|
|
|
|
rate: 1.0,
|
|
|
|
|
|
pitch_rate: 1.0,
|
|
|
|
|
|
text_type: 'PlainText',
|
|
|
|
|
|
...(instruction ? { instruction } : {}),
|
|
|
|
|
|
},
|
|
|
|
|
|
input: {},
|
|
|
|
|
|
},
|
|
|
|
|
|
}))
|
|
|
|
|
|
})
|
2026-04-29 21:04:43 +08:00
|
|
|
|
|
2026-05-01 00:44:18 +08:00
|
|
|
|
ws.addEventListener('message', async (event) => {
|
|
|
|
|
|
if (typeof event.data !== 'string') {
|
|
|
|
|
|
const buf = event.data instanceof Blob
|
|
|
|
|
|
? Buffer.from(await event.data.arrayBuffer())
|
|
|
|
|
|
: Buffer.from(event.data)
|
|
|
|
|
|
chunks.push(buf)
|
|
|
|
|
|
return
|
|
|
|
|
|
}
|
|
|
|
|
|
try {
|
|
|
|
|
|
const msg = JSON.parse(event.data)
|
|
|
|
|
|
const evt = msg.header?.event
|
|
|
|
|
|
|
|
|
|
|
|
if (evt === 'task-started') {
|
|
|
|
|
|
// Step 2: continue-task — send text
|
|
|
|
|
|
ws.send(JSON.stringify({
|
|
|
|
|
|
header: {
|
|
|
|
|
|
task_id: taskId,
|
|
|
|
|
|
action: 'continue-task',
|
|
|
|
|
|
streaming: 'duplex',
|
|
|
|
|
|
},
|
|
|
|
|
|
payload: {
|
|
|
|
|
|
task_group: 'audio',
|
|
|
|
|
|
task: 'tts',
|
|
|
|
|
|
function: 'SpeechSynthesizer',
|
|
|
|
|
|
model,
|
|
|
|
|
|
input: { text },
|
|
|
|
|
|
},
|
|
|
|
|
|
}))
|
|
|
|
|
|
|
|
|
|
|
|
// Step 3: finish-task
|
|
|
|
|
|
ws.send(JSON.stringify({
|
|
|
|
|
|
header: {
|
|
|
|
|
|
task_id: taskId,
|
|
|
|
|
|
action: 'finish-task',
|
|
|
|
|
|
streaming: 'duplex',
|
|
|
|
|
|
},
|
|
|
|
|
|
payload: {
|
|
|
|
|
|
task_group: 'audio',
|
|
|
|
|
|
task: 'tts',
|
|
|
|
|
|
function: 'SpeechSynthesizer',
|
|
|
|
|
|
input: {},
|
|
|
|
|
|
},
|
|
|
|
|
|
}))
|
|
|
|
|
|
} else if (evt === 'task-finished') {
|
|
|
|
|
|
clearTimeout(timer)
|
|
|
|
|
|
ws.close()
|
|
|
|
|
|
if (settled) return
|
|
|
|
|
|
settled = true
|
|
|
|
|
|
|
|
|
|
|
|
const audio = Buffer.concat(chunks)
|
|
|
|
|
|
if (audio.length === 0) { reject(new Error('TTS 未返回音频')); return }
|
|
|
|
|
|
|
|
|
|
|
|
fs.writeFileSync(filePath, audio)
|
|
|
|
|
|
resolve({ filePath, duration: getAudioDuration(filePath) })
|
|
|
|
|
|
} else if (evt === 'task-failed') {
|
|
|
|
|
|
clearTimeout(timer)
|
|
|
|
|
|
ws.close()
|
|
|
|
|
|
if (settled) return
|
|
|
|
|
|
settled = true
|
|
|
|
|
|
reject(new Error(`TTS 失败: ${msg.header?.error_message || msg.header?.message || JSON.stringify(msg)}`))
|
|
|
|
|
|
}
|
|
|
|
|
|
} catch {}
|
|
|
|
|
|
})
|
2026-04-29 21:04:43 +08:00
|
|
|
|
|
2026-05-01 00:44:18 +08:00
|
|
|
|
ws.addEventListener('error', (e) => {
|
|
|
|
|
|
clearTimeout(timer)
|
|
|
|
|
|
if (!settled) { settled = true; reject(new Error(`WebSocket 错误: ${e.message || '连接失败'}`)) }
|
|
|
|
|
|
})
|
|
|
|
|
|
})
|
2026-04-29 21:04:43 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* 批量语音合成
|
|
|
|
|
|
*/
|
|
|
|
|
|
async function synthesizeBatch(segments, options = {}) {
|
|
|
|
|
|
const results = []
|
|
|
|
|
|
for (const seg of segments) {
|
|
|
|
|
|
console.error(` 合成 #${seg.id}: ${seg.text.substring(0, 30)}...`)
|
2026-05-01 00:44:18 +08:00
|
|
|
|
try {
|
|
|
|
|
|
const { filePath, duration } = await synthesize(seg.text, {
|
|
|
|
|
|
...options,
|
|
|
|
|
|
id: seg.id,
|
|
|
|
|
|
})
|
|
|
|
|
|
results.push({
|
|
|
|
|
|
id: seg.id,
|
|
|
|
|
|
text: seg.text,
|
|
|
|
|
|
audio: filePath,
|
|
|
|
|
|
duration: Math.round(duration * 1000) / 1000,
|
|
|
|
|
|
})
|
|
|
|
|
|
} catch (err) {
|
|
|
|
|
|
results.push({
|
|
|
|
|
|
id: seg.id,
|
|
|
|
|
|
text: seg.text,
|
|
|
|
|
|
audio: '',
|
|
|
|
|
|
duration: 0,
|
|
|
|
|
|
error: err.message,
|
|
|
|
|
|
})
|
|
|
|
|
|
}
|
2026-04-29 21:04:43 +08:00
|
|
|
|
await new Promise(r => setTimeout(r, 500))
|
|
|
|
|
|
}
|
|
|
|
|
|
return results
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// CLI 入口
|
|
|
|
|
|
async function main() {
|
|
|
|
|
|
const inputJson = process.argv[2]
|
|
|
|
|
|
if (!inputJson) {
|
|
|
|
|
|
console.error('用法: node qwen-tts.js <input.json>')
|
|
|
|
|
|
console.error('')
|
|
|
|
|
|
console.error('input.json 格式:')
|
|
|
|
|
|
console.error(JSON.stringify({
|
|
|
|
|
|
segments: [{ id: 1, text: '文案' }],
|
2026-05-01 00:44:18 +08:00
|
|
|
|
voice: 'longanyang',
|
2026-04-29 21:04:43 +08:00
|
|
|
|
output_dir: './audio',
|
|
|
|
|
|
}, null, 2))
|
|
|
|
|
|
process.exit(1)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const config = JSON.parse(fs.readFileSync(inputJson, 'utf-8'))
|
2026-05-01 00:44:18 +08:00
|
|
|
|
const results = await synthesizeBatch(config.segments, {
|
2026-04-29 21:04:43 +08:00
|
|
|
|
voice: config.voice,
|
|
|
|
|
|
outputDir: config.output_dir || './audio',
|
2026-05-01 00:44:18 +08:00
|
|
|
|
})
|
2026-04-29 21:04:43 +08:00
|
|
|
|
|
2026-05-01 00:44:18 +08:00
|
|
|
|
process.stdout.write(JSON.stringify({ segments: results }, null, 2) + '\n')
|
2026-04-29 21:04:43 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (require.main === module) {
|
|
|
|
|
|
main().catch(err => {
|
|
|
|
|
|
console.error('TTS 合成失败:', err.message)
|
|
|
|
|
|
process.exit(1)
|
|
|
|
|
|
})
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
module.exports = { synthesize, synthesizeBatch }
|