#!/usr/bin/env node /** * CosyVoice TTS 批量语音合成脚本 * 通过 WebSocket 调用阿里云 DashScope CosyVoice API * * 输入 JSON 文件格式: * { * "segments": [ * {"id": 1, "text": "第一段文案"}, * {"id": 2, "text": "第二段文案"} * ], * "voice": "longanyang", // 可选,覆盖 config * "output_dir": "./audio" // 可选,默认 ./audio * } * * 输出 JSON (stdout): * { * "segments": [ * {"id": 1, "text": "...", "audio": "./audio/seg_001.mp3", "duration": 3.456} * ] * } * * 也可作为模块调用: * const { synthesize } = require('./qwen-tts') * const { filePath, duration } = await synthesize('你好世界', { voice: 'longanyang' }) */ const fs = require('fs') const path = require('path') const { execFileSync } = require('child_process') const CONFIG_PATH = path.join(__dirname, '..', '..', 'config.json') function loadConfig() { if (!fs.existsSync(CONFIG_PATH)) throw new Error(`config.json 不存在: ${CONFIG_PATH}`) return JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf-8')) } function getAudioDuration(filePath) { try { const out = execFileSync('ffprobe', [ '-v', 'quiet', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', filePath, ], { encoding: 'utf-8', timeout: 10000 }) return parseFloat(out.trim()) } catch { const stat = fs.statSync(filePath) return stat.size * 8 / 32000 } } /** * 单段语音合成(CosyVoice WebSocket) * @param {string} text * @param {object} options - { voice, model, outputDir, id, instruction } * @returns {Promise<{filePath: string, duration: number}>} */ function synthesize(text, options = {}) { return new Promise((resolve, reject) => { const config = loadConfig() const apiKey = options.apiKey || config.ttsApiKey if (!apiKey) { reject(new Error('ttsApiKey 未配置')); return } const model = options.model || config.ttsModel || 'cosyvoice-v3-flash' const voice = options.voice || config.ttsVoice || 'longanyang' const instruction = options.instruction || config.ttsInstruction || '' const outputDir = options.outputDir || './audio' fs.mkdirSync(outputDir, { recursive: true }) text = text.trimEnd() if (!/[。!?.!?…]$/.test(text)) text += '。' const id = options.id || 1 const fileName = `seg_${String(id).padStart(3, '0')}.mp3` const filePath = path.resolve(outputDir, fileName) const wsUrl = 'wss://dashscope.aliyuncs.com/api-ws/v1/inference' const ws = new WebSocket(wsUrl, { headers: { Authorization: `bearer ${apiKey}` }, }) const taskId = `tts_${Date.now()}_${id}` const chunks = [] let settled = false const timer = setTimeout(() => { if (!settled) { settled = true; ws.close(); reject(new Error('TTS 超时 (60s)')) } }, 60000) ws.addEventListener('open', () => { // Step 1: run-task — empty input, no text ws.send(JSON.stringify({ header: { task_id: taskId, action: 'run-task', streaming: 'duplex', }, payload: { task_group: 'audio', task: 'tts', function: 'SpeechSynthesizer', model, parameters: { voice, format: 'mp3', sample_rate: 24000, volume: 50, rate: 1.0, pitch_rate: 1.0, text_type: 'PlainText', ...(instruction ? { instruction } : {}), }, input: {}, }, })) }) ws.addEventListener('message', async (event) => { if (typeof event.data !== 'string') { const buf = event.data instanceof Blob ? Buffer.from(await event.data.arrayBuffer()) : Buffer.from(event.data) chunks.push(buf) return } try { const msg = JSON.parse(event.data) const evt = msg.header?.event if (evt === 'task-started') { // Step 2: continue-task — send text ws.send(JSON.stringify({ header: { task_id: taskId, action: 'continue-task', streaming: 'duplex', }, payload: { task_group: 'audio', task: 'tts', function: 'SpeechSynthesizer', model, input: { text }, }, })) // Step 3: finish-task ws.send(JSON.stringify({ header: { task_id: taskId, action: 'finish-task', streaming: 'duplex', }, payload: { task_group: 'audio', task: 'tts', function: 'SpeechSynthesizer', input: {}, }, })) } else if (evt === 'task-finished') { clearTimeout(timer) ws.close() if (settled) return settled = true const audio = Buffer.concat(chunks) if (audio.length === 0) { reject(new Error('TTS 未返回音频')); return } fs.writeFileSync(filePath, audio) resolve({ filePath, duration: getAudioDuration(filePath) }) } else if (evt === 'task-failed') { clearTimeout(timer) ws.close() if (settled) return settled = true reject(new Error(`TTS 失败: ${msg.header?.error_message || msg.header?.message || JSON.stringify(msg)}`)) } } catch {} }) ws.addEventListener('error', (e) => { clearTimeout(timer) if (!settled) { settled = true; reject(new Error(`WebSocket 错误: ${e.message || '连接失败'}`)) } }) }) } /** * 批量语音合成 */ async function synthesizeBatch(segments, options = {}) { const results = [] for (const seg of segments) { console.error(` 合成 #${seg.id}: ${seg.text.substring(0, 30)}...`) try { const { filePath, duration } = await synthesize(seg.text, { ...options, id: seg.id, }) results.push({ id: seg.id, text: seg.text, audio: filePath, duration: Math.round(duration * 1000) / 1000, }) } catch (err) { results.push({ id: seg.id, text: seg.text, audio: '', duration: 0, error: err.message, }) } await new Promise(r => setTimeout(r, 500)) } return results } // CLI 入口 async function main() { const inputJson = process.argv[2] if (!inputJson) { console.error('用法: node qwen-tts.js ') console.error('') console.error('input.json 格式:') console.error(JSON.stringify({ segments: [{ id: 1, text: '文案' }], voice: 'longanyang', output_dir: './audio', }, null, 2)) process.exit(1) } const config = JSON.parse(fs.readFileSync(inputJson, 'utf-8')) const results = await synthesizeBatch(config.segments, { voice: config.voice, outputDir: config.output_dir || './audio', }) process.stdout.write(JSON.stringify({ segments: results }, null, 2) + '\n') } if (require.main === module) { main().catch(err => { console.error('TTS 合成失败:', err.message) process.exit(1) }) } module.exports = { synthesize, synthesizeBatch }