Files
video-create/.claude/skills/video-from-script/scripts/qwen-tts.js
sion123 18fce1b5a1 feat(video-from-script): 添加 TTS 音色管理和解析功能
- 在 config.json 中添加 `ttsVoices` 音色库,支持音色名称到 ID 的映射
- 实现 `resolveVoice` 函数,将音色名称解析为实际 ID
- 更新账号系统和批量管道,支持通过音色名称配置 TTS 语音
- Excel 导入和 CLI 参数新增音色字段,支持按行指定不同音色
2026-05-08 23:53:37 +08:00

265 lines
7.4 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
/**
* CosyVoice TTS 批量语音合成脚本
* 通过 WebSocket 调用阿里云 DashScope CosyVoice API
*
* 输入 JSON 文件格式:
* {
* "segments": [
* {"id": 1, "text": "第一段文案"},
* {"id": 2, "text": "第二段文案"}
* ],
* "voice": "longanyang", // 可选,覆盖 config
* "output_dir": "./audio" // 可选,默认 ./audio
* }
*
* 输出 JSON (stdout):
* {
* "segments": [
* {"id": 1, "text": "...", "audio": "./audio/seg_001.mp3", "duration": 3.456}
* ]
* }
*
* 也可作为模块调用:
* const { synthesize } = require('./qwen-tts')
* const { filePath, duration } = await synthesize('你好世界', { voice: 'longanyang' })
*/
const fs = require('fs')
const path = require('path')
const { execFileSync } = require('child_process')
const CONFIG_PATH = path.join(__dirname, '..', '..', 'config.json')
function loadConfig() {
if (!fs.existsSync(CONFIG_PATH)) throw new Error(`config.json 不存在: ${CONFIG_PATH}`)
return JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf-8'))
}
/**
* 解析音色:名称 → ID。如果是名称则查 ttsVoices 映射表,否则原样返回。
*/
function resolveVoice(voice, config) {
if (!voice) return voice
const voices = config.ttsVoices || {}
return voices[voice] || voice
}
function getAudioDuration(filePath) {
try {
const out = execFileSync('ffprobe', [
'-v', 'quiet', '-show_entries', 'format=duration',
'-of', 'default=noprint_wrappers=1:nokey=1', filePath,
], { encoding: 'utf-8', timeout: 10000 })
return parseFloat(out.trim())
} catch {
const stat = fs.statSync(filePath)
return stat.size * 8 / 32000
}
}
/**
* 单段语音合成CosyVoice WebSocket
* @param {string} text
* @param {object} options - { voice, model, outputDir, id, instruction }
* @returns {Promise<{filePath: string, duration: number}>}
*/
function synthesize(text, options = {}) {
return new Promise((resolve, reject) => {
const config = loadConfig()
const apiKey = options.apiKey || config.ttsApiKey
if (!apiKey) { reject(new Error('ttsApiKey 未配置')); return }
const model = options.model || config.ttsModel || 'cosyvoice-v3-flash'
const voice = resolveVoice(options.voice || config.ttsVoice, config) || 'longanyang'
const instruction = options.instruction || config.ttsInstruction || ''
const outputDir = options.outputDir || './audio'
fs.mkdirSync(outputDir, { recursive: true })
text = text.trimEnd()
if (!/[。!?;,.!?…]$/.test(text)) text += '。'
const id = options.id || 1
const fileName = `seg_${String(id).padStart(3, '0')}.mp3`
const filePath = path.resolve(outputDir, fileName)
const wsUrl = 'wss://dashscope.aliyuncs.com/api-ws/v1/inference'
const ws = new WebSocket(wsUrl, {
headers: { Authorization: `bearer ${apiKey}` },
})
const taskId = `tts_${Date.now()}_${id}`
const chunks = []
let settled = false
const timer = setTimeout(() => {
if (!settled) { settled = true; ws.close(); reject(new Error('TTS 超时 (60s)')) }
}, 60000)
ws.addEventListener('open', () => {
// Step 1: run-task — empty input, no text
ws.send(JSON.stringify({
header: {
task_id: taskId,
action: 'run-task',
streaming: 'duplex',
},
payload: {
task_group: 'audio',
task: 'tts',
function: 'SpeechSynthesizer',
model,
parameters: {
voice,
format: 'mp3',
sample_rate: 24000,
volume: 50,
rate: options.rate || 1.15,
pitch_rate: 1.0,
text_type: 'PlainText',
...(instruction ? { instruction } : {}),
},
input: {},
},
}))
})
ws.addEventListener('message', async (event) => {
if (typeof event.data !== 'string') {
const buf = event.data instanceof Blob
? Buffer.from(await event.data.arrayBuffer())
: Buffer.from(event.data)
chunks.push(buf)
return
}
try {
const msg = JSON.parse(event.data)
const evt = msg.header?.event
if (evt === 'task-started') {
// Step 2: continue-task — send text
ws.send(JSON.stringify({
header: {
task_id: taskId,
action: 'continue-task',
streaming: 'duplex',
},
payload: {
task_group: 'audio',
task: 'tts',
function: 'SpeechSynthesizer',
model,
input: { text },
},
}))
// Step 3: finish-task
ws.send(JSON.stringify({
header: {
task_id: taskId,
action: 'finish-task',
streaming: 'duplex',
},
payload: {
task_group: 'audio',
task: 'tts',
function: 'SpeechSynthesizer',
input: {},
},
}))
} else if (evt === 'task-finished') {
clearTimeout(timer)
ws.close()
if (settled) return
settled = true
const audio = Buffer.concat(chunks)
if (audio.length === 0) { reject(new Error('TTS 未返回音频')); return }
fs.writeFileSync(filePath, audio)
resolve({ filePath, duration: getAudioDuration(filePath) })
} else if (evt === 'task-failed') {
clearTimeout(timer)
ws.close()
if (settled) return
settled = true
reject(new Error(`TTS 失败: ${msg.header?.error_message || msg.header?.message || JSON.stringify(msg)}`))
}
} catch {}
})
ws.addEventListener('error', (e) => {
clearTimeout(timer)
if (!settled) { settled = true; reject(new Error(`WebSocket 错误: ${e.message || '连接失败'}`)) }
})
})
}
/**
* 批量语音合成
*/
async function synthesizeBatch(segments, options = {}) {
const results = []
for (const seg of segments) {
console.error(` 合成 #${seg.id}: ${seg.text.substring(0, 30)}...`)
try {
const { filePath, duration } = await synthesize(seg.text, {
...options,
id: seg.id,
})
results.push({
id: seg.id,
text: seg.text,
audio: filePath,
duration: Math.round(duration * 1000) / 1000,
})
} catch (err) {
results.push({
id: seg.id,
text: seg.text,
audio: '',
duration: 0,
error: err.message,
})
}
await new Promise(r => setTimeout(r, 500))
}
return results
}
// CLI 入口
async function main() {
const inputJson = process.argv[2]
if (!inputJson) {
console.error('用法: node qwen-tts.js <input.json>')
console.error('')
console.error('input.json 格式:')
console.error(JSON.stringify({
segments: [{ id: 1, text: '文案' }],
voice: 'longanyang',
output_dir: './audio',
}, null, 2))
process.exit(1)
}
const config = JSON.parse(fs.readFileSync(inputJson, 'utf-8'))
const results = await synthesizeBatch(config.segments, {
voice: config.voice,
outputDir: config.output_dir || './audio',
})
process.stdout.write(JSON.stringify({ segments: results }, null, 2) + '\n')
}
if (require.main === module) {
main().catch(err => {
console.error('TTS 合成失败:', err.message)
process.exit(1)
})
}
module.exports = { synthesize, synthesizeBatch }