Files
video-create/.claude/skills/video-from-script/scripts/qwen-tts.js
sion123 b309f54430 feat(capcut): 优化音频/字幕添加策略并重构语音切分逻辑
- 音频和字幕 API 调用改为先批量添加,批量失败时逐个兜底
- 重写 `splitIntoAudioSegments`,基于原始标点保留切分,合并短片段
- `qwen-tts.js` 补充中文逗号作为句末标点判断
2026-05-06 23:21:40 +08:00

256 lines
7.2 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
/**
* CosyVoice TTS 批量语音合成脚本
* 通过 WebSocket 调用阿里云 DashScope CosyVoice API
*
* 输入 JSON 文件格式:
* {
* "segments": [
* {"id": 1, "text": "第一段文案"},
* {"id": 2, "text": "第二段文案"}
* ],
* "voice": "longanyang", // 可选,覆盖 config
* "output_dir": "./audio" // 可选,默认 ./audio
* }
*
* 输出 JSON (stdout):
* {
* "segments": [
* {"id": 1, "text": "...", "audio": "./audio/seg_001.mp3", "duration": 3.456}
* ]
* }
*
* 也可作为模块调用:
* const { synthesize } = require('./qwen-tts')
* const { filePath, duration } = await synthesize('你好世界', { voice: 'longanyang' })
*/
const fs = require('fs')
const path = require('path')
const { execFileSync } = require('child_process')
const CONFIG_PATH = path.join(__dirname, '..', '..', 'config.json')
function loadConfig() {
if (!fs.existsSync(CONFIG_PATH)) throw new Error(`config.json 不存在: ${CONFIG_PATH}`)
return JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf-8'))
}
function getAudioDuration(filePath) {
try {
const out = execFileSync('ffprobe', [
'-v', 'quiet', '-show_entries', 'format=duration',
'-of', 'default=noprint_wrappers=1:nokey=1', filePath,
], { encoding: 'utf-8', timeout: 10000 })
return parseFloat(out.trim())
} catch {
const stat = fs.statSync(filePath)
return stat.size * 8 / 32000
}
}
/**
* 单段语音合成CosyVoice WebSocket
* @param {string} text
* @param {object} options - { voice, model, outputDir, id, instruction }
* @returns {Promise<{filePath: string, duration: number}>}
*/
function synthesize(text, options = {}) {
return new Promise((resolve, reject) => {
const config = loadConfig()
const apiKey = options.apiKey || config.ttsApiKey
if (!apiKey) { reject(new Error('ttsApiKey 未配置')); return }
const model = options.model || config.ttsModel || 'cosyvoice-v3-flash'
const voice = options.voice || config.ttsVoice || 'longanyang'
const instruction = options.instruction || config.ttsInstruction || ''
const outputDir = options.outputDir || './audio'
fs.mkdirSync(outputDir, { recursive: true })
text = text.trimEnd()
if (!/[。!?;,.!?…]$/.test(text)) text += '。'
const id = options.id || 1
const fileName = `seg_${String(id).padStart(3, '0')}.mp3`
const filePath = path.resolve(outputDir, fileName)
const wsUrl = 'wss://dashscope.aliyuncs.com/api-ws/v1/inference'
const ws = new WebSocket(wsUrl, {
headers: { Authorization: `bearer ${apiKey}` },
})
const taskId = `tts_${Date.now()}_${id}`
const chunks = []
let settled = false
const timer = setTimeout(() => {
if (!settled) { settled = true; ws.close(); reject(new Error('TTS 超时 (60s)')) }
}, 60000)
ws.addEventListener('open', () => {
// Step 1: run-task — empty input, no text
ws.send(JSON.stringify({
header: {
task_id: taskId,
action: 'run-task',
streaming: 'duplex',
},
payload: {
task_group: 'audio',
task: 'tts',
function: 'SpeechSynthesizer',
model,
parameters: {
voice,
format: 'mp3',
sample_rate: 24000,
volume: 50,
rate: options.rate || 1.15,
pitch_rate: 1.0,
text_type: 'PlainText',
...(instruction ? { instruction } : {}),
},
input: {},
},
}))
})
ws.addEventListener('message', async (event) => {
if (typeof event.data !== 'string') {
const buf = event.data instanceof Blob
? Buffer.from(await event.data.arrayBuffer())
: Buffer.from(event.data)
chunks.push(buf)
return
}
try {
const msg = JSON.parse(event.data)
const evt = msg.header?.event
if (evt === 'task-started') {
// Step 2: continue-task — send text
ws.send(JSON.stringify({
header: {
task_id: taskId,
action: 'continue-task',
streaming: 'duplex',
},
payload: {
task_group: 'audio',
task: 'tts',
function: 'SpeechSynthesizer',
model,
input: { text },
},
}))
// Step 3: finish-task
ws.send(JSON.stringify({
header: {
task_id: taskId,
action: 'finish-task',
streaming: 'duplex',
},
payload: {
task_group: 'audio',
task: 'tts',
function: 'SpeechSynthesizer',
input: {},
},
}))
} else if (evt === 'task-finished') {
clearTimeout(timer)
ws.close()
if (settled) return
settled = true
const audio = Buffer.concat(chunks)
if (audio.length === 0) { reject(new Error('TTS 未返回音频')); return }
fs.writeFileSync(filePath, audio)
resolve({ filePath, duration: getAudioDuration(filePath) })
} else if (evt === 'task-failed') {
clearTimeout(timer)
ws.close()
if (settled) return
settled = true
reject(new Error(`TTS 失败: ${msg.header?.error_message || msg.header?.message || JSON.stringify(msg)}`))
}
} catch {}
})
ws.addEventListener('error', (e) => {
clearTimeout(timer)
if (!settled) { settled = true; reject(new Error(`WebSocket 错误: ${e.message || '连接失败'}`)) }
})
})
}
/**
* 批量语音合成
*/
async function synthesizeBatch(segments, options = {}) {
const results = []
for (const seg of segments) {
console.error(` 合成 #${seg.id}: ${seg.text.substring(0, 30)}...`)
try {
const { filePath, duration } = await synthesize(seg.text, {
...options,
id: seg.id,
})
results.push({
id: seg.id,
text: seg.text,
audio: filePath,
duration: Math.round(duration * 1000) / 1000,
})
} catch (err) {
results.push({
id: seg.id,
text: seg.text,
audio: '',
duration: 0,
error: err.message,
})
}
await new Promise(r => setTimeout(r, 500))
}
return results
}
// CLI 入口
async function main() {
const inputJson = process.argv[2]
if (!inputJson) {
console.error('用法: node qwen-tts.js <input.json>')
console.error('')
console.error('input.json 格式:')
console.error(JSON.stringify({
segments: [{ id: 1, text: '文案' }],
voice: 'longanyang',
output_dir: './audio',
}, null, 2))
process.exit(1)
}
const config = JSON.parse(fs.readFileSync(inputJson, 'utf-8'))
const results = await synthesizeBatch(config.segments, {
voice: config.voice,
outputDir: config.output_dir || './audio',
})
process.stdout.write(JSON.stringify({ segments: results }, null, 2) + '\n')
}
if (require.main === module) {
main().catch(err => {
console.error('TTS 合成失败:', err.message)
process.exit(1)
})
}
module.exports = { synthesize, synthesizeBatch }