Files
video-create/.claude/skills/video-from-script/scripts/qwen-tts.js
sion123 f5d47ec5db feat(video-pipeline): 添加可配置的转场策略引擎和TTS语速支持
- 引入转场策略系统(`getTransition`),支持 `fixed`、`director`、`rhythm` 三种模式
- 根据账号配置文件动态读取转场配置(`loadTransitions`)
- 图片和视频轨道分别调用转场策略,替代原有的固定“闪白”转场
- 支持 `byPosition`(hook/body/keypoint/closing)和 `byDirector` 两种高级选择策略
- 图片动画支持 `loop_animation` 与 `in_animation` 解析(“缩放”、“弹入”等组合)
- TTS 合成新增 `rate` 字段(源自账号配置 `ttsRate`),默认语速调整为 1.1
- 默认动画类型从 `kenburns-zoom` 改为 `缩放`,适配中文 CapCut
2026-05-01 14:16:08 +08:00

256 lines
7.2 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
/**
* CosyVoice TTS 批量语音合成脚本
* 通过 WebSocket 调用阿里云 DashScope CosyVoice API
*
* 输入 JSON 文件格式:
* {
* "segments": [
* {"id": 1, "text": "第一段文案"},
* {"id": 2, "text": "第二段文案"}
* ],
* "voice": "longanyang", // 可选,覆盖 config
* "output_dir": "./audio" // 可选,默认 ./audio
* }
*
* 输出 JSON (stdout):
* {
* "segments": [
* {"id": 1, "text": "...", "audio": "./audio/seg_001.mp3", "duration": 3.456}
* ]
* }
*
* 也可作为模块调用:
* const { synthesize } = require('./qwen-tts')
* const { filePath, duration } = await synthesize('你好世界', { voice: 'longanyang' })
*/
const fs = require('fs')
const path = require('path')
const { execFileSync } = require('child_process')
const CONFIG_PATH = path.join(__dirname, '..', '..', 'config.json')
function loadConfig() {
if (!fs.existsSync(CONFIG_PATH)) throw new Error(`config.json 不存在: ${CONFIG_PATH}`)
return JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf-8'))
}
function getAudioDuration(filePath) {
try {
const out = execFileSync('ffprobe', [
'-v', 'quiet', '-show_entries', 'format=duration',
'-of', 'default=noprint_wrappers=1:nokey=1', filePath,
], { encoding: 'utf-8', timeout: 10000 })
return parseFloat(out.trim())
} catch {
const stat = fs.statSync(filePath)
return stat.size * 8 / 32000
}
}
/**
* 单段语音合成CosyVoice WebSocket
* @param {string} text
* @param {object} options - { voice, model, outputDir, id, instruction }
* @returns {Promise<{filePath: string, duration: number}>}
*/
function synthesize(text, options = {}) {
return new Promise((resolve, reject) => {
const config = loadConfig()
const apiKey = options.apiKey || config.ttsApiKey
if (!apiKey) { reject(new Error('ttsApiKey 未配置')); return }
const model = options.model || config.ttsModel || 'cosyvoice-v3-flash'
const voice = options.voice || config.ttsVoice || 'longanyang'
const instruction = options.instruction || config.ttsInstruction || ''
const outputDir = options.outputDir || './audio'
fs.mkdirSync(outputDir, { recursive: true })
text = text.trimEnd()
if (!/[。!?.!?…]$/.test(text)) text += '。'
const id = options.id || 1
const fileName = `seg_${String(id).padStart(3, '0')}.mp3`
const filePath = path.resolve(outputDir, fileName)
const wsUrl = 'wss://dashscope.aliyuncs.com/api-ws/v1/inference'
const ws = new WebSocket(wsUrl, {
headers: { Authorization: `bearer ${apiKey}` },
})
const taskId = `tts_${Date.now()}_${id}`
const chunks = []
let settled = false
const timer = setTimeout(() => {
if (!settled) { settled = true; ws.close(); reject(new Error('TTS 超时 (60s)')) }
}, 60000)
ws.addEventListener('open', () => {
// Step 1: run-task — empty input, no text
ws.send(JSON.stringify({
header: {
task_id: taskId,
action: 'run-task',
streaming: 'duplex',
},
payload: {
task_group: 'audio',
task: 'tts',
function: 'SpeechSynthesizer',
model,
parameters: {
voice,
format: 'mp3',
sample_rate: 24000,
volume: 50,
rate: options.rate || 1.1,
pitch_rate: 1.0,
text_type: 'PlainText',
...(instruction ? { instruction } : {}),
},
input: {},
},
}))
})
ws.addEventListener('message', async (event) => {
if (typeof event.data !== 'string') {
const buf = event.data instanceof Blob
? Buffer.from(await event.data.arrayBuffer())
: Buffer.from(event.data)
chunks.push(buf)
return
}
try {
const msg = JSON.parse(event.data)
const evt = msg.header?.event
if (evt === 'task-started') {
// Step 2: continue-task — send text
ws.send(JSON.stringify({
header: {
task_id: taskId,
action: 'continue-task',
streaming: 'duplex',
},
payload: {
task_group: 'audio',
task: 'tts',
function: 'SpeechSynthesizer',
model,
input: { text },
},
}))
// Step 3: finish-task
ws.send(JSON.stringify({
header: {
task_id: taskId,
action: 'finish-task',
streaming: 'duplex',
},
payload: {
task_group: 'audio',
task: 'tts',
function: 'SpeechSynthesizer',
input: {},
},
}))
} else if (evt === 'task-finished') {
clearTimeout(timer)
ws.close()
if (settled) return
settled = true
const audio = Buffer.concat(chunks)
if (audio.length === 0) { reject(new Error('TTS 未返回音频')); return }
fs.writeFileSync(filePath, audio)
resolve({ filePath, duration: getAudioDuration(filePath) })
} else if (evt === 'task-failed') {
clearTimeout(timer)
ws.close()
if (settled) return
settled = true
reject(new Error(`TTS 失败: ${msg.header?.error_message || msg.header?.message || JSON.stringify(msg)}`))
}
} catch {}
})
ws.addEventListener('error', (e) => {
clearTimeout(timer)
if (!settled) { settled = true; reject(new Error(`WebSocket 错误: ${e.message || '连接失败'}`)) }
})
})
}
/**
* 批量语音合成
*/
async function synthesizeBatch(segments, options = {}) {
const results = []
for (const seg of segments) {
console.error(` 合成 #${seg.id}: ${seg.text.substring(0, 30)}...`)
try {
const { filePath, duration } = await synthesize(seg.text, {
...options,
id: seg.id,
})
results.push({
id: seg.id,
text: seg.text,
audio: filePath,
duration: Math.round(duration * 1000) / 1000,
})
} catch (err) {
results.push({
id: seg.id,
text: seg.text,
audio: '',
duration: 0,
error: err.message,
})
}
await new Promise(r => setTimeout(r, 500))
}
return results
}
// CLI 入口
async function main() {
const inputJson = process.argv[2]
if (!inputJson) {
console.error('用法: node qwen-tts.js <input.json>')
console.error('')
console.error('input.json 格式:')
console.error(JSON.stringify({
segments: [{ id: 1, text: '文案' }],
voice: 'longanyang',
output_dir: './audio',
}, null, 2))
process.exit(1)
}
const config = JSON.parse(fs.readFileSync(inputJson, 'utf-8'))
const results = await synthesizeBatch(config.segments, {
voice: config.voice,
outputDir: config.output_dir || './audio',
})
process.stdout.write(JSON.stringify({ segments: results }, null, 2) + '\n')
}
if (require.main === module) {
main().catch(err => {
console.error('TTS 合成失败:', err.message)
process.exit(1)
})
}
module.exports = { synthesize, synthesizeBatch }