Files
video-create/.claude/skills/video-from-script/scripts/qwen-tts.js
lc 6eec0e8889 feat(skills): 完善视频生产 pipeline 及新增健身跟练账号
- SKILL.md: 新增工作流阶段定义、质量卡点、分镜规则
- manifest-schema.md: 补充完整字段规范及类型定义
- phase-tts.js: 优化 TTS 合成长逻辑,添加进度追踪
- capcut-tracks.js: 扩展轨道构建能力,支持更多元素类型
- capcut-timeline.js: 改进时间线生成,支持淡入淡出
- capcut_assemble.js: 新增 assemble 阶段完整实现
- cmd-init.js: 完善 init 命令逻辑
- qwen-tts.js: 调整超时配置
- accounts/禁忌帝王学: 更新拆分/图像/台词提示词
- accounts/健身跟练: 新增账号含 account.json 及全套提示词模板
- 新增 workflow-issues-20260501.md 参考文档

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-06 22:53:37 +08:00

256 lines
7.2 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env node
/**
* CosyVoice TTS 批量语音合成脚本
* 通过 WebSocket 调用阿里云 DashScope CosyVoice API
*
* 输入 JSON 文件格式:
* {
* "segments": [
* {"id": 1, "text": "第一段文案"},
* {"id": 2, "text": "第二段文案"}
* ],
* "voice": "longanyang", // 可选,覆盖 config
* "output_dir": "./audio" // 可选,默认 ./audio
* }
*
* 输出 JSON (stdout):
* {
* "segments": [
* {"id": 1, "text": "...", "audio": "./audio/seg_001.mp3", "duration": 3.456}
* ]
* }
*
* 也可作为模块调用:
* const { synthesize } = require('./qwen-tts')
* const { filePath, duration } = await synthesize('你好世界', { voice: 'longanyang' })
*/
const fs = require('fs')
const path = require('path')
const { execFileSync } = require('child_process')
const CONFIG_PATH = path.join(__dirname, '..', '..', 'config.json')
function loadConfig() {
if (!fs.existsSync(CONFIG_PATH)) throw new Error(`config.json 不存在: ${CONFIG_PATH}`)
return JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf-8'))
}
function getAudioDuration(filePath) {
try {
const out = execFileSync('ffprobe', [
'-v', 'quiet', '-show_entries', 'format=duration',
'-of', 'default=noprint_wrappers=1:nokey=1', filePath,
], { encoding: 'utf-8', timeout: 10000 })
return parseFloat(out.trim())
} catch {
const stat = fs.statSync(filePath)
return stat.size * 8 / 32000
}
}
/**
* 单段语音合成CosyVoice WebSocket
* @param {string} text
* @param {object} options - { voice, model, outputDir, id, instruction }
* @returns {Promise<{filePath: string, duration: number}>}
*/
function synthesize(text, options = {}) {
return new Promise((resolve, reject) => {
const config = loadConfig()
const apiKey = options.apiKey || config.ttsApiKey
if (!apiKey) { reject(new Error('ttsApiKey 未配置')); return }
const model = options.model || config.ttsModel || 'cosyvoice-v3-flash'
const voice = options.voice || config.ttsVoice || 'longanyang'
const instruction = options.instruction || config.ttsInstruction || ''
const outputDir = options.outputDir || './audio'
fs.mkdirSync(outputDir, { recursive: true })
text = text.trimEnd()
if (!/[。!?.!?…]$/.test(text)) text += '。'
const id = options.id || 1
const fileName = `seg_${String(id).padStart(3, '0')}.mp3`
const filePath = path.resolve(outputDir, fileName)
const wsUrl = 'wss://dashscope.aliyuncs.com/api-ws/v1/inference'
const ws = new WebSocket(wsUrl, {
headers: { Authorization: `bearer ${apiKey}` },
})
const taskId = `tts_${Date.now()}_${id}`
const chunks = []
let settled = false
const timer = setTimeout(() => {
if (!settled) { settled = true; ws.close(); reject(new Error('TTS 超时 (60s)')) }
}, 60000)
ws.addEventListener('open', () => {
// Step 1: run-task — empty input, no text
ws.send(JSON.stringify({
header: {
task_id: taskId,
action: 'run-task',
streaming: 'duplex',
},
payload: {
task_group: 'audio',
task: 'tts',
function: 'SpeechSynthesizer',
model,
parameters: {
voice,
format: 'mp3',
sample_rate: 24000,
volume: 50,
rate: options.rate || 1.15,
pitch_rate: 1.0,
text_type: 'PlainText',
...(instruction ? { instruction } : {}),
},
input: {},
},
}))
})
ws.addEventListener('message', async (event) => {
if (typeof event.data !== 'string') {
const buf = event.data instanceof Blob
? Buffer.from(await event.data.arrayBuffer())
: Buffer.from(event.data)
chunks.push(buf)
return
}
try {
const msg = JSON.parse(event.data)
const evt = msg.header?.event
if (evt === 'task-started') {
// Step 2: continue-task — send text
ws.send(JSON.stringify({
header: {
task_id: taskId,
action: 'continue-task',
streaming: 'duplex',
},
payload: {
task_group: 'audio',
task: 'tts',
function: 'SpeechSynthesizer',
model,
input: { text },
},
}))
// Step 3: finish-task
ws.send(JSON.stringify({
header: {
task_id: taskId,
action: 'finish-task',
streaming: 'duplex',
},
payload: {
task_group: 'audio',
task: 'tts',
function: 'SpeechSynthesizer',
input: {},
},
}))
} else if (evt === 'task-finished') {
clearTimeout(timer)
ws.close()
if (settled) return
settled = true
const audio = Buffer.concat(chunks)
if (audio.length === 0) { reject(new Error('TTS 未返回音频')); return }
fs.writeFileSync(filePath, audio)
resolve({ filePath, duration: getAudioDuration(filePath) })
} else if (evt === 'task-failed') {
clearTimeout(timer)
ws.close()
if (settled) return
settled = true
reject(new Error(`TTS 失败: ${msg.header?.error_message || msg.header?.message || JSON.stringify(msg)}`))
}
} catch {}
})
ws.addEventListener('error', (e) => {
clearTimeout(timer)
if (!settled) { settled = true; reject(new Error(`WebSocket 错误: ${e.message || '连接失败'}`)) }
})
})
}
/**
* 批量语音合成
*/
async function synthesizeBatch(segments, options = {}) {
const results = []
for (const seg of segments) {
console.error(` 合成 #${seg.id}: ${seg.text.substring(0, 30)}...`)
try {
const { filePath, duration } = await synthesize(seg.text, {
...options,
id: seg.id,
})
results.push({
id: seg.id,
text: seg.text,
audio: filePath,
duration: Math.round(duration * 1000) / 1000,
})
} catch (err) {
results.push({
id: seg.id,
text: seg.text,
audio: '',
duration: 0,
error: err.message,
})
}
await new Promise(r => setTimeout(r, 500))
}
return results
}
// CLI 入口
async function main() {
const inputJson = process.argv[2]
if (!inputJson) {
console.error('用法: node qwen-tts.js <input.json>')
console.error('')
console.error('input.json 格式:')
console.error(JSON.stringify({
segments: [{ id: 1, text: '文案' }],
voice: 'longanyang',
output_dir: './audio',
}, null, 2))
process.exit(1)
}
const config = JSON.parse(fs.readFileSync(inputJson, 'utf-8'))
const results = await synthesizeBatch(config.segments, {
voice: config.voice,
outputDir: config.output_dir || './audio',
})
process.stdout.write(JSON.stringify({ segments: results }, null, 2) + '\n')
}
if (require.main === module) {
main().catch(err => {
console.error('TTS 合成失败:', err.message)
process.exit(1)
})
}
module.exports = { synthesize, synthesizeBatch }