video-create/.claude/skills/video-from-script/scripts/qwen-tts.js

#!/usr/bin/env node

/**
 * 阿里云 Qwen-TTS 批量语音合成脚本
 *
 * 输入 JSON 文件格式:
 * {
 *   "segments": [
 *     {"id": 1, "text": "第一段文案"},
 *     {"id": 2, "text": "第二段文案"}
 *   ],
 *   "voice": "Cherry",          // 可选，覆盖 config
 *   "output_dir": "./audio"     // 可选，默认 ./audio
 * }
 *
 * 输出 JSON (stdout):
 * {
 *   "segments": [
 *     {"id": 1, "text": "...", "audio": "./audio/seg_001.wav", "duration": 3.456},
 *     ...
 *   ]
 * }
 *
 * 也可作为模块调用:
 *   const { synthesize } = require('./qwen-tts')
 *   const { filePath, duration } = await synthesize('你好世界', { voice: 'Cherry' })
 */

const axios = require('axios')
const fs = require('fs')
const path = require('path')

const CONFIG_PATH = path.join(__dirname, '..', '..', 'config.json')

function loadConfig() {
  if (!fs.existsSync(CONFIG_PATH)) throw new Error(`config.json 不存在: ${CONFIG_PATH}`)
  return JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf-8'))
}

/**
 * 单段语音合成（非流式）
 * @param {string} text - 要合成的文本
 * @param {object} options - { voice, model, language, outputDir, id }
 * @returns {{ filePath: string, duration: number }}
 */
async function synthesize(text, options = {}) {
  const config = loadConfig()

  const apiKey = options.apiKey || config.ttsApiKey
  if (!apiKey) throw new Error('ttsApiKey 未配置，请在 config.json 中设置')

  const baseUrl = (options.apiBaseUrl || config.ttsApiBaseUrl || 'https://dashscope.aliyuncs.com/api/v1').replace(/\/$/, '')
  const model = options.model || config.ttsModel || 'qwen-tts'
  const voice = options.voice || config.ttsVoice || 'Cherry'
  const language = options.language || config.ttsLanguage || 'Chinese'
  const outputDir = options.outputDir || './audio'

  fs.mkdirSync(outputDir, { recursive: true })

  // 确保文本有句末标点，让 TTS 生成自然语调和尾部停顿
  text = text.trimEnd()
  if (!/[。！？.!?…]$/.test(text)) text += '。'

  const url = `${baseUrl}/services/aigc/multimodal-generation/generation`

  let res
  try {
    res = await axios.post(url, {
      model,
      input: {
        text,
        voice,
        language_type: language,
      },
    }, {
      headers: {
        'Authorization': `Bearer ${apiKey}`,
        'Content-Type': 'application/json',
      },
      timeout: 60000,
    })
  } catch (err) {
    const detail = err.response?.data
    throw new Error(`TTS API 错误: ${err.message}${detail ? ' ' + JSON.stringify(detail) : ''}`)
  }

  const audioUrl = res.data?.output?.audio?.url
  if (!audioUrl) {
    throw new Error(`TTS API 未返回音频 URL: ${JSON.stringify(res.data)}`)
  }

  // 下载音频到本地
  const id = options.id || 1
  const fileName = `seg_${String(id).padStart(3, '0')}.wav`
  const filePath = path.resolve(outputDir, fileName)

  const audioRes = await axios.get(audioUrl, { responseType: 'arraybuffer', timeout: 30000 })
  const wavBuffer = Buffer.from(audioRes.data)

  // 追加 0.3s 静音（句间气口）
  const silenceSec = options.silencePadding !== undefined ? options.silencePadding : 0.3
  const silenceBytes = Math.round(24000 * 2 * silenceSec)
  const silenceBuffer = Buffer.alloc(silenceBytes, 0)
  const finalBuffer = Buffer.concat([wavBuffer, silenceBuffer])
  // 更新 WAV 头的文件大小
  finalBuffer.writeUInt32LE(finalBuffer.length - 8, 4)
  finalBuffer.writeUInt32LE(wavBuffer.length - 44 + silenceBytes, 40)
  fs.writeFileSync(filePath, finalBuffer)

  const duration = (finalBuffer.length - 44) / (24000 * 2)

  return { filePath, duration }
}

/**
 * 批量语音合成
 * @param {Array<{id: number, text: string}>} segments
 * @param {object} options - { voice, outputDir }
 * @returns {Array<{id: number, text: string, audio: string, duration: number}>}
 */
async function synthesizeBatch(segments, options = {}) {
  const results = []
  for (const seg of segments) {
    console.error(`  合成 #${seg.id}: ${seg.text.substring(0, 30)}...`)
    const { filePath, duration } = await synthesize(seg.text, {
      ...options,
      id: seg.id,
    })
    results.push({
      id: seg.id,
      text: seg.text,
      audio: filePath,
      duration: Math.round(duration * 1000) / 1000,
    })
    // 间隔 0.5 秒避免限流
    await new Promise(r => setTimeout(r, 500))
  }
  return results
}

// CLI 入口
async function main() {
  const inputJson = process.argv[2]
  if (!inputJson) {
    console.error('用法: node qwen-tts.js <input.json>')
    console.error('')
    console.error('input.json 格式:')
    console.error(JSON.stringify({
      segments: [{ id: 1, text: '文案' }],
      voice: 'Cherry',
      output_dir: './audio',
    }, null, 2))
    process.exit(1)
  }

  const config = JSON.parse(fs.readFileSync(inputJson, 'utf-8'))
  const segments = config.segments
  const options = {
    voice: config.voice,
    outputDir: config.output_dir || './audio',
  }

  const results = await synthesizeBatch(segments, options)
  const output = { segments: results }
  process.stdout.write(JSON.stringify(output, null, 2) + '\n')
}

if (require.main === module) {
  main().catch(err => {
    console.error('TTS 合成失败:', err.message)
    process.exit(1)
  })
}

module.exports = { synthesize, synthesizeBatch }