.claude/skills/video-from-script/scripts/qwen-tts.js

#!/usr/bin/env node

/**
 * CosyVoice TTS 批量语音合成脚本
 * 通过 WebSocket 调用阿里云 DashScope CosyVoice API
 *
 * 输入 JSON 文件格式:
 * {
 *   "segments": [
 *     {"id": 1, "text": "第一段文案"},
 *     {"id": 2, "text": "第二段文案"}
 *   ],
 *   "voice": "longanyang",       // 可选，覆盖 config
 *   "output_dir": "./audio"      // 可选，默认 ./audio
 * }
 *
 * 输出 JSON (stdout):
 * {
 *   "segments": [
 *     {"id": 1, "text": "...", "audio": "./audio/seg_001.mp3", "duration": 3.456}
 *   ]
 * }
 *
 * 也可作为模块调用:
 *   const { synthesize } = require('./qwen-tts')
 *   const { filePath, duration } = await synthesize('你好世界', { voice: 'longanyang' })
 */

const fs = require('fs')
const path = require('path')
const { execFileSync } = require('child_process')

const CONFIG_PATH = path.join(__dirname, '..', '..', 'config.json')

function loadConfig() {
  if (!fs.existsSync(CONFIG_PATH)) throw new Error(`config.json 不存在: ${CONFIG_PATH}`)
  return JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf-8'))
}

function getAudioDuration(filePath) {
  try {
    const out = execFileSync('ffprobe', [
      '-v', 'quiet', '-show_entries', 'format=duration',
      '-of', 'default=noprint_wrappers=1:nokey=1', filePath,
    ], { encoding: 'utf-8', timeout: 10000 })
    return parseFloat(out.trim())
  } catch {
    const stat = fs.statSync(filePath)
    return stat.size * 8 / 32000
  }
}

/**
 * 单段语音合成（CosyVoice WebSocket）
 * @param {string} text
 * @param {object} options - { voice, model, outputDir, id, instruction }
 * @returns {Promise<{filePath: string, duration: number}>}
 */
function synthesize(text, options = {}) {
  return new Promise((resolve, reject) => {
    const config = loadConfig()

    const apiKey = options.apiKey || config.ttsApiKey
    if (!apiKey) { reject(new Error('ttsApiKey 未配置')); return }

    const model = options.model || config.ttsModel || 'cosyvoice-v3-flash'
    const voice = options.voice || config.ttsVoice || 'longanyang'
    const instruction = options.instruction || config.ttsInstruction || ''
    const outputDir = options.outputDir || './audio'

    fs.mkdirSync(outputDir, { recursive: true })

    text = text.trimEnd()
    if (!/[。！？.!?…]$/.test(text)) text += '。'

    const id = options.id || 1
    const fileName = `seg_${String(id).padStart(3, '0')}.mp3`
    const filePath = path.resolve(outputDir, fileName)

    const wsUrl = 'wss://dashscope.aliyuncs.com/api-ws/v1/inference'
    const ws = new WebSocket(wsUrl, {
      headers: { Authorization: `bearer ${apiKey}` },
    })

    const taskId = `tts_${Date.now()}_${id}`
    const chunks = []
    let settled = false

    const timer = setTimeout(() => {
      if (!settled) { settled = true; ws.close(); reject(new Error('TTS 超时 (60s)')) }
    }, 60000)

    ws.addEventListener('open', () => {
      // Step 1: run-task — empty input, no text
      ws.send(JSON.stringify({
        header: {
          task_id: taskId,
          action: 'run-task',
          streaming: 'duplex',
        },
        payload: {
          task_group: 'audio',
          task: 'tts',
          function: 'SpeechSynthesizer',
          model,
          parameters: {
            voice,
            format: 'mp3',
            sample_rate: 24000,
            volume: 50,
            rate: 1.0,
            pitch_rate: 1.0,
            text_type: 'PlainText',
            ...(instruction ? { instruction } : {}),
          },
          input: {},
        },
      }))
    })

    ws.addEventListener('message', async (event) => {
      if (typeof event.data !== 'string') {
        const buf = event.data instanceof Blob
          ? Buffer.from(await event.data.arrayBuffer())
          : Buffer.from(event.data)
        chunks.push(buf)
        return
      }
      try {
        const msg = JSON.parse(event.data)
        const evt = msg.header?.event

        if (evt === 'task-started') {
          // Step 2: continue-task — send text
          ws.send(JSON.stringify({
            header: {
              task_id: taskId,
              action: 'continue-task',
              streaming: 'duplex',
            },
            payload: {
              task_group: 'audio',
              task: 'tts',
              function: 'SpeechSynthesizer',
              model,
              input: { text },
            },
          }))

          // Step 3: finish-task
          ws.send(JSON.stringify({
            header: {
              task_id: taskId,
              action: 'finish-task',
              streaming: 'duplex',
            },
            payload: {
              task_group: 'audio',
              task: 'tts',
              function: 'SpeechSynthesizer',
              input: {},
            },
          }))
        } else if (evt === 'task-finished') {
          clearTimeout(timer)
          ws.close()
          if (settled) return
          settled = true

          const audio = Buffer.concat(chunks)
          if (audio.length === 0) { reject(new Error('TTS 未返回音频')); return }

          fs.writeFileSync(filePath, audio)
          resolve({ filePath, duration: getAudioDuration(filePath) })
        } else if (evt === 'task-failed') {
          clearTimeout(timer)
          ws.close()
          if (settled) return
          settled = true
          reject(new Error(`TTS 失败: ${msg.header?.error_message || msg.header?.message || JSON.stringify(msg)}`))
        }
      } catch {}
    })

    ws.addEventListener('error', (e) => {
      clearTimeout(timer)
      if (!settled) { settled = true; reject(new Error(`WebSocket 错误: ${e.message || '连接失败'}`)) }
    })
  })
}

/**
 * 批量语音合成
 */
async function synthesizeBatch(segments, options = {}) {
  const results = []
  for (const seg of segments) {
    console.error(`  合成 #${seg.id}: ${seg.text.substring(0, 30)}...`)
    try {
      const { filePath, duration } = await synthesize(seg.text, {
        ...options,
        id: seg.id,
      })
      results.push({
        id: seg.id,
        text: seg.text,
        audio: filePath,
        duration: Math.round(duration * 1000) / 1000,
      })
    } catch (err) {
      results.push({
        id: seg.id,
        text: seg.text,
        audio: '',
        duration: 0,
        error: err.message,
      })
    }
    await new Promise(r => setTimeout(r, 500))
  }
  return results
}

// CLI 入口
async function main() {
  const inputJson = process.argv[2]
  if (!inputJson) {
    console.error('用法: node qwen-tts.js <input.json>')
    console.error('')
    console.error('input.json 格式:')
    console.error(JSON.stringify({
      segments: [{ id: 1, text: '文案' }],
      voice: 'longanyang',
      output_dir: './audio',
    }, null, 2))
    process.exit(1)
  }

  const config = JSON.parse(fs.readFileSync(inputJson, 'utf-8'))
  const results = await synthesizeBatch(config.segments, {
    voice: config.voice,
    outputDir: config.output_dir || './audio',
  })

  process.stdout.write(JSON.stringify({ segments: results }, null, 2) + '\n')
}

if (require.main === module) {
  main().catch(err => {
    console.error('TTS 合成失败:', err.message)
    process.exit(1)
  })
}

module.exports = { synthesize, synthesizeBatch }
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
+								#!/usr/bin/env node
 								/**
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								 * CosyVoice TTS 批量语音合成脚本
 								 * 通过 WebSocket 调用阿里云 DashScope CosyVoice API
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
+								 *
 								 * 输入 JSON 文件格式:
 								 * {
 								 *   "segments": [
 								 *     {"id": 1, "text": "第一段文案"},
 								 *     {"id": 2, "text": "第二段文案"}
 								 *   ],
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								 *   "voice": "longanyang",       // 可选，覆盖 config
 								 *   "output_dir": "./audio"      // 可选，默认 ./audio
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
+								 * }
 								 *
 								 * 输出 JSON (stdout):
 								 * {
 								 *   "segments": [
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								 *     {"id": 1, "text": "...", "audio": "./audio/seg_001.mp3", "duration": 3.456}
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
+								 *   ]
 								 * }
 								 *
 								 * 也可作为模块调用:
 								 *   const { synthesize } = require('./qwen-tts')
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								 *   const { filePath, duration } = await synthesize('你好世界', { voice: 'longanyang' })
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
+								 */
 								const fs = require('fs')
 								const path = require('path')
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								const { execFileSync } = require('child_process')
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
 								const CONFIG_PATH = path.join(__dirname, '..', '..', 'config.json')
 								function loadConfig() {
 								  if (!fs.existsSync(CONFIG_PATH)) throw new Error(`config.json 不存在: ${CONFIG_PATH}`)
 								  return JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf-8'))
 								}
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								function getAudioDuration(filePath) {
 								  try {
 								    const out = execFileSync('ffprobe', [
 								      '-v', 'quiet', '-show_entries', 'format=duration',
 								      '-of', 'default=noprint_wrappers=1:nokey=1', filePath,
 								    ], { encoding: 'utf-8', timeout: 10000 })
 								    return parseFloat(out.trim())
 								  } catch {
 								    const stat = fs.statSync(filePath)
 								    return stat.size * 8 / 32000
 								  }
 								}
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
+								/**
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								 * 单段语音合成（CosyVoice WebSocket）
 								 * @param {string} text
 								 * @param {object} options - { voice, model, outputDir, id, instruction }
 								 * @returns {Promise<{filePath: string, duration: number}>}
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
+								 */
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								function synthesize(text, options = {}) {
 								  return new Promise((resolve, reject) => {
 								    const config = loadConfig()
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								    const apiKey = options.apiKey || config.ttsApiKey
 								    if (!apiKey) { reject(new Error('ttsApiKey 未配置')); return }
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								    const model = options.model || config.ttsModel || 'cosyvoice-v3-flash'
 								    const voice = options.voice || config.ttsVoice || 'longanyang'
 								    const instruction = options.instruction || config.ttsInstruction || ''
 								    const outputDir = options.outputDir || './audio'
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								    fs.mkdirSync(outputDir, { recursive: true })
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								    text = text.trimEnd()
 								    if (!/[。！？.!?…]$/.test(text)) text += '。'
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								    const id = options.id || 1
 								    const fileName = `seg_${String(id).padStart(3, '0')}.mp3`
 								    const filePath = path.resolve(outputDir, fileName)
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								    const wsUrl = 'wss://dashscope.aliyuncs.com/api-ws/v1/inference'
 								    const ws = new WebSocket(wsUrl, {
 								      headers: { Authorization: `bearer ${apiKey}` },
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
+								    })
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								    const taskId = `tts_${Date.now()}_${id}`
 								    const chunks = []
 								    let settled = false
 								    const timer = setTimeout(() => {
 								      if (!settled) { settled = true; ws.close(); reject(new Error('TTS 超时 (60s)')) }
 								    }, 60000)
 								    ws.addEventListener('open', () => {
 								      // Step 1: run-task — empty input, no text
 								      ws.send(JSON.stringify({
 								        header: {
 								          task_id: taskId,
 								          action: 'run-task',
 								          streaming: 'duplex',
 								        },
 								        payload: {
 								          task_group: 'audio',
 								          task: 'tts',
 								          function: 'SpeechSynthesizer',
 								          model,
 								          parameters: {
 								            voice,
 								            format: 'mp3',
 								            sample_rate: 24000,
 								            volume: 50,
 								            rate: 1.0,
 								            pitch_rate: 1.0,
 								            text_type: 'PlainText',
 								            ...(instruction ? { instruction } : {}),
 								          },
 								          input: {},
 								        },
 								      }))
 								    })
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								    ws.addEventListener('message', async (event) => {
 								      if (typeof event.data !== 'string') {
 								        const buf = event.data instanceof Blob
 								          ? Buffer.from(await event.data.arrayBuffer())
 								          : Buffer.from(event.data)
 								        chunks.push(buf)
 								        return
 								      }
 								      try {
 								        const msg = JSON.parse(event.data)
 								        const evt = msg.header?.event
 								        if (evt === 'task-started') {
 								          // Step 2: continue-task — send text
 								          ws.send(JSON.stringify({
 								            header: {
 								              task_id: taskId,
 								              action: 'continue-task',
 								              streaming: 'duplex',
 								            },
 								            payload: {
 								              task_group: 'audio',
 								              task: 'tts',
 								              function: 'SpeechSynthesizer',
 								              model,
 								              input: { text },
 								            },
 								          }))
 								          // Step 3: finish-task
 								          ws.send(JSON.stringify({
 								            header: {
 								              task_id: taskId,
 								              action: 'finish-task',
 								              streaming: 'duplex',
 								            },
 								            payload: {
 								              task_group: 'audio',
 								              task: 'tts',
 								              function: 'SpeechSynthesizer',
 								              input: {},
 								            },
 								          }))
 								        } else if (evt === 'task-finished') {
 								          clearTimeout(timer)
 								          ws.close()
 								          if (settled) return
 								          settled = true
 								          const audio = Buffer.concat(chunks)
 								          if (audio.length === 0) { reject(new Error('TTS 未返回音频')); return }
 								          fs.writeFileSync(filePath, audio)
 								          resolve({ filePath, duration: getAudioDuration(filePath) })
 								        } else if (evt === 'task-failed') {
 								          clearTimeout(timer)
 								          ws.close()
 								          if (settled) return
 								          settled = true
 								          reject(new Error(`TTS 失败: ${msg.header?.error_message || msg.header?.message || JSON.stringify(msg)}`))
 								        }
 								      } catch {}
 								    })
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								    ws.addEventListener('error', (e) => {
 								      clearTimeout(timer)
 								      if (!settled) { settled = true; reject(new Error(`WebSocket 错误: ${e.message || '连接失败'}`)) }
 								    })
 								  })
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
+								}
 								/**
 								 * 批量语音合成
 								 */
 								async function synthesizeBatch(segments, options = {}) {
 								  const results = []
 								  for (const seg of segments) {
 								    console.error(`  合成 #${seg.id}: ${seg.text.substring(0, 30)}...`)
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								    try {
 								      const { filePath, duration } = await synthesize(seg.text, {
 								        ...options,
 								        id: seg.id,
 								      })
 								      results.push({
 								        id: seg.id,
 								        text: seg.text,
 								        audio: filePath,
 								        duration: Math.round(duration * 1000) / 1000,
 								      })
 								    } catch (err) {
 								      results.push({
 								        id: seg.id,
 								        text: seg.text,
 								        audio: '',
 								        duration: 0,
 								        error: err.message,
 								      })
 								    }
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
+								    await new Promise(r => setTimeout(r, 500))
 								  }
 								  return results
 								}
 								// CLI 入口
 								async function main() {
 								  const inputJson = process.argv[2]
 								  if (!inputJson) {
 								    console.error('用法: node qwen-tts.js <input.json>')
 								    console.error('')
 								    console.error('input.json 格式:')
 								    console.error(JSON.stringify({
 								      segments: [{ id: 1, text: '文案' }],
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								      voice: 'longanyang',
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
+								      output_dir: './audio',
 								    }, null, 2))
 								    process.exit(1)
 								  }
 								  const config = JSON.parse(fs.readFileSync(inputJson, 'utf-8'))
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								  const results = await synthesizeBatch(config.segments, {
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
+								    voice: config.voice,
 								    outputDir: config.output_dir || './audio',
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								  })
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
-												feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复
- 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复
- 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传
- 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段
- 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长
- 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项
- 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数

											
										
										
											2026-05-01 00:44:18 +08:00
+								  process.stdout.write(JSON.stringify({ segments: results }, null, 2) + '\n')
-												init: video-create project with skills and accounts

											
										
										
											2026-04-29 21:04:43 +08:00
+								}
 								if (require.main === module) {
 								  main().catch(err => {
 								    console.error('TTS 合成失败:', err.message)
 								    process.exit(1)
 								  })
 								}
 								module.exports = { synthesize, synthesizeBatch }