feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS

- 重写 `phase-images`：改为并发 3 张并行生成，每个 item 完成立即写入 manifest，支持 MJ task ID 恢复 - 重写 `phase-videos`：先恢复已有 task ID 再提交新任务（并发 3），支持中断恢复 - 迁移 TTS 引擎：从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口，支持音色/语气参数透传 - 精简账号系统：移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验，`references` 改为顶层字段 - 调整 `slugify`：限制中文字符 5 个、其他 10 个，避免文件名过长 - 更新文档：`manifest-schema.md` 中 `narration` 改为完整原文案，`account-creation.md` 新增 TTS 配置项 - 配置更新：默认 TTS 模型切换为 `cosyvoice-v3.5-plus`，新增 `localAudio` 参数
2026-05-01 00:44:18 +08:00
parent 3326f6cb37
commit 7d526d2b60
19 changed files with 888 additions and 411 deletions
--- a/.claude/skills/video-from-script/scripts/qwen-tts.js
+++ b/.claude/skills/video-from-script/scripts/qwen-tts.js
@@ -1,7 +1,8 @@
 #!/usr/bin/env node

 /**
- * 阿里云 Qwen-TTS 批量语音合成脚本
+ * CosyVoice TTS 批量语音合成脚本
+ * 通过 WebSocket 调用阿里云 DashScope CosyVoice API
 *
 * 输入 JSON 文件格式:
 * {
@@ -9,26 +10,25 @@
 *     {"id": 1, "text": "第一段文案"},
 *     {"id": 2, "text": "第二段文案"}
 *   ],
- *   "voice": "Cherry",          // 可选，覆盖 config
- *   "output_dir": "./audio"     // 可选，默认 ./audio
+ *   "voice": "longanyang",       // 可选，覆盖 config
+ *   "output_dir": "./audio"      // 可选，默认 ./audio
 * }
 *
 * 输出 JSON (stdout):
 * {
 *   "segments": [
- *     {"id": 1, "text": "...", "audio": "./audio/seg_001.wav", "duration": 3.456},
- *     ...
+ *     {"id": 1, "text": "...", "audio": "./audio/seg_001.mp3", "duration": 3.456}
 *   ]
 * }
 *
 * 也可作为模块调用:
 *   const { synthesize } = require('./qwen-tts')
- *   const { filePath, duration } = await synthesize('你好世界', { voice: 'Cherry' })
+ *   const { filePath, duration } = await synthesize('你好世界', { voice: 'longanyang' })
 */

-const axios = require('axios')
 const fs = require('fs')
 const path = require('path')
+const { execFileSync } = require('child_process')

 const CONFIG_PATH = path.join(__dirname, '..', '..', 'config.json')

@@ -37,102 +37,185 @@ function loadConfig() {
  return JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf-8'))
 }

-/**
- * 单段语音合成（非流式）
- * @param {string} text - 要合成的文本
- * @param {object} options - { voice, model, language, outputDir, id }
- * @returns {{ filePath: string, duration: number }}
- */
-async function synthesize(text, options = {}) {
-  const config = loadConfig()
-
-  const apiKey = options.apiKey || config.ttsApiKey
-  if (!apiKey) throw new Error('ttsApiKey 未配置，请在 config.json 中设置')
-
-  const baseUrl = (options.apiBaseUrl || config.ttsApiBaseUrl || 'https://dashscope.aliyuncs.com/api/v1').replace(/\/$/, '')
-  const model = options.model || config.ttsModel || 'qwen-tts'
-  const voice = options.voice || config.ttsVoice || 'Cherry'
-  const language = options.language || config.ttsLanguage || 'Chinese'
-  const outputDir = options.outputDir || './audio'
-
-  fs.mkdirSync(outputDir, { recursive: true })
-
-  // 确保文本有句末标点，让 TTS 生成自然语调和尾部停顿
-  text = text.trimEnd()
-  if (!/[。！？.!?…]$/.test(text)) text += '。'
-
-  const url = `${baseUrl}/services/aigc/multimodal-generation/generation`
-
-  let res
+function getAudioDuration(filePath) {
  try {
-    res = await axios.post(url, {
-      model,
-      input: {
-        text,
-        voice,
-        language_type: language,
-      },
-    }, {
-      headers: {
-        'Authorization': `Bearer ${apiKey}`,
-        'Content-Type': 'application/json',
-      },
-      timeout: 60000,
+    const out = execFileSync('ffprobe', [
+      '-v', 'quiet', '-show_entries', 'format=duration',
+      '-of', 'default=noprint_wrappers=1:nokey=1', filePath,
+    ], { encoding: 'utf-8', timeout: 10000 })
+    return parseFloat(out.trim())
+  } catch {
+    const stat = fs.statSync(filePath)
+    return stat.size * 8 / 32000
+  }
+}
+
+/**
+ * 单段语音合成（CosyVoice WebSocket）
+ * @param {string} text
+ * @param {object} options - { voice, model, outputDir, id, instruction }
+ * @returns {Promise<{filePath: string, duration: number}>}
+ */
+function synthesize(text, options = {}) {
+  return new Promise((resolve, reject) => {
+    const config = loadConfig()
+
+    const apiKey = options.apiKey || config.ttsApiKey
+    if (!apiKey) { reject(new Error('ttsApiKey 未配置')); return }
+
+    const model = options.model || config.ttsModel || 'cosyvoice-v3-flash'
+    const voice = options.voice || config.ttsVoice || 'longanyang'
+    const instruction = options.instruction || config.ttsInstruction || ''
+    const outputDir = options.outputDir || './audio'
+
+    fs.mkdirSync(outputDir, { recursive: true })
+
+    text = text.trimEnd()
+    if (!/[。！？.!?…]$/.test(text)) text += '。'
+
+    const id = options.id || 1
+    const fileName = `seg_${String(id).padStart(3, '0')}.mp3`
+    const filePath = path.resolve(outputDir, fileName)
+
+    const wsUrl = 'wss://dashscope.aliyuncs.com/api-ws/v1/inference'
+    const ws = new WebSocket(wsUrl, {
+      headers: { Authorization: `bearer ${apiKey}` },
    })
-  } catch (err) {
-    const detail = err.response?.data
-    throw new Error(`TTS API 错误: ${err.message}${detail ? ' ' + JSON.stringify(detail) : ''}`)
-  }

-  const audioUrl = res.data?.output?.audio?.url
-  if (!audioUrl) {
-    throw new Error(`TTS API 未返回音频 URL: ${JSON.stringify(res.data)}`)
-  }
+    const taskId = `tts_${Date.now()}_${id}`
+    const chunks = []
+    let settled = false

-  // 下载音频到本地
-  const id = options.id || 1
-  const fileName = `seg_${String(id).padStart(3, '0')}.wav`
-  const filePath = path.resolve(outputDir, fileName)
+    const timer = setTimeout(() => {
+      if (!settled) { settled = true; ws.close(); reject(new Error('TTS 超时 (60s)')) }
+    }, 60000)

-  const audioRes = await axios.get(audioUrl, { responseType: 'arraybuffer', timeout: 30000 })
-  const wavBuffer = Buffer.from(audioRes.data)
+    ws.addEventListener('open', () => {
+      // Step 1: run-task — empty input, no text
+      ws.send(JSON.stringify({
+        header: {
+          task_id: taskId,
+          action: 'run-task',
+          streaming: 'duplex',
+        },
+        payload: {
+          task_group: 'audio',
+          task: 'tts',
+          function: 'SpeechSynthesizer',
+          model,
+          parameters: {
+            voice,
+            format: 'mp3',
+            sample_rate: 24000,
+            volume: 50,
+            rate: 1.0,
+            pitch_rate: 1.0,
+            text_type: 'PlainText',
+            ...(instruction ? { instruction } : {}),
+          },
+          input: {},
+        },
+      }))
+    })

-  // 追加 0.3s 静音（句间气口）
-  const silenceSec = options.silencePadding !== undefined ? options.silencePadding : 0.3
-  const silenceBytes = Math.round(24000 * 2 * silenceSec)
-  const silenceBuffer = Buffer.alloc(silenceBytes, 0)
-  const finalBuffer = Buffer.concat([wavBuffer, silenceBuffer])
-  // 更新 WAV 头的文件大小
-  finalBuffer.writeUInt32LE(finalBuffer.length - 8, 4)
-  finalBuffer.writeUInt32LE(wavBuffer.length - 44 + silenceBytes, 40)
-  fs.writeFileSync(filePath, finalBuffer)
+    ws.addEventListener('message', async (event) => {
+      if (typeof event.data !== 'string') {
+        const buf = event.data instanceof Blob
+          ? Buffer.from(await event.data.arrayBuffer())
+          : Buffer.from(event.data)
+        chunks.push(buf)
+        return
+      }
+      try {
+        const msg = JSON.parse(event.data)
+        const evt = msg.header?.event

-  const duration = (finalBuffer.length - 44) / (24000 * 2)
+        if (evt === 'task-started') {
+          // Step 2: continue-task — send text
+          ws.send(JSON.stringify({
+            header: {
+              task_id: taskId,
+              action: 'continue-task',
+              streaming: 'duplex',
+            },
+            payload: {
+              task_group: 'audio',
+              task: 'tts',
+              function: 'SpeechSynthesizer',
+              model,
+              input: { text },
+            },
+          }))

-  return { filePath, duration }
+          // Step 3: finish-task
+          ws.send(JSON.stringify({
+            header: {
+              task_id: taskId,
+              action: 'finish-task',
+              streaming: 'duplex',
+            },
+            payload: {
+              task_group: 'audio',
+              task: 'tts',
+              function: 'SpeechSynthesizer',
+              input: {},
+            },
+          }))
+        } else if (evt === 'task-finished') {
+          clearTimeout(timer)
+          ws.close()
+          if (settled) return
+          settled = true
+
+          const audio = Buffer.concat(chunks)
+          if (audio.length === 0) { reject(new Error('TTS 未返回音频')); return }
+
+          fs.writeFileSync(filePath, audio)
+          resolve({ filePath, duration: getAudioDuration(filePath) })
+        } else if (evt === 'task-failed') {
+          clearTimeout(timer)
+          ws.close()
+          if (settled) return
+          settled = true
+          reject(new Error(`TTS 失败: ${msg.header?.error_message || msg.header?.message || JSON.stringify(msg)}`))
+        }
+      } catch {}
+    })
+
+    ws.addEventListener('error', (e) => {
+      clearTimeout(timer)
+      if (!settled) { settled = true; reject(new Error(`WebSocket 错误: ${e.message || '连接失败'}`)) }
+    })
+  })
 }

 /**
 * 批量语音合成
- * @param {Array<{id: number, text: string}>} segments
- * @param {object} options - { voice, outputDir }
- * @returns {Array<{id: number, text: string, audio: string, duration: number}>}
 */
 async function synthesizeBatch(segments, options = {}) {
  const results = []
  for (const seg of segments) {
    console.error(`  合成 #${seg.id}: ${seg.text.substring(0, 30)}...`)
-    const { filePath, duration } = await synthesize(seg.text, {
-      ...options,
-      id: seg.id,
-    })
-    results.push({
-      id: seg.id,
-      text: seg.text,
-      audio: filePath,
-      duration: Math.round(duration * 1000) / 1000,
-    })
-    // 间隔 0.5 秒避免限流
+    try {
+      const { filePath, duration } = await synthesize(seg.text, {
+        ...options,
+        id: seg.id,
+      })
+      results.push({
+        id: seg.id,
+        text: seg.text,
+        audio: filePath,
+        duration: Math.round(duration * 1000) / 1000,
+      })
+    } catch (err) {
+      results.push({
+        id: seg.id,
+        text: seg.text,
+        audio: '',
+        duration: 0,
+        error: err.message,
+      })
+    }
    await new Promise(r => setTimeout(r, 500))
  }
  return results
@@ -147,22 +230,19 @@ async function main() {
    console.error('input.json 格式:')
    console.error(JSON.stringify({
      segments: [{ id: 1, text: '文案' }],
-      voice: 'Cherry',
+      voice: 'longanyang',
      output_dir: './audio',
    }, null, 2))
    process.exit(1)
  }

  const config = JSON.parse(fs.readFileSync(inputJson, 'utf-8'))
-  const segments = config.segments
-  const options = {
+  const results = await synthesizeBatch(config.segments, {
    voice: config.voice,
    outputDir: config.output_dir || './audio',
-  }
+  })

-  const results = await synthesizeBatch(segments, options)
-  const output = { segments: results }
-  process.stdout.write(JSON.stringify(output, null, 2) + '\n')
+  process.stdout.write(JSON.stringify({ segments: results }, null, 2) + '\n')
 }

 if (require.main === module) {