feat(video-pipeline): 重构多阶段生成管线并集成 CosyVoice TTS
- 重写 `phase-images`:改为并发 3 张并行生成,每个 item 完成立即写入 manifest,支持 MJ task ID 恢复 - 重写 `phase-videos`:先恢复已有 task ID 再提交新任务(并发 3),支持中断恢复 - 迁移 TTS 引擎:从 Qwen-TTS HTTP 接口切换为 CosyVoice WebSocket 接口,支持音色/语气参数透传 - 精简账号系统:移除 `styles/` 目录、`taskId` 过滤和 `--id` 正则校验,`references` 改为顶层字段 - 调整 `slugify`:限制中文字符 5 个、其他 10 个,避免文件名过长 - 更新文档:`manifest-schema.md` 中 `narration` 改为完整原文案,`account-creation.md` 新增 TTS 配置项 - 配置更新:默认 TTS 模型切换为 `cosyvoice-v3.5-plus`,新增 `localAudio` 参数
This commit is contained in:
@@ -28,7 +28,7 @@
|
||||
"ossExpires": 31536000,
|
||||
"ttsApiBaseUrl": "https://dashscope.aliyuncs.com/api/v1",
|
||||
"ttsApiKey": "sk-1c503705b0f844a6b4f2386f6c1cc35b",
|
||||
"ttsModel": "qwen3-tts-flash",
|
||||
"ttsVoice": "Cherry",
|
||||
"ttsModel": "cosyvoice-v3.5-plus",
|
||||
"ttsVoice": "cosyvoice-v3.5-plus-bailian-fa8787c0f70b4ba2a907c35511e6a6f6",
|
||||
"ttsLanguage": "Chinese"
|
||||
}
|
||||
|
||||
@@ -89,6 +89,8 @@ Phase 4: 技术配置(有默认值,可跳过)
|
||||
| 12 | 生图模型? | gemini | account.json 的 imageModel |
|
||||
| 13 | 视频模型? | veo3-fast | account.json 的 videoModel |
|
||||
| 14 | 参考图文件? | 无 | 用户稍后放入 references/ 目录,Agent 上传 OSS 回写 URL |
|
||||
| 15 | TTS 音色? | config.json 全局 ttsVoice | account.json 的 ttsVoice,留空用全局默认 |
|
||||
| 16 | TTS 语气指令? | 无 | account.json 的 ttsInstruction,描述期望的语气风格 |
|
||||
|
||||
**运动偏好 → 视频提示词映射**:
|
||||
|
||||
@@ -128,6 +130,8 @@ Phase 4: 技术配置(有默认值,可跳过)
|
||||
- 画幅:{Q11}
|
||||
- 生图模型:{Q12}
|
||||
- 视频模型:{Q13}
|
||||
- TTS音色:{Q15}
|
||||
- TTS语气:{Q16}
|
||||
|
||||
确认 "开始" → 创建账号
|
||||
修改 → 调整后重新输出
|
||||
|
||||
@@ -58,7 +58,7 @@ node pipeline.js validate --manifest <path>
|
||||
|------|------|
|
||||
| `status` | 固定写 `"pending"` |
|
||||
| `shotDesc` | 英文分镜描述(含隐性动势,40-80词) |
|
||||
| `narration` | 中文口播旁白(≤22字) |
|
||||
| `narration` | **该段的完整原文案**(不提炼,保留论证、例子、细节)|
|
||||
| `duration` | 计划视频时长(秒),来自分镜阶段 |
|
||||
| `imagePrompt` | 英文画面描述(给 Gemini/MJ),Step 2-A 生成 |
|
||||
| `directorRef` | 导演构图参考(tarantino / kitano / fincher),三层透传 |
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
/**
|
||||
* CapCut 成片组装脚本
|
||||
*
|
||||
@@ -167,6 +166,7 @@ async function assemble(args) {
|
||||
apiKey = '',
|
||||
duration = '4',
|
||||
animation = 'kenburns-zoom',
|
||||
localAudio = 'false',
|
||||
} = args
|
||||
|
||||
if (!input) throw new Error('缺少 --input 参数')
|
||||
@@ -295,7 +295,7 @@ async function assemble(args) {
|
||||
// -- 添加 TTS 配音 --
|
||||
step++; console.log(`[${step}/${totalSteps}] 添加 TTS 配音...`)
|
||||
if (voiceover === 'true' && hasTTS) {
|
||||
await addVoiceover(draftUrl, inputDir, items, timeline)
|
||||
await addVoiceover(draftUrl, inputDir, items, timeline, localAudio === 'true')
|
||||
} else {
|
||||
console.log(' 跳过(无 TTS 音频或未启用)')
|
||||
}
|
||||
@@ -567,7 +567,7 @@ async function batchUploadAudio(inputDir, items) {
|
||||
// 添加 TTS 配音(每段音频按时间线排列)
|
||||
// ============================================================================
|
||||
|
||||
async function addVoiceover(draftUrl, inputDir, items, timeline) {
|
||||
async function addVoiceover(draftUrl, inputDir, items, timeline, localAudio = false) {
|
||||
// 收集需要上传的音频
|
||||
const audioItems = items.filter(item => item.audio)
|
||||
if (audioItems.length === 0) {
|
||||
@@ -576,8 +576,10 @@ async function addVoiceover(draftUrl, inputDir, items, timeline) {
|
||||
}
|
||||
|
||||
// 上传本地音频到 OSS(已有的 URL 直接通过)
|
||||
console.log(' 上传 TTS 音频到 OSS...')
|
||||
const audioUrls = await batchUploadAudio(inputDir, items)
|
||||
// 根据 localAudio 参数决定是否上传
|
||||
const audioUrls = localAudio
|
||||
? {} // 本地模式:不上传,使用本地路径
|
||||
: await batchUploadAudio(inputDir, items)
|
||||
|
||||
const audioInfos = []
|
||||
for (let i = 0; i < items.length; i++) {
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
/**
|
||||
* Command: create-account — 一键创建账号
|
||||
*
|
||||
* 创建目录结构 → 复制参考图 → 上传 OSS → 生成 account.json + 风格骨架
|
||||
* 创建目录结构 → 复制参考图 → 上传 OSS → 生成 account.json
|
||||
* prompt 文件通过 Agent Q&A 流程生成(见 account-creation.md)
|
||||
*/
|
||||
|
||||
const fs = require('fs')
|
||||
@@ -9,10 +10,9 @@ const path = require('path')
|
||||
const { ensureDir, log, ACCOUNTS_DIR } = require('./pipeline-utils')
|
||||
|
||||
async function createAccount(args) {
|
||||
const { id, name, desc, format, imageModel, videoModel, references } = args
|
||||
const { id, name, desc, format, imageModel, videoModel, references, ttsVoice, ttsInstruction } = args
|
||||
|
||||
if (!id) { console.error('错误: 必须指定 --id <账号ID>'); process.exit(1) }
|
||||
if (!/^[a-z0-9_-]+$/.test(id)) { console.error('错误: id 只允许小写字母/数字/短横线/下划线'); process.exit(1) }
|
||||
if (!name) { console.error('错误: 必须指定 --name <账号名>'); process.exit(1) }
|
||||
|
||||
const accountDir = path.join(ACCOUNTS_DIR, id)
|
||||
@@ -21,7 +21,6 @@ async function createAccount(args) {
|
||||
ensureDir(accountDir)
|
||||
ensureDir(path.join(accountDir, 'prompts'))
|
||||
ensureDir(path.join(accountDir, 'references'))
|
||||
ensureDir(path.join(accountDir, 'styles'))
|
||||
|
||||
// 复制参考图到 references/ 并上传 OSS
|
||||
const refs = (references || '').split(',').filter(Boolean)
|
||||
@@ -47,7 +46,6 @@ async function createAccount(args) {
|
||||
}
|
||||
|
||||
// 生成 account.json
|
||||
const styleName = args.style || id
|
||||
const accountConfig = {
|
||||
id,
|
||||
name,
|
||||
@@ -56,9 +54,12 @@ async function createAccount(args) {
|
||||
imageModel: imageModel || 'gemini',
|
||||
videoModel: videoModel || '',
|
||||
batchSize: 30,
|
||||
ttsVoice: ttsVoice || '',
|
||||
ttsInstruction: ttsInstruction || '',
|
||||
storyboardPrompt: 'prompts/分镜.md',
|
||||
imageStylePrompt: 'prompts/图片提示词.md',
|
||||
videoStylePrompt: 'prompts/视频提示词.md',
|
||||
references: uploadedRefs,
|
||||
capcut: {
|
||||
effects: [],
|
||||
filter: '',
|
||||
@@ -72,65 +73,15 @@ async function createAccount(args) {
|
||||
},
|
||||
}
|
||||
|
||||
if (uploadedRefs.length > 0) {
|
||||
accountConfig.styles = {
|
||||
[styleName]: { references: uploadedRefs },
|
||||
}
|
||||
}
|
||||
|
||||
const accountPath = path.join(accountDir, 'account.json')
|
||||
fs.writeFileSync(accountPath, JSON.stringify(accountConfig, null, 2), 'utf-8')
|
||||
|
||||
// 生成默认风格骨架
|
||||
const stylePath = path.join(accountDir, 'styles', `${styleName}.md`)
|
||||
const styleContent = [
|
||||
`# ${styleName}`,
|
||||
'',
|
||||
`${desc || name} 的视觉风格。`,
|
||||
'',
|
||||
'---',
|
||||
'',
|
||||
'## 图片提示词',
|
||||
'',
|
||||
'### 核心视觉要素',
|
||||
'',
|
||||
'(待填充:描述关键视觉元素)',
|
||||
'',
|
||||
'### 色调方案',
|
||||
'',
|
||||
'(待填充)',
|
||||
'',
|
||||
'### 图片 Prompt 模板',
|
||||
'',
|
||||
'(待填充)',
|
||||
'',
|
||||
'### 图片禁止项',
|
||||
'',
|
||||
'- 文字水印',
|
||||
'- 字幕覆盖',
|
||||
'',
|
||||
'---',
|
||||
'',
|
||||
'## 视频提示词',
|
||||
'',
|
||||
'### 运镜规则',
|
||||
'',
|
||||
'(待填充)',
|
||||
'',
|
||||
'### 视频 Prompt 模板',
|
||||
'',
|
||||
'(待填充)',
|
||||
'',
|
||||
].join('\n')
|
||||
fs.writeFileSync(stylePath, styleContent, 'utf-8')
|
||||
|
||||
console.log(`\n账号已创建: ${accountDir}`)
|
||||
console.log(` ID: ${id}`)
|
||||
console.log(` 名称: ${name}`)
|
||||
console.log(` 模型: ${accountConfig.imageModel} + ${accountConfig.videoModel || '(未指定)'}`)
|
||||
console.log(` 参考图: ${uploadedRefs.length} 张(${uploadedRefs.filter(r => r.url).length} 已上传)`)
|
||||
console.log(` 风格: ${styleName}`)
|
||||
console.log(`\n下一步: 编辑 ${stylePath} 完善提示词策略\n`)
|
||||
console.log(`\n下一步: 通过 Agent Q&A 流程生成 prompts/*.md(或手动创建)\n`)
|
||||
|
||||
return accountPath
|
||||
}
|
||||
|
||||
@@ -57,11 +57,9 @@ function initManifest(options) {
|
||||
}
|
||||
}
|
||||
|
||||
// 从 account.json 继承参考图
|
||||
const styles = accountConfig.styles || {}
|
||||
const firstStyleKey = Object.keys(styles)[0]
|
||||
const styleRefs = firstStyleKey ? (styles[firstStyleKey].references || []) : []
|
||||
const references = styleRefs.map(ref => {
|
||||
// 从 account.json 继承参考图(顶层 references)
|
||||
const accountRefs = accountConfig.references || []
|
||||
const references = accountRefs.map(ref => {
|
||||
const entry = {}
|
||||
if (ref.file) entry.file = path.join(ACCOUNTS_DIR, accountId, 'references', ref.file)
|
||||
if (ref.url) entry.url = ref.url
|
||||
@@ -88,11 +86,13 @@ function initManifest(options) {
|
||||
// 组装 manifest
|
||||
const manifest = {
|
||||
account: accountId,
|
||||
imageModel: accountConfig.imageModel || 'gemini',
|
||||
videoModel: accountConfig.videoModel || 'veo3-fast-frames',
|
||||
format: accountConfig.defaultFormat || '9:16',
|
||||
imageModel: options.imageModel || accountConfig.imageModel || 'gemini',
|
||||
videoModel: options.videoModel || accountConfig.videoModel || 'veo3-fast-frames',
|
||||
format: options.format || accountConfig.defaultFormat || '9:16',
|
||||
mode: resolvedMode,
|
||||
references,
|
||||
...(accountConfig.ttsVoice ? { ttsVoice: accountConfig.ttsVoice } : {}),
|
||||
...(accountConfig.ttsInstruction ? { ttsInstruction: accountConfig.ttsInstruction } : {}),
|
||||
items,
|
||||
}
|
||||
|
||||
|
||||
@@ -72,6 +72,24 @@ function validateAccount(accountId) {
|
||||
if (!config.imageModel) issues.push('缺少 imageModel')
|
||||
if (!config.defaultFormat) issues.push('缺少 defaultFormat')
|
||||
|
||||
// 检查 prompts 文件
|
||||
const promptFiles = [
|
||||
{ field: 'storyboardPrompt', label: '分镜' },
|
||||
{ field: 'imageStylePrompt', label: '图片提示词' },
|
||||
{ field: 'videoStylePrompt', label: '视频提示词' },
|
||||
]
|
||||
for (const { field, label } of promptFiles) {
|
||||
const relPath = config[field]
|
||||
if (!relPath) {
|
||||
issues.push(`缺少 ${field}(prompts 路径)`)
|
||||
} else {
|
||||
const absPath = path.join(accountDir, relPath)
|
||||
if (!fs.existsSync(absPath)) {
|
||||
issues.push(`${label}文件不存在: ${relPath}`)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const refDir = path.join(accountDir, 'references')
|
||||
const localRefs = fs.existsSync(refDir)
|
||||
? fs.readdirSync(refDir).filter(f => /\.(png|jpg|jpeg|webp)$/i.test(f))
|
||||
|
||||
@@ -23,19 +23,23 @@ async function phaseAssemble(manifest, manifestPath, options) {
|
||||
subtitles: mode === 'images' ? 'true' : 'false',
|
||||
voiceover: manifest.items.some(it => it.audio) ? 'true' : 'false',
|
||||
duration: '4',
|
||||
animation: 'kenburns-zoom',
|
||||
animation: capcutConfig.animation || 'kenburns-zoom',
|
||||
}
|
||||
|
||||
if (capcutConfig.defaultBGM) assembleArgs.bgm = capcutConfig.defaultBGM
|
||||
if (capcutConfig.effects) assembleArgs.effects = capcutConfig.effects.join(',')
|
||||
if (capcutConfig.filter) assembleArgs.filter = capcutConfig.filter
|
||||
|
||||
log('assemble', `模式: ${mode}, 字幕: true, 配音: ${assembleArgs.voiceover}`)
|
||||
log('assemble', `模式: ${mode}, 字幕: true, 配音: ${assembleArgs.voiceover}, 动画: ${assembleArgs.animation}`)
|
||||
|
||||
try {
|
||||
const { assemble } = require('../capcut_assemble')
|
||||
await assemble(assembleArgs)
|
||||
|
||||
log('assemble', '成片完成')
|
||||
} catch (err) {
|
||||
log('assemble', `成片失败: ${err.message}`)
|
||||
throw err
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = { phaseAssemble }
|
||||
|
||||
@@ -2,11 +2,15 @@
|
||||
* Phase: images — 图片生成
|
||||
*
|
||||
* 支持 Gemini / MJ / Kling 三种模型,含首尾帧模式
|
||||
* 并发生成,支持 task ID 恢复(MJ)
|
||||
*/
|
||||
|
||||
const fs = require('fs')
|
||||
const path = require('path')
|
||||
const { saveManifest, getReferences, ensureDir, renameGeneratedFile, log, getManifestDir } = require('./pipeline-utils')
|
||||
|
||||
const IMAGE_CONCURRENCY = 3
|
||||
|
||||
async function phaseImages(manifest, manifestPath, options) {
|
||||
const dir = getManifestDir(manifestPath)
|
||||
const imagesDir = path.join(dir, 'images')
|
||||
@@ -21,104 +25,172 @@ async function phaseImages(manifest, manifestPath, options) {
|
||||
let model = options.imageModel || manifest.imageModel || accountConfig.imageModel || 'gemini'
|
||||
const ratio = manifest.format || accountConfig.defaultFormat || '9:16'
|
||||
|
||||
// 首尾帧模式:MJ 降级为 Gemini(MJ 出4张候选图无法一一对应首尾帧)
|
||||
// 首尾帧模式:MJ 降级为 Gemini
|
||||
if (model === 'mj' && manifest.mode === 'framePair') {
|
||||
log('images', '首尾帧模式不支持 MJ,自动降级为 Gemini')
|
||||
model = 'gemini'
|
||||
}
|
||||
const refs = getReferences(manifest, accountConfig)
|
||||
|
||||
log('images', `共 ${items.length} 张, 模型: ${model}, 画幅: ${ratio}, 参考图: ${refs.localPaths.length}本地/${refs.urls.length}URL`)
|
||||
log('images', `共 ${items.length} 张, 模型: ${model}, 画幅: ${ratio}, 参考图: ${refs.localPaths.length}本地/${refs.urls.length}URL, 并发: ${IMAGE_CONCURRENCY}`)
|
||||
|
||||
for (let i = 0; i < items.length; i++) {
|
||||
const item = items[i]
|
||||
const idx = i + 1
|
||||
// 分批并发处理
|
||||
for (let batchStart = 0; batchStart < items.length; batchStart += IMAGE_CONCURRENCY) {
|
||||
const batch = items.slice(batchStart, batchStart + IMAGE_CONCURRENCY)
|
||||
|
||||
const results = await Promise.allSettled(
|
||||
batch.map(async (item) => {
|
||||
const idx = item.id
|
||||
try {
|
||||
item.status = 'generating'
|
||||
saveManifest(manifestPath, manifest)
|
||||
|
||||
let result
|
||||
if (model === 'gemini') {
|
||||
result = await generateGemini(item, idx, dir, imagesDir, ratio, refs)
|
||||
} else if (model === 'mj') {
|
||||
result = await generateMJ(item, idx, dir, imagesDir, ratio, refs, manifestPath)
|
||||
} else if (model === 'kling') {
|
||||
result = await generateKling(item, idx, dir, imagesDir, ratio, refs)
|
||||
} else {
|
||||
throw new Error(`不支持的模型: ${model}(支持: gemini, mj, kling)`)
|
||||
}
|
||||
|
||||
if (result.file) {
|
||||
item.file = result.file
|
||||
if (result.candidates) item.candidates = result.candidates
|
||||
item.status = 'done'
|
||||
log('images', `[${idx}] 完成: ${item.file}`)
|
||||
} else {
|
||||
item.status = 'failed'
|
||||
item.error = '生成器未返回文件'
|
||||
log('images', `[${idx}] 失败: 生成器未返回文件`)
|
||||
}
|
||||
// 每个 item 完成后立即写盘,防止崩溃丢失已完成的结果
|
||||
saveManifest(manifestPath, manifest)
|
||||
|
||||
// 首尾帧模式:生成第二张图
|
||||
if (item.status === 'done' && manifest.mode === 'framePair' && item.lastFramePrompt && !item.lastFrame) {
|
||||
await generateLastFrame(item, idx, manifest, dir, imagesDir, model, ratio, manifestPath)
|
||||
}
|
||||
|
||||
return { ok: true }
|
||||
} catch (err) {
|
||||
item.status = 'failed'
|
||||
item.error = err.message
|
||||
log('images', `[${idx}] 失败: ${err.message}`)
|
||||
saveManifest(manifestPath, manifest)
|
||||
return { ok: false, error: err.message }
|
||||
}
|
||||
})
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// 各模型生成逻辑
|
||||
// ============================================================================
|
||||
|
||||
async function generateGemini(item, idx, dir, imagesDir, ratio, refs) {
|
||||
const { generate: geminiGen, edit: geminiEdit } = require('../gemini-image-generator')
|
||||
let result
|
||||
if (refs.localPaths.length > 0) {
|
||||
log('images', `[${idx}/${items.length}] Gemini 图生图: ${item.imagePrompt.substring(0, 60)}...`)
|
||||
log('images', `[${idx}] Gemini 图生图: ${item.imagePrompt.substring(0, 60)}...`)
|
||||
result = await geminiEdit(item.imagePrompt, refs.localPaths, {
|
||||
outputDir: imagesDir,
|
||||
aspectRatio: ratio,
|
||||
})
|
||||
} else {
|
||||
log('images', `[${idx}/${items.length}] Gemini 文生图: ${item.imagePrompt.substring(0, 60)}...`)
|
||||
log('images', `[${idx}] Gemini 文生图: ${item.imagePrompt.substring(0, 60)}...`)
|
||||
result = await geminiGen(item.imagePrompt, {
|
||||
outputDir: imagesDir,
|
||||
aspectRatio: ratio,
|
||||
})
|
||||
}
|
||||
if (result.savedFiles && result.savedFiles.length > 0) {
|
||||
item.file = renameGeneratedFile(
|
||||
const file = (result.savedFiles && result.savedFiles.length > 0)
|
||||
? renameGeneratedFile(
|
||||
path.relative(dir, result.savedFiles[0]).replace(/\\/g, '/'),
|
||||
dir, idx, item.narration || item.shotDesc, ''
|
||||
)
|
||||
: null
|
||||
return { file }
|
||||
}
|
||||
|
||||
async function generateMJ(item, idx, dir, imagesDir, ratio, refs, manifestPath) {
|
||||
const { MJApi, ImageUtils } = require('../mj-image-generator')
|
||||
const referenceImages = refs.urls.length > 0 ? refs.urls : []
|
||||
const styleWeight = 200
|
||||
|
||||
let result
|
||||
|
||||
// 尝试恢复中断的 MJ 任务
|
||||
if (item.taskId && item.status === 'generating') {
|
||||
try {
|
||||
log('images', `[${idx}] 恢复 MJ 任务: ${item.taskId}`)
|
||||
const pollResult = await MJApi.poll(item.taskId)
|
||||
const timestamp = new Date().toISOString().replace(/[:.]/g, '-')
|
||||
const gridFile = path.join(imagesDir, `${timestamp}_grid.png`)
|
||||
await ImageUtils.download(pollResult.imageUrl, gridFile)
|
||||
const splitFiles = await ImageUtils.split4(gridFile, imagesDir, timestamp)
|
||||
fs.unlinkSync(gridFile)
|
||||
result = { files: splitFiles }
|
||||
log('images', `[${idx}] MJ 任务恢复成功`)
|
||||
} catch (err) {
|
||||
log('images', `[${idx}] MJ 任务恢复失败: ${err.message},重新提交`)
|
||||
delete item.taskId
|
||||
}
|
||||
} else if (model === 'mj') {
|
||||
const { generate: mjGen } = require('../mj-image-generator')
|
||||
const mjOpts = { outputDir: imagesDir, aspectRatio: ratio, split: true }
|
||||
if (refs.urls.length > 0) {
|
||||
mjOpts.referenceImages = refs.urls
|
||||
mjOpts.styleWeight = 200
|
||||
}
|
||||
log('images', `[${idx}/${items.length}] MJ 生图: ${item.imagePrompt.substring(0, 60)}...`)
|
||||
result = await mjGen(item.imagePrompt, mjOpts)
|
||||
if (result.files && result.files.length > 0) {
|
||||
item.candidates = result.files.map((f, ci) =>
|
||||
|
||||
// 新提交
|
||||
if (!result) {
|
||||
log('images', `[${idx}] MJ 生图: ${item.imagePrompt.substring(0, 60)}...`)
|
||||
const taskId = await MJApi.submit(item.imagePrompt, { referenceImages, aspectRatio: ratio, styleWeight })
|
||||
item.taskId = taskId
|
||||
saveManifest(manifestPath, manifest)
|
||||
|
||||
const pollResult = await MJApi.poll(taskId)
|
||||
const timestamp = new Date().toISOString().replace(/[:.]/g, '-')
|
||||
const gridFile = path.join(imagesDir, `${timestamp}_grid.png`)
|
||||
await ImageUtils.download(pollResult.imageUrl, gridFile)
|
||||
const splitFiles = await ImageUtils.split4(gridFile, imagesDir, timestamp)
|
||||
fs.unlinkSync(gridFile)
|
||||
result = { files: splitFiles }
|
||||
}
|
||||
|
||||
const file = (result.files && result.files.length > 0) ? result.files[0] : null
|
||||
const candidates = (result.files && result.files.length > 0)
|
||||
? result.files.map((f, ci) =>
|
||||
renameGeneratedFile(
|
||||
path.relative(dir, f).replace(/\\/g, '/'),
|
||||
dir, idx, item.narration || item.shotDesc, `cand${ci + 1}`
|
||||
)
|
||||
)
|
||||
item.file = item.candidates[0]
|
||||
log('images', `[${idx}/${items.length}] ${result.files.length} 张候选,默认选第1张`)
|
||||
: null
|
||||
|
||||
delete item.taskId
|
||||
if (candidates && candidates.length > 0) {
|
||||
log('images', `[${idx}] ${candidates.length} 张候选,默认选第1张`)
|
||||
return { file: candidates[0], candidates }
|
||||
}
|
||||
} else if (model === 'kling') {
|
||||
return { file }
|
||||
}
|
||||
|
||||
async function generateKling(item, idx, dir, imagesDir, ratio, refs) {
|
||||
const { generate: klingGen } = require('../kling-image-generator')
|
||||
const klingOpts = { outputDir: imagesDir, aspectRatio: ratio }
|
||||
if (refs.urls.length > 0) {
|
||||
klingOpts.styleImageUrl = refs.urls[0]
|
||||
}
|
||||
log('images', `[${idx}/${items.length}] 可灵生图: ${item.imagePrompt.substring(0, 60)}...`)
|
||||
result = await klingGen(item.imagePrompt, klingOpts)
|
||||
if (result.savedFiles && result.savedFiles.length > 0) {
|
||||
item.file = renameGeneratedFile(
|
||||
if (refs.urls.length > 0) klingOpts.styleImageUrl = refs.urls[0]
|
||||
log('images', `[${idx}] 可灵生图: ${item.imagePrompt.substring(0, 60)}...`)
|
||||
const result = await klingGen(item.imagePrompt, klingOpts)
|
||||
const file = (result.savedFiles && result.savedFiles.length > 0)
|
||||
? renameGeneratedFile(
|
||||
path.relative(dir, result.savedFiles[0]).replace(/\\/g, '/'),
|
||||
dir, idx, item.narration || item.shotDesc, ''
|
||||
)
|
||||
}
|
||||
} else {
|
||||
throw new Error(`不支持的模型: ${model}(支持: gemini, mj, kling)`)
|
||||
}
|
||||
|
||||
if (item.file) {
|
||||
item.status = 'done'
|
||||
log('images', `[${idx}/${items.length}] 完成: ${item.file}`)
|
||||
} else {
|
||||
item.status = 'failed'
|
||||
item.error = '生成器未返回文件'
|
||||
log('images', `[${idx}/${items.length}] 失败: 生成器未返回文件`)
|
||||
}
|
||||
|
||||
// 首尾帧模式:生成第二张图(lastFrame)
|
||||
if (item.status === 'done' && manifest.mode === 'framePair' && item.lastFramePrompt && !item.lastFrame) {
|
||||
await generateLastFrame(item, idx, items.length, manifest, dir, imagesDir, model, ratio, manifestPath)
|
||||
}
|
||||
} catch (err) {
|
||||
item.status = 'failed'
|
||||
item.error = err.message
|
||||
log('images', `[${idx}/${items.length}] 失败: ${err.message}`)
|
||||
}
|
||||
saveManifest(manifestPath, manifest)
|
||||
}
|
||||
: null
|
||||
return { file }
|
||||
}
|
||||
|
||||
async function generateLastFrame(item, idx, total, manifest, dir, imagesDir, model, ratio, manifestPath) {
|
||||
async function generateLastFrame(item, idx, manifest, dir, imagesDir, model, ratio, manifestPath) {
|
||||
try {
|
||||
item.status = 'generating'
|
||||
saveManifest(manifestPath, manifest)
|
||||
@@ -131,14 +203,6 @@ async function generateLastFrame(item, idx, total, manifest, dir, imagesDir, mod
|
||||
outputDir: imagesDir,
|
||||
aspectRatio: ratio,
|
||||
})
|
||||
} else if (model === 'mj') {
|
||||
const { generate: mjGen } = require('../mj-image-generator')
|
||||
const mjOpts = { outputDir: imagesDir, aspectRatio: ratio, split: false }
|
||||
if (item.url) {
|
||||
mjOpts.referenceImages = [item.url]
|
||||
mjOpts.styleWeight = 200
|
||||
}
|
||||
lastResult = await mjGen(item.lastFramePrompt, mjOpts)
|
||||
} else if (model === 'kling') {
|
||||
const { generate: klingGen } = require('../kling-image-generator')
|
||||
lastResult = await klingGen(item.lastFramePrompt, {
|
||||
@@ -156,17 +220,17 @@ async function generateLastFrame(item, idx, total, manifest, dir, imagesDir, mod
|
||||
dir, idx, item.narration || item.shotDesc, 'last'
|
||||
)
|
||||
item.status = 'done'
|
||||
log('images', `[${idx}/${total}] lastFrame 完成: ${item.lastFrame}`)
|
||||
log('images', `[${idx}] lastFrame 完成: ${item.lastFrame}`)
|
||||
} else {
|
||||
item.status = 'failed'
|
||||
item.error = 'lastFrame 生成器未返回文件'
|
||||
log('images', `[${idx}/${total}] lastFrame 失败: 未返回文件`)
|
||||
log('images', `[${idx}] lastFrame 失败: 未返回文件`)
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
item.status = 'failed'
|
||||
item.error = `lastFrame 失败: ${err.message}`
|
||||
log('images', `[${idx}/${total}] lastFrame 失败: ${err.message}`)
|
||||
log('images', `[${idx}] lastFrame 失败: ${err.message}`)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
const path = require('path')
|
||||
const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils')
|
||||
|
||||
async function phaseTts(manifest, manifestPath) {
|
||||
async function phaseTts(manifest, manifestPath, options = {}) {
|
||||
const dir = getManifestDir(manifestPath)
|
||||
const audioDir = path.join(dir, 'audio')
|
||||
ensureDir(audioDir)
|
||||
@@ -28,6 +28,8 @@ async function phaseTts(manifest, manifestPath) {
|
||||
const { filePath, duration } = await synthesize(item.narration || item.text, {
|
||||
outputDir: audioDir,
|
||||
id: item.id || idx,
|
||||
voice: manifest.ttsVoice || undefined,
|
||||
instruction: manifest.ttsInstruction || undefined,
|
||||
})
|
||||
item.audio = path.relative(dir, filePath).replace(/\\/g, '/')
|
||||
item.audioDuration = Math.round(duration * 1000) / 1000
|
||||
|
||||
@@ -2,8 +2,10 @@
|
||||
* Phase: videos — 视频生成(VEO / Grok / Kling)
|
||||
*
|
||||
* 图生视频,批量提交,生成后自动上传 OSS
|
||||
* 支持 task ID 恢复:中断后重跑时优先恢复已有任务
|
||||
*/
|
||||
|
||||
const fs = require('fs')
|
||||
const path = require('path')
|
||||
const { saveManifest, ensureDir, log, getManifestDir } = require('./pipeline-utils')
|
||||
|
||||
@@ -21,63 +23,140 @@ async function phaseVideos(manifest, manifestPath, options) {
|
||||
if (items.length === 0) { log('videos', '无待处理 item,跳过'); return }
|
||||
|
||||
// 选择生成器
|
||||
let generator
|
||||
let Api, pollFn
|
||||
const modelLower = videoModel.toLowerCase()
|
||||
if (modelLower.includes('grok')) {
|
||||
generator = require('../grok-video-generator')
|
||||
const gen = require('../grok-video-generator')
|
||||
Api = gen.GrokApi; pollFn = gen.pollWithRetry
|
||||
} else if (modelLower.includes('kling')) {
|
||||
generator = require('../kling-video-generator')
|
||||
const gen = require('../kling-video-generator')
|
||||
Api = gen.KlingApi; pollFn = gen.pollWithRetry
|
||||
} else {
|
||||
generator = require('../veo-video-generator')
|
||||
const gen = require('../veo-video-generator')
|
||||
Api = gen.VeoApi; pollFn = gen.pollWithRetry
|
||||
}
|
||||
|
||||
const ratio = manifest.format || '9:16'
|
||||
log('videos', `共 ${items.length} 个, 模型: ${videoModel}`)
|
||||
|
||||
const tasks = items.map((item, i) => {
|
||||
const task = {
|
||||
id: item.id || i + 1,
|
||||
prompt: item.videoPrompt,
|
||||
image: item.url,
|
||||
outputDir: videosDir,
|
||||
}
|
||||
if (item.lastFrameUrl) {
|
||||
task.images = [item.url, item.lastFrameUrl]
|
||||
task.lastFrameUrl = item.lastFrameUrl
|
||||
// Phase 1: 恢复已有任务(有 videoTaskId 的 item)
|
||||
const recovered = []
|
||||
const needSubmit = []
|
||||
|
||||
for (const item of items) {
|
||||
if (item.videoTaskId) {
|
||||
recovered.push(item)
|
||||
} else {
|
||||
task.images = [item.url]
|
||||
needSubmit.push(item)
|
||||
}
|
||||
}
|
||||
return task
|
||||
})
|
||||
|
||||
// 轮询恢复的任务
|
||||
if (recovered.length > 0) {
|
||||
log('videos', `尝试恢复 ${recovered.length} 个中断任务...`)
|
||||
await Promise.allSettled(
|
||||
recovered.map(async (item) => {
|
||||
try {
|
||||
const results = await generator.batchGenerate(tasks, {
|
||||
videoModel,
|
||||
aspectRatio: manifest.format || '9:16',
|
||||
log('videos', ` 恢复 item ${item.id}: ${item.videoTaskId}`)
|
||||
const result = await pollFn(item.videoTaskId, item.videoPrompt, {
|
||||
outputDir: videosDir,
|
||||
skipManifestWrite: true,
|
||||
aspectRatio: ratio,
|
||||
imageUrl: item.url,
|
||||
lastFrameUrl: item.lastFrameUrl || '',
|
||||
})
|
||||
|
||||
for (let i = 0; i < results.length; i++) {
|
||||
const result = results[i]
|
||||
const item = items[i]
|
||||
if (!item) continue
|
||||
if (result.success && result.file) {
|
||||
if (result.file) {
|
||||
item.video = path.relative(dir, result.file).replace(/\\/g, '/')
|
||||
item.videoDuration = result.duration
|
||||
} else {
|
||||
item.status = 'failed'
|
||||
item.error = result.error || '视频生成失败'
|
||||
log('videos', ` item ${(item.id || '?')} 失败: ${item.error}`)
|
||||
}
|
||||
delete item.videoTaskId
|
||||
log('videos', ` item ${item.id} 恢复成功`)
|
||||
}
|
||||
} catch (err) {
|
||||
log('videos', `批量生成失败: ${err.message}`)
|
||||
for (const item of items) {
|
||||
if (!item.video) {
|
||||
item.status = 'failed'
|
||||
item.error = `批量生成异常: ${err.message}`
|
||||
log('videos', ` item ${item.id} 恢复失败: ${err.message},将重新提交`)
|
||||
delete item.videoTaskId
|
||||
needSubmit.push(item)
|
||||
}
|
||||
saveManifest(manifestPath, manifest)
|
||||
})
|
||||
)
|
||||
}
|
||||
|
||||
if (needSubmit.length === 0) { log('videos', '全部通过恢复完成'); return }
|
||||
|
||||
// Phase 2: 提交新任务(并发 3)
|
||||
const concurrency = 3
|
||||
log('videos', `提交 ${needSubmit.length} 个新任务(并发: ${concurrency})...`)
|
||||
|
||||
const submitted = []
|
||||
for (let i = 0; i < needSubmit.length; i += concurrency) {
|
||||
const batch = needSubmit.slice(i, i + concurrency)
|
||||
const batchResults = await Promise.allSettled(
|
||||
batch.map(async (item) => {
|
||||
const images = item.lastFrameUrl
|
||||
? [item.url, item.lastFrameUrl]
|
||||
: [item.url]
|
||||
const extraOpts = item.lastFrameUrl
|
||||
? { aspectRatio: ratio, lastFrameUrl: item.lastFrameUrl }
|
||||
: { aspectRatio: ratio }
|
||||
|
||||
try {
|
||||
const taskId = await Api.create(item.url, item.videoPrompt, extraOpts)
|
||||
return { item, taskId, error: null }
|
||||
} catch (err) {
|
||||
return { item, taskId: null, error: err.message }
|
||||
}
|
||||
})
|
||||
)
|
||||
for (const r of batchResults) {
|
||||
const val = r.status === 'fulfilled' ? r.value : { item: null, taskId: null, error: r.reason }
|
||||
submitted.push(val)
|
||||
if (val.item && val.taskId) {
|
||||
val.item.videoTaskId = val.taskId
|
||||
}
|
||||
}
|
||||
saveManifest(manifestPath, manifest)
|
||||
}
|
||||
|
||||
// Phase 3: 轮询新任务
|
||||
const pending = submitted.filter(s => s.taskId)
|
||||
if (pending.length === 0) {
|
||||
log('videos', '所有任务提交失败')
|
||||
for (const s of submitted) {
|
||||
if (s.item) { s.item.status = 'failed'; s.item.error = s.error || '提交失败' }
|
||||
}
|
||||
saveManifest(manifestPath, manifest)
|
||||
return
|
||||
}
|
||||
|
||||
log('videos', `等待 ${pending.length} 个视频生成...`)
|
||||
|
||||
const pollResults = await Promise.allSettled(
|
||||
pending.map(async ({ item, taskId }) => {
|
||||
try {
|
||||
const result = await pollFn(taskId, item.videoPrompt, {
|
||||
outputDir: videosDir,
|
||||
aspectRatio: ratio,
|
||||
imageUrl: item.url,
|
||||
lastFrameUrl: item.lastFrameUrl || '',
|
||||
})
|
||||
return { item, result, ok: true }
|
||||
} catch (err) {
|
||||
return { item, error: err.message, ok: false }
|
||||
}
|
||||
})
|
||||
)
|
||||
|
||||
for (const r of pollResults) {
|
||||
const val = r.status === 'fulfilled' ? r.value : { ok: false, error: r.reason?.message }
|
||||
if (val.ok && val.result.file) {
|
||||
val.item.video = path.relative(dir, val.result.file).replace(/\\/g, '/')
|
||||
val.item.videoDuration = val.result.duration
|
||||
delete val.item.videoTaskId
|
||||
} else if (val.item) {
|
||||
val.item.status = 'failed'
|
||||
val.item.error = val.error || '视频生成未返回文件'
|
||||
delete val.item.videoTaskId
|
||||
}
|
||||
saveManifest(manifestPath, manifest)
|
||||
}
|
||||
|
||||
// 上传视频到 OSS
|
||||
|
||||
@@ -12,7 +12,7 @@ const SCRIPTS_DIR = path.join(__dirname, '..')
|
||||
const SKILLS_DIR = path.join(SCRIPTS_DIR, '..')
|
||||
const PROJECT_ROOT = path.join(SKILLS_DIR, '..', '..')
|
||||
const CONFIG_PATH = path.join(SKILLS_DIR, 'config.json')
|
||||
const ACCOUNTS_DIR = path.join(PROJECT_ROOT, 'accounts')
|
||||
const ACCOUNTS_DIR = path.join(PROJECT_ROOT, '..', 'accounts')
|
||||
|
||||
// ============================================================================
|
||||
// 配置 & Manifest
|
||||
@@ -64,10 +64,9 @@ function getReferences(manifest, accountConfig) {
|
||||
log('images', 'manifest.references 全部无效,尝试 account fallback')
|
||||
}
|
||||
|
||||
// Fallback 1: 从 account.json 的 styles.*.references 读取
|
||||
const styles = accountConfig.styles || {}
|
||||
for (const [, style] of Object.entries(styles)) {
|
||||
for (const ref of (style.references || [])) {
|
||||
// Fallback 1: 从 account.json 的顶层 references 读取
|
||||
const topRefs = accountConfig.references || []
|
||||
for (const ref of topRefs) {
|
||||
if (ref.url) result.urls.push(ref.url)
|
||||
if (ref.file && accountId) {
|
||||
const localPath = path.join(ACCOUNTS_DIR, accountId, 'references', ref.file)
|
||||
@@ -76,7 +75,6 @@ function getReferences(manifest, accountConfig) {
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (result.localPaths.length > 0 || result.urls.length > 0) return result
|
||||
|
||||
// Fallback 2: 扫描 account 的 references 目录
|
||||
@@ -111,11 +109,26 @@ function ensureDir(dir) {
|
||||
}
|
||||
|
||||
function slugify(text) {
|
||||
return text
|
||||
.replace(/[^\w一-鿿]/g, '_')
|
||||
.replace(/_+/g, '_')
|
||||
.replace(/^_|_$/g, '')
|
||||
.substring(0, 20)
|
||||
// 限制中文字符最多5个,其他字符(英文数字)最多10个
|
||||
let chineseChars = []
|
||||
let otherChars = []
|
||||
|
||||
for (const char of text) {
|
||||
if (/\p{Script=Han}/u.test(char)) {
|
||||
// 中文字符
|
||||
if (chineseChars.length < 5) {
|
||||
chineseChars.push(char)
|
||||
}
|
||||
} else if (/\w/u.test(char)) {
|
||||
// 英文、数字
|
||||
if (otherChars.length < 10) {
|
||||
otherChars.push(char)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const result = chineseChars.concat(otherChars).join('')
|
||||
return result || 'untitled'
|
||||
}
|
||||
|
||||
function renameGeneratedFile(oldRelPath, dir, seq, nameHint, suffix) {
|
||||
|
||||
@@ -216,7 +216,7 @@ async function main() {
|
||||
console.log('用法:')
|
||||
console.log(' pipeline.js create-account --id <id> --name <名称> [--desc ...] [--references file1,file2]')
|
||||
console.log(' pipeline.js validate-account --account <id>')
|
||||
console.log(' pipeline.js init --account <id> --mode <single|framePair> --items <JSON> [--items-file <path>]')
|
||||
console.log(' pipeline.js init --account <id> --mode <single|framePair> --items <JSON> [--items-file <path>] [--image-model gemini|mj] [--video-model veo3-fast|grok|kling] [--format 9:16]')
|
||||
console.log(' pipeline.js validate --manifest <path>')
|
||||
console.log(' pipeline.js confirm --manifest <path> --all')
|
||||
console.log(' pipeline.js run --manifest <path> [--account id] [--phase p1,p2] [--resume] [--retry-failed]')
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
#!/usr/bin/env node
|
||||
|
||||
/**
|
||||
* 阿里云 Qwen-TTS 批量语音合成脚本
|
||||
* CosyVoice TTS 批量语音合成脚本
|
||||
* 通过 WebSocket 调用阿里云 DashScope CosyVoice API
|
||||
*
|
||||
* 输入 JSON 文件格式:
|
||||
* {
|
||||
@@ -9,26 +10,25 @@
|
||||
* {"id": 1, "text": "第一段文案"},
|
||||
* {"id": 2, "text": "第二段文案"}
|
||||
* ],
|
||||
* "voice": "Cherry", // 可选,覆盖 config
|
||||
* "voice": "longanyang", // 可选,覆盖 config
|
||||
* "output_dir": "./audio" // 可选,默认 ./audio
|
||||
* }
|
||||
*
|
||||
* 输出 JSON (stdout):
|
||||
* {
|
||||
* "segments": [
|
||||
* {"id": 1, "text": "...", "audio": "./audio/seg_001.wav", "duration": 3.456},
|
||||
* ...
|
||||
* {"id": 1, "text": "...", "audio": "./audio/seg_001.mp3", "duration": 3.456}
|
||||
* ]
|
||||
* }
|
||||
*
|
||||
* 也可作为模块调用:
|
||||
* const { synthesize } = require('./qwen-tts')
|
||||
* const { filePath, duration } = await synthesize('你好世界', { voice: 'Cherry' })
|
||||
* const { filePath, duration } = await synthesize('你好世界', { voice: 'longanyang' })
|
||||
*/
|
||||
|
||||
const axios = require('axios')
|
||||
const fs = require('fs')
|
||||
const path = require('path')
|
||||
const { execFileSync } = require('child_process')
|
||||
|
||||
const CONFIG_PATH = path.join(__dirname, '..', '..', 'config.json')
|
||||
|
||||
@@ -37,91 +37,166 @@ function loadConfig() {
|
||||
return JSON.parse(fs.readFileSync(CONFIG_PATH, 'utf-8'))
|
||||
}
|
||||
|
||||
function getAudioDuration(filePath) {
|
||||
try {
|
||||
const out = execFileSync('ffprobe', [
|
||||
'-v', 'quiet', '-show_entries', 'format=duration',
|
||||
'-of', 'default=noprint_wrappers=1:nokey=1', filePath,
|
||||
], { encoding: 'utf-8', timeout: 10000 })
|
||||
return parseFloat(out.trim())
|
||||
} catch {
|
||||
const stat = fs.statSync(filePath)
|
||||
return stat.size * 8 / 32000
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 单段语音合成(非流式)
|
||||
* @param {string} text - 要合成的文本
|
||||
* @param {object} options - { voice, model, language, outputDir, id }
|
||||
* @returns {{ filePath: string, duration: number }}
|
||||
* 单段语音合成(CosyVoice WebSocket)
|
||||
* @param {string} text
|
||||
* @param {object} options - { voice, model, outputDir, id, instruction }
|
||||
* @returns {Promise<{filePath: string, duration: number}>}
|
||||
*/
|
||||
async function synthesize(text, options = {}) {
|
||||
function synthesize(text, options = {}) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const config = loadConfig()
|
||||
|
||||
const apiKey = options.apiKey || config.ttsApiKey
|
||||
if (!apiKey) throw new Error('ttsApiKey 未配置,请在 config.json 中设置')
|
||||
if (!apiKey) { reject(new Error('ttsApiKey 未配置')); return }
|
||||
|
||||
const baseUrl = (options.apiBaseUrl || config.ttsApiBaseUrl || 'https://dashscope.aliyuncs.com/api/v1').replace(/\/$/, '')
|
||||
const model = options.model || config.ttsModel || 'qwen-tts'
|
||||
const voice = options.voice || config.ttsVoice || 'Cherry'
|
||||
const language = options.language || config.ttsLanguage || 'Chinese'
|
||||
const model = options.model || config.ttsModel || 'cosyvoice-v3-flash'
|
||||
const voice = options.voice || config.ttsVoice || 'longanyang'
|
||||
const instruction = options.instruction || config.ttsInstruction || ''
|
||||
const outputDir = options.outputDir || './audio'
|
||||
|
||||
fs.mkdirSync(outputDir, { recursive: true })
|
||||
|
||||
// 确保文本有句末标点,让 TTS 生成自然语调和尾部停顿
|
||||
text = text.trimEnd()
|
||||
if (!/[。!?.!?…]$/.test(text)) text += '。'
|
||||
|
||||
const url = `${baseUrl}/services/aigc/multimodal-generation/generation`
|
||||
|
||||
let res
|
||||
try {
|
||||
res = await axios.post(url, {
|
||||
model,
|
||||
input: {
|
||||
text,
|
||||
voice,
|
||||
language_type: language,
|
||||
},
|
||||
}, {
|
||||
headers: {
|
||||
'Authorization': `Bearer ${apiKey}`,
|
||||
'Content-Type': 'application/json',
|
||||
},
|
||||
timeout: 60000,
|
||||
})
|
||||
} catch (err) {
|
||||
const detail = err.response?.data
|
||||
throw new Error(`TTS API 错误: ${err.message}${detail ? ' ' + JSON.stringify(detail) : ''}`)
|
||||
}
|
||||
|
||||
const audioUrl = res.data?.output?.audio?.url
|
||||
if (!audioUrl) {
|
||||
throw new Error(`TTS API 未返回音频 URL: ${JSON.stringify(res.data)}`)
|
||||
}
|
||||
|
||||
// 下载音频到本地
|
||||
const id = options.id || 1
|
||||
const fileName = `seg_${String(id).padStart(3, '0')}.wav`
|
||||
const fileName = `seg_${String(id).padStart(3, '0')}.mp3`
|
||||
const filePath = path.resolve(outputDir, fileName)
|
||||
|
||||
const audioRes = await axios.get(audioUrl, { responseType: 'arraybuffer', timeout: 30000 })
|
||||
const wavBuffer = Buffer.from(audioRes.data)
|
||||
const wsUrl = 'wss://dashscope.aliyuncs.com/api-ws/v1/inference'
|
||||
const ws = new WebSocket(wsUrl, {
|
||||
headers: { Authorization: `bearer ${apiKey}` },
|
||||
})
|
||||
|
||||
// 追加 0.3s 静音(句间气口)
|
||||
const silenceSec = options.silencePadding !== undefined ? options.silencePadding : 0.3
|
||||
const silenceBytes = Math.round(24000 * 2 * silenceSec)
|
||||
const silenceBuffer = Buffer.alloc(silenceBytes, 0)
|
||||
const finalBuffer = Buffer.concat([wavBuffer, silenceBuffer])
|
||||
// 更新 WAV 头的文件大小
|
||||
finalBuffer.writeUInt32LE(finalBuffer.length - 8, 4)
|
||||
finalBuffer.writeUInt32LE(wavBuffer.length - 44 + silenceBytes, 40)
|
||||
fs.writeFileSync(filePath, finalBuffer)
|
||||
const taskId = `tts_${Date.now()}_${id}`
|
||||
const chunks = []
|
||||
let settled = false
|
||||
|
||||
const duration = (finalBuffer.length - 44) / (24000 * 2)
|
||||
const timer = setTimeout(() => {
|
||||
if (!settled) { settled = true; ws.close(); reject(new Error('TTS 超时 (60s)')) }
|
||||
}, 60000)
|
||||
|
||||
return { filePath, duration }
|
||||
ws.addEventListener('open', () => {
|
||||
// Step 1: run-task — empty input, no text
|
||||
ws.send(JSON.stringify({
|
||||
header: {
|
||||
task_id: taskId,
|
||||
action: 'run-task',
|
||||
streaming: 'duplex',
|
||||
},
|
||||
payload: {
|
||||
task_group: 'audio',
|
||||
task: 'tts',
|
||||
function: 'SpeechSynthesizer',
|
||||
model,
|
||||
parameters: {
|
||||
voice,
|
||||
format: 'mp3',
|
||||
sample_rate: 24000,
|
||||
volume: 50,
|
||||
rate: 1.0,
|
||||
pitch_rate: 1.0,
|
||||
text_type: 'PlainText',
|
||||
...(instruction ? { instruction } : {}),
|
||||
},
|
||||
input: {},
|
||||
},
|
||||
}))
|
||||
})
|
||||
|
||||
ws.addEventListener('message', async (event) => {
|
||||
if (typeof event.data !== 'string') {
|
||||
const buf = event.data instanceof Blob
|
||||
? Buffer.from(await event.data.arrayBuffer())
|
||||
: Buffer.from(event.data)
|
||||
chunks.push(buf)
|
||||
return
|
||||
}
|
||||
try {
|
||||
const msg = JSON.parse(event.data)
|
||||
const evt = msg.header?.event
|
||||
|
||||
if (evt === 'task-started') {
|
||||
// Step 2: continue-task — send text
|
||||
ws.send(JSON.stringify({
|
||||
header: {
|
||||
task_id: taskId,
|
||||
action: 'continue-task',
|
||||
streaming: 'duplex',
|
||||
},
|
||||
payload: {
|
||||
task_group: 'audio',
|
||||
task: 'tts',
|
||||
function: 'SpeechSynthesizer',
|
||||
model,
|
||||
input: { text },
|
||||
},
|
||||
}))
|
||||
|
||||
// Step 3: finish-task
|
||||
ws.send(JSON.stringify({
|
||||
header: {
|
||||
task_id: taskId,
|
||||
action: 'finish-task',
|
||||
streaming: 'duplex',
|
||||
},
|
||||
payload: {
|
||||
task_group: 'audio',
|
||||
task: 'tts',
|
||||
function: 'SpeechSynthesizer',
|
||||
input: {},
|
||||
},
|
||||
}))
|
||||
} else if (evt === 'task-finished') {
|
||||
clearTimeout(timer)
|
||||
ws.close()
|
||||
if (settled) return
|
||||
settled = true
|
||||
|
||||
const audio = Buffer.concat(chunks)
|
||||
if (audio.length === 0) { reject(new Error('TTS 未返回音频')); return }
|
||||
|
||||
fs.writeFileSync(filePath, audio)
|
||||
resolve({ filePath, duration: getAudioDuration(filePath) })
|
||||
} else if (evt === 'task-failed') {
|
||||
clearTimeout(timer)
|
||||
ws.close()
|
||||
if (settled) return
|
||||
settled = true
|
||||
reject(new Error(`TTS 失败: ${msg.header?.error_message || msg.header?.message || JSON.stringify(msg)}`))
|
||||
}
|
||||
} catch {}
|
||||
})
|
||||
|
||||
ws.addEventListener('error', (e) => {
|
||||
clearTimeout(timer)
|
||||
if (!settled) { settled = true; reject(new Error(`WebSocket 错误: ${e.message || '连接失败'}`)) }
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* 批量语音合成
|
||||
* @param {Array<{id: number, text: string}>} segments
|
||||
* @param {object} options - { voice, outputDir }
|
||||
* @returns {Array<{id: number, text: string, audio: string, duration: number}>}
|
||||
*/
|
||||
async function synthesizeBatch(segments, options = {}) {
|
||||
const results = []
|
||||
for (const seg of segments) {
|
||||
console.error(` 合成 #${seg.id}: ${seg.text.substring(0, 30)}...`)
|
||||
try {
|
||||
const { filePath, duration } = await synthesize(seg.text, {
|
||||
...options,
|
||||
id: seg.id,
|
||||
@@ -132,7 +207,15 @@ async function synthesizeBatch(segments, options = {}) {
|
||||
audio: filePath,
|
||||
duration: Math.round(duration * 1000) / 1000,
|
||||
})
|
||||
// 间隔 0.5 秒避免限流
|
||||
} catch (err) {
|
||||
results.push({
|
||||
id: seg.id,
|
||||
text: seg.text,
|
||||
audio: '',
|
||||
duration: 0,
|
||||
error: err.message,
|
||||
})
|
||||
}
|
||||
await new Promise(r => setTimeout(r, 500))
|
||||
}
|
||||
return results
|
||||
@@ -147,22 +230,19 @@ async function main() {
|
||||
console.error('input.json 格式:')
|
||||
console.error(JSON.stringify({
|
||||
segments: [{ id: 1, text: '文案' }],
|
||||
voice: 'Cherry',
|
||||
voice: 'longanyang',
|
||||
output_dir: './audio',
|
||||
}, null, 2))
|
||||
process.exit(1)
|
||||
}
|
||||
|
||||
const config = JSON.parse(fs.readFileSync(inputJson, 'utf-8'))
|
||||
const segments = config.segments
|
||||
const options = {
|
||||
const results = await synthesizeBatch(config.segments, {
|
||||
voice: config.voice,
|
||||
outputDir: config.output_dir || './audio',
|
||||
}
|
||||
})
|
||||
|
||||
const results = await synthesizeBatch(segments, options)
|
||||
const output = { segments: results }
|
||||
process.stdout.write(JSON.stringify(output, null, 2) + '\n')
|
||||
process.stdout.write(JSON.stringify({ segments: results }, null, 2) + '\n')
|
||||
}
|
||||
|
||||
if (require.main === module) {
|
||||
|
||||
@@ -6,6 +6,8 @@
|
||||
"imageModel": "gemini",
|
||||
"videoModel": "veo3-fast",
|
||||
"batchSize": 30,
|
||||
"ttsVoice": "",
|
||||
"ttsInstruction": "",
|
||||
"storyboardPrompt": "prompts/分镜.md",
|
||||
"imageStylePrompt": "prompts/图片提示词.md",
|
||||
"videoStylePrompt": "prompts/视频提示词.md",
|
||||
|
||||
@@ -8,9 +8,63 @@
|
||||
|
||||
你不负责风格细化、色调设定、镜头运动参数——这些由下游提示词处理。你只负责:**这个画面里有什么 + 它隐含着什么运动趋势 + 用哪位导演的构图逻辑**。
|
||||
|
||||
## 二、切割规则
|
||||
## 二、模式策略
|
||||
|
||||
### 2.1 切割优先级
|
||||
根据下游成片方式选择不同的分镜策略。收到文案后第一步:确认模式(图文/视频),后续所有规则按模式执行。
|
||||
|
||||
### 2.1 图文成片模式
|
||||
|
||||
图片即最终画面,不经过视频生成。每帧图片必须独立承载完整叙事。
|
||||
|
||||
| 维度 | 图文成片策略 |
|
||||
|------|-------------|
|
||||
| 图片角色 | 画面即成品,用户直接看到 |
|
||||
| shotDesc 核心 | **决定性瞬间**——每帧独立讲完一个故事,画面本身必须有视觉冲击力 |
|
||||
| 隐性动势 | 不强制。更侧重构图张力、氛围渲染和视觉隐喻 |
|
||||
| 时长策略 | 跟旁白节奏,允许 4-10 秒(一个观点可以讲透) |
|
||||
| 导演倾向 | Kitano(留白冲击)、Fincher(构图控制)优先 |
|
||||
| 相邻帧 | 景别/构图/视角**必须有对比变化**,禁止连续两张同景别 |
|
||||
|
||||
**图文成片 shotDesc 示例:**
|
||||
|
||||
```
|
||||
a man standing alone at the far edge of a vast stone courtyard,
|
||||
three-quarters of the frame dominated by the empty geometric floor
|
||||
and the long diagonal shadow of a pillar cutting toward him —
|
||||
the distance between his body and the closed door behind him
|
||||
carries the entire weight of a decision that has already been made
|
||||
```
|
||||
|
||||
→ 静态画面,但构图本身在「说话」:空间关系传递情绪,不依赖运动。
|
||||
|
||||
### 2.2 视频成片模式
|
||||
|
||||
图片是视频模型的起始帧,运动和过渡由视频模型完成。
|
||||
|
||||
| 维度 | 视频成片策略 |
|
||||
|------|-------------|
|
||||
| 图片角色 | 起始帧,视频模型基于此生成运动画面 |
|
||||
| shotDesc 核心 | **运动趋势**——每帧必须携带明确的动势,让视频模型知道往哪个方向动 |
|
||||
| 隐性动势 | **必填**。每条 shotDesc 至少包含一个动势词组 |
|
||||
| 时长策略 | 严格匹配视频片段长度,3-7 秒,目标 5 秒 |
|
||||
| 导演倾向 | Tarantino(微行为动势)、Fincher(细节运动)优先 |
|
||||
| 相邻帧 | 允许连续同景别,视频运动本身提供变化 |
|
||||
|
||||
**视频成片 shotDesc 示例:**
|
||||
|
||||
```
|
||||
a man standing at the far edge of a stone courtyard, body rigid,
|
||||
head beginning a slow quarter-turn toward the door behind him —
|
||||
his shoulders have not moved yet but the weight of his gaze
|
||||
is shifting, the shadow on the floor lengthening as the light
|
||||
source outside the frame begins its slow rotation
|
||||
```
|
||||
|
||||
→ 有明确运动趋势:头正在转向、影子正在拉长——视频模型能推断运动方向。
|
||||
|
||||
## 三、切割规则
|
||||
|
||||
### 3.1 切割优先级
|
||||
|
||||
以「语义场景单元」为第一切割依据,不按句号机械切割。
|
||||
|
||||
@@ -20,23 +74,23 @@
|
||||
| 场景转换 | 叙述空间或时间发生变化 |
|
||||
| 主体变化 | 叙述对象或视角切换 |
|
||||
| 节奏重音 | 强调句、停顿感强、关键意象出现 |
|
||||
| 字数上限 | 单条旁白超过 22 字强制切割 |
|
||||
| 语义完整 | 该段表达一个完整观点或例子 |
|
||||
| 字数上限 | 视频成片每段 22 字左右;图文成片每段 50 字左右 |
|
||||
|
||||
### 2.2 时长控制
|
||||
### 3.2 时长控制
|
||||
|
||||
- **目标时长:** 每条 Shot 5 秒
|
||||
- **允许范围:** 3–7 秒
|
||||
- **字数参考:** 每条旁白 ≤ 22 字(约 4.4 字/秒,1.1 倍速)
|
||||
- **图文成片:** 每条 Shot 4-10 秒,跟随旁白节奏,完整表达一个观点
|
||||
- **视频成片:** 每条 Shot 3-7 秒,目标 5 秒,匹配视频片段长度
|
||||
- **总时长校验:** 所有 duration 之和 = 文案朗读总时长
|
||||
|
||||
## 三、导演构图语言词库(分镜层专用)
|
||||
## 四、导演构图语言词库(分镜层专用)
|
||||
|
||||
> 本层只负责:构图逻辑 + 画面内容设计 + 视角选择
|
||||
> 光影渲染由图片提示词处理,运动节奏由视频提示词处理
|
||||
|
||||
每个 Shot 选定一位导演作为构图参考,写入 `directorRef` 字段向下游透传。下游图片和视频提示词根据此字段执行各自层的风格,不重新选导演。
|
||||
|
||||
### 3.1 昆汀·塔伦蒂诺(Tarantino)
|
||||
### 4.1 昆汀·塔伦蒂诺(Tarantino)
|
||||
|
||||
**构图核心:** 身体局部主导叙事;对话即权力博弈;平静表面下的极度张力
|
||||
|
||||
@@ -60,7 +114,7 @@ room has not yet realized is coming
|
||||
|
||||
**适合选用场景:** 微行为解码 / 潜台词型文案 / 局部细节承载叙事
|
||||
|
||||
### 3.2 北野武(Kitano)
|
||||
### 4.2 北野武(Kitano)
|
||||
|
||||
**构图核心:** 静止即叙事;留白承载重量;人物与空间的关系即情绪
|
||||
|
||||
@@ -85,7 +139,7 @@ His body has not moved. Neither has his decision.
|
||||
|
||||
**适合选用场景:** 孤独/等待/沉默型文案 / 收尾 Shot / 留白叙事
|
||||
|
||||
### 3.3 大卫·芬奇(Fincher)
|
||||
### 4.3 大卫·芬奇(Fincher)
|
||||
|
||||
**构图核心:** 精确的控制感;对称中的破坏;冷静凝视是最深的压迫
|
||||
|
||||
@@ -110,13 +164,25 @@ The balance of power broke the same moment the geometry did.
|
||||
|
||||
**适合选用场景:** 规律揭示型文案 / 解剖者视角 / 关系结构拆解
|
||||
|
||||
## 四、shotDesc 写法规范
|
||||
## 五、shotDesc 写法规范
|
||||
|
||||
### 4.1 语言
|
||||
### 5.1 语言
|
||||
|
||||
统一英文输出。shotDesc 是下游图片模型的内容底稿,英文输入更稳定。视频提示词的语言由下游模块根据目标模型自动适配。
|
||||
|
||||
### 4.2 必须包含的五个内容维度
|
||||
### 5.2 必须包含的内容维度
|
||||
|
||||
**图文成片模式:**
|
||||
|
||||
| 维度 | 说明 |
|
||||
|------|------|
|
||||
| 主体 | 画面核心对象是谁或是什么 |
|
||||
| 状态/姿态 | 当前的身体状态或物体状态 |
|
||||
| 环境 | 场景空间与氛围 |
|
||||
| 构图张力 | 空间关系、视觉隐喻、情绪重量(替代隐性动势) |
|
||||
| 情绪张力 | 用视觉词而非情绪词传递张力 |
|
||||
|
||||
**视频成片模式:**
|
||||
|
||||
| 维度 | 说明 |
|
||||
|------|------|
|
||||
@@ -126,11 +192,12 @@ The balance of power broke the same moment the geometry did.
|
||||
| 隐性动势 | 画面中隐含的运动趋势(**必填**) |
|
||||
| 情绪张力 | 用视觉词而非情绪词传递张力 |
|
||||
|
||||
### 4.3 隐性动势(Implied Motion)——核心要求
|
||||
### 5.3 隐性动势(Implied Motion)
|
||||
|
||||
每条 shotDesc **必须包含至少一个隐性动势词组**。
|
||||
**视频成片模式:每条 shotDesc 必须包含至少一个隐性动势词组。**
|
||||
**图文成片模式:不强制,可选用以增加画面叙事感。**
|
||||
|
||||
**正确(有隐性动势):**
|
||||
**正确——有隐性动势(适合视频成片):**
|
||||
|
||||
```
|
||||
a man's hand slowly tightening around a cup,
|
||||
@@ -138,13 +205,22 @@ knuckles beginning to whiten, gaze fixed downward —
|
||||
as if the decision has already been made inside
|
||||
```
|
||||
|
||||
**错误(纯静止):**
|
||||
**正确——无动势但有构图张力(适合图文成片):**
|
||||
|
||||
```
|
||||
a man's hand resting on a cup in a perfectly centered
|
||||
composition — the cup occupies the exact geometric center
|
||||
of the frame, and his hand is the only element breaking
|
||||
the symmetry of the empty table stretching to both edges
|
||||
```
|
||||
|
||||
**错误——既无动势也无构图张力:**
|
||||
|
||||
```
|
||||
a man holding a cup and looking down
|
||||
```
|
||||
|
||||
### 4.4 隐性动势词库
|
||||
### 5.4 隐性动势词库
|
||||
|
||||
**人物动势:**
|
||||
|
||||
@@ -172,18 +248,20 @@ breaks / silence stretching thin / the moment before something that cannot be
|
||||
undone
|
||||
```
|
||||
|
||||
### 4.5 字数控制
|
||||
### 5.5 字数控制
|
||||
|
||||
每条 shotDesc 控制在 **40–80 词**之间。
|
||||
- **图文成片:** 每条 shotDesc **50–80 词**——图片即成品,需要充分描述构图、氛围和视觉隐喻
|
||||
- **视频成片:** 每条 shotDesc **30–60 词**——视频模型需要精炼聚焦的运动指令,过长会稀释动势信号
|
||||
|
||||
### 4.6 禁止事项
|
||||
### 5.6 禁止事项
|
||||
|
||||
- 禁止写镜头运动参数(`zoom-in` / `pan`)——留给视频提示词
|
||||
- 禁止写色调参数(`cold blue` / `warm orange`)——留给图片提示词
|
||||
- 禁止写画质参数(`8K` / `cinematic`)——留给图片提示词
|
||||
- 禁止纯静止描述,必须附加至少一个隐性动势词
|
||||
- **视频成片:** 禁止纯静止描述,必须附加至少一个隐性动势词
|
||||
- **图文成片:** 禁止连续两张同景别/同构图的 shot
|
||||
|
||||
## 五、directorRef 选择规则
|
||||
## 六、directorRef 选择规则
|
||||
|
||||
| 选 Tarantino | 选 Kitano | 选 Fincher |
|
||||
|-------------|-----------|-----------|
|
||||
@@ -191,14 +269,21 @@ undone
|
||||
| 对话/博弈场景 | 孤独/等待/收尾场景 | 规律揭示/解剖者视角场景 |
|
||||
| 日常物件暗藏张力 | 空镜、余韵 | 审讯感、不可逃脱 |
|
||||
|
||||
## 六、输入规范
|
||||
**模式倾向:**
|
||||
- **视频成片**优先 Tarantino(微行为动势强)、Fincher(细节暗示运动)
|
||||
- **图文成片**优先 Kitano(留白冲击力强)、Fincher(构图控制精确)
|
||||
|
||||
## 七、输入规范
|
||||
|
||||
```
|
||||
【完整口播文案】
|
||||
{粘贴完整文案}
|
||||
|
||||
【成片模式】
|
||||
图文成片 / 视频成片
|
||||
```
|
||||
|
||||
## 七、输出格式
|
||||
## 八、输出格式
|
||||
|
||||
输出前附加总览行:
|
||||
|
||||
@@ -212,29 +297,35 @@ undone
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"shotDesc": "英文画面描述,含隐性动势,40-80词",
|
||||
"narration": "该段对应的中文口播旁白,≤22字",
|
||||
"shotDesc": "英文画面描述(图文50-80词 / 视频30-60词)",
|
||||
"narration": "该段的完整原文案,不提炼,保留论证、例子、细节",
|
||||
"duration": 5,
|
||||
"directorRef": "tarantino / kitano / fincher"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
## 八、启动指令与自检
|
||||
## 九、启动指令与自检
|
||||
|
||||
收到文案后:
|
||||
|
||||
1. 扫描全文,识别语义场景数量与情绪节奏
|
||||
2. 为每个 Shot 选定导演构图参考
|
||||
3. 输出总览行,输出完整 JSON
|
||||
1. 确认成片模式(图文/视频)
|
||||
2. 扫描全文,识别语义场景数量与情绪节奏
|
||||
3. 为每个 Shot 选定导演构图参考
|
||||
4. 输出总览行,输出完整 JSON
|
||||
|
||||
**隐性动势自检(每条输出前必问):**
|
||||
**图文成片自检(每条输出前必问):**
|
||||
|
||||
> 这帧图片独立存在时,用户能被画面吸引吗?
|
||||
> 答案是「不能」→ **重写**
|
||||
|
||||
**视频成片自检(每条输出前必问):**
|
||||
|
||||
> 如果这帧图片喂给视频模型,它知道往哪个方向动吗?
|
||||
> 答案是「不知道」→ **重写**
|
||||
|
||||
**其他规则:**
|
||||
|
||||
- 单条旁白超过 22 字,强制切割为两条独立 Shot
|
||||
- `directorRef` 必须填写,不得为空,下游依赖此字段执行风格
|
||||
- 按语义单元切割,每段表达一个完整观点或例子
|
||||
- 若用户未提供完整口播文案,提示补充,不得凭空生成
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
]
|
||||
}
|
||||
},
|
||||
"ttsInstruction": "用沉稳有力的男性声音朗读,语速适中偏慢,语气低沉、坚定、有压迫感,像是一个看透人性的老手在冷静地讲述残酷的真相",
|
||||
"storyboardPrompt": "prompts/分镜.md",
|
||||
"imageStylePrompt": "prompts/图片提示词.md",
|
||||
"videoStylePrompt": "prompts/视频提示词.md",
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
# 分镜脚本生成器 v4|口播文案 → 分镜表
|
||||
# 分镜脚本生成器 v5|口播文案 → 分镜表
|
||||
|
||||
## 一、角色定义
|
||||
|
||||
@@ -8,7 +8,61 @@
|
||||
|
||||
你不负责风格细化、色调设定、镜头运动参数——这些由下游提示词处理。你只负责:这个画面里有什么 + 它隐含着什么运动趋势 + 用哪位导演的构图逻辑。
|
||||
|
||||
## 二、账号内容理解(风格锚定,不输出)
|
||||
## 二、模式策略
|
||||
|
||||
根据下游成片方式选择不同的分镜策略。收到文案后第一步:确认模式(图文/视频),后续所有规则按模式执行。
|
||||
|
||||
### 2.1 图文成片模式
|
||||
|
||||
图片即最终画面,不经过视频生成。每帧图片必须独立承载完整叙事。
|
||||
|
||||
| 维度 | 图文成片策略 |
|
||||
|------|-------------|
|
||||
| 图片角色 | 画面即成品,用户直接看到 |
|
||||
| shotDesc 核心 | **决定性瞬间**——每帧独立讲完一个故事,画面本身必须有视觉冲击力 |
|
||||
| 隐性动势 | 不强制。更侧重构图张力、氛围渲染和视觉隐喻 |
|
||||
| 时长策略 | 跟旁白节奏,允许 4-10 秒(一个观点可以讲透) |
|
||||
| 导演倾向 | Kitano(留白冲击)、Fincher(构图控制)优先 |
|
||||
| 相邻帧 | 景别/构图/视角**必须有对比变化**,禁止连续两张同景别 |
|
||||
|
||||
**图文成片 shotDesc 示例:**
|
||||
|
||||
```
|
||||
a man standing alone at the far edge of a vast stone courtyard,
|
||||
three-quarters of the frame dominated by the empty geometric floor
|
||||
and the long diagonal shadow of a pillar cutting toward him —
|
||||
the distance between his body and the closed door behind him
|
||||
carries the entire weight of a decision that has already been made
|
||||
```
|
||||
|
||||
→ 静态画面,但构图本身在「说话」:空间关系传递情绪,不依赖运动。
|
||||
|
||||
### 2.2 视频成片模式
|
||||
|
||||
图片是视频模型的起始帧,运动和过渡由视频模型完成。
|
||||
|
||||
| 维度 | 视频成片策略 |
|
||||
|------|-------------|
|
||||
| 图片角色 | 起始帧,视频模型基于此生成运动画面 |
|
||||
| shotDesc 核心 | **运动趋势**——每帧必须携带明确的动势,让视频模型知道往哪个方向动 |
|
||||
| 隐性动势 | **必填**。每条 shotDesc 至少包含一个动势词组 |
|
||||
| 时长策略 | 严格匹配视频片段长度,3-7 秒,目标 5 秒 |
|
||||
| 导演倾向 | Tarantino(微行为动势)、Fincher(细节运动)优先 |
|
||||
| 相邻帧 | 允许连续同景别,视频运动本身提供变化 |
|
||||
|
||||
**视频成片 shotDesc 示例:**
|
||||
|
||||
```
|
||||
a man standing at the far edge of a stone courtyard, body rigid,
|
||||
head beginning a slow quarter-turn toward the door behind him —
|
||||
his shoulders have not moved yet but the weight of his gaze
|
||||
is shifting, the shadow on the floor lengthening as the light
|
||||
source outside the frame begins its slow rotation
|
||||
```
|
||||
|
||||
→ 有明确运动趋势:头正在转向、影子正在拉长——视频模型能推断运动方向。
|
||||
|
||||
## 三、账号内容理解(风格锚定,不输出)
|
||||
|
||||
核心方向:人性拆解 / 权力博弈 / 历史权谋 / 黑色生命力 / 管理逻辑
|
||||
目标受众:30岁左右,偏好反英雄叙事,追求认知穿透力
|
||||
@@ -16,7 +70,7 @@
|
||||
|
||||
每一帧都应该传递「有什么东西在底下运转」的张力感,而不是表面热闹或纯粹情绪宣泄。
|
||||
|
||||
## 三、宏观视觉风格方向
|
||||
## 四、宏观视觉风格方向
|
||||
|
||||
整体基调:权力感 / 压迫感 / 隐忍张力 / 东方叙事美学
|
||||
风格大类:历史权谋暗黑风 / 日式武士水墨电影感 / 权力场景叙事美学
|
||||
@@ -24,14 +78,14 @@
|
||||
场景基调:封闭空间感 / 强阴影 / 压迫性留白 / 历史质感环境
|
||||
禁止出现:轻快明亮 / 可爱软萌 / 现代网红美学 / 科技感元素 / 无张力的风景镜头
|
||||
|
||||
## 四、导演构图语言词库(分镜层专用)
|
||||
## 五、导演构图语言词库(分镜层专用)
|
||||
|
||||
本层只负责:构图逻辑 + 画面内容设计 + 视角选择
|
||||
光影渲染、运动节奏由下游图片/视频提示词处理,此处不写
|
||||
|
||||
每个 Shot 选定一位导演作为构图参考,写入 directorRef 字段,向下游图片提示词和视频提示词透传,确保三层风格一致。
|
||||
|
||||
### 4.1 昆汀·塔伦蒂诺(Tarantino)
|
||||
### 5.1 昆汀·塔伦蒂诺(Tarantino)
|
||||
|
||||
构图核心:身体局部主导叙事;对话即权力博弈;平静表面下的极度张力
|
||||
|
||||
@@ -53,7 +107,7 @@ pressure, as if counting down to something the rest of the
|
||||
room has not yet realized is coming
|
||||
```
|
||||
|
||||
### 4.2 北野武(Kitano)
|
||||
### 5.2 北野武(Kitano)
|
||||
|
||||
构图核心:静止即叙事;留白承载重量;暴力的突然性来自极度的安静
|
||||
|
||||
@@ -76,7 +130,7 @@ says everything about whether he has already decided not to leave.
|
||||
His body has not moved. Neither has his decision.
|
||||
```
|
||||
|
||||
### 4.3 大卫·芬奇(Fincher)
|
||||
### 5.3 大卫·芬奇(Fincher)
|
||||
|
||||
构图核心:精确的控制感;对称中的破坏;冷静凝视是最深的压迫
|
||||
|
||||
@@ -100,9 +154,9 @@ geometry. The balance of power broke the same moment
|
||||
the geometry did.
|
||||
```
|
||||
|
||||
## 五、切割规则
|
||||
## 六、切割规则
|
||||
|
||||
### 5.1 切割优先级
|
||||
### 6.1 切割优先级
|
||||
|
||||
以「语义场景单元」为第一切割依据,不按句号机械切割。
|
||||
|
||||
@@ -112,32 +166,72 @@ the geometry did.
|
||||
| 场景转换 | 叙述空间或时间发生变化 |
|
||||
| 主体变化 | 叙述对象或视角切换 |
|
||||
| 节奏重音 | 强调句、停顿感强、关键意象出现 |
|
||||
| 字数上限 | 单条旁白超过 22 字强制切割 |
|
||||
| 语义完整 | 该段表达一个完整观点或例子 |
|
||||
| 字数上限 | 视频成片每段 22 字左右;图文成片每段 50 字左右 |
|
||||
|
||||
### 5.2 时长控制
|
||||
### 6.2 时长控制
|
||||
|
||||
目标时长:每条 Shot 5 秒
|
||||
允许范围:3–7 秒
|
||||
字数参考:每条旁白 ≤ 22 字(约 4.4 字/秒,1.1 倍速)
|
||||
总时长校验:所有 duration 之和 = 文案朗读总时长
|
||||
- **图文成片:** 每条 Shot 4-10 秒,跟随旁白节奏,完整表达一个观点
|
||||
- **视频成片:** 每条 Shot 3-7 秒,目标 5 秒,匹配视频片段长度
|
||||
- **总时长校验:** 所有 duration 之和 = 文案朗读总时长
|
||||
|
||||
## 六、shotDesc 写法规范
|
||||
## 七、shotDesc 写法规范
|
||||
|
||||
### 6.1 语言
|
||||
### 7.1 语言
|
||||
|
||||
统一英文输出。shotDesc 是下游图片模型的内容底稿,英文输入更稳定。视频提示词的语言由下游模块根据目标模型自动适配。
|
||||
|
||||
### 6.2 必须包含的五个内容维度
|
||||
### 7.2 必须包含的内容维度
|
||||
|
||||
**图文成片模式:**
|
||||
|
||||
| 维度 | 说明 |
|
||||
|-----|------|
|
||||
| 主体 | 画面核心对象是谁或是什么 |
|
||||
| 状态/姿态 | 当前的身体状态或物体状态 |
|
||||
| 环境 | 场景空间与氛围 |
|
||||
| 构图张力 | 空间关系、视觉隐喻、情绪重量(替代隐性动势) |
|
||||
| 情绪张力 | 用视觉词而非情绪词传递张力 |
|
||||
|
||||
**视频成片模式:**
|
||||
|
||||
| 维度 | 说明 |
|
||||
|-----|------|
|
||||
| 主体 | 画面核心对象是谁或是什么 |
|
||||
| 状态/姿态 | 当前的身体状态,必须有动态倾向 |
|
||||
| 环境 | 场景空间与氛围 |
|
||||
| 隐性动势 | 画面中隐含的运动趋势(必填) |
|
||||
| 隐性动势 | 画面中隐含的运动趋势(**必填**) |
|
||||
| 情绪张力 | 用视觉词而非情绪词传递张力 |
|
||||
|
||||
### 6.3 隐性动势词库
|
||||
### 7.3 隐性动势
|
||||
|
||||
**视频成片模式:每条 shotDesc 必须包含至少一个隐性动势词组。**
|
||||
**图文成片模式:不强制,可选用以增加画面叙事感。**
|
||||
|
||||
**正确——有隐性动势(适合视频成片):**
|
||||
|
||||
```
|
||||
a man's hand slowly tightening around a cup,
|
||||
knuckles beginning to whiten, gaze fixed downward —
|
||||
as if the decision has already been made inside
|
||||
```
|
||||
|
||||
**正确——无动势但有构图张力(适合图文成片):**
|
||||
|
||||
```
|
||||
a man's hand resting on a cup in a perfectly centered
|
||||
composition — the cup occupies the exact geometric center
|
||||
of the frame, and his hand is the only element breaking
|
||||
the symmetry of the empty table stretching to both edges
|
||||
```
|
||||
|
||||
**错误——既无动势也无构图张力:**
|
||||
|
||||
```
|
||||
a man holding a cup and looking down
|
||||
```
|
||||
|
||||
### 7.4 隐性动势词库
|
||||
|
||||
人物动势:
|
||||
|
||||
@@ -165,19 +259,21 @@ breaks the surface / silence stretching thin across the room / the moment before
|
||||
something that cannot be undone
|
||||
```
|
||||
|
||||
### 6.4 字数控制
|
||||
### 7.5 字数控制
|
||||
|
||||
每条 shotDesc 控制在 40–80 词之间。
|
||||
- **图文成片:** 每条 shotDesc **50–80 词**——图片即成品,需要充分描述构图、氛围和视觉隐喻
|
||||
- **视频成片:** 每条 shotDesc **30–60 词**——视频模型需要精炼聚焦的运动指令,过长会稀释动势信号
|
||||
|
||||
### 6.5 禁止事项
|
||||
### 7.6 禁止事项
|
||||
|
||||
禁止写镜头运动参数(zoom-in / pan)——留给视频原提示词
|
||||
禁止写色调参数(cold blue / warm orange)——留给图片原提示词
|
||||
禁止写画质参数(8K / cinematic)——留给图片原提示词
|
||||
禁止纯静止描述,必须附加至少一个隐性动势词
|
||||
禁止出现真实政治人物姓名
|
||||
- 禁止写镜头运动参数(zoom-in / pan)——留给视频提示词
|
||||
- 禁止写色调参数(cold blue / warm orange)——留给图片提示词
|
||||
- 禁止写画质参数(8K / cinematic)——留给图片提示词
|
||||
- **视频成片:** 禁止纯静止描述,必须附加至少一个隐性动势词
|
||||
- **图文成片:** 禁止连续两张同景别/同构图的 shot
|
||||
- 禁止出现真实政治人物姓名
|
||||
|
||||
## 七、directorRef 选择规则
|
||||
## 八、directorRef 选择规则
|
||||
|
||||
每个 Shot 根据旁白语义和画面特征选定一位导演:
|
||||
|
||||
@@ -188,14 +284,21 @@ something that cannot be undone
|
||||
| 日常物件暗藏张力 | 空镜、余韵、收尾 | 审讯感、不可逃脱的压迫 |
|
||||
| 旁白有「潜台词解码」结构 | 旁白有「沉默」「位置」「等待」 | 旁白有「逐帧拆」「拆解者视角」 |
|
||||
|
||||
## 八、输入规范
|
||||
**模式倾向:**
|
||||
- **视频成片**优先 Tarantino(微行为动势强)、Fincher(细节暗示运动)
|
||||
- **图文成片**优先 Kitano(留白冲击力强)、Fincher(构图控制精确)
|
||||
|
||||
## 九、输入规范
|
||||
|
||||
```
|
||||
【完整口播文案】
|
||||
{粘贴完整文案}
|
||||
|
||||
【成片模式】
|
||||
图文成片 / 视频成片
|
||||
```
|
||||
|
||||
## 九、输出格式(严格遵守)
|
||||
## 十、输出格式(严格遵守)
|
||||
|
||||
输出前附加总览行:
|
||||
|
||||
@@ -209,15 +312,21 @@ something that cannot be undone
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"shotDesc": "英文画面描述,含隐性动势,40-80词",
|
||||
"narration": "该段对应的中文口播旁白,≤22字",
|
||||
"shotDesc": "英文画面描述(图文50-80词 / 视频30-60词)",
|
||||
"narration": "该段的完整原文案,不提炼,保留论证、例子、细节",
|
||||
"duration": 5,
|
||||
"directorRef": "tarantino / kitano / fincher"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
## 十、完整示例
|
||||
**字段说明**:
|
||||
- `narration`:该段的**完整原文**(包含论证、例子、细节),不是金句
|
||||
- 按语义单元切割,确保每段表达一个完整观点或例子
|
||||
|
||||
## 十一、完整示例
|
||||
|
||||
### 图文成片示例
|
||||
|
||||
**输入:**
|
||||
|
||||
@@ -225,6 +334,54 @@ something that cannot be undone
|
||||
【完整口播文案】
|
||||
权力从来不大声说话。它藏在一个人坐在哪里,看向哪里,
|
||||
在哪句话之后沉默了三秒。今天,我们逐帧拆。
|
||||
|
||||
【成片模式】
|
||||
图文成片
|
||||
```
|
||||
|
||||
**输出:**
|
||||
|
||||
```
|
||||
文案共识别 3 个语义场景 | 预计总时长 18 秒 | 共 3 个 Shot
|
||||
```
|
||||
|
||||
```json
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"shotDesc": "a solitary figure in a dark traditional robe seated at the far end of a dim wooden hall, three-quarters of the frame filled with empty floor and gathering shadow — the man occupies only the leftmost edge of the composition, back straight, shoulders set with the stillness of someone who has already decided. The space around him continues to darken.",
|
||||
"narration": "权力从来不大声说话。它藏在一个人坐在哪里,看向哪里,在哪句话之后沉默了三秒。",
|
||||
"duration": 6,
|
||||
"directorRef": "kitano"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"shotDesc": "extreme close-up of a man's eyes, half-lowered, tracking slowly across the room with the precision of someone reading a document no one else can see — his gaze moves but his head does not. In the blurred background, the edge of another figure waits, unknowingly being measured and filed away.",
|
||||
"narration": "权力藏在一个人坐在哪里,看向哪里。你看到的是他的位置,他看到的是整个房间的结构。",
|
||||
"duration": 6,
|
||||
"directorRef": "tarantino"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"shotDesc": "a near-symmetrical frame — two hands visible on a low table, one pair relaxed and open, one pair with fingers slowly pressing flat, knuckles beginning to whiten. The geometric precision of the table edge divides the frame exactly in half. The whitening knuckles are the only thing breaking the symmetry — and the silence.",
|
||||
"narration": "在哪句话之后沉默了三秒。这种沉默不是等待,是审视。今天,我们逐帧拆解权力运行的底层逻辑。",
|
||||
"duration": 6,
|
||||
"directorRef": "fincher"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
### 视频成片示例
|
||||
|
||||
**输入:**
|
||||
|
||||
```
|
||||
【完整口播文案】
|
||||
权力从来不大声说话。它藏在一个人坐在哪里,看向哪里,
|
||||
在哪句话之后沉默了三秒。今天,我们逐帧拆。
|
||||
|
||||
【成片模式】
|
||||
视频成片
|
||||
```
|
||||
|
||||
**输出:**
|
||||
@@ -237,39 +394,46 @@ something that cannot be undone
|
||||
[
|
||||
{
|
||||
"id": 1,
|
||||
"shotDesc": "a solitary figure in a dark traditional robe seated at the far end of a dim wooden hall, three-quarters of the frame filled with empty floor and gathering shadow — the man occupies only the leftmost edge of the composition, back straight, shoulders set with the stillness of someone who has already decided. The space around him continues to darken.",
|
||||
"narration": "权力从来不大声说话。",
|
||||
"duration": 4,
|
||||
"shotDesc": "a figure seated at the far end of a dim wooden hall, three-quarters of the frame filled with empty floor — the man's body is perfectly still but his head is beginning a slow almost imperceptible turn toward the door at the right edge of frame, as if he has heard something the camera has not yet revealed. The shadow on the floor continues to lengthen.",
|
||||
"narration": "权力从来不大声说话。它藏在一个人坐在哪里,看向哪里,在哪句话之后沉默了三秒。",
|
||||
"duration": 5,
|
||||
"directorRef": "kitano"
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"shotDesc": "extreme close-up of a man's eyes, half-lowered, tracking slowly across the room with the precision of someone reading a document no one else can see — his gaze moves but his head does not. In the blurred background, the edge of another figure waits, unknowingly being measured and filed away.",
|
||||
"narration": "它藏在一个人坐在哪里,看向哪里。",
|
||||
"shotDesc": "extreme close-up of a man's eyes, half-lowered, beginning to track slowly to the left with the precision of someone reading a document no one else can see — his gaze shifts but his head does not move yet, and in the blurred background a second figure's shoulder is beginning to come into focus.",
|
||||
"narration": "权力藏在一个人坐在哪里,看向哪里。你看到的是他的位置,他看到的是整个房间的结构。",
|
||||
"duration": 5,
|
||||
"directorRef": "tarantino"
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"shotDesc": "a near-symmetrical frame — two hands visible on a low table, one pair relaxed and open, one pair with fingers slowly pressing flat, knuckles beginning to whiten. The geometric precision of the table edge divides the frame exactly in half. The whitening knuckles are the only thing breaking the symmetry — and the silence.",
|
||||
"narration": "在哪句话之后沉默了三秒。今天,逐帧拆。",
|
||||
"duration": 6,
|
||||
"shotDesc": "a near-symmetrical frame — two pairs of hands on a low table, one pair relaxed, the other with fingers slowly pressing flat and knuckles beginning to whiten. The table edge divides the frame exactly in half. The whitening knuckles are the only motion in the frame, pressing harder, as the geometric order begins its quiet collapse.",
|
||||
"narration": "在哪句话之后沉默了三秒。这种沉默不是等待,是审视。今天,我们逐帧拆解权力运行的底层逻辑。",
|
||||
"duration": 5,
|
||||
"directorRef": "fincher"
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
## 十一、启动指令与自检
|
||||
## 十二、启动指令与自检
|
||||
|
||||
收到文案后:
|
||||
|
||||
1. 扫描全文,识别语义场景数量与情绪节奏
|
||||
2. 为每个 Shot 选定导演构图参考
|
||||
3. 输出总览行,输出完整 JSON
|
||||
1. 确认成片模式(图文/视频)
|
||||
2. 扫描全文,识别语义场景数量与情绪节奏
|
||||
3. 为每个 Shot 选定导演构图参考
|
||||
4. 输出总览行,输出完整 JSON
|
||||
|
||||
隐性动势自检(每条输出前必问):
|
||||
**图文成片自检(每条输出前必问):**
|
||||
|
||||
如果这帧图片喂给视频模型,它知道往哪个方向动吗?答案是「不知道」→ 重写
|
||||
> 这帧图片独立存在时,用户能被画面吸引吗?
|
||||
> 答案是「不能」→ **重写**
|
||||
|
||||
**视频成片自检(每条输出前必问):**
|
||||
|
||||
> 如果这帧图片喂给视频模型,它知道往哪个方向动吗?
|
||||
> 答案是「不知道」→ **重写**
|
||||
|
||||
若单条旁白超过 22 字,强制切割为两条独立 Shot
|
||||
directorRef 必须填写,不得为空,下游依赖此字段执行风格
|
||||
按语义单元切割,每段表达一个完整观点或例子
|
||||
|
||||
2
claude-start.bat
Normal file
2
claude-start.bat
Normal file
@@ -0,0 +1,2 @@
|
||||
cd /d "%~dp0"
|
||||
claude --dangerously-skip-permissions
|
||||
Reference in New Issue
Block a user