From 9e056b752d6485886d5b75a074e461fb00b31116 Mon Sep 17 00:00:00 2001 From: Ken Yasue Date: Sat, 30 Aug 2025 23:22:11 +0200 Subject: [PATCH] save current changes --- src/generatePromptVideo.ts | 167 ---------------- src/generatePromptVideoFromImage.ts | 300 ++++++++++++++++++++++++++++ src/pinterest_keywords.json | 78 ++++++-- src/piterest_styletransfer_video.ts | 254 ++++++++++++++++++----- 4 files changed, 562 insertions(+), 237 deletions(-) delete mode 100644 src/generatePromptVideo.ts create mode 100644 src/generatePromptVideoFromImage.ts diff --git a/src/generatePromptVideo.ts b/src/generatePromptVideo.ts deleted file mode 100644 index 2552364..0000000 --- a/src/generatePromptVideo.ts +++ /dev/null @@ -1,167 +0,0 @@ -import fs from 'fs'; -import path from 'path'; -import { query } from './lib/mysql'; -import { logger } from './lib/logger'; -import { callLMStudio } from './lib/lmstudio'; - -async function main() { - await updatePromptsFromDB(); - process.exit(); -} - -/** - * Find DB records whose video_prompt contains 'cut' or 'zoom' (case-insensitive), - * regenerate the video_prompt using LMStudio, and update the record. - * - * If the newly generated prompt still contains any banned words/phrases, regenerate - * again (up to maxAttempts). If after attempts the prompt is still invalid, skip update. - */ -async function updatePromptsFromDB() { - logger.info("Starting DB sweep for video_prompt containing 'cut' or 'zoom'..."); - - // Banned regex per requirement - const banned = /\b(cut|cuts|cutting|quick cut|insert|macro insert|close-?up|extreme close-?up|zoom|zooming|push-?in|pull-?out|whip|switch angle|change angle|montage|cross-?cut|smash cut|transition|meanwhile|later)\b/i; - - let rows: any[] = []; - try { - // Case-insensitive search for 'cut' or 'zoom' anywhere in video_prompt - rows = (await query( - "SELECT id, genre, sub_genre, scene, action, camera, video_prompt FROM video WHERE LOWER(COALESCE(video_prompt,'')) LIKE ? OR LOWER(COALESCE(video_prompt,'')) LIKE ?", - ['%cut%', '%zoom%'] - )) as any[]; - } catch (err) { - logger.error('DB query failed while searching for problematic prompts:', err); - return; - } - - if (!rows || rows.length === 0) { - logger.info("No records found with 'cut' or 'zoom' in video_prompt."); - return; - } - - logger.info(`Found ${rows.length} record(s) to process.`); - - for (const row of rows) { - const id = row.id; - const genre = row.genre || ''; - const subGenre = row.sub_genre || ''; - const scene = row.scene || ''; - const action = row.action || ''; - const camera = row.camera || ''; - - if (!genre || !subGenre || !scene) { - logger.info(`Skipping id=${id} due to missing identification fields: genre='${genre}', sub_genre='${subGenre}', scene='${scene}'`); - continue; - } - - // Build LM input (similar ruleset to previous implementation) - const lmInput = buildLMInputFromRecord(genre, subGenre, scene, action, camera, row.video_prompt); - - let finalPrompt: string | null = null; - const maxAttempts = 10; - - for (let attempt = 1; attempt <= maxAttempts; attempt++) { - let lmResponse: any = null; - try { - lmResponse = await callLMStudio(lmInput); - } catch (err) { - logger.warn(`LMStudio call failed for id=${id} (attempt ${attempt}): ${err}`); - // Retry on next loop iteration - continue; - } - - if (!lmResponse) { - logger.warn(`LMStudio returned empty response for id=${id} (attempt ${attempt}).`); - continue; - } - - const videoPrompt = lmResponse.videoPrompt || lmResponse.video_prompt || lmResponse.prompt || null; - if (!videoPrompt || typeof videoPrompt !== 'string') { - logger.warn(`LMStudio did not return a valid videoPrompt for id=${id} (attempt ${attempt}).`); - continue; - } - - // Check banned regex - if (banned.test(videoPrompt)) { - logger.info(`Generated prompt for id=${id} (attempt ${attempt}) still contains banned phrases - retrying.`); - logger.info(videoPrompt); - // If last attempt, we will fall through and skip update - continue; - } - - // Passed banned check - finalPrompt = videoPrompt; - break; - } - - if (!finalPrompt) { - logger.warn(`Could not generate a clean prompt for id=${id} after ${maxAttempts} attempts. Skipping update.`); - continue; - } - - // Update DB - try { - await query('UPDATE video SET video_prompt = ? WHERE id = ?', [finalPrompt, id]); - logger.info(`Updated video_prompt for id=${id}`); - } catch (err) { - logger.error(`Failed to update video_prompt for id=${id}: ${err}`); - } - } - - logger.info('Finished DB sweep for problematic prompts.'); -} - -/** - * Helper to construct LM input for a single DB record. - * Keeps the same HARD RULES and prohibited list as previous data-driven generation. - */ -function buildLMInputFromRecord( - genre: string, - subGenre: string, - finalScene: string, - chosenAction: string, - camera: string, - existingPrompt: string | undefined -) { - const accents = 'none'; - const mood = 'n/a'; - const lighting = 'n/a'; - const style = 'n/a'; - - const lmInput = ` -Return exactly one JSON object: { "videoPrompt": "..." } and nothing else. - -Write "videoPrompt" in 100–150 words, present tense, plain concrete language. - -HARD RULES (must comply): -- One continuous shot ("one take", "oner"). Real-time 8 seconds. No edits. -- Fixed location and vantage. Do not change background or angle. -- Lens and focal length locked. No zooms, no close-ups that imply a lens change, no rack zoom. -- Camera motion: at most subtle pan/tilt/dolly within 1 meter while staying in the same spot. -- Keep framing consistent (e.g., medium-wide two-shot). No “another shot/meanwhile.” -- Describe: (1) main action, (2) framing & motion, (3) lighting & mood, (4) style & small accents. -- Use clear simple sentences. No metaphors or poetic language. - -PROHIBITED WORDS/PHRASES (case-insensitive): -cut, cuts, cutting, quick cut, insert, macro insert, close-up, extreme close-up, -zoom, zooms, zooming, push-in, pull-out, whip, switch angle, change angle, -montage, cross-cut, smash cut, transition, meanwhile, later. - -If proximity is needed, say: "the camera glides slightly closer while staying in the same position." - -Here is information of the scene, please generate prompt for the video based on these information for key "videoPrompt": -Genre: ${genre} -Sub-Genre: ${subGenre} -Scene: ${finalScene} -Action: ${chosenAction || 'n/a'} -Camera: ${camera || 'static or subtle movement (stay within scene)'} -Accents: ${accents} -Mood: ${mood} -Lighting: ${lighting} -Style: ${style} -`; - - return lmInput; -} - -main(); diff --git a/src/generatePromptVideoFromImage.ts b/src/generatePromptVideoFromImage.ts new file mode 100644 index 0000000..7866914 --- /dev/null +++ b/src/generatePromptVideoFromImage.ts @@ -0,0 +1,300 @@ +import fs from 'fs'; +import path from 'path'; +import { query } from './lib/mysql'; +import { logger } from './lib/logger'; +import { callLMStudioWithFile } from './lib/lmstudio'; + +async function main() { + await updatePromptsFromDB(); + process.exit(); +} + +/** + * Utility: extract JSON substring from a text. + * Tries to extract from fenced ```json blocks first, otherwise extracts first {...} span. + */ +function extractJsonFromText(text: string): any | null { + if (!text || typeof text !== 'string') return null; + + // Try fenced code block with optional json language + const fenced = text.match(/```(?:json)?\s*([\s\S]*?)\s*```/i); + if (fenced && fenced[1]) { + try { + return JSON.parse(fenced[1].trim()); + } catch (e) { + // fall through to brace extraction + } + } + + // Try to extract first {...} match (greedy between first { and last }) + const brace = text.match(/\{[\s\S]*\}/); + if (brace && brace[0]) { + try { + return JSON.parse(brace[0]); + } catch (e) { + return null; + } + } + + return null; +} + +/** + * Wrapper to call LMStudio with an image and prompt, and extract JSON reliably. + * - Uses callLMStudioWithFile to pass the image. + * - Tries to parse JSON from response if needed. + * - Retries up to maxRetries times (default 5) when parsing fails or an error occurs. + */ +async function callLMWithImageAndExtract(imagePath: string, prompt: string, maxRetries = 5): Promise { + for (let attempt = 1; attempt <= maxRetries; attempt++) { + try { + const res = await callLMStudioWithFile(imagePath, prompt); + // callLMStudioWithFile attempts to return parsed JSON already. Accept objects directly. + if (res && typeof res === 'object') { + return res; + } + + // If it returned text, try to extract JSON + if (typeof res === 'string') { + const parsed = extractJsonFromText(res); + if (parsed) return parsed; + } + + logger.warn(`callLMWithImageAndExtract: attempt ${attempt} returned unexpected result. Retrying...`); + } catch (err) { + logger.warn(`callLMWithImageAndExtract: attempt ${attempt} failed: ${err}`); + } + } + + logger.error(`callLMWithImageAndExtract: failed to get valid JSON after ${maxRetries} attempts`); + return null; +} + +/** + * Main sweep: find DB records whose video_prompt contains 'cut' or 'zoom' (case-insensitive), + * run multi-step LMStudio flow (object -> action -> camerawork -> final prompt) using the image, + * and update the record. + */ +async function updatePromptsFromDB() { + logger.info("Starting DB sweep for video_prompt containing 'cut' or 'zoom'..."); + + // Banned regex per requirement + const banned = /\b(cut|cuts|cutting|quick cut|insert|macro insert|close-?up|extreme close-?up|zoom|zooming|push-?in|pull-?out|whip|switch angle|change angle|montage|cross-?cut|smash cut|transition|meanwhile|later)\b/i; + + let rows: any[] = []; + try { + // Case-insensitive search for 'cut' or 'zoom' anywhere in video_prompt + rows = (await query( + "SELECT id, genre, sub_genre, scene, action, camera, video_prompt, image_path FROM video where (video_path = '' or video_path is null) and modified_at < '2025-08-30 09:15:33'", + )) as any[]; + } catch (err) { + logger.error('DB query failed while searching for problematic prompts:', err); + return; + } + + if (!rows || rows.length === 0) { + logger.info("No records found with 'cut' or 'zoom' in video_prompt."); + return; + } + + logger.info(`Found ${rows.length} record(s) to process.`); + + for (const row of rows) { + const id = row.id; + const genre = row.genre || ''; + const subGenre = row.sub_genre || ''; + const scene = row.scene || ''; + const action = row.action || ''; + const camera = row.camera || ''; + const imagePathRaw = row.image_path || row.image || null; + + if (!genre || !subGenre || !scene) { + logger.info(`Skipping id=${id} due to missing identification fields: genre='${genre}', sub_genre='${subGenre}', scene='${scene}'`); + continue; + } + + if (!imagePathRaw) { + logger.info(`Skipping id=${id} because image_path is empty for this record.`); + continue; + } + + // Resolve the image path: if relative, make absolute based on cwd + let imageFullPath = imagePathRaw; + if (!path.isAbsolute(imageFullPath)) { + imageFullPath = path.resolve(process.cwd(), imageFullPath); + } + + if (!fs.existsSync(imageFullPath)) { + logger.info(`Skipping id=${id} because image not found at path: ${imageFullPath}`); + continue; + } + + logger.info(`Processing id=${id} using image: ${imageFullPath}`); + + // Step 1: Detect main object + const step1Prompt = ` +Return exactly one JSON object and nothing else: { "mainobject": "..." }. +Look at the provided image and determine the single most prominent/main object or subject in the scene. +Answer with a short noun or short phrase (no extra commentary). +If unsure, give the best concise guess. +`; + const step1Res = await callLMWithImageAndExtract(imageFullPath, step1Prompt, 5); + const mainobject = (step1Res && (step1Res.mainobject || step1Res.mainObject || step1Res.object)) ? String(step1Res.mainobject || step1Res.mainObject || step1Res.object).trim() : ''; + + if (!mainobject) { + logger.warn(`id=${id} - could not detect main object. Skipping record.`); + continue; + } + + logger.info(`id=${id} - detected main object: ${mainobject}`); + + // Step 2: Determine best action for this scene + const step2Prompt = ` +You have access to the image and the detected main object: "${mainobject}". +Decide which single action type best fits this scene from the list: +- no action +- micro animation (animate object but small movement) +- big movement +- impossible movement + +Return exactly one JSON object and nothing else: { "actiontype": "...", "action": ""}. +Do not add commentary. Choose the single best option from the list above. +`; + const step2Res = await callLMWithImageAndExtract(imageFullPath, step2Prompt, 5); + const actiontype = (step2Res && (step2Res.actiontype || step2Res.actionType)) ? String(step2Res.actiontype || step2Res.actionType).trim() : ''; + + if (!actiontype) { + logger.warn(`id=${id} - could not determine action type. Skipping record.`); + continue; + } + + logger.info(`id=${id} - decided action type: ${actiontype}`); + + // Step 3: Ask LMStudio what is the best camera work for the scene + const step3Prompt = ` +Given the image and the following information: +- main object: "${mainobject}" +- chosen action type: "${actiontype}" + +From the options below pick the single best camera approach for this scene: +- static camera +- pan +- rotation +- follow the moving object +- zoom to the object +- impossible camera work + +Return exactly one JSON object and nothing else: { "cameraworkType": "..." }. +Choose one of the listed options and do not add commentary. +`; + const step3Res = await callLMWithImageAndExtract(imageFullPath, step3Prompt, 5); + const cameraworkType = (step3Res && (step3Res.cameraworkType || step3Res.cameraWorkType || step3Res.camera)) ? String(step3Res.cameraworkType || step3Res.cameraWorkType || step3Res.camera).trim() : ''; + + if (!cameraworkType) { + logger.warn(`id=${id} - could not determine camera work. Skipping record.`); + continue; + } + + logger.info(`id=${id} - decided camera work: ${cameraworkType}`); + + // Step 4: Generate final video prompt using all gathered info + const finalPromptInput = buildLMInputFromRecordWithImageInfo( + genre, + subGenre, + scene, + action, + camera, + mainobject, + actiontype, + cameraworkType + ); + + // Use wrapper to call LM and extract JSON { videoPrompt: "" } + const finalRes = await callLMWithImageAndExtract(imageFullPath, finalPromptInput, 5); + const videoPrompt = (finalRes && (finalRes.videoPrompt || finalRes.video_prompt || finalRes.prompt)) ? String(finalRes.videoPrompt || finalRes.video_prompt || finalRes.prompt).trim() : null; + logger.info(`id=${id} - videoPrompt: ${videoPrompt}`); + if (!videoPrompt) { + logger.warn(`id=${id} - LM did not return a valid videoPrompt. Skipping record.`); + continue; + } + + // Check banned regex + if (banned.test(videoPrompt)) { + logger.info(`Generated prompt for id=${id} contains banned phrases - skipping update.`); + logger.info(videoPrompt); + continue; + } + + + // Update DB + try { + await query('UPDATE video SET video_prompt = ? WHERE id = ?', [videoPrompt, id]); + logger.info(`Updated video_prompt for id=${id}`); + } catch (err) { + logger.error(`Failed to update video_prompt for id=${id}: ${err}`); + } + + } + + logger.info('Finished DB sweep for problematic prompts.'); +} + +/** + * Build final LM input for step 4, including HARD RULES and scene info. + * The LM should return: { "videoPrompt": "..." } + */ +function buildLMInputFromRecordWithImageInfo( + genre: string, + subGenre: string, + finalScene: string, + chosenAction: string, + camera: string, + mainobject: string, + actiontype: string, + cameraworkType: string +) { + const accents = 'none'; + const mood = 'n/a'; + const lighting = 'n/a'; + const style = 'n/a'; + + const lmInput = ` +Return exactly one JSON object: { "videoPrompt": "..." } and nothing else. + +Write "videoPrompt" in 100–150 words, present tense, plain concrete language. + +HARD RULES (must comply): +- One continuous shot ("one take", "oner"). Real-time 8 seconds. No edits. +- Fixed location and vantage. Do not change background or angle. +- Lens and focal length locked. No zooms, no close-ups that imply a lens change, no rack zoom. +- Camera motion: at most subtle pan/tilt/dolly within 1 meter while staying in the same spot. +- Keep framing consistent (e.g., medium-wide two-shot). No “another shot/meanwhile.” +- Describe: (1) main action, (2) framing & motion, (3) lighting & mood, (4) style & small accents. +- Use clear simple sentences. No metaphors or poetic language. + +PROHIBITED WORDS/PHRASES (case-insensitive): +cut, cuts, cutting, quick cut, insert, macro insert, close-up, extreme close-up, +zoom, zooms, zooming, push-in, pull-out, whip, switch angle, change angle, +montage, cross-cut, smash cut, transition, meanwhile, later. + +If proximity is needed, say: "the camera glides slightly closer while staying in the same position." + +Here is information of the scene, please generate prompt for the video based on these information for key "videoPrompt": +Genre: ${genre} +Sub-Genre: ${subGenre} +Scene: ${finalScene} +Existing Action Field: ${chosenAction || 'n/a'} +Existing Camera Field: ${camera || 'static or subtle movement (stay within scene)'} +Detected Main Object: ${mainobject} +Suggested Action Type: ${actiontype} +Suggested Camera Work: ${cameraworkType} +Accents: ${accents} +Mood: ${mood} +Lighting: ${lighting} +Style: ${style} +`; + + return lmInput; +} + +main(); diff --git a/src/pinterest_keywords.json b/src/pinterest_keywords.json index 3ff158b..030cdcc 100644 --- a/src/pinterest_keywords.json +++ b/src/pinterest_keywords.json @@ -3589,28 +3589,34 @@ }, { "genre": "fantasy", - "subGenre": "Talking Animals", + "subGenre": "Animals", "pinIds": [ - "112941903147144539", - "281543725720325", - "36943659439371180", - "11751649022612432", - "18436679719492625", - "1407443629725511", - "6544361954178116", - "1759287347907216", - "4151824652348137", - "569705421631320751", - "168603579798518857", - "301670875061735522", - "1407443629997072", - "1266706140762662", - "2603712281941095", - "633387443385941", + "166422148725296622", + "151996556170151090", + "673780794293391738", + "17099673582089553", + "715861303309575585", + "13933080091832547", + "11047961582093288", + "4011087180789478", + "75857574972480900", + "10836855346968288", + "19140367162342308", + "518969557079186929", + "6966574420158002", + "63754150969272553", + "18718154695261333", + "115193702963985873", "3166662232947239", - "1079245498202717474", - "3025924745501558", - "1407443629997060" + "35606653299616695", + "252553491598336020", + "3166662232949549", + "52917364367365713", + "322288917102637980", + "63191201015204585", + "53058101854654159", + "211174978191847", + "20266267068530882" ] }, { @@ -9332,6 +9338,38 @@ "14707136280473866" ] }, + { + "genre": "sports", + "subGenre": "Motocross", + "pinIds": [ + "1049338781913408309", + "60728294964705253", + "41376890329005782", + "54676582969961779", + "6544361952522477", + "30962316182630482", + "66498531996848387", + "35184440833770285", + "8233211827095090", + "155303888199282668", + "2814818510870558", + "24418022972582884", + "17099673582215596", + "69946600457122783", + "911697518289700421", + "13229392651243233", + "207658232811269227", + "335588609755373167" + ], + "videoInstructions": [ + "Ultra low angle: the tire passes extremely close to the camera, mud splashes onto the lens.", + "High-speed slow motion: capture the rider soaring through the air during a jump.", + "Drone tracking: follow the rider from directly above, from jump to landing.", + "Exaggerated perspective: use an ultra-wide lens to make cornering look dramatic and powerful.", + "Consecutive jumps from the side: riders leap one after another, seen in sequence from the side of the course.", + "Head-on approach: place the camera just before the jump, the rider flies directly overhead." + ] + }, { "genre": "technology", "subGenre": "3D Printing", diff --git a/src/piterest_styletransfer_video.ts b/src/piterest_styletransfer_video.ts index 57e0844..0d14776 100644 --- a/src/piterest_styletransfer_video.ts +++ b/src/piterest_styletransfer_video.ts @@ -11,6 +11,60 @@ import { VideoModel } from './lib/db/video'; dotenv.config(); + +// Utility: extract JSON substring from a text. +// Tries fenced ```json``` blocks first, otherwise extracts first {...} span. +function extractJsonFromText(text: string): any | null { + if (!text || typeof text !== 'string') return null; + + // Try fenced code block with optional json language + const fenced = text.match(/```(?:json)?\s*([\s\S]*?)\s*```/i); + if (fenced && fenced[1]) { + try { + return JSON.parse(fenced[1].trim()); + } catch (e) { + // fall through to brace extraction + } + } + + // Try to extract first {...} match + const brace = text.match(/\{[\s\S]*\}/); + if (brace && brace[0]) { + try { + return JSON.parse(brace[0]); + } catch (e) { + return null; + } + } + + return null; +} + +// Wrapper to call OpenAI with an image and prompt, and extract JSON reliably. +// - Uses callOpenAIWithFile to pass the image. +// - Tries to parse JSON from response if needed. +// - Retries up to maxRetries times (default 5) when parsing fails or an error occurs. +async function callOpenAIWithFileAndExtract(imagePath: string, prompt: string, maxRetries = 5): Promise { + for (let attempt = 1; attempt <= maxRetries; attempt++) { + try { + const res = await callOpenAIWithFile(imagePath, prompt); + // callOpenAIWithFile may return an object or parsed JSON already + if (res && typeof res === 'object') { + return res; + } + if (typeof res === 'string') { + const parsed = extractJsonFromText(res); + if (parsed) return parsed; + } + // unexpected shape -> retry + logger.warn(`callOpenAIWithFileAndExtract: attempt ${attempt} returned unexpected result. Retrying...`); + } catch (err) { + logger.warn(`callOpenAIWithFileAndExtract: attempt ${attempt} failed: ${err}`); + } + } + logger.error(`callOpenAIWithFileAndExtract: failed to get valid JSON after ${maxRetries} attempts`); + return null; +} const servers = [ { baseUrl: process.env.SERVER1_COMFY_BASE_URL, @@ -36,9 +90,10 @@ interface GenerationTask { scene: string; action: string; camera: string; + videoInstructions?: string[]; } -async function getPromptsForImage(imagePaths: string[], pinUrl: string, genre: string, subGenre: string): Promise { +async function getPromptsForImage(imagePaths: string[], pinUrl: string, genre: string, subGenre: string, videoInstructions: string[] = []): Promise { const pinId = pinUrl.split('/').filter(Boolean).pop() || `pin_${Date.now()}`; const timestamp = new Date().getTime(); const imageFileName = `${pinId}_${timestamp}.png`; @@ -54,26 +109,121 @@ async function getPromptsForImage(imagePaths: string[], pinUrl: string, genre: s const imageForPrompt = renamedImagePaths[Math.floor(Math.random() * renamedImagePaths.length)]; try { - const promptResponse = await callOpenAIWithFile(imageForPrompt, - `Analyze the provided image and generate the following: - 1. 'scene': A description of the image's environment. - 2. 'action': A description of the main action occurring in the image. - 3. 'camera': A description of the camera shot (e.g., 'close-up', 'wide-angle'). - 4. 'image_prompt': A short and detailed prompt to generate this photo. - 5. 'video_prompt': A prompt describing a creative and subtle movement of the main object. The camera should be slight panning or static. + // Step 1: Detect main object + const step1Prompt = ` +Return exactly one JSON object and nothing else: { "mainobject": "..." }. +Look at the provided image and determine the single most prominent/main object or subject in the scene. +Answer with a short noun or short phrase (no extra commentary). +If unsure, give the best concise guess. +`; + const step1Res = await callOpenAIWithFileAndExtract(imageForPrompt, step1Prompt, 5); + const mainobject = (step1Res && (step1Res.mainobject || step1Res.mainObject || step1Res.object)) ? String(step1Res.mainobject || step1Res.mainObject || step1Res.object).trim() : ''; + + if (!mainobject) { + throw new Error('Could not detect main object'); + } + logger.info(`Detected main object for ${imageForPrompt}: ${mainobject}`); + + // Step 2: Determine best action for this scene + const step2Prompt = ` +You have access to the image and the detected main object: "${mainobject}". +Decide which single action type best fits this scene from the list: +- no action +- micro animation (animate object but small movement) +- big movement +- impossible movement + +Return exactly one JSON object and nothing else: { "actiontype": "..." }. +Do not add commentary. Choose the single best option from the list above. +`; + const step2Res = await callOpenAIWithFileAndExtract(imageForPrompt, step2Prompt, 5); + const actiontype = (step2Res && (step2Res.actiontype || step2Res.actionType)) ? String(step2Res.actiontype || step2Res.actionType).trim() : ''; + + if (!actiontype) { + throw new Error('Could not determine action type'); + } + logger.info(`Decided action type for ${imageForPrompt}: ${actiontype}`); + + // Step 3: Ask OpenAI what is the best camera work for the scene + const step3Prompt = ` +Given the image and the following information: +- main object: "${mainobject}" +- chosen action type: "${actiontype}" + +From the options below pick the single best camera approach for this scene: +- static camera +- pan +- rotation +- follow the moving object +- zoom to the object +- impossible camera work + +Return exactly one JSON object and nothing else: { "cameraworkType": "..." }. +Choose one of the listed options and do not add commentary. +`; + const step3Res = await callOpenAIWithFileAndExtract(imageForPrompt, step3Prompt, 5); + const cameraworkType = (step3Res && (step3Res.cameraworkType || step3Res.cameraWorkType || step3Res.camera)) ? String(step3Res.cameraworkType || step3Res.cameraWorkType || step3Res.camera).trim() : ''; + + if (!cameraworkType) { + throw new Error('Could not determine camera work'); + } + logger.info(`Decided camera work for ${imageForPrompt}: ${cameraworkType}`); + + let videoInstruction = ""; + if (videoInstructions && videoInstructions.length > 0) { + + const videoInstructionPrompt = ` +Given the image and the following information: +- main object: "${mainobject}" + +From the options below pick the single best camera approach for this scene: +${videoInstructions.join(",\r\n")} + +Return exactly one JSON object and nothing else: { "videoInstruction": "..." }. +Choose one of the listed options and do not add commentary. +`; + const videoInstructionRes = await callOpenAIWithFileAndExtract(imageForPrompt, videoInstructionPrompt, 5); + const videoInstructionFinalRes = (step3Res && (videoInstructionRes.videoInstruction || videoInstructionRes.videoInstruction || videoInstructionRes.camera)) ? String(videoInstructionRes.videoInstruction || videoInstructionRes.videoInstruction || videoInstructionRes.camera).trim() : ''; + + if (videoInstructionFinalRes) + videoInstruction = videoInstructionFinalRes + + } + // Step 4: Generate final video prompt (and image prompt) using all gathered info + const finalPrompt = ` +Return exactly one JSON object: { "scene": "...", "action":"...", "camera":"...", "image_prompt":"...", "videoPrompt":"..." } and nothing else. + +Write "videoPrompt" in 100–150 words, present tense, plain concrete language. +Write "image_prompt" as a concise, detailed prompt suitable for generating a similar image. + +HARD RULES (must comply for videoPrompt): +- One continuous shot. Real-time 8 seconds. No edits. +- Fixed location and vantage. Do not change background or angle. +- Lens and focal length locked. No zooms, no close-ups that imply a lens change. +- Camera motion: at most subtle pan/tilt/dolly within 1 meter while staying in the same spot. +- Keep framing consistent. No “another shot/meanwhile.” +- Use clear simple sentences. No metaphors or poetic language. + +Here is information of the scene, please generate fields accordingly: +Detected Main Object: ${mainobject} +Suggested Action Type: ${actiontype} +Suggested Camera Work: ${cameraworkType} +Genre: ${genre} +Sub-Genre: ${subGenre} +${videoInstruction ? 'video instruction:' + videoInstruction : ""} +`; + const finalRes = await callOpenAIWithFileAndExtract(imageForPrompt, finalPrompt, 5); + + const scene = finalRes && (finalRes.scene || finalRes.Scene) ? String(finalRes.scene) : ''; + const action = finalRes && (finalRes.action || finalRes.Action) ? String(finalRes.action) : ''; + const camera = finalRes && (finalRes.camera || finalRes.Camera) ? String(finalRes.camera) : ''; + const imagePrompt = finalRes && (finalRes.image_prompt || finalRes.imagePrompt || finalRes.image_prompt) ? String(finalRes.image_prompt || finalRes.imagePrompt) : ''; + const videoPrompt = finalRes && (finalRes.videoPrompt || finalRes.video_prompt || finalRes.video_prompt) ? String(finalRes.videoPrompt || finalRes.video_prompt) : ''; + + if (!imagePrompt || !videoPrompt) { + throw new Error('Final LM output did not include image_prompt or videoPrompt'); + } - Output should be in this JSON format: - --- - { - "scene": "{result comes here}", - "action": "{result comes here}", - "camera": "{result comes here}", - "image_prompt": "{result comes here}", - "video_prompt": "{result comes here}" - } - --- - `); - const { scene, action, camera, image_prompt: imagePrompt, video_prompt: videoPrompt } = promptResponse; logger.info(`Image prompt for ${imageForPrompt}:`, imagePrompt); logger.info(`Video prompt for ${imageForPrompt}:`, videoPrompt); @@ -179,7 +329,7 @@ async function getPinUrlFromPinterest(keyword: string): Promise { (async () => { // Load pinterest keywords JSON, pick up to 20 subGenres and choose 1 pinId per subGenre const keywordsFilePath = path.resolve(process.cwd(), 'src', 'pinterest_keywords.json'); - let allKeywords: { genre: string; subGenre: string; pinIds?: string[]; pinId?: string[] }[] = []; + let allKeywords: { genre: string; subGenre: string; pinIds?: string[]; pinId?: string[], videoInstructions?: string[] }[] = []; try { const raw = await fs.readFile(keywordsFilePath, 'utf-8'); allKeywords = JSON.parse(raw); @@ -189,7 +339,7 @@ async function getPinUrlFromPinterest(keyword: string): Promise { } allKeywords = allKeywords.filter(a => { - return (a.genre == "food" && a.subGenre == "imagination") + return (a.genre == "sports" && a.subGenre == "Motocross") }); function shuffle(arr: T[]): T[] { @@ -205,24 +355,39 @@ async function getPinUrlFromPinterest(keyword: string): Promise { // Download up to `count` images from a pin URL by opening the pin page and scrolling up to 5 times to trigger lazy loading // Returns an array of saved image paths (may be empty) async function downloadOneImageFromPin(pinUrl: string, count: number = 1): Promise { - const browser = await puppeteer.launch({ headless: true }); + const browser = await puppeteer.launch({ headless: false }); const page = await browser.newPage(); await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36'); await page.setViewport({ width: 1920, height: 1080 }); try { await page.goto(pinUrl, { waitUntil: 'networkidle2', timeout: 30000 }); - for (let i = 0; i < 5; i++) { + for (let i = 0; i < 3; i++) { await page.evaluate('window.scrollTo(0, document.body.scrollHeight)'); await new Promise((r) => setTimeout(r, 700 + Math.random() * 800)); } - const imgs: string[] = await page.$$eval('img', imgs => - imgs.map(img => (img as HTMLImageElement).src) - .filter(src => !!src && (src.includes('pinimg') || /\.(jpe?g|png|webp)$/i.test(src))) - ); + const imgs: string[] = await page.$$eval('img', imgs => { + // For each try to extract the 4x (original) URL from srcset. + // srcset example: + // "https://i.pinimg.com/236x/...jpg 1x, https://i.pinimg.com/474x/...jpg 2x, https://i.pinimg.com/736x/...jpg 3x, https://i.pinimg.com/originals/...jpg 4x" + const urls: string[] = imgs.map(img => { + const srcset = (img as HTMLImageElement).getAttribute('srcset') || ''; + if (!srcset) return ''; + const parts = srcset.split(',').map(p => p.trim()); + for (const part of parts) { + const m = part.match(/^(\S+)\s+4x$/); + if (m && m[1]) return m[1]; + } + // fallback: if src contains "originals" return src + const src = (img as HTMLImageElement).src || ''; + if (src.includes('/originals/')) return src; + return ''; + }).filter(s => !!s && s.includes('pinimg')); + return urls; + }); if (!imgs || imgs.length === 0) { - logger.warn(`No image src found on pin page ${pinUrl}`); + logger.warn(`No image src (4x) found on pin page ${pinUrl}`); return []; } @@ -265,13 +430,16 @@ async function getPinUrlFromPinterest(keyword: string): Promise { } } + const numberOfPinIds = 20; // Build keywords list with single chosen pinId per selected subGenre - const keywords: { genre: string; subGenre: string; pinId: string[] }[] = []; + const keywords: { + genre: string; subGenre: string; pinIds: string[], videoInstructions?: string[] + }[] = []; for (const entry of selectedEntries) { const pinIds = (entry.pinIds || entry.pinId) as string[] | undefined; if (!Array.isArray(pinIds) || pinIds.length === 0) continue; - const chosenPinId = pinIds[Math.floor(Math.random() * pinIds.length)]; - keywords.push({ genre: entry.genre, subGenre: entry.subGenre, pinId: [chosenPinId] }); + const chosenPinId = pinIds.splice(0, numberOfPinIds); + keywords.push({ genre: entry.genre, subGenre: entry.subGenre, pinIds: chosenPinId, videoInstructions: entry.videoInstructions }); } if (keywords.length === 0) { @@ -291,24 +459,9 @@ async function getPinUrlFromPinterest(keyword: string): Promise { const { genre, subGenre } = genreSubGenre; - for (let i = 0; i < 10; i++) { - // pinId is now an array with a single chosen id. Pick the first element. - const pinIdField = (genreSubGenre as any).pinId; - let selectedPinId: string | undefined; - if (Array.isArray(pinIdField) && pinIdField.length > 0) { - selectedPinId = pinIdField[0]; - logger.info(`Selected chosen pinId ${selectedPinId} for ${genre} / ${subGenre}`); - } else if (typeof pinIdField === 'string' && pinIdField) { - selectedPinId = pinIdField; - logger.info(`Using single pinId ${selectedPinId} for ${genre} / ${subGenre}`); - } + for (const pinId of genreSubGenre.pinIds) { - if (!selectedPinId) { - logger.warn(`No pinId available for ${genre}/${subGenre}. Skipping.`); - continue; - } - - const pin = `https://www.pinterest.com/pin/${selectedPinId}/`; + const pin = `https://www.pinterest.com/pin/${pinId}/`; logger.info(`--- Starting processing for pin: ${pin} ---`); // download images from the pin page (pass desired count as second arg) @@ -323,8 +476,9 @@ async function getPinUrlFromPinterest(keyword: string): Promise { // proceed if we have at least one image if (selectedImages.length >= 1) { - const task = await getPromptsForImage(selectedImages, pin, genre, subGenre); + const task = await getPromptsForImage(selectedImages, pin, genre, subGenre, genreSubGenre.videoInstructions); if (task) { + task.videoInstructions = genreSubGenre.videoInstructions; generationTasks.push(task); } } else {