From 9e056b752d6485886d5b75a074e461fb00b31116 Mon Sep 17 00:00:00 2001
From: Ken Yasue <ken@yasue.us>
Date: Sat, 30 Aug 2025 23:22:11 +0200
Subject: [PATCH] save current changes

---
 src/generatePromptVideo.ts          | 167 ----------------
 src/generatePromptVideoFromImage.ts | 300 ++++++++++++++++++++++++++++
 src/pinterest_keywords.json         |  78 ++++++--
 src/piterest_styletransfer_video.ts | 254 ++++++++++++++++++-----
 4 files changed, 562 insertions(+), 237 deletions(-)
 delete mode 100644 src/generatePromptVideo.ts
 create mode 100644 src/generatePromptVideoFromImage.ts

diff --git a/src/generatePromptVideo.ts b/src/generatePromptVideo.ts
deleted file mode 100644
index 2552364..0000000
--- a/src/generatePromptVideo.ts
+++ /dev/null
@@ -1,167 +0,0 @@
-import fs from 'fs';
-import path from 'path';
-import { query } from './lib/mysql';
-import { logger } from './lib/logger';
-import { callLMStudio } from './lib/lmstudio';
-
-async function main() {
-    await updatePromptsFromDB();
-    process.exit();
-}
-
-/**
- * Find DB records whose video_prompt contains 'cut' or 'zoom' (case-insensitive),
- * regenerate the video_prompt using LMStudio, and update the record.
- *
- * If the newly generated prompt still contains any banned words/phrases, regenerate
- * again (up to maxAttempts). If after attempts the prompt is still invalid, skip update.
- */
-async function updatePromptsFromDB() {
-    logger.info("Starting DB sweep for video_prompt containing 'cut' or 'zoom'...");
-
-    // Banned regex per requirement
-    const banned = /\b(cut|cuts|cutting|quick cut|insert|macro insert|close-?up|extreme close-?up|zoom|zooming|push-?in|pull-?out|whip|switch angle|change angle|montage|cross-?cut|smash cut|transition|meanwhile|later)\b/i;
-
-    let rows: any[] = [];
-    try {
-        // Case-insensitive search for 'cut' or 'zoom' anywhere in video_prompt
-        rows = (await query(
-            "SELECT id, genre, sub_genre, scene, action, camera, video_prompt FROM video WHERE LOWER(COALESCE(video_prompt,'')) LIKE ? OR LOWER(COALESCE(video_prompt,'')) LIKE ?",
-            ['%cut%', '%zoom%']
-        )) as any[];
-    } catch (err) {
-        logger.error('DB query failed while searching for problematic prompts:', err);
-        return;
-    }
-
-    if (!rows || rows.length === 0) {
-        logger.info("No records found with 'cut' or 'zoom' in video_prompt.");
-        return;
-    }
-
-    logger.info(`Found ${rows.length} record(s) to process.`);
-
-    for (const row of rows) {
-        const id = row.id;
-        const genre = row.genre || '';
-        const subGenre = row.sub_genre || '';
-        const scene = row.scene || '';
-        const action = row.action || '';
-        const camera = row.camera || '';
-
-        if (!genre || !subGenre || !scene) {
-            logger.info(`Skipping id=${id} due to missing identification fields: genre='${genre}', sub_genre='${subGenre}', scene='${scene}'`);
-            continue;
-        }
-
-        // Build LM input (similar ruleset to previous implementation)
-        const lmInput = buildLMInputFromRecord(genre, subGenre, scene, action, camera, row.video_prompt);
-
-        let finalPrompt: string | null = null;
-        const maxAttempts = 10;
-
-        for (let attempt = 1; attempt <= maxAttempts; attempt++) {
-            let lmResponse: any = null;
-            try {
-                lmResponse = await callLMStudio(lmInput);
-            } catch (err) {
-                logger.warn(`LMStudio call failed for id=${id} (attempt ${attempt}): ${err}`);
-                // Retry on next loop iteration
-                continue;
-            }
-
-            if (!lmResponse) {
-                logger.warn(`LMStudio returned empty response for id=${id} (attempt ${attempt}).`);
-                continue;
-            }
-
-            const videoPrompt = lmResponse.videoPrompt || lmResponse.video_prompt || lmResponse.prompt || null;
-            if (!videoPrompt || typeof videoPrompt !== 'string') {
-                logger.warn(`LMStudio did not return a valid videoPrompt for id=${id} (attempt ${attempt}).`);
-                continue;
-            }
-
-            // Check banned regex
-            if (banned.test(videoPrompt)) {
-                logger.info(`Generated prompt for id=${id} (attempt ${attempt}) still contains banned phrases - retrying.`);
-                logger.info(videoPrompt);
-                // If last attempt, we will fall through and skip update
-                continue;
-            }
-
-            // Passed banned check
-            finalPrompt = videoPrompt;
-            break;
-        }
-
-        if (!finalPrompt) {
-            logger.warn(`Could not generate a clean prompt for id=${id} after ${maxAttempts} attempts. Skipping update.`);
-            continue;
-        }
-
-        // Update DB
-        try {
-            await query('UPDATE video SET video_prompt = ? WHERE id = ?', [finalPrompt, id]);
-            logger.info(`Updated video_prompt for id=${id}`);
-        } catch (err) {
-            logger.error(`Failed to update video_prompt for id=${id}: ${err}`);
-        }
-    }
-
-    logger.info('Finished DB sweep for problematic prompts.');
-}
-
-/**
- * Helper to construct LM input for a single DB record.
- * Keeps the same HARD RULES and prohibited list as previous data-driven generation.
- */
-function buildLMInputFromRecord(
-    genre: string,
-    subGenre: string,
-    finalScene: string,
-    chosenAction: string,
-    camera: string,
-    existingPrompt: string | undefined
-) {
-    const accents = 'none';
-    const mood = 'n/a';
-    const lighting = 'n/a';
-    const style = 'n/a';
-
-    const lmInput = `
-Return exactly one JSON object: { "videoPrompt": "..." } and nothing else.
-
-Write "videoPrompt" in 100–150 words, present tense, plain concrete language.
-
-HARD RULES (must comply):
-- One continuous shot ("one take", "oner"). Real-time 8 seconds. No edits.
-- Fixed location and vantage. Do not change background or angle.
-- Lens and focal length locked. No zooms, no close-ups that imply a lens change, no rack zoom.
-- Camera motion: at most subtle pan/tilt/dolly within 1 meter while staying in the same spot.
-- Keep framing consistent (e.g., medium-wide two-shot). No “another shot/meanwhile.”
-- Describe: (1) main action, (2) framing & motion, (3) lighting & mood, (4) style & small accents.
-- Use clear simple sentences. No metaphors or poetic language.
-
-PROHIBITED WORDS/PHRASES (case-insensitive): 
-cut, cuts, cutting, quick cut, insert, macro insert, close-up, extreme close-up,
-zoom, zooms, zooming, push-in, pull-out, whip, switch angle, change angle,
-montage, cross-cut, smash cut, transition, meanwhile, later.
-
-If proximity is needed, say: "the camera glides slightly closer while staying in the same position."
-
-Here is information of the scene, please generate prompt for the video based on these information for key "videoPrompt":
-Genre: ${genre}
-Sub-Genre: ${subGenre}
-Scene: ${finalScene}
-Action: ${chosenAction || 'n/a'}
-Camera: ${camera || 'static or subtle movement (stay within scene)'}
-Accents: ${accents}
-Mood: ${mood}
-Lighting: ${lighting}
-Style: ${style}
-`;
-
-    return lmInput;
-}
-
-main();
diff --git a/src/generatePromptVideoFromImage.ts b/src/generatePromptVideoFromImage.ts
new file mode 100644
index 0000000..7866914
--- /dev/null
+++ b/src/generatePromptVideoFromImage.ts
@@ -0,0 +1,300 @@
+import fs from 'fs';
+import path from 'path';
+import { query } from './lib/mysql';
+import { logger } from './lib/logger';
+import { callLMStudioWithFile } from './lib/lmstudio';
+
+async function main() {
+    await updatePromptsFromDB();
+    process.exit();
+}
+
+/**
+ * Utility: extract JSON substring from a text.
+ * Tries to extract from fenced ```json blocks first, otherwise extracts first {...} span.
+ */
+function extractJsonFromText(text: string): any | null {
+    if (!text || typeof text !== 'string') return null;
+
+    // Try fenced code block with optional json language
+    const fenced = text.match(/```(?:json)?\s*([\s\S]*?)\s*```/i);
+    if (fenced && fenced[1]) {
+        try {
+            return JSON.parse(fenced[1].trim());
+        } catch (e) {
+            // fall through to brace extraction
+        }
+    }
+
+    // Try to extract first {...} match (greedy between first { and last })
+    const brace = text.match(/\{[\s\S]*\}/);
+    if (brace && brace[0]) {
+        try {
+            return JSON.parse(brace[0]);
+        } catch (e) {
+            return null;
+        }
+    }
+
+    return null;
+}
+
+/**
+ * Wrapper to call LMStudio with an image and prompt, and extract JSON reliably.
+ * - Uses callLMStudioWithFile to pass the image.
+ * - Tries to parse JSON from response if needed.
+ * - Retries up to maxRetries times (default 5) when parsing fails or an error occurs.
+ */
+async function callLMWithImageAndExtract(imagePath: string, prompt: string, maxRetries = 5): Promise<any | null> {
+    for (let attempt = 1; attempt <= maxRetries; attempt++) {
+        try {
+            const res = await callLMStudioWithFile(imagePath, prompt);
+            // callLMStudioWithFile attempts to return parsed JSON already. Accept objects directly.
+            if (res && typeof res === 'object') {
+                return res;
+            }
+
+            // If it returned text, try to extract JSON
+            if (typeof res === 'string') {
+                const parsed = extractJsonFromText(res);
+                if (parsed) return parsed;
+            }
+
+            logger.warn(`callLMWithImageAndExtract: attempt ${attempt} returned unexpected result. Retrying...`);
+        } catch (err) {
+            logger.warn(`callLMWithImageAndExtract: attempt ${attempt} failed: ${err}`);
+        }
+    }
+
+    logger.error(`callLMWithImageAndExtract: failed to get valid JSON after ${maxRetries} attempts`);
+    return null;
+}
+
+/**
+ * Main sweep: find DB records whose video_prompt contains 'cut' or 'zoom' (case-insensitive),
+ * run multi-step LMStudio flow (object -> action -> camerawork -> final prompt) using the image,
+ * and update the record.
+ */
+async function updatePromptsFromDB() {
+    logger.info("Starting DB sweep for video_prompt containing 'cut' or 'zoom'...");
+
+    // Banned regex per requirement
+    const banned = /\b(cut|cuts|cutting|quick cut|insert|macro insert|close-?up|extreme close-?up|zoom|zooming|push-?in|pull-?out|whip|switch angle|change angle|montage|cross-?cut|smash cut|transition|meanwhile|later)\b/i;
+
+    let rows: any[] = [];
+    try {
+        // Case-insensitive search for 'cut' or 'zoom' anywhere in video_prompt
+        rows = (await query(
+            "SELECT id, genre, sub_genre, scene, action, camera, video_prompt, image_path FROM video where (video_path = '' or video_path is null) and modified_at < '2025-08-30 09:15:33'",
+        )) as any[];
+    } catch (err) {
+        logger.error('DB query failed while searching for problematic prompts:', err);
+        return;
+    }
+
+    if (!rows || rows.length === 0) {
+        logger.info("No records found with 'cut' or 'zoom' in video_prompt.");
+        return;
+    }
+
+    logger.info(`Found ${rows.length} record(s) to process.`);
+
+    for (const row of rows) {
+        const id = row.id;
+        const genre = row.genre || '';
+        const subGenre = row.sub_genre || '';
+        const scene = row.scene || '';
+        const action = row.action || '';
+        const camera = row.camera || '';
+        const imagePathRaw = row.image_path || row.image || null;
+
+        if (!genre || !subGenre || !scene) {
+            logger.info(`Skipping id=${id} due to missing identification fields: genre='${genre}', sub_genre='${subGenre}', scene='${scene}'`);
+            continue;
+        }
+
+        if (!imagePathRaw) {
+            logger.info(`Skipping id=${id} because image_path is empty for this record.`);
+            continue;
+        }
+
+        // Resolve the image path: if relative, make absolute based on cwd
+        let imageFullPath = imagePathRaw;
+        if (!path.isAbsolute(imageFullPath)) {
+            imageFullPath = path.resolve(process.cwd(), imageFullPath);
+        }
+
+        if (!fs.existsSync(imageFullPath)) {
+            logger.info(`Skipping id=${id} because image not found at path: ${imageFullPath}`);
+            continue;
+        }
+
+        logger.info(`Processing id=${id} using image: ${imageFullPath}`);
+
+        // Step 1: Detect main object
+        const step1Prompt = `
+Return exactly one JSON object and nothing else: { "mainobject": "..." }.
+Look at the provided image and determine the single most prominent/main object or subject in the scene.
+Answer with a short noun or short phrase (no extra commentary).
+If unsure, give the best concise guess.
+`;
+        const step1Res = await callLMWithImageAndExtract(imageFullPath, step1Prompt, 5);
+        const mainobject = (step1Res && (step1Res.mainobject || step1Res.mainObject || step1Res.object)) ? String(step1Res.mainobject || step1Res.mainObject || step1Res.object).trim() : '';
+
+        if (!mainobject) {
+            logger.warn(`id=${id} - could not detect main object. Skipping record.`);
+            continue;
+        }
+
+        logger.info(`id=${id} - detected main object: ${mainobject}`);
+
+        // Step 2: Determine best action for this scene
+        const step2Prompt = `
+You have access to the image and the detected main object: "${mainobject}".
+Decide which single action type best fits this scene from the list:
+- no action
+- micro animation (animate object but small movement)
+- big movement
+- impossible movement
+
+Return exactly one JSON object and nothing else: { "actiontype": "...", "action": ""}.
+Do not add commentary. Choose the single best option from the list above.
+`;
+        const step2Res = await callLMWithImageAndExtract(imageFullPath, step2Prompt, 5);
+        const actiontype = (step2Res && (step2Res.actiontype || step2Res.actionType)) ? String(step2Res.actiontype || step2Res.actionType).trim() : '';
+
+        if (!actiontype) {
+            logger.warn(`id=${id} - could not determine action type. Skipping record.`);
+            continue;
+        }
+
+        logger.info(`id=${id} - decided action type: ${actiontype}`);
+
+        // Step 3: Ask LMStudio what is the best camera work for the scene
+        const step3Prompt = `
+Given the image and the following information:
+- main object: "${mainobject}"
+- chosen action type: "${actiontype}"
+
+From the options below pick the single best camera approach for this scene:
+- static camera
+- pan
+- rotation
+- follow the moving object
+- zoom to the object
+- impossible camera work
+
+Return exactly one JSON object and nothing else: { "cameraworkType": "..." }.
+Choose one of the listed options and do not add commentary.
+`;
+        const step3Res = await callLMWithImageAndExtract(imageFullPath, step3Prompt, 5);
+        const cameraworkType = (step3Res && (step3Res.cameraworkType || step3Res.cameraWorkType || step3Res.camera)) ? String(step3Res.cameraworkType || step3Res.cameraWorkType || step3Res.camera).trim() : '';
+
+        if (!cameraworkType) {
+            logger.warn(`id=${id} - could not determine camera work. Skipping record.`);
+            continue;
+        }
+
+        logger.info(`id=${id} - decided camera work: ${cameraworkType}`);
+
+        // Step 4: Generate final video prompt using all gathered info
+        const finalPromptInput = buildLMInputFromRecordWithImageInfo(
+            genre,
+            subGenre,
+            scene,
+            action,
+            camera,
+            mainobject,
+            actiontype,
+            cameraworkType
+        );
+
+        // Use wrapper to call LM and extract JSON { videoPrompt: "" }
+        const finalRes = await callLMWithImageAndExtract(imageFullPath, finalPromptInput, 5);
+        const videoPrompt = (finalRes && (finalRes.videoPrompt || finalRes.video_prompt || finalRes.prompt)) ? String(finalRes.videoPrompt || finalRes.video_prompt || finalRes.prompt).trim() : null;
+        logger.info(`id=${id} - videoPrompt: ${videoPrompt}`);
+        if (!videoPrompt) {
+            logger.warn(`id=${id} - LM did not return a valid videoPrompt. Skipping record.`);
+            continue;
+        }
+
+        // Check banned regex
+        if (banned.test(videoPrompt)) {
+            logger.info(`Generated prompt for id=${id} contains banned phrases - skipping update.`);
+            logger.info(videoPrompt);
+            continue;
+        }
+
+
+        // Update DB
+        try {
+            await query('UPDATE video SET video_prompt = ? WHERE id = ?', [videoPrompt, id]);
+            logger.info(`Updated video_prompt for id=${id}`);
+        } catch (err) {
+            logger.error(`Failed to update video_prompt for id=${id}: ${err}`);
+        }
+
+    }
+
+    logger.info('Finished DB sweep for problematic prompts.');
+}
+
+/**
+ * Build final LM input for step 4, including HARD RULES and scene info.
+ * The LM should return: { "videoPrompt": "..." }
+ */
+function buildLMInputFromRecordWithImageInfo(
+    genre: string,
+    subGenre: string,
+    finalScene: string,
+    chosenAction: string,
+    camera: string,
+    mainobject: string,
+    actiontype: string,
+    cameraworkType: string
+) {
+    const accents = 'none';
+    const mood = 'n/a';
+    const lighting = 'n/a';
+    const style = 'n/a';
+
+    const lmInput = `
+Return exactly one JSON object: { "videoPrompt": "..." } and nothing else.
+
+Write "videoPrompt" in 100–150 words, present tense, plain concrete language.
+
+HARD RULES (must comply):
+- One continuous shot ("one take", "oner"). Real-time 8 seconds. No edits.
+- Fixed location and vantage. Do not change background or angle.
+- Lens and focal length locked. No zooms, no close-ups that imply a lens change, no rack zoom.
+- Camera motion: at most subtle pan/tilt/dolly within 1 meter while staying in the same spot.
+- Keep framing consistent (e.g., medium-wide two-shot). No “another shot/meanwhile.”
+- Describe: (1) main action, (2) framing & motion, (3) lighting & mood, (4) style & small accents.
+- Use clear simple sentences. No metaphors or poetic language.
+
+PROHIBITED WORDS/PHRASES (case-insensitive): 
+cut, cuts, cutting, quick cut, insert, macro insert, close-up, extreme close-up,
+zoom, zooms, zooming, push-in, pull-out, whip, switch angle, change angle,
+montage, cross-cut, smash cut, transition, meanwhile, later.
+
+If proximity is needed, say: "the camera glides slightly closer while staying in the same position."
+
+Here is information of the scene, please generate prompt for the video based on these information for key "videoPrompt":
+Genre: ${genre}
+Sub-Genre: ${subGenre}
+Scene: ${finalScene}
+Existing Action Field: ${chosenAction || 'n/a'}
+Existing Camera Field: ${camera || 'static or subtle movement (stay within scene)'}
+Detected Main Object: ${mainobject}
+Suggested Action Type: ${actiontype}
+Suggested Camera Work: ${cameraworkType}
+Accents: ${accents}
+Mood: ${mood}
+Lighting: ${lighting}
+Style: ${style}
+`;
+
+    return lmInput;
+}
+
+main();
diff --git a/src/pinterest_keywords.json b/src/pinterest_keywords.json
index 3ff158b..030cdcc 100644
--- a/src/pinterest_keywords.json
+++ b/src/pinterest_keywords.json
@@ -3589,28 +3589,34 @@
   },
   {
     "genre": "fantasy",
-    "subGenre": "Talking Animals",
+    "subGenre": "Animals",
     "pinIds": [
-      "112941903147144539",
-      "281543725720325",
-      "36943659439371180",
-      "11751649022612432",
-      "18436679719492625",
-      "1407443629725511",
-      "6544361954178116",
-      "1759287347907216",
-      "4151824652348137",
-      "569705421631320751",
-      "168603579798518857",
-      "301670875061735522",
-      "1407443629997072",
-      "1266706140762662",
-      "2603712281941095",
-      "633387443385941",
+      "166422148725296622",
+      "151996556170151090",
+      "673780794293391738",
+      "17099673582089553",
+      "715861303309575585",
+      "13933080091832547",
+      "11047961582093288",
+      "4011087180789478",
+      "75857574972480900",
+      "10836855346968288",
+      "19140367162342308",
+      "518969557079186929",
+      "6966574420158002",
+      "63754150969272553",
+      "18718154695261333",
+      "115193702963985873",
       "3166662232947239",
-      "1079245498202717474",
-      "3025924745501558",
-      "1407443629997060"
+      "35606653299616695",
+      "252553491598336020",
+      "3166662232949549",
+      "52917364367365713",
+      "322288917102637980",
+      "63191201015204585",
+      "53058101854654159",
+      "211174978191847",
+      "20266267068530882"
     ]
   },
   {
@@ -9332,6 +9338,38 @@
       "14707136280473866"
     ]
   },
+  {
+    "genre": "sports",
+    "subGenre": "Motocross",
+    "pinIds": [
+      "1049338781913408309",
+      "60728294964705253",
+      "41376890329005782",
+      "54676582969961779",
+      "6544361952522477",
+      "30962316182630482",
+      "66498531996848387",
+      "35184440833770285",
+      "8233211827095090",
+      "155303888199282668",
+      "2814818510870558",
+      "24418022972582884",
+      "17099673582215596",
+      "69946600457122783",
+      "911697518289700421",
+      "13229392651243233",
+      "207658232811269227",
+      "335588609755373167"
+    ],
+    "videoInstructions": [
+      "Ultra low angle: the tire passes extremely close to the camera, mud splashes onto the lens.",
+      "High-speed slow motion: capture the rider soaring through the air during a jump.",
+      "Drone tracking: follow the rider from directly above, from jump to landing.",
+      "Exaggerated perspective: use an ultra-wide lens to make cornering look dramatic and powerful.",
+      "Consecutive jumps from the side: riders leap one after another, seen in sequence from the side of the course.",
+      "Head-on approach: place the camera just before the jump, the rider flies directly overhead."
+    ]
+  },
   {
     "genre": "technology",
     "subGenre": "3D Printing",
diff --git a/src/piterest_styletransfer_video.ts b/src/piterest_styletransfer_video.ts
index 57e0844..0d14776 100644
--- a/src/piterest_styletransfer_video.ts
+++ b/src/piterest_styletransfer_video.ts
@@ -11,6 +11,60 @@ import { VideoModel } from './lib/db/video';
 
 dotenv.config();
 
+
+// Utility: extract JSON substring from a text.
+// Tries fenced ```json``` blocks first, otherwise extracts first {...} span.
+function extractJsonFromText(text: string): any | null {
+    if (!text || typeof text !== 'string') return null;
+
+    // Try fenced code block with optional json language
+    const fenced = text.match(/```(?:json)?\s*([\s\S]*?)\s*```/i);
+    if (fenced && fenced[1]) {
+        try {
+            return JSON.parse(fenced[1].trim());
+        } catch (e) {
+            // fall through to brace extraction
+        }
+    }
+
+    // Try to extract first {...} match
+    const brace = text.match(/\{[\s\S]*\}/);
+    if (brace && brace[0]) {
+        try {
+            return JSON.parse(brace[0]);
+        } catch (e) {
+            return null;
+        }
+    }
+
+    return null;
+}
+
+// Wrapper to call OpenAI with an image and prompt, and extract JSON reliably.
+// - Uses callOpenAIWithFile to pass the image.
+// - Tries to parse JSON from response if needed.
+// - Retries up to maxRetries times (default 5) when parsing fails or an error occurs.
+async function callOpenAIWithFileAndExtract(imagePath: string, prompt: string, maxRetries = 5): Promise<any | null> {
+    for (let attempt = 1; attempt <= maxRetries; attempt++) {
+        try {
+            const res = await callOpenAIWithFile(imagePath, prompt);
+            // callOpenAIWithFile may return an object or parsed JSON already
+            if (res && typeof res === 'object') {
+                return res;
+            }
+            if (typeof res === 'string') {
+                const parsed = extractJsonFromText(res);
+                if (parsed) return parsed;
+            }
+            // unexpected shape -> retry
+            logger.warn(`callOpenAIWithFileAndExtract: attempt ${attempt} returned unexpected result. Retrying...`);
+        } catch (err) {
+            logger.warn(`callOpenAIWithFileAndExtract: attempt ${attempt} failed: ${err}`);
+        }
+    }
+    logger.error(`callOpenAIWithFileAndExtract: failed to get valid JSON after ${maxRetries} attempts`);
+    return null;
+}
 const servers = [
     {
         baseUrl: process.env.SERVER1_COMFY_BASE_URL,
@@ -36,9 +90,10 @@ interface GenerationTask {
     scene: string;
     action: string;
     camera: string;
+    videoInstructions?: string[];
 }
 
-async function getPromptsForImage(imagePaths: string[], pinUrl: string, genre: string, subGenre: string): Promise<GenerationTask | null> {
+async function getPromptsForImage(imagePaths: string[], pinUrl: string, genre: string, subGenre: string, videoInstructions: string[] = []): Promise<GenerationTask | null> {
     const pinId = pinUrl.split('/').filter(Boolean).pop() || `pin_${Date.now()}`;
     const timestamp = new Date().getTime();
     const imageFileName = `${pinId}_${timestamp}.png`;
@@ -54,26 +109,121 @@ async function getPromptsForImage(imagePaths: string[], pinUrl: string, genre: s
     const imageForPrompt = renamedImagePaths[Math.floor(Math.random() * renamedImagePaths.length)];
 
     try {
-        const promptResponse = await callOpenAIWithFile(imageForPrompt,
-            `Analyze the provided image and generate the following:
-            1. 'scene': A description of the image's environment.
-            2. 'action': A description of the main action occurring in the image.
-            3. 'camera': A description of the camera shot (e.g., 'close-up', 'wide-angle').
-            4. 'image_prompt': A short and detailed prompt to generate this photo.
-            5. 'video_prompt': A prompt describing a creative and subtle movement of the main object. The camera should be slight panning or static.
+        // Step 1: Detect main object
+        const step1Prompt = `
+Return exactly one JSON object and nothing else: { "mainobject": "..." }.
+Look at the provided image and determine the single most prominent/main object or subject in the scene.
+Answer with a short noun or short phrase (no extra commentary).
+If unsure, give the best concise guess.
+`;
+        const step1Res = await callOpenAIWithFileAndExtract(imageForPrompt, step1Prompt, 5);
+        const mainobject = (step1Res && (step1Res.mainobject || step1Res.mainObject || step1Res.object)) ? String(step1Res.mainobject || step1Res.mainObject || step1Res.object).trim() : '';
+
+        if (!mainobject) {
+            throw new Error('Could not detect main object');
+        }
+        logger.info(`Detected main object for ${imageForPrompt}: ${mainobject}`);
+
+        // Step 2: Determine best action for this scene
+        const step2Prompt = `
+You have access to the image and the detected main object: "${mainobject}".
+Decide which single action type best fits this scene from the list:
+- no action
+- micro animation (animate object but small movement)
+- big movement
+- impossible movement
+
+Return exactly one JSON object and nothing else: { "actiontype": "..." }.
+Do not add commentary. Choose the single best option from the list above.
+`;
+        const step2Res = await callOpenAIWithFileAndExtract(imageForPrompt, step2Prompt, 5);
+        const actiontype = (step2Res && (step2Res.actiontype || step2Res.actionType)) ? String(step2Res.actiontype || step2Res.actionType).trim() : '';
+
+        if (!actiontype) {
+            throw new Error('Could not determine action type');
+        }
+        logger.info(`Decided action type for ${imageForPrompt}: ${actiontype}`);
+
+        // Step 3: Ask OpenAI what is the best camera work for the scene
+        const step3Prompt = `
+Given the image and the following information:
+- main object: "${mainobject}"
+- chosen action type: "${actiontype}"
+
+From the options below pick the single best camera approach for this scene:
+- static camera
+- pan
+- rotation
+- follow the moving object
+- zoom to the object
+- impossible camera work
+
+Return exactly one JSON object and nothing else: { "cameraworkType": "..." }.
+Choose one of the listed options and do not add commentary.
+`;
+        const step3Res = await callOpenAIWithFileAndExtract(imageForPrompt, step3Prompt, 5);
+        const cameraworkType = (step3Res && (step3Res.cameraworkType || step3Res.cameraWorkType || step3Res.camera)) ? String(step3Res.cameraworkType || step3Res.cameraWorkType || step3Res.camera).trim() : '';
+
+        if (!cameraworkType) {
+            throw new Error('Could not determine camera work');
+        }
+        logger.info(`Decided camera work for ${imageForPrompt}: ${cameraworkType}`);
+
+        let videoInstruction = "";
+        if (videoInstructions && videoInstructions.length > 0) {
+
+            const videoInstructionPrompt = `
+Given the image and the following information:
+- main object: "${mainobject}"
+
+From the options below pick the single best camera approach for this scene:
+${videoInstructions.join(",\r\n")}
+
+Return exactly one JSON object and nothing else: { "videoInstruction": "..." }.
+Choose one of the listed options and do not add commentary.
+`;
+            const videoInstructionRes = await callOpenAIWithFileAndExtract(imageForPrompt, videoInstructionPrompt, 5);
+            const videoInstructionFinalRes = (step3Res && (videoInstructionRes.videoInstruction || videoInstructionRes.videoInstruction || videoInstructionRes.camera)) ? String(videoInstructionRes.videoInstruction || videoInstructionRes.videoInstruction || videoInstructionRes.camera).trim() : '';
+
+            if (videoInstructionFinalRes)
+                videoInstruction = videoInstructionFinalRes
+
+        }
+        // Step 4: Generate final video prompt (and image prompt) using all gathered info
+        const finalPrompt = `
+Return exactly one JSON object: { "scene": "...", "action":"...", "camera":"...", "image_prompt":"...", "videoPrompt":"..." } and nothing else.
+
+Write "videoPrompt" in 100–150 words, present tense, plain concrete language.
+Write "image_prompt" as a concise, detailed prompt suitable for generating a similar image.
+
+HARD RULES (must comply for videoPrompt):
+- One continuous shot. Real-time 8 seconds. No edits.
+- Fixed location and vantage. Do not change background or angle.
+- Lens and focal length locked. No zooms, no close-ups that imply a lens change.
+- Camera motion: at most subtle pan/tilt/dolly within 1 meter while staying in the same spot.
+- Keep framing consistent. No “another shot/meanwhile.”
+- Use clear simple sentences. No metaphors or poetic language.
+
+Here is information of the scene, please generate fields accordingly:
+Detected Main Object: ${mainobject}
+Suggested Action Type: ${actiontype}
+Suggested Camera Work: ${cameraworkType}
+Genre: ${genre}
+Sub-Genre: ${subGenre}
+${videoInstruction ? 'video instruction:' + videoInstruction : ""}
+`;
+        const finalRes = await callOpenAIWithFileAndExtract(imageForPrompt, finalPrompt, 5);
+
+        const scene = finalRes && (finalRes.scene || finalRes.Scene) ? String(finalRes.scene) : '';
+        const action = finalRes && (finalRes.action || finalRes.Action) ? String(finalRes.action) : '';
+        const camera = finalRes && (finalRes.camera || finalRes.Camera) ? String(finalRes.camera) : '';
+        const imagePrompt = finalRes && (finalRes.image_prompt || finalRes.imagePrompt || finalRes.image_prompt) ? String(finalRes.image_prompt || finalRes.imagePrompt) : '';
+        const videoPrompt = finalRes && (finalRes.videoPrompt || finalRes.video_prompt || finalRes.video_prompt) ? String(finalRes.videoPrompt || finalRes.video_prompt) : '';
+
+        if (!imagePrompt || !videoPrompt) {
+            throw new Error('Final LM output did not include image_prompt or videoPrompt');
+        }
 
-            Output should be in this JSON format:
-            ---
-            {
-                "scene": "{result comes here}",
-                "action": "{result comes here}",
-                "camera": "{result comes here}",
-                "image_prompt": "{result comes here}",
-                "video_prompt": "{result comes here}"
-            }
-            ---
-            `);
-        const { scene, action, camera, image_prompt: imagePrompt, video_prompt: videoPrompt } = promptResponse;
         logger.info(`Image prompt for ${imageForPrompt}:`, imagePrompt);
         logger.info(`Video prompt for ${imageForPrompt}:`, videoPrompt);
 
@@ -179,7 +329,7 @@ async function getPinUrlFromPinterest(keyword: string): Promise<string | null> {
 (async () => {
     // Load pinterest keywords JSON, pick up to 20 subGenres and choose 1 pinId per subGenre
     const keywordsFilePath = path.resolve(process.cwd(), 'src', 'pinterest_keywords.json');
-    let allKeywords: { genre: string; subGenre: string; pinIds?: string[]; pinId?: string[] }[] = [];
+    let allKeywords: { genre: string; subGenre: string; pinIds?: string[]; pinId?: string[], videoInstructions?: string[] }[] = [];
     try {
         const raw = await fs.readFile(keywordsFilePath, 'utf-8');
         allKeywords = JSON.parse(raw);
@@ -189,7 +339,7 @@ async function getPinUrlFromPinterest(keyword: string): Promise<string | null> {
     }
 
     allKeywords = allKeywords.filter(a => {
-        return (a.genre == "food" && a.subGenre == "imagination")
+        return (a.genre == "sports" && a.subGenre == "Motocross")
     });
 
     function shuffle<T>(arr: T[]): T[] {
@@ -205,24 +355,39 @@ async function getPinUrlFromPinterest(keyword: string): Promise<string | null> {
     // Download up to `count` images from a pin URL by opening the pin page and scrolling up to 5 times to trigger lazy loading
     // Returns an array of saved image paths (may be empty)
     async function downloadOneImageFromPin(pinUrl: string, count: number = 1): Promise<string[]> {
-        const browser = await puppeteer.launch({ headless: true });
+        const browser = await puppeteer.launch({ headless: false });
         const page = await browser.newPage();
         await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36');
         await page.setViewport({ width: 1920, height: 1080 });
         try {
             await page.goto(pinUrl, { waitUntil: 'networkidle2', timeout: 30000 });
-            for (let i = 0; i < 5; i++) {
+            for (let i = 0; i < 3; i++) {
                 await page.evaluate('window.scrollTo(0, document.body.scrollHeight)');
                 await new Promise((r) => setTimeout(r, 700 + Math.random() * 800));
             }
 
-            const imgs: string[] = await page.$$eval('img', imgs =>
-                imgs.map(img => (img as HTMLImageElement).src)
-                    .filter(src => !!src && (src.includes('pinimg') || /\.(jpe?g|png|webp)$/i.test(src)))
-            );
+            const imgs: string[] = await page.$$eval('img', imgs => {
+                // For each <img> try to extract the 4x (original) URL from srcset.
+                // srcset example:
+                // "https://i.pinimg.com/236x/...jpg 1x, https://i.pinimg.com/474x/...jpg 2x, https://i.pinimg.com/736x/...jpg 3x, https://i.pinimg.com/originals/...jpg 4x"
+                const urls: string[] = imgs.map(img => {
+                    const srcset = (img as HTMLImageElement).getAttribute('srcset') || '';
+                    if (!srcset) return '';
+                    const parts = srcset.split(',').map(p => p.trim());
+                    for (const part of parts) {
+                        const m = part.match(/^(\S+)\s+4x$/);
+                        if (m && m[1]) return m[1];
+                    }
+                    // fallback: if src contains "originals" return src
+                    const src = (img as HTMLImageElement).src || '';
+                    if (src.includes('/originals/')) return src;
+                    return '';
+                }).filter(s => !!s && s.includes('pinimg'));
+                return urls;
+            });
 
             if (!imgs || imgs.length === 0) {
-                logger.warn(`No image src found on pin page ${pinUrl}`);
+                logger.warn(`No image src (4x) found on pin page ${pinUrl}`);
                 return [];
             }
 
@@ -265,13 +430,16 @@ async function getPinUrlFromPinterest(keyword: string): Promise<string | null> {
         }
     }
 
+    const numberOfPinIds = 20;
     // Build keywords list with single chosen pinId per selected subGenre
-    const keywords: { genre: string; subGenre: string; pinId: string[] }[] = [];
+    const keywords: {
+        genre: string; subGenre: string; pinIds: string[], videoInstructions?: string[]
+    }[] = [];
     for (const entry of selectedEntries) {
         const pinIds = (entry.pinIds || entry.pinId) as string[] | undefined;
         if (!Array.isArray(pinIds) || pinIds.length === 0) continue;
-        const chosenPinId = pinIds[Math.floor(Math.random() * pinIds.length)];
-        keywords.push({ genre: entry.genre, subGenre: entry.subGenre, pinId: [chosenPinId] });
+        const chosenPinId = pinIds.splice(0, numberOfPinIds);
+        keywords.push({ genre: entry.genre, subGenre: entry.subGenre, pinIds: chosenPinId, videoInstructions: entry.videoInstructions });
     }
 
     if (keywords.length === 0) {
@@ -291,24 +459,9 @@ async function getPinUrlFromPinterest(keyword: string): Promise<string | null> {
 
             const { genre, subGenre } = genreSubGenre;
 
-            for (let i = 0; i < 10; i++) {
-                // pinId is now an array with a single chosen id. Pick the first element.
-                const pinIdField = (genreSubGenre as any).pinId;
-                let selectedPinId: string | undefined;
-                if (Array.isArray(pinIdField) && pinIdField.length > 0) {
-                    selectedPinId = pinIdField[0];
-                    logger.info(`Selected chosen pinId ${selectedPinId} for ${genre} / ${subGenre}`);
-                } else if (typeof pinIdField === 'string' && pinIdField) {
-                    selectedPinId = pinIdField;
-                    logger.info(`Using single pinId ${selectedPinId} for ${genre} / ${subGenre}`);
-                }
+            for (const pinId of genreSubGenre.pinIds) {
 
-                if (!selectedPinId) {
-                    logger.warn(`No pinId available for ${genre}/${subGenre}. Skipping.`);
-                    continue;
-                }
-
-                const pin = `https://www.pinterest.com/pin/${selectedPinId}/`;
+                const pin = `https://www.pinterest.com/pin/${pinId}/`;
                 logger.info(`--- Starting processing for pin: ${pin} ---`);
 
                 // download images from the pin page (pass desired count as second arg)
@@ -323,8 +476,9 @@ async function getPinUrlFromPinterest(keyword: string): Promise<string | null> {
 
                 // proceed if we have at least one image
                 if (selectedImages.length >= 1) {
-                    const task = await getPromptsForImage(selectedImages, pin, genre, subGenre);
+                    const task = await getPromptsForImage(selectedImages, pin, genre, subGenre, genreSubGenre.videoInstructions);
                     if (task) {
+                        task.videoInstructions = genreSubGenre.videoInstructions;
                         generationTasks.push(task);
                     }
                 } else {