#!/usr/bin/env python3
"""
Wan 2.2 I2V (Image-to-Video) API prompt builder and submitter.
Uses Kijai's WanVideoWrapper nodes with our downloaded fp8 models.
"""
import json, requests, uuid

API = "http://127.0.0.1:8188"
CLIENT_ID = str(uuid.uuid4())

def submit(prompt_dict, label=""):
    r = requests.post(f"{API}/prompt", json={"prompt": prompt_dict, "client_id": CLIENT_ID})
    if r.status_code == 200:
        pid = r.json().get("prompt_id", "?")
        print(f"✅ [{label}]: {pid}")
        return pid
    else:
        print(f"❌ [{label}]: {r.status_code} {r.text[:300]}")

def make_wan22_i2v_prompt(
    input_image: str,
    prompt_text: str,
    negative_prompt: str = "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留",
    width: int = 832,
    height: int = 480,
    frames: int = 81,
    steps: int = 4,
    cfg: float = 1.0,
    shift: float = 3.0,
    seed: int = 42,
    save_prefix: str = "wan22_i2v/output"
) -> dict:
    """
    Build a Wan 2.2 I2V API prompt using Kijai's wrapper nodes.
    
    Models used (must be in ComfyUI models folders):
    - HIGH model: Wan2_2-I2V-A14B-HIGH_fp8_e4m3fn_scaled_KJ.safetensors
    - LOW model: Wan2_2-I2V-A14B-LOW_fp8_e4m3fn_scaled_KJ.safetensors  
    - Text encoder: umt5-xxl-enc-bf16.safetensors
    - VAE: wan_2.1_vae.safetensors
    - CLIP vision: open-clip-xlm-roberta-large-vit-huge-14_visual_fp16.safetensors
    """
    return {
        # Load input image
        "1": {
            "class_type": "LoadImage",
            "inputs": {"image": input_image}
        },
        
        # Load CLIP vision for image conditioning
        "2": {
            "class_type": "CLIPVisionLoader",
            "inputs": {"clip_name": "open-clip-xlm-roberta-large-vit-huge-14_visual_fp16.safetensors"}
        },
        
        # Encode image with CLIP vision
        "3": {
            "class_type": "CLIPVisionEncode",
            "inputs": {"clip_vision": ["2", 0], "image": ["1", 0], "crop": "center"}
        },
        
        # Load T5 text encoder
        "4": {
            "class_type": "LoadWanVideoT5TextEncoder",
            "inputs": {
                "model_name": "umt5-xxl-enc-bf16.safetensors",
                "precision": "bf16"
            }
        },
        
        # Encode positive prompt
        "5": {
            "class_type": "WanVideoTextEncode",
            "inputs": {
                "text": prompt_text,
                "text_encoder": ["4", 0]
            }
        },
        
        # Load VAE
        "6": {
            "class_type": "WanVideoVAELoader",
            "inputs": {
                "model_name": "wan_2.1_vae.safetensors",
                "dtype": "bf16"
            }
        },
        
        # Image to video encoding
        "7": {
            "class_type": "WanVideoImageToVideoEncode",
            "inputs": {
                "width": width,
                "height": height,
                "num_frames": frames,
                "noise_aug_strength": 0.0,
                "start_latent_strength": 1.0,
                "end_latent_strength": 1.0,
                "force_offload": True,
                "vae": ["6", 0],
                "clip_embeds": ["3", 0],
                "start_image": ["1", 0]
            }
        },
        
        # Load HIGH noise model
        "8": {
            "class_type": "WanVideoModelLoader",
            "inputs": {
                "model": "I2V/Wan2_2-I2V-A14B-HIGH_fp8_e4m3fn_scaled_KJ.safetensors",
                "base_precision": "fp8_e4m3fn",
                "quantization": "fp8_e4m3fn_scaled",
                "load_device": "cpu"
            }
        },
        
        # Run sampler with HIGH model
        "9": {
            "class_type": "WanVideoSampler",
            "inputs": {
                "model": ["8", 0],
                "image_embeds": ["3", 0],
                "steps": steps,
                "cfg": cfg,
                "shift": shift,
                "seed": seed,
                "force_offload": True,
                "scheduler": "unipc",
                "riflex_freq_index": 0,
                "text_embeds": ["5", 0],
                "samples": ["7", 0]
            }
        },
        
        # Decode video
        "10": {
            "class_type": "WanVideoDecode",
            "inputs": {
                "vae": ["6", 0],
                "samples": ["9", 0],
                "enable_vae_tiling": False,
                "tile_x": 512,
                "tile_y": 512,
                "tile_stride_x": 256,
                "tile_stride_y": 256
            }
        },
        
        # Save as video
        "11": {
            "class_type": "SaveWEBM",
            "inputs": {
                "images": ["10", 0],
                "filename_prefix": save_prefix,
                "fps": 24,
                "lossless": False,
                "quality": 85
            }
        }
    }


# ── Example usage ────────────────────────────────────────────────────────────

if __name__ == "__main__":
    import sys
    
    if len(sys.argv) < 3:
        print("Usage: python submit_wan22_i2v.py <input_image> <prompt> [save_prefix]")
        print("Example: python submit_wan22_i2v.py test.jpg 'a woman walking through a forest'")
        sys.exit(1)
    
    input_image = sys.argv[1]
    prompt_text = sys.argv[2]
    save_prefix = sys.argv[3] if len(sys.argv) > 3 else "wan22_i2v/test"
    
    prompt = make_wan22_i2v_prompt(
        input_image=input_image,
        prompt_text=prompt_text,
        save_prefix=save_prefix,
        width=832,
        height=480,
        frames=81,
        steps=4,
        cfg=1.0,
        seed=42
    )
    
    submit(prompt, save_prefix.split("/")[-1])
    print(f"\nView output at: http://fred:8188")