3. Latent diffusion

This tutorial demonstrates how to generate images with latent diffusion models.

# !git clone --depth 1 --single-branch https://github.com/NVlabs/edm2
# !pip install accelerate diffusers
import sys
import torch

sys.path.append("edm2")

from torchvision.transforms.functional import to_pil_image

from azula.plugins import eldm
from azula.sample import LMSSampler

device = "cuda"
_ = torch.manual_seed(0)

3.1. Pre-trained latent diffusion model

denoiser, autoencoder = eldm.load_model("imagenet_512x512_xxl")
denoiser, autoencoder = denoiser.to(device), autoencoder.to(device)
Skipping download as /mnt/home/frozet/.cache/azula/hub/https.nvlabs-fi-cdn.nvidia.com.edm2.posthoc-reconstructions.edm2-img512-xxl-0939524-0.150.pkl already exists.
def postprocess(x):
    return torch.clip(x, min=0, max=1)
sampler = LMSSampler(denoiser, steps=32).to(device)
label = torch.nn.functional.one_hot(torch.tensor(88), 1000).to(device)

z1 = sampler.init((1, 4 * 64 * 64))
z0 = sampler(z1, label=label)

x = autoencoder.decode(z0.reshape(-1, 4, 64, 64)).reshape(3, 512, 512)
to_pil_image(postprocess(x))
../_images/40c49605d87b8cd9f2ca7591b2583087e6bed38dd2409cd4c4ac6904b3522044.png

3.2. Classifier-free guidance

label = torch.nn.functional.one_hot(torch.tensor(33), 1000).to(device)

z1 = sampler.init((1, 4 * 64 * 64))
z0 = sampler(z1, label=label, omega=1.5)

x = autoencoder.decode(z0.reshape(-1, 4, 64, 64)).reshape(3, 512, 512)
to_pil_image(postprocess(x))
../_images/0b5f5b43676affadf0e5c746266c9c41276a68e3e37e16463ff0ce5c4321ef25.png