## Setup and Configuration
### ComfyUI Workflow
```json
{
"last_node_id": 15,
"last_link_id": 14,
"nodes": [
{
"id": 6,
"type": "CLIPTextEncode",
"pos": [100, 100],
"size": {"0": 400, "1": 200},
"inputs": {"clip": ["4", 1]},
"outputs": [["3", 0]],
"widgets_values": ["beautiful landscape, mountains, lake, sunset, highly detailed, 8k"]
},
{
"id": 7,
"type": "CLIPTextEncode",
"pos": [100, 400],
"size": {"0": 400, "1": 200},
"inputs": {"clip": ["4", 1]},
"outputs": [["3", 1]],
"widgets_values": ["blurry, low quality, distorted"]
}
]
}
```
### Python API
```python
import torch
from diffusers import StableDiffusionXLPipeline, DPMSolverMultistepScheduler
pipe = StableDiffusionXLPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
torch_dtype=torch.float16,
variant="fp16",
use_safetensors=True
).to("cuda")
pipe.scheduler = DPMSolverMultistepScheduler.from_config(
pipe.scheduler.config,
algorithm_type="sde-dpmsolver++"
)
pipe.enable_xformers_memory_efficient_attention()
pipe.enable_model_cpu_offload()
prompt = "professional photo of an astronaut riding a horse on mars, detailed, 8k"
negative_prompt = "blurry, low quality, distorted, amateur"
image = pipe(
prompt=prompt,
negative_prompt=negative_prompt,
width=1024,
height=1024,
num_inference_steps=30,
guidance_scale=7.5,
generator=torch.Generator("cuda").manual_seed(42)
).images[0]
image.save("output.png")
```
## LoRA Training
### Kohya_ss Configuration
```bash
git clone https://github.com/bmaltais/kohya_ss.git
cd kohya_ss
./setup.sh
{
"pretrained_model_name_or_path": "stabilityai/stable-diffusion-xl-base-1.0",
"vae": "madebyollin/sdxl-vae-fp16-fix",
"train_data_dir": "/path/to/dataset",
"output_dir": "/path/to/output",
"max_train_steps": 1000,
"save_every_n_steps": 100,
"learning_rate": 0.0001,
"lr_scheduler": "constant",
"network_dim": 32,
"network_alpha": 16,
"resolution": "1024,1024",
"train_batch_size": 1,
"mixed_precision": "fp16",
"optimizer_type": "AdamW8bit",
"xformers": true,
"gradient_checkpointing": true
}
```
### Using LoRA
```python
from diffusers import StableDiffusionXLPipeline
import torch
pipe = StableDiffusionXLPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
torch_dtype=torch.float16
).to("cuda")
pipe.load_lora_weights("path/to/lora.safetensors", adapter_name="my_lora")
pipe.set_adapters(["my_lora"], adapter_weights=[0.8])
image = pipe(
"photo of <trigger_word>, high quality",
num_inference_steps=30
).images[0]
```
## ControlNet
```python
from diffusers import ControlNetModel, StableDiffusionXLControlNetPipeline
from diffusers.utils import load_image
import cv2
import numpy as np
canny_controlnet = ControlNetModel.from_pretrained(
"diffusers/controlnet-canny-sdxl-1.0",
torch_dtype=torch.float16
)
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
controlnet=canny_controlnet,
torch_dtype=torch.float16
).to("cuda")
image = load_image("input.jpg")
image_np = np.array(image)
edges = cv2.Canny(image_np, 100, 200)
control_image = Image.fromarray(edges)
result = pipe(
prompt="modern building, architectural photography",
image=control_image,
controlnet_conditioning_scale=0.7,
num_inference_steps=30
).images[0]
```
## Advanced Techniques
### Img2Img
```python
init_image = load_image("input.jpg").resize((1024, 1024))
result = pipe(
prompt="transform into watercolor painting",
image=init_image,
strength=0.75, # How much to change
num_inference_steps=30
).images[0]
```
### Inpainting
```python
from diffusers import StableDiffusionXLInpaintPipeline
pipe = StableDiffusionXLInpaintPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
torch_dtype=torch.float16
).to("cuda")
mask = Image.new("L", (1024, 1024), 0) # Black
mask_draw = ImageDraw.Draw(mask)
mask_draw.rectangle([400, 400, 600, 600], fill=255) # White region to inpaint
result = pipe(
prompt="a cat sitting",
image=init_image,
mask_image=mask,
num_inference_steps=30
).images[0]
```
## Best Practices
- Use VAE fp16 fix for better colors
- Enable xformers for memory efficiency
- Use DPMSolver++ for faster inference
- Train LoRA with 10-20 images minimum
- Use controlnet for precise composition control