Hunjuan Video training config

🧩 Syntax:
# Config file
output_dir = 'output/pv'
dataset = 'datasets/pv_dataset.toml'

epochs = 1000
micro_batch_size_per_gpu = 1
pipeline_stages = 1
gradient_accumulation_steps = 4
gradient_clipping = 1.0
warmup_steps = 100

eval_every_n_epochs = 1
eval_before_first_step = true
eval_micro_batch_size_per_gpu = 1
eval_gradient_accumulation_steps = 1

save_every_n_epochs = 2
checkpoint_every_n_epochs = 2
caching_batch_size = 1
steps_per_print = 1
video_clip_mode = 'single_beginning'

[model]
type = 'hunyuan-video'
transformer_path = '/u/SD/models/HunyuanVideo/models/hunyuan/hunyuan_video_720_cfgdistill_fp8_e4m3fn.safetensors'
vae_path = '/u/SD/models/HunyuanVideo/models/hunyuan/hunyuan_video_vae_bf16.safetensors'
llm_path = '/u/SD/models/HunyuanVideo/models/llm/llava-llama-3-8b-text-encoder-tokenizer'
clip_path = '/u/SD/models/HunyuanVideo/models/clip/clip-vit-large-patch14'
dtype = 'bfloat16'
transformer_dtype = 'float8'
timestep_sample_method = 'logit_normal'

# Dataset config
resolutions = [768]
enable_ar_bucket = true
min_ar = 0.5
max_ar = 2.0
num_ar_buckets = 8
frame_buckets = [1, 33]

[[directory]]
path = '/u/SD/training/video/p/imgs'
resolutions = [768]
num_repeats = 4

[[directory]]
path = '/u/SD/training/video/p/vids/640x480'
ar_buckets = [[640, 480]]
resolutions = [[640, 480]]
frame_buckets = [33]
num_repeats = 2

[[directory]]
path = '/u/SD/training/video/p/vids/1280x720'
ar_buckets = [[640, 360]]
resolutions = [[640, 360]]
frame_buckets = [33]
num_repeats = 2

[[directory]]
path = '/u/SD/training/video/p/vids/720x1280'
ar_buckets = [[360, 640]]
resolutions = [[360, 640]]
frame_buckets = [33]
num_repeats = 2

[adapter]
type = 'lora'
rank = 32
dtype = 'bfloat16'

# You need to patch diffusion-pipe for this.
# See https://github.com/tdrussell/diffusion-pipe/issues/66#issuecomment-2616116274
only_double_blocks = true

[optimizer]
type = 'adamw8bit'
lr = 8e-5
betas = [0.9, 0.99]
weight_decay = 0.01
eps = 1e-8