[docs] Promote AutoModel usage (#11300)

sayakpaul · stevhliu · web-flow · commit cefa28f4490c · 2025-04-15T09:25:40.000+05:30
* docs: promote the usage of automodel.

* bitsandbytes

* Apply suggestions from code review

Co-authored-by: Steven Liu &lt;59462357+stevhliu@users.noreply.github.com&gt;

---------

Co-authored-by: Steven Liu &lt;59462357+stevhliu@users.noreply.github.com&gt;
diff --git a/docs/source/en/quantization/bitsandbytes.md b/docs/source/en/quantization/bitsandbytes.md
@@ -49,7 +49,7 @@ For Ada and higher-series GPUs. we recommend changing `torch_dtype` to `torch.bf
 from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
 from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
 
-from diffusers import FluxTransformer2DModel
+from diffusers import AutoModel
 from transformers import T5EncoderModel
 
 quant_config = TransformersBitsAndBytesConfig(load_in_8bit=True,)
@@ -63,7 +63,7 @@ text_encoder_2_8bit = T5EncoderModel.from_pretrained(
 
 quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True,)
 
-transformer_8bit = FluxTransformer2DModel.from_pretrained(
+transformer_8bit = AutoModel.from_pretrained(
     "black-forest-labs/FLUX.1-dev",
     subfolder="transformer",
     quantization_config=quant_config,
@@ -74,7 +74,7 @@ transformer_8bit = FluxTransformer2DModel.from_pretrained(
 By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter.
 
 ```diff
-transformer_8bit = FluxTransformer2DModel.from_pretrained(
+transformer_8bit = AutoModel.from_pretrained(
     "black-forest-labs/FLUX.1-dev",
     subfolder="transformer",
     quantization_config=quant_config,
@@ -133,7 +133,7 @@ For Ada and higher-series GPUs. we recommend changing `torch_dtype` to `torch.bf
 from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
 from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
 
-from diffusers import FluxTransformer2DModel
+from diffusers import AutoModel
 from transformers import T5EncoderModel
 
 quant_config = TransformersBitsAndBytesConfig(load_in_4bit=True,)
@@ -147,7 +147,7 @@ text_encoder_2_4bit = T5EncoderModel.from_pretrained(
 
 quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True,)
 
-transformer_4bit = FluxTransformer2DModel.from_pretrained(
+transformer_4bit = AutoModel.from_pretrained(
     "black-forest-labs/FLUX.1-dev",
     subfolder="transformer",
     quantization_config=quant_config,
@@ -158,7 +158,7 @@ transformer_4bit = FluxTransformer2DModel.from_pretrained(
 By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter.
 
 ```diff
-transformer_4bit = FluxTransformer2DModel.from_pretrained(
+transformer_4bit = AutoModel.from_pretrained(
     "black-forest-labs/FLUX.1-dev",
     subfolder="transformer",
     quantization_config=quant_config,
@@ -217,11 +217,11 @@ print(model.get_memory_footprint())
 Quantized models can be loaded from the [`~ModelMixin.from_pretrained`] method without needing to specify the `quantization_config` parameters:
 
 ```py
-from diffusers import FluxTransformer2DModel, BitsAndBytesConfig
+from diffusers import AutoModel, BitsAndBytesConfig
 
 quantization_config = BitsAndBytesConfig(load_in_4bit=True)
 
-model_4bit = FluxTransformer2DModel.from_pretrained(
+model_4bit = AutoModel.from_pretrained(
     "hf-internal-testing/flux.1-dev-nf4-pkg", subfolder="transformer"
 )
 ```
@@ -243,13 +243,13 @@ An "outlier" is a hidden state value greater than a certain threshold, and these
 To find the best threshold for your model, we recommend experimenting with the `llm_int8_threshold` parameter in [`BitsAndBytesConfig`]:
 
 ```py
-from diffusers import FluxTransformer2DModel, BitsAndBytesConfig
+from diffusers import AutoModel, BitsAndBytesConfig
 
 quantization_config = BitsAndBytesConfig(
     load_in_8bit=True, llm_int8_threshold=10,
 )
 
-model_8bit = FluxTransformer2DModel.from_pretrained(
+model_8bit = AutoModel.from_pretrained(
     "black-forest-labs/FLUX.1-dev",
     subfolder="transformer",
     quantization_config=quantization_config,
@@ -305,7 +305,7 @@ NF4 is a 4-bit data type from the [QLoRA](https://hf.co/papers/2305.14314) paper
 from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
 from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
 
-from diffusers import FluxTransformer2DModel
+from diffusers import AutoModel
 from transformers import T5EncoderModel
 
 quant_config = TransformersBitsAndBytesConfig(
@@ -325,7 +325,7 @@ quant_config = DiffusersBitsAndBytesConfig(
     bnb_4bit_quant_type="nf4",
 )
 
-transformer_4bit = FluxTransformer2DModel.from_pretrained(
+transformer_4bit = AutoModel.from_pretrained(
     "black-forest-labs/FLUX.1-dev",
     subfolder="transformer",
     quantization_config=quant_config,
@@ -343,7 +343,7 @@ Nested quantization is a technique that can save additional memory at no additio
 from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
 from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
 
-from diffusers import FluxTransformer2DModel
+from diffusers import AutoModel
 from transformers import T5EncoderModel
 
 quant_config = TransformersBitsAndBytesConfig(
@@ -363,7 +363,7 @@ quant_config = DiffusersBitsAndBytesConfig(
     bnb_4bit_use_double_quant=True,
 )
 
-transformer_4bit = FluxTransformer2DModel.from_pretrained(
+transformer_4bit = AutoModel.from_pretrained(
     "black-forest-labs/FLUX.1-dev",
     subfolder="transformer",
     quantization_config=quant_config,
@@ -379,7 +379,7 @@ Once quantized, you can dequantize a model to its original precision, but this m
 from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
 from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
 
-from diffusers import FluxTransformer2DModel
+from diffusers import AutoModel
 from transformers import T5EncoderModel
 
 quant_config = TransformersBitsAndBytesConfig(
@@ -399,7 +399,7 @@ quant_config = DiffusersBitsAndBytesConfig(
     bnb_4bit_use_double_quant=True,
 )
 
-transformer_4bit = FluxTransformer2DModel.from_pretrained(
+transformer_4bit = AutoModel.from_pretrained(
     "black-forest-labs/FLUX.1-dev",
     subfolder="transformer",
     quantization_config=quant_config,
diff --git a/docs/source/en/quantization/torchao.md b/docs/source/en/quantization/torchao.md
@@ -26,13 +26,13 @@ The example below only quantizes the weights to int8.
 
 ```python
 import torch
-from diffusers import FluxPipeline, FluxTransformer2DModel, TorchAoConfig
+from diffusers import FluxPipeline, AutoModel, TorchAoConfig
 
 model_id = "black-forest-labs/FLUX.1-dev"
 dtype = torch.bfloat16
 
 quantization_config = TorchAoConfig("int8wo")
-transformer = FluxTransformer2DModel.from_pretrained(
+transformer = AutoModel.from_pretrained(
     model_id,
     subfolder="transformer",
     quantization_config=quantization_config,
@@ -99,10 +99,10 @@ To serialize a quantized model in a given dtype, first load the model with the d
 
 ```python
 import torch
-from diffusers import FluxTransformer2DModel, TorchAoConfig
+from diffusers import AutoModel, TorchAoConfig
 
 quantization_config = TorchAoConfig("int8wo")
-transformer = FluxTransformer2DModel.from_pretrained(
+transformer = AutoModel.from_pretrained(
     "black-forest-labs/Flux.1-Dev",
     subfolder="transformer",
     quantization_config=quantization_config,
@@ -115,9 +115,9 @@ To load a serialized quantized model, use the [`~ModelMixin.from_pretrained`] me
 
 ```python
 import torch
-from diffusers import FluxPipeline, FluxTransformer2DModel
+from diffusers import FluxPipeline, AutoModel
 
-transformer = FluxTransformer2DModel.from_pretrained("/path/to/flux_int8wo", torch_dtype=torch.bfloat16, use_safetensors=False)
+transformer = AutoModel.from_pretrained("/path/to/flux_int8wo", torch_dtype=torch.bfloat16, use_safetensors=False)
 pipe = FluxPipeline.from_pretrained("black-forest-labs/Flux.1-Dev", transformer=transformer, torch_dtype=torch.bfloat16)
 pipe.to("cuda")
 
@@ -131,10 +131,10 @@ If you are using `torch<=2.6.0`, some quantization methods, such as `uint4wo`, c
 ```python
 import torch
 from accelerate import init_empty_weights
-from diffusers import FluxPipeline, FluxTransformer2DModel, TorchAoConfig
+from diffusers import FluxPipeline, AutoModel, TorchAoConfig
 
 # Serialize the model
-transformer = FluxTransformer2DModel.from_pretrained(
+transformer = AutoModel.from_pretrained(
     "black-forest-labs/Flux.1-Dev",
     subfolder="transformer",
     quantization_config=TorchAoConfig("uint4wo"),
@@ -146,10 +146,13 @@ transformer.save_pretrained("/path/to/flux_uint4wo", safe_serialization=False, m
 # Load the model
 state_dict = torch.load("/path/to/flux_uint4wo/diffusion_pytorch_model.bin", weights_only=False, map_location="cpu")
 with init_empty_weights():
-    transformer = FluxTransformer2DModel.from_config("/path/to/flux_uint4wo/config.json")
+    transformer = AutoModel.from_config("/path/to/flux_uint4wo/config.json")
 transformer.load_state_dict(state_dict, strict=True, assign=True)
 ```
 
+> [!TIP]
+> The [`AutoModel`] API is supported for PyTorch >= 2.6 as shown in the examples below.
+
 ## Resources
 
 - [TorchAO Quantization API](https://github.com/pytorch/ao/blob/main/torchao/quantization/README.md)
diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md
@@ -163,6 +163,9 @@ Models are initiated with the [`~ModelMixin.from_pretrained`] method which also
 >>> model = UNet2DModel.from_pretrained(repo_id, use_safetensors=True)
 ```
 
+> [!TIP]
+> Use the [`AutoModel`] API to automatically select a model class if you're unsure of which one to use.
+
 To access the model parameters, call `model.config`:
 
 ```py
diff --git a/docs/source/en/training/adapt_a_model.md b/docs/source/en/training/adapt_a_model.md
@@ -31,10 +31,10 @@ To adapt your text-to-image model for inpainting, you'll need to change the numb
 Initialize a [`UNet2DConditionModel`] with the pretrained text-to-image model weights, and change `in_channels` to 9. Changing the number of `in_channels` means you need to set `ignore_mismatched_sizes=True` and `low_cpu_mem_usage=False` to avoid a size mismatch error because the shape is different now.
 
 ```py
-from diffusers import UNet2DConditionModel
+from diffusers import AutoModel
 
 model_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
-unet = UNet2DConditionModel.from_pretrained(
+unet = AutoModel.from_pretrained(
     model_id,
     subfolder="unet",
     in_channels=9,
diff --git a/docs/source/en/training/distributed_inference.md b/docs/source/en/training/distributed_inference.md
@@ -165,10 +165,10 @@ flush()
 Load the diffusion transformer next which has 12.5B parameters. This time, set `device_map="auto"` to automatically distribute the model across two 16GB GPUs. The `auto` strategy is backed by [Accelerate](https://hf.co/docs/accelerate/index) and available as a part of the [Big Model Inference](https://hf.co/docs/accelerate/concept_guides/big_model_inference) feature. It starts by distributing a model across the fastest device first (GPU) before moving to slower devices like the CPU and hard drive if needed. The trade-off of storing model parameters on slower devices is slower inference latency.
 
 ```py
-from diffusers import FluxTransformer2DModel
+from diffusers import AutoModel
 import torch 
 
-transformer = FluxTransformer2DModel.from_pretrained(
+transformer = AutoModel.from_pretrained(
     "black-forest-labs/FLUX.1-dev", 
     subfolder="transformer",
     device_map="auto",
diff --git a/docs/source/en/tutorials/inference_with_big_models.md b/docs/source/en/tutorials/inference_with_big_models.md
@@ -32,9 +32,9 @@ The denoiser checkpoint can also have multiple shards and supports inference tha
 For example, let's save a sharded checkpoint for the [SDXL UNet](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/tree/main/unet):
 
 ```python
-from diffusers import UNet2DConditionModel
+from diffusers import AutoModel
 
-unet = UNet2DConditionModel.from_pretrained(
+unet = AutoModel.from_pretrained(
     "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet"
 )
 unet.save_pretrained("sdxl-unet-sharded", max_shard_size="5GB")
@@ -43,10 +43,10 @@ unet.save_pretrained("sdxl-unet-sharded", max_shard_size="5GB")
 The size of the fp32 variant of the SDXL UNet checkpoint is ~10.4GB. Set the `max_shard_size` parameter to 5GB to create 3 shards. After saving, you can load them in [`StableDiffusionXLPipeline`]:
 
 ```python
-from diffusers import UNet2DConditionModel, StableDiffusionXLPipeline
+from diffusers import AutoModel, StableDiffusionXLPipeline
 import torch
 
-unet = UNet2DConditionModel.from_pretrained(
+unet = AutoModel.from_pretrained(
     "sayakpaul/sdxl-unet-sharded", torch_dtype=torch.float16
 )
 pipeline = StableDiffusionXLPipeline.from_pretrained(
diff --git a/docs/source/en/using-diffusers/loading_adapters.md b/docs/source/en/using-diffusers/loading_adapters.md
@@ -134,7 +134,7 @@ The [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method loads L
 - the LoRA weights don't have separate identifiers for the UNet and text encoder
 - the LoRA weights have separate identifiers for the UNet and text encoder
 
-To directly load (and save) a LoRA adapter at the *model-level*, use [`~PeftAdapterMixin.load_lora_adapter`], which builds and prepares the necessary model configuration for the adapter. Like [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`], [`PeftAdapterMixin.load_lora_adapter`] can load LoRAs for both the UNet and text encoder. For example, if you're loading a LoRA for the UNet, [`PeftAdapterMixin.load_lora_adapter`] ignores the keys for the text encoder.
+To directly load (and save) a LoRA adapter at the *model-level*, use [`~loaders.PeftAdapterMixin.load_lora_adapter`], which builds and prepares the necessary model configuration for the adapter. Like [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`], [`~loaders.PeftAdapterMixin.load_lora_adapter`] can load LoRAs for both the UNet and text encoder. For example, if you're loading a LoRA for the UNet, [`~loaders.PeftAdapterMixin.load_lora_adapter`] ignores the keys for the text encoder.
 
 Use the `weight_name` parameter to specify the specific weight file and the `prefix` parameter to filter for the appropriate state dicts (`"unet"` in this case) to load.
 
@@ -155,7 +155,7 @@ image
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_attn_proc.png" />
 </div>
 
-Save an adapter with [`~PeftAdapterMixin.save_lora_adapter`].
+Save an adapter with [`~loaders.PeftAdapterMixin.save_lora_adapter`].
 
 To unload the LoRA weights, use the [`~loaders.StableDiffusionLoraLoaderMixin.unload_lora_weights`] method to discard the LoRA weights and restore the model to its original weights:
 
diff --git a/docs/source/en/using-diffusers/merge_loras.md b/docs/source/en/using-diffusers/merge_loras.md
@@ -66,10 +66,10 @@ Let's dive deeper into what these steps entail.
 1. Load a UNet that corresponds to the UNet in the LoRA checkpoint. In this case, both LoRAs use the SDXL UNet as their base model.
 
 ```python
-from diffusers import UNet2DConditionModel
+from diffusers import AutoModel
 import torch
 
-unet = UNet2DConditionModel.from_pretrained(
+unet = AutoModel.from_pretrained(
     "stabilityai/stable-diffusion-xl-base-1.0",
     torch_dtype=torch.float16,
     use_safetensors=True,
@@ -136,7 +136,7 @@ feng_peft_model.load_state_dict(original_state_dict, strict=True)
 ```python
 from peft import PeftModel
 
-base_unet = UNet2DConditionModel.from_pretrained(
+base_unet = AutoModel.from_pretrained(
     "stabilityai/stable-diffusion-xl-base-1.0",
     torch_dtype=torch.float16,
     use_safetensors=True,