Merge branch 'main' of https://github.com/huggingface/diffusers into v0.18.0-release

Release: v0.18.0
2023-07-06 19:29:40 +02:00 · 2023-07-06 19:22:46 +02:00
24 changed files with 160 additions and 495 deletions
@@ -43,7 +43,7 @@ pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-256").to(devic

 output = pipe()
 display(output.images[0])
-display(Audio(output.audios[0], rate=pipe.mel.get_sample_rate()))
+display(Audio(output.audios[0], rate=mel.get_sample_rate()))
 ```

 ### Latent Audio Diffusion
@@ -21,7 +21,7 @@ The abstract of the paper is the following:
 ## Tips

 - Stable Diffusion XL works especially well with images between 768 and 1024.
- Stable Diffusion XL output image can be improved by making use of a refiner as shown below.
+- Stable Diffusion XL output image can be improved by making use of a refiner as shown below

 ### Available checkpoints:

@@ -40,7 +40,7 @@ pip install safetensors
 pip install invisible-watermark>=2.0
 ```

-### Text-to-Image
+### *Text-to-Image*

 You can use SDXL as follows for *text-to-image*:

@@ -71,7 +71,6 @@ pipe = StableDiffusionXLPipeline.from_pretrained(
 )
 pipe.to("cuda")

-use_refiner = True
 refiner = StableDiffusionXLImg2ImgPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-refiner-0.9", torch_dtype=torch.float16, use_safetensors=True, variant="fp16"
 )
@@ -83,29 +82,7 @@ image = pipe(prompt=prompt, output_type="latent" if use_refiner else "pil").imag
 image = refiner(prompt=prompt, image=image[None, :]).images[0]
 ```

-### Image-to-image 
-
-```py 
-import torch
-from diffusers import StableDiffusionXLImg2ImgPipeline
-from diffusers.utils import load_image
-
-pipe = StableDiffusionXLImg2ImgPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-refiner-0.9", torch_dtype=torch.float16
-)
-pipe = pipe.to("cuda")
-url = "https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/aa_xl/000000009.png"
-
-init_image = load_image(url).convert("RGB")
-prompt = "a photo of an astronaut riding a horse on mars"
-image = pipe(prompt, image=init_image).images[0]
-```
-
-| Original Image | Refined Image |
-|---|---|
-| ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/sd_xl/init_image.png) | ![](https://huggingface.co/datasets/diffusers/docs-images/resolve/main/sd_xl/refined_image.png) |
-
-### Loading single file checkpoints / original file format
+### Loading single file checkpoitns / original file format

 By making use of [`~diffusers.loaders.FromSingleFileMixin.from_single_file`] you can also load the 
 original file format into `diffusers`:
@@ -150,7 +127,7 @@ You can speed up inference by making use of `torch.compile`. This should give yo
 + refiner.unet = torch.compile(refiner.unet, mode="reduce-overhead", fullgraph=True)
 ```

-### Running with `torch` \< 2.0
+### Running with `torch` < 2.0

 **Note** that if you want to run Stable Diffusion XL with `torch` < 2.0, please make sure to enable xformers 
 attention:
@@ -436,12 +436,6 @@ def parse_args(input_args=None):
        default=None,
        help="The optional `class_label` conditioning to pass to the unet, available values are `timesteps`.",
    )
-    parser.add_argument(
-        "--rank",
-        type=int,
-        default=4,
-        help=("The dimension of the LoRA update matrices."),
-    )

    if input_args is not None:
        args = parser.parse_args(input_args)
@@ -851,9 +845,7 @@ def main(args):
                LoRAAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else LoRAAttnProcessor
            )
        unet_lora_attn_procs[name] = lora_attn_processor_class(
-            hidden_size=hidden_size,
-            cross_attention_dim=cross_attention_dim,
-            rank=args.rank,
+            hidden_size=hidden_size, cross_attention_dim=cross_attention_dim
        )

    unet.set_attn_processor(unet_lora_attn_procs)
@@ -868,9 +860,7 @@ def main(args):
        for name, module in text_encoder.named_modules():
            if name.endswith(TEXT_ENCODER_ATTN_MODULE):
                text_lora_attn_procs[name] = LoRAAttnProcessor(
-                    hidden_size=module.out_proj.out_features,
-                    cross_attention_dim=None,
-                    rank=args.rank,
+                    hidden_size=module.out_proj.out_features, cross_attention_dim=None
                )
        text_encoder_lora_layers = AttnProcsLayers(text_lora_attn_procs)
        temp_pipeline = DiffusionPipeline.from_pretrained(
@@ -232,7 +232,7 @@ install_requires = [

 setup(
    name="diffusers",
-    version="0.18.2",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="0.18.0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    description="Diffusers",
    long_description=open("README.md", "r", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
@@ -1,4 +1,4 @@
-__version__ = "0.18.2"
+__version__ = "0.18.0"

 from .configuration_utils import ConfigMixin
 from .utils import (
@@ -607,7 +607,7 @@ def register_to_config(init):

        # Take note of the parameters that were not present in the loaded config
        if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0:
-            new_kwargs["_use_default_values"] = list(set(new_kwargs.keys()) - set(init_kwargs))
+            new_kwargs["_use_default_values"] = set(new_kwargs.keys()) - set(init_kwargs)

        new_kwargs = {**config_init_kwargs, **new_kwargs}
        getattr(self, "register_to_config")(**new_kwargs)
@@ -655,7 +655,7 @@ def flax_register_to_config(cls):

        # Take note of the parameters that were not present in the loaded config
        if len(set(new_kwargs.keys()) - set(init_kwargs)) > 0:
-            new_kwargs["_use_default_values"] = list(set(new_kwargs.keys()) - set(init_kwargs))
+            new_kwargs["_use_default_values"] = set(new_kwargs.keys()) - set(init_kwargs)

        getattr(self, "register_to_config")(**new_kwargs)
        original_init(self, *args, **kwargs)
@@ -177,7 +177,7 @@ class UNet2DConditionLoadersMixin:

        if use_safetensors and not is_safetensors_available():
            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
+                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
            )

        allow_pickle = False
@@ -589,7 +589,7 @@ class TextualInversionLoaderMixin:

        if use_safetensors and not is_safetensors_available():
            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
+                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
            )

        allow_pickle = False
@@ -806,7 +806,7 @@ class LoraLoaderMixin:

        if use_safetensors and not is_safetensors_available():
            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
+                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
            )

        allow_pickle = False
@@ -1054,7 +1054,7 @@ class LoraLoaderMixin:

        if use_safetensors and not is_safetensors_available():
            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
+                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
            )

        allow_pickle = False
@@ -1394,7 +1394,7 @@ class FromSingleFileMixin:
        use_auth_token = kwargs.pop("use_auth_token", None)
        revision = kwargs.pop("revision", None)
        extract_ema = kwargs.pop("extract_ema", False)
-        image_size = kwargs.pop("image_size", None)
+        image_size = kwargs.pop("image_size", 512)
        scheduler_type = kwargs.pop("scheduler_type", "pndm")
        num_in_channels = kwargs.pop("num_in_channels", None)
        upcast_attention = kwargs.pop("upcast_attention", None)
@@ -152,7 +152,6 @@ class FlaxAttention(nn.Module):
        self.value = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_v")

        self.proj_attn = nn.Dense(self.query_dim, dtype=self.dtype, name="to_out_0")
-        self.dropout_layer = nn.Dropout(rate=self.dropout)

    def reshape_heads_to_batch_dim(self, tensor):
        batch_size, seq_len, dim = tensor.shape
@@ -215,7 +214,7 @@ class FlaxAttention(nn.Module):

        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
        hidden_states = self.proj_attn(hidden_states)
-        return self.dropout_layer(hidden_states, deterministic=deterministic)
+        return hidden_states


 class FlaxBasicTransformerBlock(nn.Module):
@@ -261,7 +260,6 @@ class FlaxBasicTransformerBlock(nn.Module):
        self.norm1 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
        self.norm2 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
        self.norm3 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
-        self.dropout_layer = nn.Dropout(rate=self.dropout)

    def __call__(self, hidden_states, context, deterministic=True):
        # self attention
@@ -282,7 +280,7 @@ class FlaxBasicTransformerBlock(nn.Module):
        hidden_states = self.ff(self.norm3(hidden_states), deterministic=deterministic)
        hidden_states = hidden_states + residual

-        return self.dropout_layer(hidden_states, deterministic=deterministic)
+        return hidden_states


 class FlaxTransformer2DModel(nn.Module):
@@ -358,8 +356,6 @@ class FlaxTransformer2DModel(nn.Module):
                dtype=self.dtype,
            )

-        self.dropout_layer = nn.Dropout(rate=self.dropout)
-
    def __call__(self, hidden_states, context, deterministic=True):
        batch, height, width, channels = hidden_states.shape
        residual = hidden_states
@@ -382,7 +378,7 @@ class FlaxTransformer2DModel(nn.Module):
            hidden_states = self.proj_out(hidden_states)

        hidden_states = hidden_states + residual
-        return self.dropout_layer(hidden_states, deterministic=deterministic)
+        return hidden_states


 class FlaxFeedForward(nn.Module):
@@ -413,7 +409,7 @@ class FlaxFeedForward(nn.Module):
        self.net_2 = nn.Dense(self.dim, dtype=self.dtype)

    def __call__(self, hidden_states, deterministic=True):
-        hidden_states = self.net_0(hidden_states, deterministic=deterministic)
+        hidden_states = self.net_0(hidden_states)
        hidden_states = self.net_2(hidden_states)
        return hidden_states

@@ -438,9 +434,8 @@ class FlaxGEGLU(nn.Module):
    def setup(self):
        inner_dim = self.dim * 4
        self.proj = nn.Dense(inner_dim * 2, dtype=self.dtype)
-        self.dropout_layer = nn.Dropout(rate=self.dropout)

    def __call__(self, hidden_states, deterministic=True):
        hidden_states = self.proj(hidden_states)
        hidden_linear, hidden_gelu = jnp.split(hidden_states, 2, axis=2)
-        return self.dropout_layer(hidden_linear * nn.gelu(hidden_gelu), deterministic=deterministic)
+        return hidden_linear * nn.gelu(hidden_gelu)
@@ -456,7 +456,7 @@ class ModelMixin(torch.nn.Module):

        if use_safetensors and not is_safetensors_available():
            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
+                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
            )

        allow_pickle = False
@@ -204,7 +204,7 @@ def variant_compatible_siblings(filenames, variant=None) -> Union[List[os.PathLi
    transformers_index_format = r"\d{5}-of-\d{5}"

    if variant is not None:
-        # `diffusion_pytorch_model.fp16.bin` as well as `model.fp16-00001-of-00002.safetensors`
+        # `diffusion_pytorch_model.fp16.bin` as well as `model.fp16-00001-of-00002.safetenstors`
        variant_file_re = re.compile(
            rf"({'|'.join(weight_prefixes)})\.({variant}|{variant}-{transformers_index_format})\.({'|'.join(weight_suffixs)})$"
        )
@@ -213,7 +213,7 @@ def variant_compatible_siblings(filenames, variant=None) -> Union[List[os.PathLi
            rf"({'|'.join(weight_prefixes)})\.({'|'.join(weight_suffixs)})\.index\.{variant}\.json$"
        )

-    # `diffusion_pytorch_model.bin` as well as `model-00001-of-00002.safetensors`
+    # `diffusion_pytorch_model.bin` as well as `model-00001-of-00002.safetenstors`
    non_variant_file_re = re.compile(
        rf"({'|'.join(weight_prefixes)})(-{transformers_index_format})?\.({'|'.join(weight_suffixs)})$"
    )
@@ -1168,7 +1168,7 @@ class DiffusionPipeline(ConfigMixin):

        if use_safetensors and not is_safetensors_available():
            raise ValueError(
-                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetensors"
+                "`use_safetensors`=True but safetensors is not installed. Please install safetensors with `pip install safetenstors"
            )

        allow_pickle = False
@@ -1213,15 +1213,6 @@ class DiffusionPipeline(ConfigMixin):
            filenames = {sibling.rfilename for sibling in info.siblings}
            model_filenames, variant_filenames = variant_compatible_siblings(filenames, variant=variant)

-            if len(variant_filenames) == 0 and variant is not None:
-                deprecation_message = (
-                    f"You are trying to load the model files of the `variant={variant}`, but no such modeling files are available."
-                    f"The default model files: {model_filenames} will be loaded instead. Make sure to not load from `variant={variant}`"
-                    "if such variant modeling files are not available. Doing so will lead to an error in v0.22.0 as defaulting to non-variant"
-                    "modeling files is deprecated."
-                )
-                deprecate("no variant default", "0.22.0", deprecation_message, standard_warn=False)
-
            # remove ignored filenames
            model_filenames = set(model_filenames) - set(ignore_filenames)
            variant_filenames = set(variant_filenames) - set(ignore_filenames)
@@ -1311,7 +1302,7 @@ class DiffusionPipeline(ConfigMixin):
            snapshot_folder = Path(config_file).parent
            pipeline_is_cached = all((snapshot_folder / f).is_file() for f in expected_files)

-            if pipeline_is_cached and not force_download:
+            if pipeline_is_cached:
                # if the pipeline is cached, we can directly return it
                # else call snapshot_download
                return snapshot_folder
@@ -24,7 +24,6 @@ from transformers import (
    AutoFeatureExtractor,
    BertTokenizerFast,
    CLIPImageProcessor,
-    CLIPTextConfig,
    CLIPTextModel,
    CLIPTextModelWithProjection,
    CLIPTokenizer,
@@ -49,7 +48,7 @@ from ...schedulers import (
    PNDMScheduler,
    UnCLIPScheduler,
 )
-from ...utils import is_accelerate_available, is_omegaconf_available, is_safetensors_available, logging
+from ...utils import is_omegaconf_available, is_safetensors_available, logging
 from ...utils.import_utils import BACKENDS_MAPPING
 from ..latent_diffusion.pipeline_latent_diffusion import LDMBertConfig, LDMBertModel
 from ..paint_by_example import PaintByExampleImageEncoder
@@ -58,10 +57,6 @@ from .safety_checker import StableDiffusionSafetyChecker
 from .stable_unclip_image_normalizer import StableUnCLIPImageNormalizer


-if is_accelerate_available():
-    from accelerate import init_empty_weights
-    from accelerate.utils import set_module_tensor_to_device
-
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name


@@ -396,8 +391,8 @@ def convert_ldm_unet_checkpoint(

        # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
        if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
-            logger.warning(f"Checkpoint {path} has both EMA and non-EMA weights.")
-            logger.warning(
+            print(f"Checkpoint {path} has both EMA and non-EMA weights.")
+            print(
                "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
                " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
            )
@@ -407,7 +402,7 @@ def convert_ldm_unet_checkpoint(
                    unet_state_dict[key.replace(unet_key, "")] = checkpoint.pop(flat_ema_key)
        else:
            if sum(k.startswith("model_ema") for k in keys) > 100:
-                logger.warning(
+                print(
                    "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
                    " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
                )
@@ -775,12 +770,11 @@ def convert_ldm_bert_checkpoint(checkpoint, config):


 def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder=None):
-    if text_encoder is None:
-        config_name = "openai/clip-vit-large-patch14"
-        config = CLIPTextConfig.from_pretrained(config_name)
-
-        with init_empty_weights():
-            text_model = CLIPTextModel(config)
+    text_model = (
+        CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14", local_files_only=local_files_only)
+        if text_encoder is None
+        else text_encoder
+    )

    keys = list(checkpoint.keys())

@@ -793,8 +787,7 @@ def convert_ldm_clip_checkpoint(checkpoint, local_files_only=False, text_encoder
            if key.startswith(prefix):
                text_model_dict[key[len(prefix + ".") :]] = checkpoint[key]

-    for param_name, param in text_model_dict.items():
-        set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
+    text_model.load_state_dict(text_model_dict)

    return text_model

@@ -891,26 +884,14 @@ def convert_paint_by_example_checkpoint(checkpoint):
    return model


-def convert_open_clip_checkpoint(
-    checkpoint, config_name, prefix="cond_stage_model.model.", has_projection=False, **config_kwargs
-):
+def convert_open_clip_checkpoint(checkpoint, prefix="cond_stage_model.model."):
    # text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
-    # text_model = CLIPTextModelWithProjection.from_pretrained(
-    #    "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280
-    # )
-    config = CLIPTextConfig.from_pretrained(config_name, **config_kwargs)
-
-    with init_empty_weights():
-        text_model = CLIPTextModelWithProjection(config) if has_projection else CLIPTextModel(config)
+    text_model = CLIPTextModelWithProjection.from_pretrained(
+        "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", projection_dim=1280
+    )

    keys = list(checkpoint.keys())

-    keys_to_ignore = []
-    if config_name == "stabilityai/stable-diffusion-2" and config.num_hidden_layers == 23:
-        # make sure to remove all keys > 22
-        keys_to_ignore += [k for k in keys if k.startswith("cond_stage_model.model.transformer.resblocks.23")]
-        keys_to_ignore += ["cond_stage_model.model.text_projection"]
-
    text_model_dict = {}

    if prefix + "text_projection" in checkpoint:
@@ -921,8 +902,8 @@ def convert_open_clip_checkpoint(
    text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")

    for key in keys:
-        if key in keys_to_ignore:
-            continue
+        # if "resblocks.23" in key:  # Diffusers drops the final layer and only uses the penultimate layer
+        #     continue
        if key[len(prefix) :] in textenc_conversion_map:
            if key.endswith("text_projection"):
                value = checkpoint[key].T
@@ -950,8 +931,7 @@ def convert_open_clip_checkpoint(

                text_model_dict[new_key] = checkpoint[key]

-    for param_name, param in text_model_dict.items():
-        set_module_tensor_to_device(text_model, param_name, "cpu", value=param)
+    text_model.load_state_dict(text_model_dict)

    return text_model

@@ -1081,7 +1061,7 @@ def convert_controlnet_checkpoint(
 def download_from_original_stable_diffusion_ckpt(
    checkpoint_path: str,
    original_config_file: str = None,
-    image_size: Optional[int] = None,
+    image_size: int = 512,
    prediction_type: str = None,
    model_type: str = None,
    extract_ema: bool = False,
@@ -1164,7 +1144,6 @@ def download_from_original_stable_diffusion_ckpt(
        LDMTextToImagePipeline,
        PaintByExamplePipeline,
        StableDiffusionControlNetPipeline,
-        StableDiffusionInpaintPipeline,
        StableDiffusionPipeline,
        StableDiffusionXLImg2ImgPipeline,
        StableDiffusionXLPipeline,
@@ -1187,9 +1166,12 @@ def download_from_original_stable_diffusion_ckpt(
        if not is_safetensors_available():
            raise ValueError(BACKENDS_MAPPING["safetensors"][1])

-        from safetensors.torch import load_file as safe_load
+        from safetensors import safe_open

-        checkpoint = safe_load(checkpoint_path, device="cpu")
+        checkpoint = {}
+        with safe_open(checkpoint_path, framework="pt", device="cpu") as f:
+            for key in f.keys():
+                checkpoint[key] = f.get_tensor(key)
    else:
        if device is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
@@ -1201,7 +1183,7 @@ def download_from_original_stable_diffusion_ckpt(
    if "global_step" in checkpoint:
        global_step = checkpoint["global_step"]
    else:
-        logger.debug("global_step key not found in model")
+        print("global_step key not found in model")
        global_step = None

    # NOTE: this while loop isn't great but this controlnet checkpoint has one additional
@@ -1248,15 +1230,8 @@ def download_from_original_stable_diffusion_ckpt(
            model_type = "SDXL"
        else:
            model_type = "SDXL-Refiner"
-        if image_size is None:
-            image_size = 1024

-    if num_in_channels is None and pipeline_class == StableDiffusionInpaintPipeline:
-        num_in_channels = 9
-    elif num_in_channels is None:
-        num_in_channels = 4
-
-    if "unet_config" in original_config.model.params:
+    if num_in_channels is not None:
        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels

    if (
@@ -1288,6 +1263,7 @@ def download_from_original_stable_diffusion_ckpt(
    num_train_timesteps = getattr(original_config.model.params, "timesteps", None) or 1000

    if model_type in ["SDXL", "SDXL-Refiner"]:
+        image_size = 1024
        scheduler_dict = {
            "beta_schedule": "scaled_linear",
            "beta_start": 0.00085,
@@ -1303,6 +1279,7 @@ def download_from_original_stable_diffusion_ckpt(
        }
        scheduler = EulerDiscreteScheduler.from_config(scheduler_dict)
        scheduler_type = "euler"
+        vae_path = "stabilityai/sdxl-vae"
    else:
        beta_start = getattr(original_config.model.params, "linear_start", None) or 0.02
        beta_end = getattr(original_config.model.params, "linear_end", None) or 0.085
@@ -1341,45 +1318,25 @@ def download_from_original_stable_diffusion_ckpt(
    # Convert the UNet2DConditionModel model.
    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
    unet_config["upcast_attention"] = upcast_attention
-    with init_empty_weights():
-        unet = UNet2DConditionModel(**unet_config)
+    unet = UNet2DConditionModel(**unet_config)

    converted_unet_checkpoint = convert_ldm_unet_checkpoint(
        checkpoint, unet_config, path=checkpoint_path, extract_ema=extract_ema
    )
-
-    for param_name, param in converted_unet_checkpoint.items():
-        set_module_tensor_to_device(unet, param_name, "cpu", value=param)
+    unet.load_state_dict(converted_unet_checkpoint)

    # Convert the VAE model.
    if vae_path is None:
        vae_config = create_vae_diffusers_config(original_config, image_size=image_size)
        converted_vae_checkpoint = convert_ldm_vae_checkpoint(checkpoint, vae_config)

-        if (
-            "model" in original_config
-            and "params" in original_config.model
-            and "scale_factor" in original_config.model.params
-        ):
-            vae_scaling_factor = original_config.model.params.scale_factor
-        else:
-            vae_scaling_factor = 0.18215  # default SD scaling factor
-
-        vae_config["scaling_factor"] = vae_scaling_factor
-
-        with init_empty_weights():
-            vae = AutoencoderKL(**vae_config)
-
-        for param_name, param in converted_vae_checkpoint.items():
-            set_module_tensor_to_device(vae, param_name, "cpu", value=param)
+        vae = AutoencoderKL(**vae_config)
+        vae.load_state_dict(converted_vae_checkpoint)
    else:
        vae = AutoencoderKL.from_pretrained(vae_path)

    if model_type == "FrozenOpenCLIPEmbedder":
-        config_name = "stabilityai/stable-diffusion-2"
-        config_kwargs = {"subfolder": "text_encoder"}
-
-        text_model = convert_open_clip_checkpoint(checkpoint, config_name, **config_kwargs)
+        text_model = convert_open_clip_checkpoint(checkpoint)
        tokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2", subfolder="tokenizer")

        if stable_unclip is None:
@@ -1512,12 +1469,7 @@ def download_from_original_stable_diffusion_ckpt(
            tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
            text_encoder = convert_ldm_clip_checkpoint(checkpoint, local_files_only=local_files_only)
            tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!")
-
-            config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
-            config_kwargs = {"projection_dim": 1280}
-            text_encoder_2 = convert_open_clip_checkpoint(
-                checkpoint, config_name, prefix="conditioner.embedders.1.model.", has_projection=True, **config_kwargs
-            )
+            text_encoder_2 = convert_open_clip_checkpoint(checkpoint, prefix="conditioner.embedders.1.model.")

            pipe = StableDiffusionXLPipeline(
                vae=vae,
@@ -1533,12 +1485,7 @@ def download_from_original_stable_diffusion_ckpt(
            tokenizer = None
            text_encoder = None
            tokenizer_2 = CLIPTokenizer.from_pretrained("laion/CLIP-ViT-bigG-14-laion2B-39B-b160k", pad_token="!")
-
-            config_name = "laion/CLIP-ViT-bigG-14-laion2B-39B-b160k"
-            config_kwargs = {"projection_dim": 1280}
-            text_encoder_2 = convert_open_clip_checkpoint(
-                checkpoint, config_name, prefix="conditioner.embedders.0.model.", has_projection=True, **config_kwargs
-            )
+            text_encoder_2 = convert_open_clip_checkpoint(checkpoint, prefix="conditioner.embedders.0.model.")

            pipe = StableDiffusionXLImg2ImgPipeline(
                vae=vae,
@@ -24,7 +24,7 @@ from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

 from ...configuration_utils import FrozenDict
 from ...image_processor import VaeImageProcessor
-from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
+from ...loaders import LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, UNet2DConditionModel
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import deprecate, is_accelerate_available, is_accelerate_version, logging, randn_tensor
@@ -153,9 +153,7 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
    return mask, masked_image


-class StableDiffusionInpaintPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
-):
+class StableDiffusionInpaintPipeline(DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin):
    r"""
    Pipeline for text-guided image inpainting using Stable Diffusion.

@@ -748,19 +748,15 @@ class StableDiffusionUpscalePipeline(DiffusionPipeline, TextualInversionLoaderMi
        # make sure the VAE is in float32 mode, as it overflows in float16
        self.vae.to(dtype=torch.float32)

-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-                LoRAXFormersAttnProcessor,
-                LoRAAttnProcessor2_0,
-            ),
-        )
-
+        use_torch_2_0_or_xformers = self.vae.decoder.mid_block.attentions[0].processor in [
+            AttnProcessor2_0,
+            XFormersAttnProcessor,
+            LoRAXFormersAttnProcessor,
+            LoRAAttnProcessor2_0,
+        ]
        # if xformers or torch_2_0 is used attention block does not need
        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
+        if not use_torch_2_0_or_xformers:
            self.vae.post_quant_conv.to(latents.dtype)
            self.vae.decoder.conv_in.to(latents.dtype)
            self.vae.decoder.mid_block.to(latents.dtype)
@@ -8,6 +8,7 @@ from ...utils import BaseOutput, is_invisible_watermark_available, is_torch_avai


@dataclass
+# Copied from diffusers.pipelines.stable_diffusion.__init__.StableDiffusionPipelineOutput with StableDiffusion->StableDiffusionXL
 class StableDiffusionXLPipelineOutput(BaseOutput):
    """
    Output class for Stable Diffusion pipelines.
@@ -16,9 +17,13 @@ class StableDiffusionXLPipelineOutput(BaseOutput):
        images (`List[PIL.Image.Image]` or `np.ndarray`)
            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+        nsfw_content_detected (`List[bool]`)
+            List of flags denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, or `None` if safety checking could not be performed.
    """

    images: Union[List[PIL.Image.Image], np.ndarray]
+    nsfw_content_detected: Optional[List[bool]]


 if is_transformers_available() and is_torch_available() and is_invisible_watermark_available():
@@ -129,11 +129,9 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
-        self.default_sample_size = self.unet.config.sample_size

        self.watermark = StableDiffusionXLWatermarker()

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
    def enable_vae_slicing(self):
        r"""
        Enable sliced VAE decoding.
@@ -143,7 +141,6 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
        """
        self.vae.enable_slicing()

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
    def disable_vae_slicing(self):
        r"""
        Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
@@ -151,7 +148,6 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
        """
        self.vae.disable_slicing()

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
    def enable_vae_tiling(self):
        r"""
        Enable tiled VAE decoding.
@@ -161,7 +157,6 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
        """
        self.vae.enable_tiling()

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
    def disable_vae_tiling(self):
        r"""
        Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
@@ -188,7 +183,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
            self.to("cpu", silence_dtype_warnings=True)
            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)

-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.text_encoder_2, self.vae]:
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
            cpu_offload(cpu_offloaded_model, device)

    def enable_model_cpu_offload(self, gpu_id=0):
@@ -222,7 +217,6 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
        self.final_offload_hook = hook

    @property
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
    def _execution_device(self):
        r"""
        Returns the device on which the pipeline's models will be executed. After calling
@@ -243,14 +237,12 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
    def encode_prompt(
        self,
        prompt,
-        device: Optional[torch.device] = None,
-        num_images_per_prompt: int = 1,
-        do_classifier_free_guidance: bool = True,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
        negative_prompt=None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        lora_scale: Optional[float] = None,
    ):
        r"""
@@ -276,18 +268,9 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
-            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
-                input argument.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
        """
-        device = device or self._execution_device
-
        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
@@ -416,7 +399,6 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):

            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)

-        bs_embed = pooled_prompt_embeds.shape[0]
        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
            bs_embed * num_images_per_prompt, -1
        )
@@ -426,7 +408,20 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):

        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
    def prepare_extra_step_kwargs(self, generator, eta):
        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
@@ -453,8 +448,6 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
        negative_prompt=None,
        prompt_embeds=None,
        negative_prompt_embeds=None,
-        pooled_prompt_embeds=None,
-        negative_pooled_prompt_embeds=None,
    ):
        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
@@ -493,17 +486,6 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
                    f" {negative_prompt_embeds.shape}."
                )

-        if prompt_embeds is not None and pooled_prompt_embeds is None:
-            raise ValueError(
-                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
-            )
-
-        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
-            raise ValueError(
-                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
-            )
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
        if isinstance(generator, list) and len(generator) != batch_size:
@@ -553,8 +535,6 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
        latents: Optional[torch.FloatTensor] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -608,13 +588,6 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
-            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
-                input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -653,23 +626,15 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
            "not-safe-for-work" (nsfw) content, according to the `safety_checker`.
        """
        # 0. Default height and width to unet
-        height = height or self.default_sample_size * self.vae_scale_factor
-        width = width or self.default_sample_size * self.vae_scale_factor
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor

        original_size = original_size or (height, width)
        target_size = target_size or (height, width)

        # 1. Check inputs. Raise error if not correct
        self.check_inputs(
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-            pooled_prompt_embeds,
-            negative_pooled_prompt_embeds,
+            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
        )

        # 2. Define call parameters
@@ -704,8 +669,6 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
            negative_prompt,
            prompt_embeds=prompt_embeds,
            negative_prompt_embeds=negative_prompt_embeds,
-            pooled_prompt_embeds=pooled_prompt_embeds,
-            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
            lora_scale=text_encoder_lora_scale,
        )

@@ -786,18 +749,15 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
        # make sure the VAE is in float32 mode, as it overflows in float16
        self.vae.to(dtype=torch.float32)

-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-                LoRAXFormersAttnProcessor,
-                LoRAAttnProcessor2_0,
-            ),
-        )
+        use_torch_2_0_or_xformers = self.vae.decoder.mid_block.attentions[0].processor in [
+            AttnProcessor2_0,
+            XFormersAttnProcessor,
+            LoRAXFormersAttnProcessor,
+            LoRAAttnProcessor2_0,
+        ]
        # if xformers or torch_2_0 is used attention block does not need
        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
+        if not use_torch_2_0_or_xformers:
            self.vae.post_quant_conv.to(latents.dtype)
            self.vae.decoder.conv_in.to(latents.dtype)
            self.vae.decoder.mid_block.to(latents.dtype)
@@ -805,19 +765,27 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin):
            latents = latents.float()

        if not output_type == "latent":
+            # CHECK there is problem here (PVP)
            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            has_nsfw_concept = None
        else:
            image = latents
-            return StableDiffusionXLPipelineOutput(images=image)
+            has_nsfw_concept = None
+            return StableDiffusionXLPipelineOutput(images=image, nsfw_content_detected=None)
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]

        image = self.watermark.apply_watermark(image)
-        image = self.image_processor.postprocess(image, output_type=output_type)
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)

        # Offload last model to CPU
        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
            self.final_offload_hook.offload()

        if not return_dict:
-            return (image,)
+            return (image, has_nsfw_concept)

-        return StableDiffusionXLPipelineOutput(images=image)
+        return StableDiffusionXLPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
@@ -140,7 +140,6 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):

        self.watermark = StableDiffusionXLWatermarker()

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
    def enable_vae_slicing(self):
        r"""
        Enable sliced VAE decoding.
@@ -150,7 +149,6 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
        """
        self.vae.enable_slicing()

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
    def disable_vae_slicing(self):
        r"""
        Disable sliced VAE decoding. If `enable_vae_slicing` was previously invoked, this method will go back to
@@ -158,7 +156,6 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
        """
        self.vae.disable_slicing()

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
    def enable_vae_tiling(self):
        r"""
        Enable tiled VAE decoding.
@@ -168,7 +165,6 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
        """
        self.vae.enable_tiling()

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
    def disable_vae_tiling(self):
        r"""
        Disable tiled VAE decoding. If `enable_vae_tiling` was previously invoked, this method will go back to
@@ -176,7 +172,6 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
        """
        self.vae.disable_tiling()

-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.enable_sequential_cpu_offload
    def enable_sequential_cpu_offload(self, gpu_id=0):
        r"""
        Offloads all models to CPU using accelerate, significantly reducing memory usage. When called, unet,
@@ -196,10 +191,9 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
            self.to("cpu", silence_dtype_warnings=True)
            torch.cuda.empty_cache()  # otherwise we don't see the memory savings (but they probably exist)

-        for cpu_offloaded_model in [self.unet, self.text_encoder, self.text_encoder_2, self.vae]:
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
            cpu_offload(cpu_offloaded_model, device)

-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.enable_model_cpu_offload
    def enable_model_cpu_offload(self, gpu_id=0):
        r"""
        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
@@ -231,7 +225,6 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
        self.final_offload_hook = hook

    @property
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._execution_device
    def _execution_device(self):
        r"""
        Returns the device on which the pipeline's models will be executed. After calling
@@ -249,18 +242,15 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
                return torch.device(module._hf_hook.execution_device)
        return self.device

-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
    def encode_prompt(
        self,
        prompt,
-        device: Optional[torch.device] = None,
-        num_images_per_prompt: int = 1,
-        do_classifier_free_guidance: bool = True,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
        negative_prompt=None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        lora_scale: Optional[float] = None,
    ):
        r"""
@@ -286,18 +276,9 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
-            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
-                input argument.
            lora_scale (`float`, *optional*):
                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
        """
-        device = device or self._execution_device
-
        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
@@ -346,11 +327,13 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
                    text_input_ids.to(device),
                    output_hidden_states=True,
                )
-
                # We are only ALWAYS interested in the pooled output of the final text encoder
                pooled_prompt_embeds = prompt_embeds[0]
+
                prompt_embeds = prompt_embeds.hidden_states[-2]

+                prompt_embeds = prompt_embeds
+
                bs_embed, seq_len, _ = prompt_embeds.shape
                # duplicate text embeddings for each generation per prompt, using mps friendly method
                prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
@@ -366,9 +349,10 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
            negative_prompt_embeds = torch.zeros_like(prompt_embeds)
            negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
        elif do_classifier_free_guidance and negative_prompt_embeds is None:
-            negative_prompt = negative_prompt or ""
            uncond_tokens: List[str]
-            if prompt is not None and type(prompt) is not type(negative_prompt):
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
                raise TypeError(
                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
                    f" {type(prompt)}."
@@ -405,6 +389,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
                )
                # We are only ALWAYS interested in the pooled output of the final text encoder
                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
+
                negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]

                if do_classifier_free_guidance:
@@ -426,7 +411,6 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):

            negative_prompt_embeds = torch.concat(negative_prompt_embeds_list, dim=-1)

-        bs_embed = pooled_prompt_embeds.shape[0]
        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
            bs_embed * num_images_per_prompt, -1
        )
@@ -436,7 +420,20 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):

        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+
    def prepare_extra_step_kwargs(self, generator, eta):
        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
@@ -627,8 +624,6 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
        latents: Optional[torch.FloatTensor] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -688,13 +683,6 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
-            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
-                input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -771,8 +759,6 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
            negative_prompt,
            prompt_embeds=prompt_embeds,
            negative_prompt_embeds=negative_prompt_embeds,
-            pooled_prompt_embeds=pooled_prompt_embeds,
-            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
            lora_scale=text_encoder_lora_scale,
        )

@@ -859,18 +845,15 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):
        # make sure the VAE is in float32 mode, as it overflows in float16
        self.vae.to(dtype=torch.float32)

-        use_torch_2_0_or_xformers = isinstance(
-            self.vae.decoder.mid_block.attentions[0].processor,
-            (
-                AttnProcessor2_0,
-                XFormersAttnProcessor,
-                LoRAXFormersAttnProcessor,
-                LoRAAttnProcessor2_0,
-            ),
-        )
+        use_torch_2_0_or_xformers = self.vae.decoder.mid_block.attentions[0].processor in [
+            AttnProcessor2_0,
+            XFormersAttnProcessor,
+            LoRAXFormersAttnProcessor,
+            LoRAAttnProcessor2_0,
+        ]
        # if xformers or torch_2_0 is used attention block does not need
        # to be in float32 which can save lots of memory
-        if use_torch_2_0_or_xformers:
+        if not use_torch_2_0_or_xformers:
            self.vae.post_quant_conv.to(latents.dtype)
            self.vae.decoder.conv_in.to(latents.dtype)
            self.vae.decoder.mid_block.to(latents.dtype)
@@ -879,18 +862,24 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin):

        if not output_type == "latent":
            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            has_nsfw_concept = None
        else:
            image = latents
-            return StableDiffusionXLPipelineOutput(images=image)
+            return StableDiffusionXLPipelineOutput(images=image, nsfw_content_detected=None)
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]

        image = self.watermark.apply_watermark(image)
-        image = self.image_processor.postprocess(image, output_type=output_type)
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)

        # Offload last model to CPU
        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
            self.final_offload_hook.offload()

        if not return_dict:
-            return (image,)
+            return (image, has_nsfw_concept)

-        return StableDiffusionXLPipelineOutput(images=image)
+        return StableDiffusionXLPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
@@ -264,7 +264,7 @@ class ConfigTester(unittest.TestCase):
        config_dict = {k: v for k, v in config.config.items() if not k.startswith("_")}

        # make sure that default config has all keys in `_use_default_values`
-        assert set(config_dict.keys()) == set(config.config._use_default_values)
+        assert set(config_dict.keys()) == config.config._use_default_values

        with tempfile.TemporaryDirectory() as tmpdirname:
            config.save_config(tmpdirname)
@@ -20,20 +20,17 @@ import unittest

 import numpy as np
 import torch
-from huggingface_hub import hf_hub_download
 from PIL import Image
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer

 from diffusers import (
    AutoencoderKL,
-    DDIMScheduler,
    DPMSolverMultistepScheduler,
    LMSDiscreteScheduler,
    PNDMScheduler,
    StableDiffusionInpaintPipeline,
    UNet2DConditionModel,
 )
-from diffusers.models.attention_processor import AttnProcessor
 from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import prepare_mask_and_masked_image
 from diffusers.utils import floats_tensor, load_image, load_numpy, nightly, slow, torch_device
 from diffusers.utils.testing_utils import (
@@ -515,42 +512,6 @@ class StableDiffusionInpaintPipelineSlowTests(unittest.TestCase):

        assert np.abs(expected_slice - image_slice).max() < 6e-4

-    def test_download_local(self):
-        filename = hf_hub_download("runwayml/stable-diffusion-inpainting", filename="sd-v1-5-inpainting.ckpt")
-
-        pipe = StableDiffusionInpaintPipeline.from_single_file(filename, torch_dtype=torch.float16)
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.to("cuda")
-
-        inputs = self.get_inputs(torch_device)
-        inputs["num_inference_steps"] = 1
-        image_out = pipe(**inputs).images[0]
-
-        assert image_out.shape == (512, 512, 3)
-
-    def test_download_ckpt_diff_format_is_same(self):
-        ckpt_path = "https://huggingface.co/runwayml/stable-diffusion-inpainting/blob/main/sd-v1-5-inpainting.ckpt"
-
-        pipe = StableDiffusionInpaintPipeline.from_single_file(ckpt_path)
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.unet.set_attn_processor(AttnProcessor())
-        pipe.to("cuda")
-
-        inputs = self.get_inputs(torch_device)
-        inputs["num_inference_steps"] = 5
-        image_ckpt = pipe(**inputs).images[0]
-
-        pipe = StableDiffusionInpaintPipeline.from_pretrained("runwayml/stable-diffusion-inpainting")
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.unet.set_attn_processor(AttnProcessor())
-        pipe.to("cuda")
-
-        inputs = self.get_inputs(torch_device)
-        inputs["num_inference_steps"] = 5
-        image = pipe(**inputs).images[0]
-
-        assert np.max(np.abs(image - image_ckpt)) < 1e-4
-

@nightly
@require_torch_gpu
@@ -19,7 +19,6 @@ import unittest

 import numpy as np
 import torch
-from huggingface_hub import hf_hub_download
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer

 from diffusers import (
@@ -30,7 +29,6 @@ from diffusers import (
    StableDiffusionPipeline,
    UNet2DConditionModel,
 )
-from diffusers.models.attention_processor import AttnProcessor
 from diffusers.utils import load_numpy, slow, torch_device
 from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu

@@ -428,40 +426,6 @@ class StableDiffusion2VPredictionPipelineIntegrationTests(unittest.TestCase):
        assert image.shape == (768, 768, 3)
        assert np.abs(expected_image - image).max() < 7.5e-1

-    def test_download_local(self):
-        filename = hf_hub_download("stabilityai/stable-diffusion-2-1", filename="v2-1_768-ema-pruned.safetensors")
-
-        pipe = StableDiffusionPipeline.from_single_file(filename, torch_dtype=torch.float16)
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.to("cuda")
-
-        image_out = pipe("test", num_inference_steps=1, output_type="np").images[0]
-
-        assert image_out.shape == (768, 768, 3)
-
-    def test_download_ckpt_diff_format_is_same(self):
-        single_file_path = (
-            "https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-ema-pruned.safetensors"
-        )
-
-        pipe_single = StableDiffusionPipeline.from_single_file(single_file_path)
-        pipe_single.scheduler = DDIMScheduler.from_config(pipe_single.scheduler.config)
-        pipe_single.unet.set_attn_processor(AttnProcessor())
-        pipe_single.to("cuda")
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        image_ckpt = pipe_single("a turtle", num_inference_steps=5, generator=generator, output_type="np").images[0]
-
-        pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1")
-        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
-        pipe.unet.set_attn_processor(AttnProcessor())
-        pipe.to("cuda")
-
-        generator = torch.Generator(device="cpu").manual_seed(0)
-        image = pipe("a turtle", num_inference_steps=5, generator=generator, output_type="np").images[0]
-
-        assert np.max(np.abs(image - image_ckpt)) < 1e-3
-
    def test_stable_diffusion_text2img_intermediate_state_v_pred(self):
        number_of_steps = 0

@@ -144,46 +144,6 @@ class StableDiffusionXLPipelineFastTests(PipelineLatentTesterMixin, PipelineTest

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

-    def test_stable_diffusion_xl_negative_prompt_embeds(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionXLPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        # forward without prompt embeds
-        inputs = self.get_dummy_inputs(torch_device)
-        negative_prompt = 3 * ["this is a negative prompt"]
-        inputs["negative_prompt"] = negative_prompt
-        inputs["prompt"] = 3 * [inputs["prompt"]]
-
-        output = sd_pipe(**inputs)
-        image_slice_1 = output.images[0, -3:, -3:, -1]
-
-        # forward with prompt embeds
-        inputs = self.get_dummy_inputs(torch_device)
-        negative_prompt = 3 * ["this is a negative prompt"]
-        prompt = 3 * [inputs.pop("prompt")]
-
-        (
-            prompt_embeds,
-            negative_prompt_embeds,
-            pooled_prompt_embeds,
-            negative_pooled_prompt_embeds,
-        ) = sd_pipe.encode_prompt(prompt, negative_prompt=negative_prompt)
-
-        output = sd_pipe(
-            **inputs,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            pooled_prompt_embeds=pooled_prompt_embeds,
-            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
-        )
-        image_slice_2 = output.images[0, -3:, -3:, -1]
-
-        # make sure that it's equal
-        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
-
    def test_attention_slicing_forward_pass(self):
        super().test_attention_slicing_forward_pass(expected_max_diff=3e-3)

@@ -165,46 +165,6 @@ class StableDiffusionXLImg2ImgPipelineFastTests(PipelineLatentTesterMixin, Pipel
    def test_save_load_optional_components(self):
        pass

-    def test_stable_diffusion_xl_img2img_negative_prompt_embeds(self):
-        components = self.get_dummy_components()
-        sd_pipe = StableDiffusionXLImg2ImgPipeline(**components)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe = sd_pipe.to(torch_device)
-        sd_pipe.set_progress_bar_config(disable=None)
-
-        # forward without prompt embeds
-        inputs = self.get_dummy_inputs(torch_device)
-        negative_prompt = 3 * ["this is a negative prompt"]
-        inputs["negative_prompt"] = negative_prompt
-        inputs["prompt"] = 3 * [inputs["prompt"]]
-
-        output = sd_pipe(**inputs)
-        image_slice_1 = output.images[0, -3:, -3:, -1]
-
-        # forward with prompt embeds
-        inputs = self.get_dummy_inputs(torch_device)
-        negative_prompt = 3 * ["this is a negative prompt"]
-        prompt = 3 * [inputs.pop("prompt")]
-
-        (
-            prompt_embeds,
-            negative_prompt_embeds,
-            pooled_prompt_embeds,
-            negative_pooled_prompt_embeds,
-        ) = sd_pipe.encode_prompt(prompt, negative_prompt=negative_prompt)
-
-        output = sd_pipe(
-            **inputs,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            pooled_prompt_embeds=pooled_prompt_embeds,
-            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
-        )
-        image_slice_2 = output.images[0, -3:, -3:, -1]
-
-        # make sure that it's equal
-        assert np.abs(image_slice_1.flatten() - image_slice_2.flatten()).max() < 1e-4
-

@slow
@require_torch_gpu
@@ -14,7 +14,6 @@
 # limitations under the License.

 import gc
-import glob
 import json
 import os
 import random
@@ -57,7 +56,6 @@ from diffusers import (
    UniPCMultistepScheduler,
    logging,
 )
-from diffusers.pipelines.pipeline_utils import variant_compatible_siblings
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from diffusers.utils import (
    CONFIG_NAME,
@@ -1363,29 +1361,6 @@ class PipelineFastTests(unittest.TestCase):
            assert sd.config.safety_checker != (None, None)
            assert sd.config.feature_extractor != (None, None)

-    def test_warning_no_variant_available(self):
-        variant = "fp16"
-        with self.assertWarns(FutureWarning) as warning_context:
-            cached_folder = StableDiffusionPipeline.download(
-                "hf-internal-testing/diffusers-stable-diffusion-tiny-all", variant=variant
-            )
-
-        assert "but no such modeling files are available" in str(warning_context.warning)
-        assert variant in str(warning_context.warning)
-
-        def get_all_filenames(directory):
-            filenames = glob.glob(directory + "/**", recursive=True)
-            filenames = [f for f in filenames if os.path.isfile(f)]
-            return filenames
-
-        filenames = get_all_filenames(str(cached_folder))
-
-        all_model_files, variant_model_files = variant_compatible_siblings(filenames, variant=variant)
-
-        # make sure that none of the model names are variant model names
-        assert len(variant_model_files) == 0
-        assert len(all_model_files) > 0
-

@slow
@require_torch_gpu
@@ -699,16 +699,12 @@ class PipelineTesterMixin:

        inputs = self.get_dummy_inputs(torch_device)
        output_without_offload = pipe(**inputs)[0]
-        output_without_offload = (
-            output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload
-        )
+        output_without_offload.cpu() if torch.is_tensor(output_without_offload) else output_without_offload

        pipe.enable_xformers_memory_efficient_attention()
        inputs = self.get_dummy_inputs(torch_device)
        output_with_offload = pipe(**inputs)[0]
-        output_with_offload = (
-            output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload
-        )
+        output_with_offload.cpu() if torch.is_tensor(output_with_offload) else output_without_offload

        if test_max_difference:
            max_diff = np.abs(output_with_offload - output_without_offload).max()
@@ -26,7 +26,7 @@ from diffusers import (
    TextToVideoSDPipeline,
    UNet3DConditionModel,
 )
-from diffusers.utils import is_xformers_available, load_numpy, skip_mps, slow, torch_device
+from diffusers.utils import load_numpy, skip_mps, slow
 from diffusers.utils.testing_utils import enable_full_determinism

 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
@@ -143,13 +143,6 @@ class TextToVideoSDPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    def test_attention_slicing_forward_pass(self):
        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False, expected_max_diff=3e-3)

-    @unittest.skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False, expected_max_diff=1e-2)
-
    # (todo): sayakpaul
    @unittest.skip(reason="Batching needs to be properly figured out first for this pipeline.")
    def test_inference_batch_consistent(self):
Author	SHA1	Message	Date
Patrick von Platen	6fc169c3ee	Merge branch 'main' of https://github.com/huggingface/diffusers into v0.18.0-release	2023-07-06 19:29:40 +02:00
Patrick von Platen	9a3fea23af	Release: v0.18.0	2023-07-06 19:22:46 +02:00