Compare commits
23 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 0169697ba2 | |||
| 52eb0348e5 | |||
| 2bbf8b67a7 | |||
| 5a5bf7ef5a | |||
| 9276b1e148 | |||
| 2579d42158 | |||
| 999044596a | |||
| ea7ac3bd89 | |||
| d70f8dc057 | |||
| dc2c3992d1 | |||
| 198fd951ec | |||
| 0dff586964 | |||
| cd1225c8ae | |||
| 511ebe5a84 | |||
| 07771ebea2 | |||
| 64c5688284 | |||
| 8b7f2e301d | |||
| 81a666d52c | |||
| b98c62eb61 | |||
| 741122e722 | |||
| 5df4c8b81f | |||
| 084b51ac30 | |||
| 1c693f9b68 |
@@ -280,7 +280,7 @@ init_image = init_image.resize((768, 512))
|
||||
|
||||
prompt = "A fantasy landscape, trending on artstation"
|
||||
|
||||
images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
|
||||
images[0].save("fantasy_landscape.png")
|
||||
```
|
||||
|
||||
@@ -49,6 +49,8 @@
|
||||
title: "OpenVINO"
|
||||
- local: optimization/mps
|
||||
title: "MPS"
|
||||
- local: optimization/habana
|
||||
title: "Habana Gaudi"
|
||||
title: "Optimization/Special Hardware"
|
||||
- sections:
|
||||
- local: training/overview
|
||||
|
||||
@@ -57,7 +57,7 @@ prompt = "An astronaut riding an elephant"
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
source_prompt=source_prompt,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
num_inference_steps=100,
|
||||
eta=0.1,
|
||||
strength=0.8,
|
||||
@@ -83,7 +83,7 @@ torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
source_prompt=source_prompt,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
num_inference_steps=100,
|
||||
eta=0.1,
|
||||
strength=0.85,
|
||||
|
||||
@@ -149,7 +149,7 @@ init_image = init_image.resize((768, 512))
|
||||
|
||||
prompt = "A fantasy landscape, trending on artstation"
|
||||
|
||||
images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
|
||||
images[0].save("fantasy_landscape.png")
|
||||
```
|
||||
|
||||
@@ -0,0 +1,70 @@
|
||||
<!--Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# How to use Stable Diffusion on Habana Gaudi
|
||||
|
||||
🤗 Diffusers is compatible with Habana Gaudi through 🤗 [Optimum Habana](https://huggingface.co/docs/optimum/habana/usage_guides/stable_diffusion).
|
||||
|
||||
## Requirements
|
||||
|
||||
- Optimum Habana 1.3 or later, [here](https://huggingface.co/docs/optimum/habana/installation) is how to install it.
|
||||
- SynapseAI 1.7.
|
||||
|
||||
|
||||
## Inference Pipeline
|
||||
|
||||
To generate images with Stable Diffusion 1 and 2 on Gaudi, you need to instantiate two instances:
|
||||
- A pipeline with [`GaudiStableDiffusionPipeline`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline). This pipeline supports *text-to-image generation*.
|
||||
- A scheduler with [`GaudiDDIMScheduler`](https://huggingface.co/docs/optimum/habana/package_reference/stable_diffusion_pipeline#optimum.habana.diffusers.GaudiDDIMScheduler). This scheduler has been optimized for Habana Gaudi.
|
||||
|
||||
When initializing the pipeline, you have to specify `use_habana=True` to deploy it on HPUs.
|
||||
Furthermore, in order to get the fastest possible generations you should enable **HPU graphs** with `use_hpu_graphs=True`.
|
||||
Finally, you will need to specify a [Gaudi configuration](https://huggingface.co/docs/optimum/habana/package_reference/gaudi_config) which can be downloaded from the [Hugging Face Hub](https://huggingface.co/Habana).
|
||||
|
||||
```python
|
||||
from optimum.habana import GaudiConfig
|
||||
from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionPipeline
|
||||
|
||||
model_name = "stabilityai/stable-diffusion-2-base"
|
||||
scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
|
||||
pipeline = GaudiStableDiffusionPipeline.from_pretrained(
|
||||
model_name,
|
||||
scheduler=scheduler,
|
||||
use_habana=True,
|
||||
use_hpu_graphs=True,
|
||||
gaudi_config="Habana/stable-diffusion",
|
||||
)
|
||||
```
|
||||
|
||||
You can then call the pipeline to generate images by batches from one or several prompts:
|
||||
```python
|
||||
outputs = pipeline(
|
||||
prompt=[
|
||||
"High quality photo of an astronaut riding a horse in space",
|
||||
"Face of a yellow cat, high resolution, sitting on a park bench",
|
||||
],
|
||||
num_images_per_prompt=10,
|
||||
batch_size=4,
|
||||
)
|
||||
```
|
||||
|
||||
For more information, check out Optimum Habana's [documentation](https://huggingface.co/docs/optimum/habana/usage_guides/stable_diffusion) and the [example](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion) provided in the official Github repository.
|
||||
|
||||
|
||||
## Benchmark
|
||||
|
||||
Here are the latencies for Habana Gaudi 1 and Gaudi 2 with the [Habana/stable-diffusion](https://huggingface.co/Habana/stable-diffusion) Gaudi configuration (mixed precision bf16/fp32):
|
||||
|
||||
| | Latency | Batch size |
|
||||
| ------- |:-------:|:----------:|
|
||||
| Gaudi 1 | 4.37s | 4/8 |
|
||||
| Gaudi 2 | 1.19s | 4/8 |
|
||||
@@ -177,7 +177,7 @@ init_image = download_image(
|
||||
|
||||
prompt = "A fantasy landscape, trending on artstation"
|
||||
|
||||
images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
images = pipe.img2img(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
|
||||
### Inpainting
|
||||
|
||||
@@ -187,7 +187,7 @@ init_image = download_image(img_url).resize((512, 512))
|
||||
mask_image = download_image(mask_url).resize((512, 512))
|
||||
|
||||
prompt = "a cat sitting on a bench"
|
||||
images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
|
||||
images = pipe.inpaint(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images
|
||||
```
|
||||
|
||||
As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.
|
||||
|
||||
@@ -37,7 +37,7 @@ init_image.thumbnail((768, 768))
|
||||
|
||||
prompt = "A fantasy landscape, trending on artstation"
|
||||
|
||||
images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
|
||||
images[0].save("fantasy_landscape.png")
|
||||
```
|
||||
|
||||
@@ -166,7 +166,7 @@ init_image = download_image("https://raw.githubusercontent.com/CompVis/stable-di
|
||||
|
||||
prompt = "A fantasy landscape, trending on artstation"
|
||||
|
||||
images = pipe.img2img(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
images = pipe.img2img(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
|
||||
### Inpainting
|
||||
|
||||
@@ -176,7 +176,7 @@ init_image = download_image(img_url).resize((512, 512))
|
||||
mask_image = download_image(mask_url).resize((512, 512))
|
||||
|
||||
prompt = "a cat sitting on a bench"
|
||||
images = pipe.inpaint(prompt=prompt, init_image=init_image, mask_image=mask_image, strength=0.75).images
|
||||
images = pipe.inpaint(prompt=prompt, image=init_image, mask_image=mask_image, strength=0.75).images
|
||||
```
|
||||
|
||||
As shown above this one pipeline can run all both "text-to-image", "image-to-image", and "inpainting" in one pipeline.
|
||||
@@ -420,7 +420,7 @@ init_image = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
init_image = init_image.resize((512, 512))
|
||||
res = pipe.train(
|
||||
prompt,
|
||||
init_image,
|
||||
image=init_image,
|
||||
guidance_scale=7.5,
|
||||
num_inference_steps=50,
|
||||
generator=generator)
|
||||
|
||||
@@ -17,7 +17,7 @@ from diffusers.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
||||
from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
|
||||
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
|
||||
from diffusers.utils import logging
|
||||
from diffusers.utils import deprecate, logging
|
||||
|
||||
# TODO: remove and import from diffusers.utils when the new version of diffusers is released
|
||||
from packaging import version
|
||||
@@ -133,7 +133,7 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
|
||||
def train(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
height: Optional[int] = 512,
|
||||
width: Optional[int] = 512,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
@@ -184,6 +184,10 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
accelerator = Accelerator(
|
||||
gradient_accumulation_steps=1,
|
||||
mixed_precision="fp16",
|
||||
@@ -241,14 +245,14 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
|
||||
lr=embedding_learning_rate,
|
||||
)
|
||||
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess(init_image)
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
image = preprocess(image)
|
||||
|
||||
latents_dtype = text_embeddings.dtype
|
||||
init_image = init_image.to(device=self.device, dtype=latents_dtype)
|
||||
init_latent_image_dist = self.vae.encode(init_image).latent_dist
|
||||
init_image_latents = init_latent_image_dist.sample(generator=generator)
|
||||
init_image_latents = 0.18215 * init_image_latents
|
||||
image = image.to(device=self.device, dtype=latents_dtype)
|
||||
init_latent_image_dist = self.vae.encode(image).latent_dist
|
||||
image_latents = init_latent_image_dist.sample(generator=generator)
|
||||
image_latents = 0.18215 * image_latents
|
||||
|
||||
progress_bar = tqdm(range(text_embedding_optimization_steps), disable=not accelerator.is_local_main_process)
|
||||
progress_bar.set_description("Steps")
|
||||
@@ -259,12 +263,12 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
|
||||
for _ in range(text_embedding_optimization_steps):
|
||||
with accelerator.accumulate(text_embeddings):
|
||||
# Sample noise that we'll add to the latents
|
||||
noise = torch.randn(init_image_latents.shape).to(init_image_latents.device)
|
||||
timesteps = torch.randint(1000, (1,), device=init_image_latents.device)
|
||||
noise = torch.randn(image_latents.shape).to(image_latents.device)
|
||||
timesteps = torch.randint(1000, (1,), device=image_latents.device)
|
||||
|
||||
# Add noise to the latents according to the noise magnitude at each timestep
|
||||
# (this is the forward diffusion process)
|
||||
noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps)
|
||||
noisy_latents = self.scheduler.add_noise(image_latents, noise, timesteps)
|
||||
|
||||
# Predict the noise residual
|
||||
noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample
|
||||
@@ -301,12 +305,12 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline):
|
||||
for _ in range(model_fine_tuning_optimization_steps):
|
||||
with accelerator.accumulate(self.unet.parameters()):
|
||||
# Sample noise that we'll add to the latents
|
||||
noise = torch.randn(init_image_latents.shape).to(init_image_latents.device)
|
||||
timesteps = torch.randint(1000, (1,), device=init_image_latents.device)
|
||||
noise = torch.randn(image_latents.shape).to(image_latents.device)
|
||||
timesteps = torch.randint(1000, (1,), device=image_latents.device)
|
||||
|
||||
# Add noise to the latents according to the noise magnitude at each timestep
|
||||
# (this is the forward diffusion process)
|
||||
noisy_latents = self.scheduler.add_noise(init_image_latents, noise, timesteps)
|
||||
noisy_latents = self.scheduler.add_noise(image_latents, noise, timesteps)
|
||||
|
||||
# Predict the noise residual
|
||||
noise_pred = self.unet(noisy_latents, timesteps, text_embeddings).sample
|
||||
|
||||
@@ -555,7 +555,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image] = None,
|
||||
mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
@@ -583,11 +583,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
||||
if `guidance_scale` is less than `1`).
|
||||
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
|
||||
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
|
||||
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
||||
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
||||
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
||||
@@ -605,11 +605,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
|
||||
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
@@ -648,6 +648,9 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
if isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
@@ -714,7 +717,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
mask = None
|
||||
noise = None
|
||||
|
||||
if init_image is None:
|
||||
if image is None:
|
||||
# get the initial random noise unless the user supplied it
|
||||
|
||||
# Unlike in other pipelines, latents need to be generated in the target device
|
||||
@@ -753,11 +756,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
# scale the initial noise by the standard deviation required by the scheduler
|
||||
latents = latents * self.scheduler.init_noise_sigma
|
||||
else:
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess_image(init_image)
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
image = preprocess_image(image)
|
||||
# encode the init image into latents and scale the latents
|
||||
init_image = init_image.to(device=self.device, dtype=latents_dtype)
|
||||
init_latent_dist = self.vae.encode(init_image).latent_dist
|
||||
image = image.to(device=self.device, dtype=latents_dtype)
|
||||
init_latent_dist = self.vae.encode(image).latent_dist
|
||||
init_latents = init_latent_dist.sample(generator=generator)
|
||||
init_latents = 0.18215 * init_latents
|
||||
init_latents = torch.cat([init_latents] * batch_size * num_images_per_prompt, dim=0)
|
||||
@@ -772,7 +775,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
|
||||
# check sizes
|
||||
if not mask.shape == init_latents.shape:
|
||||
raise ValueError("The mask and init_image should be the same size!")
|
||||
raise ValueError("The mask and image should be the same size!")
|
||||
|
||||
# get the original timestep using init_timestep
|
||||
offset = self.scheduler.config.get("steps_offset", 0)
|
||||
@@ -961,7 +964,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
|
||||
def img2img(
|
||||
self,
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
prompt: Union[str, List[str]],
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
strength: float = 0.8,
|
||||
@@ -980,7 +983,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Function for image-to-image generation.
|
||||
Args:
|
||||
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
prompt (`str` or `List[str]`):
|
||||
@@ -989,11 +992,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
||||
if `guidance_scale` is less than `1`).
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
|
||||
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference. This parameter will be modulated by `strength`.
|
||||
@@ -1035,7 +1038,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
return self.__call__(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
init_image=init_image,
|
||||
image=image,
|
||||
num_inference_steps=num_inference_steps,
|
||||
guidance_scale=guidance_scale,
|
||||
strength=strength,
|
||||
@@ -1052,7 +1055,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
|
||||
def inpaint(
|
||||
self,
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
mask_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
prompt: Union[str, List[str]],
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
@@ -1072,11 +1075,11 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Function for inpaint.
|
||||
Args:
|
||||
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process. This is the image whose masked region will be inpainted.
|
||||
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
|
||||
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
|
||||
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
||||
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
||||
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
||||
@@ -1088,7 +1091,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
|
||||
is 1, the denoising process will be run on the masked area for the full number of iterations specified
|
||||
in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
|
||||
in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
|
||||
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
|
||||
@@ -1131,7 +1134,7 @@ class StableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
return self.__call__(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
init_image=init_image,
|
||||
image=image,
|
||||
mask_image=mask_image,
|
||||
num_inference_steps=num_inference_steps,
|
||||
guidance_scale=guidance_scale,
|
||||
|
||||
@@ -10,7 +10,7 @@ from diffusers.onnx_utils import OnnxRuntimeModel
|
||||
from diffusers.pipeline_utils import DiffusionPipeline
|
||||
from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
|
||||
from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
|
||||
from diffusers.utils import logging
|
||||
from diffusers.utils import deprecate, logging
|
||||
|
||||
# TODO: remove and import from diffusers.utils when the new version of diffusers is released
|
||||
from packaging import version
|
||||
@@ -441,7 +441,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
init_image: Union[np.ndarray, PIL.Image.Image] = None,
|
||||
image: Union[np.ndarray, PIL.Image.Image] = None,
|
||||
mask_image: Union[np.ndarray, PIL.Image.Image] = None,
|
||||
height: int = 512,
|
||||
width: int = 512,
|
||||
@@ -469,11 +469,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
||||
if `guidance_scale` is less than `1`).
|
||||
init_image (`np.ndarray` or `PIL.Image.Image`):
|
||||
image (`np.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
mask_image (`np.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
|
||||
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
|
||||
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
||||
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
||||
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
||||
@@ -491,11 +491,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
|
||||
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
@@ -533,6 +533,9 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
if isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
@@ -598,7 +601,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
mask = None
|
||||
noise = None
|
||||
|
||||
if init_image is None:
|
||||
if image is None:
|
||||
latents_shape = (
|
||||
batch_size * num_images_per_prompt,
|
||||
4,
|
||||
@@ -616,11 +619,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
# scale the initial noise by the standard deviation required by the scheduler
|
||||
latents = latents * self.scheduler.init_noise_sigma
|
||||
else:
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess_image(init_image)
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
image = preprocess_image(image)
|
||||
# encode the init image into latents and scale the latents
|
||||
init_image = init_image.astype(latents_dtype)
|
||||
init_latents = self.vae_encoder(sample=init_image)[0]
|
||||
image = image.astype(latents_dtype)
|
||||
init_latents = self.vae_encoder(sample=image)[0]
|
||||
init_latents = 0.18215 * init_latents
|
||||
init_latents = np.concatenate([init_latents] * batch_size * num_images_per_prompt)
|
||||
init_latents_orig = init_latents
|
||||
@@ -635,7 +638,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
# check sizes
|
||||
if not mask.shape == init_latents.shape:
|
||||
print(mask.shape, init_latents.shape)
|
||||
raise ValueError("The mask and init_image should be the same size!")
|
||||
raise ValueError("The mask and image should be the same size!")
|
||||
|
||||
# get the original timestep using init_timestep
|
||||
offset = self.scheduler.config.get("steps_offset", 0)
|
||||
@@ -828,7 +831,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
|
||||
def img2img(
|
||||
self,
|
||||
init_image: Union[np.ndarray, PIL.Image.Image],
|
||||
image: Union[np.ndarray, PIL.Image.Image],
|
||||
prompt: Union[str, List[str]],
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
strength: float = 0.8,
|
||||
@@ -847,7 +850,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Function for image-to-image generation.
|
||||
Args:
|
||||
init_image (`np.ndarray` or `PIL.Image.Image`):
|
||||
image (`np.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or ndarray representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
prompt (`str` or `List[str]`):
|
||||
@@ -856,11 +859,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
|
||||
if `guidance_scale` is less than `1`).
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1.
|
||||
`image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference. This parameter will be modulated by `strength`.
|
||||
@@ -901,7 +904,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
return self.__call__(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
init_image=init_image,
|
||||
image=image,
|
||||
num_inference_steps=num_inference_steps,
|
||||
guidance_scale=guidance_scale,
|
||||
strength=strength,
|
||||
@@ -918,7 +921,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
|
||||
def inpaint(
|
||||
self,
|
||||
init_image: Union[np.ndarray, PIL.Image.Image],
|
||||
image: Union[np.ndarray, PIL.Image.Image],
|
||||
mask_image: Union[np.ndarray, PIL.Image.Image],
|
||||
prompt: Union[str, List[str]],
|
||||
negative_prompt: Optional[Union[str, List[str]]] = None,
|
||||
@@ -938,11 +941,11 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
r"""
|
||||
Function for inpaint.
|
||||
Args:
|
||||
init_image (`np.ndarray` or `PIL.Image.Image`):
|
||||
image (`np.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process. This is the image whose masked region will be inpainted.
|
||||
mask_image (`np.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
|
||||
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
|
||||
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
||||
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
||||
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
||||
@@ -954,7 +957,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
|
||||
is 1, the denoising process will be run on the masked area for the full number of iterations specified
|
||||
in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
|
||||
in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more
|
||||
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
|
||||
@@ -996,7 +999,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(DiffusionPipeline):
|
||||
return self.__call__(
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
init_image=init_image,
|
||||
image=image,
|
||||
mask_image=mask_image,
|
||||
num_inference_steps=num_inference_steps,
|
||||
guidance_scale=guidance_scale,
|
||||
|
||||
@@ -121,7 +121,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
|
||||
def inpaint(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
mask_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
@@ -138,7 +138,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
|
||||
# For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline
|
||||
return StableDiffusionInpaintPipelineLegacy(**self.components)(
|
||||
prompt=prompt,
|
||||
init_image=init_image,
|
||||
image=image,
|
||||
mask_image=mask_image,
|
||||
strength=strength,
|
||||
num_inference_steps=num_inference_steps,
|
||||
@@ -156,7 +156,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
|
||||
def img2img(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
guidance_scale: Optional[float] = 7.5,
|
||||
@@ -173,7 +173,7 @@ class StableDiffusionMegaPipeline(DiffusionPipeline):
|
||||
# For more information on how this function works, please see: https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion#diffusers.StableDiffusionImg2ImgPipeline
|
||||
return StableDiffusionImg2ImgPipeline(**self.components)(
|
||||
prompt=prompt,
|
||||
init_image=init_image,
|
||||
image=image,
|
||||
strength=strength,
|
||||
num_inference_steps=num_inference_steps,
|
||||
guidance_scale=guidance_scale,
|
||||
|
||||
@@ -14,7 +14,6 @@ from datasets import load_dataset
|
||||
from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel, __version__
|
||||
from diffusers.optimization import get_scheduler
|
||||
from diffusers.training_utils import EMAModel
|
||||
from diffusers.utils import deprecate
|
||||
from huggingface_hub import HfFolder, Repository, whoami
|
||||
from packaging import version
|
||||
from torchvision.transforms import (
|
||||
@@ -417,11 +416,7 @@ def main(args):
|
||||
scheduler=noise_scheduler,
|
||||
)
|
||||
|
||||
deprecate("todo: remove this check", "0.10.0", "when the most used version is >= 0.8.0")
|
||||
if diffusers_version < version.parse("0.8.0"):
|
||||
generator = torch.manual_seed(0)
|
||||
else:
|
||||
generator = torch.Generator(device=pipeline.device).manual_seed(0)
|
||||
generator = torch.Generator(device=pipeline.device).manual_seed(0)
|
||||
# run pipeline in inference (sample random noise and denoise)
|
||||
images = pipeline(
|
||||
generator=generator,
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import argparse
|
||||
import math
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
@@ -9,9 +11,9 @@ from accelerate import Accelerator
|
||||
from accelerate.logging import get_logger
|
||||
from datasets import load_dataset
|
||||
from diffusers import DDPMPipeline, DDPMScheduler, UNet2DModel
|
||||
from diffusers.hub_utils import init_git_repo, push_to_hub
|
||||
from diffusers.optimization import get_scheduler
|
||||
from diffusers.training_utils import EMAModel
|
||||
from huggingface_hub import HfFolder, Repository, whoami
|
||||
from onnxruntime.training.ortmodule import ORTModule
|
||||
from torchvision.transforms import (
|
||||
CenterCrop,
|
||||
@@ -28,6 +30,16 @@ from tqdm.auto import tqdm
|
||||
logger = get_logger(__name__)
|
||||
|
||||
|
||||
def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
|
||||
if token is None:
|
||||
token = HfFolder.get_token()
|
||||
if organization is None:
|
||||
username = whoami(token)["name"]
|
||||
return f"{username}/{model_id}"
|
||||
else:
|
||||
return f"{organization}/{model_id}"
|
||||
|
||||
|
||||
def main(args):
|
||||
logging_dir = os.path.join(args.output_dir, args.logging_dir)
|
||||
accelerator = Accelerator(
|
||||
@@ -113,8 +125,22 @@ def main(args):
|
||||
|
||||
ema_model = EMAModel(model, inv_gamma=args.ema_inv_gamma, power=args.ema_power, max_value=args.ema_max_decay)
|
||||
|
||||
if args.push_to_hub:
|
||||
repo = init_git_repo(args, at_init=True)
|
||||
# Handle the repository creation
|
||||
if accelerator.is_main_process:
|
||||
if args.push_to_hub:
|
||||
if args.hub_model_id is None:
|
||||
repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
|
||||
else:
|
||||
repo_name = args.hub_model_id
|
||||
repo = Repository(args.output_dir, clone_from=repo_name)
|
||||
|
||||
with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
|
||||
if "step_*" not in gitignore:
|
||||
gitignore.write("step_*\n")
|
||||
if "epoch_*" not in gitignore:
|
||||
gitignore.write("epoch_*\n")
|
||||
elif args.output_dir is not None:
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
if accelerator.is_main_process:
|
||||
run = os.path.split(__file__)[-1].split(".")[0]
|
||||
@@ -186,10 +212,9 @@ def main(args):
|
||||
|
||||
if epoch % args.save_model_epochs == 0 or epoch == args.num_epochs - 1:
|
||||
# save the model
|
||||
pipeline.save_pretrained(args.output_dir)
|
||||
if args.push_to_hub:
|
||||
push_to_hub(args, pipeline, repo, commit_message=f"Epoch {epoch}", blocking=False)
|
||||
else:
|
||||
pipeline.save_pretrained(args.output_dir)
|
||||
repo.push_to_hub(commit_message=f"Epoch {epoch}", blocking=False)
|
||||
accelerator.wait_for_everyone()
|
||||
|
||||
accelerator.end_training()
|
||||
|
||||
@@ -214,7 +214,7 @@ install_requires = [
|
||||
|
||||
setup(
|
||||
name="diffusers",
|
||||
version="0.9.0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
version="0.10.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
|
||||
description="Diffusers",
|
||||
long_description=open("README.md", "r", encoding="utf-8").read(),
|
||||
long_description_content_type="text/markdown",
|
||||
|
||||
@@ -9,7 +9,7 @@ from .utils import (
|
||||
)
|
||||
|
||||
|
||||
__version__ = "0.9.0"
|
||||
__version__ = "0.10.0.dev0"
|
||||
|
||||
from .configuration_utils import ConfigMixin
|
||||
from .onnx_utils import OnnxRuntimeModel
|
||||
|
||||
+2
-118
@@ -15,16 +15,15 @@
|
||||
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional, Union
|
||||
from uuid import uuid4
|
||||
|
||||
from huggingface_hub import HfFolder, Repository, whoami
|
||||
from huggingface_hub import HfFolder, whoami
|
||||
|
||||
from . import __version__
|
||||
from .utils import ENV_VARS_TRUE_VALUES, deprecate, logging
|
||||
from .utils import ENV_VARS_TRUE_VALUES, logging
|
||||
from .utils.import_utils import (
|
||||
_flax_version,
|
||||
_jax_version,
|
||||
@@ -83,121 +82,6 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
|
||||
return f"{organization}/{model_id}"
|
||||
|
||||
|
||||
def init_git_repo(args, at_init: bool = False):
|
||||
"""
|
||||
Args:
|
||||
Initializes a git repo in `args.hub_model_id`.
|
||||
at_init (`bool`, *optional*, defaults to `False`):
|
||||
Whether this function is called before any training or not. If `self.args.overwrite_output_dir` is `True`
|
||||
and `at_init` is `True`, the path to the repo (which is `self.args.output_dir`) might be wiped out.
|
||||
"""
|
||||
deprecation_message = (
|
||||
"Please use `huggingface_hub.Repository`. "
|
||||
"See `examples/unconditional_image_generation/train_unconditional.py` for an example."
|
||||
)
|
||||
deprecate("init_git_repo()", "0.10.0", deprecation_message)
|
||||
|
||||
if hasattr(args, "local_rank") and args.local_rank not in [-1, 0]:
|
||||
return
|
||||
hub_token = args.hub_token if hasattr(args, "hub_token") else None
|
||||
use_auth_token = True if hub_token is None else hub_token
|
||||
if not hasattr(args, "hub_model_id") or args.hub_model_id is None:
|
||||
repo_name = Path(args.output_dir).absolute().name
|
||||
else:
|
||||
repo_name = args.hub_model_id
|
||||
if "/" not in repo_name:
|
||||
repo_name = get_full_repo_name(repo_name, token=hub_token)
|
||||
|
||||
try:
|
||||
repo = Repository(
|
||||
args.output_dir,
|
||||
clone_from=repo_name,
|
||||
use_auth_token=use_auth_token,
|
||||
private=args.hub_private_repo,
|
||||
)
|
||||
except EnvironmentError:
|
||||
if args.overwrite_output_dir and at_init:
|
||||
# Try again after wiping output_dir
|
||||
shutil.rmtree(args.output_dir)
|
||||
repo = Repository(
|
||||
args.output_dir,
|
||||
clone_from=repo_name,
|
||||
use_auth_token=use_auth_token,
|
||||
)
|
||||
else:
|
||||
raise
|
||||
|
||||
repo.git_pull()
|
||||
|
||||
# By default, ignore the checkpoint folders
|
||||
if not os.path.exists(os.path.join(args.output_dir, ".gitignore")):
|
||||
with open(os.path.join(args.output_dir, ".gitignore"), "w", encoding="utf-8") as writer:
|
||||
writer.writelines(["checkpoint-*/"])
|
||||
|
||||
return repo
|
||||
|
||||
|
||||
def push_to_hub(
|
||||
args,
|
||||
pipeline,
|
||||
repo: Repository,
|
||||
commit_message: Optional[str] = "End of training",
|
||||
blocking: bool = True,
|
||||
**kwargs,
|
||||
) -> str:
|
||||
"""
|
||||
Parameters:
|
||||
Upload *self.model* and *self.tokenizer* to the 🤗 model hub on the repo *self.args.hub_model_id*.
|
||||
commit_message (`str`, *optional*, defaults to `"End of training"`):
|
||||
Message to commit while pushing.
|
||||
blocking (`bool`, *optional*, defaults to `True`):
|
||||
Whether the function should return only when the `git push` has finished.
|
||||
kwargs:
|
||||
Additional keyword arguments passed along to [`create_model_card`].
|
||||
Returns:
|
||||
The url of the commit of your model in the given repository if `blocking=False`, a tuple with the url of the
|
||||
commit and an object to track the progress of the commit if `blocking=True`
|
||||
"""
|
||||
deprecation_message = (
|
||||
"Please use `huggingface_hub.Repository` and `Repository.push_to_hub()`. "
|
||||
"See `examples/unconditional_image_generation/train_unconditional.py` for an example."
|
||||
)
|
||||
deprecate("push_to_hub()", "0.10.0", deprecation_message)
|
||||
|
||||
if not hasattr(args, "hub_model_id") or args.hub_model_id is None:
|
||||
model_name = Path(args.output_dir).name
|
||||
else:
|
||||
model_name = args.hub_model_id.split("/")[-1]
|
||||
|
||||
output_dir = args.output_dir
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
logger.info(f"Saving pipeline checkpoint to {output_dir}")
|
||||
pipeline.save_pretrained(output_dir)
|
||||
|
||||
# Only push from one node.
|
||||
if hasattr(args, "local_rank") and args.local_rank not in [-1, 0]:
|
||||
return
|
||||
|
||||
# Cancel any async push in progress if blocking=True. The commits will all be pushed together.
|
||||
if (
|
||||
blocking
|
||||
and len(repo.command_queue) > 0
|
||||
and repo.command_queue[-1] is not None
|
||||
and not repo.command_queue[-1].is_done
|
||||
):
|
||||
repo.command_queue[-1]._process.kill()
|
||||
|
||||
git_head_commit_url = repo.push_to_hub(commit_message=commit_message, blocking=blocking, auto_lfs_prune=True)
|
||||
# push separately the model card to be independent from the rest of the model
|
||||
create_model_card(args, model_name=model_name)
|
||||
try:
|
||||
repo.push_to_hub(commit_message="update model card README.md", blocking=blocking, auto_lfs_prune=True)
|
||||
except EnvironmentError as exc:
|
||||
logger.error(f"Error pushing update to the model card. Please read logs and retry.\n${exc}")
|
||||
|
||||
return git_head_commit_url
|
||||
|
||||
|
||||
def create_model_card(args, model_name):
|
||||
if not is_modelcards_available:
|
||||
raise ValueError(
|
||||
|
||||
@@ -666,20 +666,6 @@ class ModelMixin(torch.nn.Module):
|
||||
return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable)
|
||||
|
||||
|
||||
def unwrap_model(model: torch.nn.Module) -> torch.nn.Module:
|
||||
"""
|
||||
Recursively unwraps a model from potential containers (as used in distributed training).
|
||||
|
||||
Args:
|
||||
model (`torch.nn.Module`): The model to unwrap.
|
||||
"""
|
||||
# since there could be multiple levels of wrapping, unwrap recursively
|
||||
if hasattr(model, "module"):
|
||||
return unwrap_model(model.module)
|
||||
else:
|
||||
return model
|
||||
|
||||
|
||||
def _get_model_file(
|
||||
pretrained_model_name_or_path,
|
||||
*,
|
||||
|
||||
@@ -290,11 +290,19 @@ class AttentionBlock(nn.Module):
|
||||
self.rescale_output_factor = rescale_output_factor
|
||||
self.proj_attn = nn.Linear(channels, channels, 1)
|
||||
|
||||
def transpose_for_scores(self, projection: torch.Tensor) -> torch.Tensor:
|
||||
new_projection_shape = projection.size()[:-1] + (self.num_heads, -1)
|
||||
# move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
|
||||
new_projection = projection.view(new_projection_shape).permute(0, 2, 1, 3)
|
||||
return new_projection
|
||||
def reshape_heads_to_batch_dim(self, tensor):
|
||||
batch_size, seq_len, dim = tensor.shape
|
||||
head_size = self.num_heads
|
||||
tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
|
||||
tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size * head_size, seq_len, dim // head_size)
|
||||
return tensor
|
||||
|
||||
def reshape_batch_dim_to_heads(self, tensor):
|
||||
batch_size, seq_len, dim = tensor.shape
|
||||
head_size = self.num_heads
|
||||
tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
|
||||
tensor = tensor.permute(0, 2, 1, 3).reshape(batch_size // head_size, seq_len, dim * head_size)
|
||||
return tensor
|
||||
|
||||
def forward(self, hidden_states):
|
||||
residual = hidden_states
|
||||
@@ -312,50 +320,28 @@ class AttentionBlock(nn.Module):
|
||||
|
||||
scale = 1 / math.sqrt(self.channels / self.num_heads)
|
||||
|
||||
# get scores
|
||||
if self.num_heads > 1:
|
||||
query_states = self.transpose_for_scores(query_proj)
|
||||
key_states = self.transpose_for_scores(key_proj)
|
||||
value_states = self.transpose_for_scores(value_proj)
|
||||
|
||||
# TODO: is there a way to perform batched matmul (e.g. baddbmm) on 4D tensors?
|
||||
# or reformulate this into a 3D problem?
|
||||
# TODO: measure whether on MPS device it would be faster to do this matmul via einsum
|
||||
# as some matmuls can be 1.94x slower than an equivalent einsum on MPS
|
||||
# https://gist.github.com/Birch-san/cba16789ec27bb20996a4b4831b13ce0
|
||||
attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) * scale
|
||||
else:
|
||||
query_states, key_states, value_states = query_proj, key_proj, value_proj
|
||||
|
||||
attention_scores = torch.baddbmm(
|
||||
torch.empty(
|
||||
query_states.shape[0],
|
||||
query_states.shape[1],
|
||||
key_states.shape[1],
|
||||
dtype=query_states.dtype,
|
||||
device=query_states.device,
|
||||
),
|
||||
query_states,
|
||||
key_states.transpose(-1, -2),
|
||||
beta=0,
|
||||
alpha=scale,
|
||||
)
|
||||
query_proj = self.reshape_heads_to_batch_dim(query_proj)
|
||||
key_proj = self.reshape_heads_to_batch_dim(key_proj)
|
||||
value_proj = self.reshape_heads_to_batch_dim(value_proj)
|
||||
|
||||
attention_scores = torch.baddbmm(
|
||||
torch.empty(
|
||||
query_proj.shape[0],
|
||||
query_proj.shape[1],
|
||||
key_proj.shape[1],
|
||||
dtype=query_proj.dtype,
|
||||
device=query_proj.device,
|
||||
),
|
||||
query_proj,
|
||||
key_proj.transpose(-1, -2),
|
||||
beta=0,
|
||||
alpha=scale,
|
||||
)
|
||||
attention_probs = torch.softmax(attention_scores.float(), dim=-1).type(attention_scores.dtype)
|
||||
hidden_states = torch.bmm(attention_probs, value_proj)
|
||||
|
||||
# compute attention output
|
||||
if self.num_heads > 1:
|
||||
# TODO: is there a way to perform batched matmul (e.g. bmm) on 4D tensors?
|
||||
# or reformulate this into a 3D problem?
|
||||
# TODO: measure whether on MPS device it would be faster to do this matmul via einsum
|
||||
# as some matmuls can be 1.94x slower than an equivalent einsum on MPS
|
||||
# https://gist.github.com/Birch-san/cba16789ec27bb20996a4b4831b13ce0
|
||||
hidden_states = torch.matmul(attention_probs, value_states)
|
||||
hidden_states = hidden_states.permute(0, 2, 1, 3).contiguous()
|
||||
new_hidden_states_shape = hidden_states.size()[:-2] + (self.channels,)
|
||||
hidden_states = hidden_states.view(new_hidden_states_shape)
|
||||
else:
|
||||
hidden_states = torch.bmm(attention_probs, value_states)
|
||||
# reshape hidden_states
|
||||
hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
|
||||
|
||||
# compute next hidden_states
|
||||
hidden_states = self.proj_attn(hidden_states)
|
||||
|
||||
@@ -288,8 +288,16 @@ _kernels = {
|
||||
}
|
||||
|
||||
|
||||
class Downsample1d(nn.Module):
|
||||
def __init__(self, kernel="linear", pad_mode="reflect"):
|
||||
class KernelDownsample1D(nn.Module):
|
||||
"""
|
||||
A static downsample module that is not updated by the optimizer.
|
||||
|
||||
Parameters:
|
||||
kernel (`str`): `linear`, `cubic`, or `lanczos3` for different static kernels used in convolution.
|
||||
pad_mode (`str`): defaults to `reflect`, use with torch.nn.functional.pad.
|
||||
"""
|
||||
|
||||
def __init__(self, kernel: str = "linear", pad_mode: str = "reflect"):
|
||||
super().__init__()
|
||||
self.pad_mode = pad_mode
|
||||
kernel_1d = torch.tensor(_kernels[kernel])
|
||||
@@ -304,8 +312,16 @@ class Downsample1d(nn.Module):
|
||||
return F.conv1d(hidden_states, weight, stride=2)
|
||||
|
||||
|
||||
class Upsample1d(nn.Module):
|
||||
def __init__(self, kernel="linear", pad_mode="reflect"):
|
||||
class KernelUpsample1D(nn.Module):
|
||||
"""
|
||||
A static upsample module that is not updated by the optimizer.
|
||||
|
||||
Parameters:
|
||||
kernel (`str`): `linear`, `cubic`, or `lanczos3` for different static kernels used in convolution.
|
||||
pad_mode (`str`): defaults to `reflect`, use with torch.nn.functional.pad.
|
||||
"""
|
||||
|
||||
def __init__(self, kernel: str = "linear", pad_mode: str = "reflect"):
|
||||
super().__init__()
|
||||
self.pad_mode = pad_mode
|
||||
kernel_1d = torch.tensor(_kernels[kernel]) * 2
|
||||
@@ -321,7 +337,7 @@ class Upsample1d(nn.Module):
|
||||
|
||||
|
||||
class SelfAttention1d(nn.Module):
|
||||
def __init__(self, in_channels, n_head=1, dropout_rate=0.0):
|
||||
def __init__(self, in_channels: int, n_head: int = 1, dropout_rate: float = 0.0):
|
||||
super().__init__()
|
||||
self.channels = in_channels
|
||||
self.group_norm = nn.GroupNorm(1, num_channels=in_channels)
|
||||
@@ -379,7 +395,7 @@ class SelfAttention1d(nn.Module):
|
||||
|
||||
|
||||
class ResConvBlock(nn.Module):
|
||||
def __init__(self, in_channels, mid_channels, out_channels, is_last=False):
|
||||
def __init__(self, in_channels: int, mid_channels: int, out_channels: int, is_last: bool = False):
|
||||
super().__init__()
|
||||
self.is_last = is_last
|
||||
self.has_conv_skip = in_channels != out_channels
|
||||
@@ -413,13 +429,12 @@ class ResConvBlock(nn.Module):
|
||||
|
||||
|
||||
class UNetMidBlock1D(nn.Module):
|
||||
def __init__(self, mid_channels, in_channels, out_channels=None):
|
||||
def __init__(self, mid_channels: int, in_channels: int, out_channels: int = None):
|
||||
super().__init__()
|
||||
|
||||
out_channels = in_channels if out_channels is None else out_channels
|
||||
|
||||
# there is always at least one resnet
|
||||
self.down = Downsample1d("cubic")
|
||||
self.down = KernelDownsample1D("cubic")
|
||||
resnets = [
|
||||
ResConvBlock(in_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, mid_channels),
|
||||
@@ -436,7 +451,7 @@ class UNetMidBlock1D(nn.Module):
|
||||
SelfAttention1d(mid_channels, mid_channels // 32),
|
||||
SelfAttention1d(out_channels, out_channels // 32),
|
||||
]
|
||||
self.up = Upsample1d(kernel="cubic")
|
||||
self.up = KernelUpsample1D(kernel="cubic")
|
||||
|
||||
self.attentions = nn.ModuleList(attentions)
|
||||
self.resnets = nn.ModuleList(resnets)
|
||||
@@ -453,21 +468,26 @@ class UNetMidBlock1D(nn.Module):
|
||||
|
||||
|
||||
class AttnDownBlock1D(nn.Module):
|
||||
def __init__(self, out_channels, in_channels, mid_channels=None):
|
||||
def __init__(self, out_channels: int, in_channels: int, num_layers: int = 3, mid_channels: int = None):
|
||||
super().__init__()
|
||||
|
||||
if num_layers < 1:
|
||||
raise ValueError("AttnDownBlock1D requires added num_layers >= 1")
|
||||
|
||||
mid_channels = out_channels if mid_channels is None else mid_channels
|
||||
|
||||
self.down = Downsample1d("cubic")
|
||||
resnets = [
|
||||
ResConvBlock(in_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, out_channels),
|
||||
]
|
||||
attentions = [
|
||||
SelfAttention1d(mid_channels, mid_channels // 32),
|
||||
SelfAttention1d(mid_channels, mid_channels // 32),
|
||||
SelfAttention1d(out_channels, out_channels // 32),
|
||||
]
|
||||
self.down = KernelDownsample1D("cubic")
|
||||
resnets = []
|
||||
attentions = []
|
||||
|
||||
for i in range(num_layers):
|
||||
in_channels = in_channels if i == 0 else mid_channels
|
||||
if i < (num_layers - 1):
|
||||
resnets.append(ResConvBlock(in_channels, mid_channels, mid_channels))
|
||||
attentions.append(SelfAttention1d(mid_channels, mid_channels // 32))
|
||||
else:
|
||||
resnets.append(ResConvBlock(mid_channels, mid_channels, out_channels))
|
||||
attentions.append(SelfAttention1d(out_channels, out_channels // 32))
|
||||
|
||||
self.attentions = nn.ModuleList(attentions)
|
||||
self.resnets = nn.ModuleList(resnets)
|
||||
@@ -483,16 +503,22 @@ class AttnDownBlock1D(nn.Module):
|
||||
|
||||
|
||||
class DownBlock1D(nn.Module):
|
||||
def __init__(self, out_channels, in_channels, mid_channels=None):
|
||||
def __init__(self, out_channels: int, in_channels: int, mid_channels: int = None, num_layers: int = 3):
|
||||
super().__init__()
|
||||
if num_layers < 1:
|
||||
raise ValueError("DownBlock1D requires added num_layers >= 1")
|
||||
|
||||
mid_channels = out_channels if mid_channels is None else mid_channels
|
||||
|
||||
self.down = Downsample1d("cubic")
|
||||
resnets = [
|
||||
ResConvBlock(in_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, out_channels),
|
||||
]
|
||||
self.down = KernelDownsample1D("cubic")
|
||||
resnets = []
|
||||
|
||||
for i in range(num_layers):
|
||||
in_channels = in_channels if i == 0 else mid_channels
|
||||
if i < (num_layers - 1):
|
||||
resnets.append(ResConvBlock(in_channels, mid_channels, mid_channels))
|
||||
else:
|
||||
resnets.append(ResConvBlock(mid_channels, mid_channels, out_channels))
|
||||
|
||||
self.resnets = nn.ModuleList(resnets)
|
||||
|
||||
@@ -506,15 +532,21 @@ class DownBlock1D(nn.Module):
|
||||
|
||||
|
||||
class DownBlock1DNoSkip(nn.Module):
|
||||
def __init__(self, out_channels, in_channels, mid_channels=None):
|
||||
def __init__(self, out_channels: int, in_channels: int, mid_channels: int = None, num_layers: int = 3):
|
||||
super().__init__()
|
||||
if num_layers < 1:
|
||||
raise ValueError("DownBlock1DNoSkip requires added num_layers >= 1")
|
||||
|
||||
mid_channels = out_channels if mid_channels is None else mid_channels
|
||||
|
||||
resnets = [
|
||||
ResConvBlock(in_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, out_channels),
|
||||
]
|
||||
resnets = []
|
||||
|
||||
for i in range(num_layers):
|
||||
in_channels = in_channels if i == 0 else mid_channels
|
||||
if i < (num_layers - 1):
|
||||
resnets.append(ResConvBlock(in_channels, mid_channels, mid_channels))
|
||||
else:
|
||||
resnets.append(ResConvBlock(mid_channels, mid_channels, out_channels))
|
||||
|
||||
self.resnets = nn.ModuleList(resnets)
|
||||
|
||||
@@ -527,24 +559,28 @@ class DownBlock1DNoSkip(nn.Module):
|
||||
|
||||
|
||||
class AttnUpBlock1D(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, mid_channels=None):
|
||||
def __init__(self, in_channels: int, out_channels: int, mid_channels: int = None, num_layers: int = 3):
|
||||
super().__init__()
|
||||
if num_layers < 1:
|
||||
raise ValueError("AttnUpBlock1D requires added num_layers >= 1")
|
||||
|
||||
mid_channels = out_channels if mid_channels is None else mid_channels
|
||||
|
||||
resnets = [
|
||||
ResConvBlock(2 * in_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, out_channels),
|
||||
]
|
||||
attentions = [
|
||||
SelfAttention1d(mid_channels, mid_channels // 32),
|
||||
SelfAttention1d(mid_channels, mid_channels // 32),
|
||||
SelfAttention1d(out_channels, out_channels // 32),
|
||||
]
|
||||
resnets = []
|
||||
attentions = []
|
||||
|
||||
for i in range(num_layers):
|
||||
in_channels = 2 * in_channels if i == 0 else mid_channels
|
||||
if i < (num_layers - 1):
|
||||
resnets.append(ResConvBlock(in_channels, mid_channels, mid_channels))
|
||||
attentions.append(SelfAttention1d(mid_channels, mid_channels // 32))
|
||||
else:
|
||||
resnets.append(ResConvBlock(mid_channels, mid_channels, out_channels))
|
||||
attentions.append(SelfAttention1d(out_channels, out_channels // 32))
|
||||
|
||||
self.attentions = nn.ModuleList(attentions)
|
||||
self.resnets = nn.ModuleList(resnets)
|
||||
self.up = Upsample1d(kernel="cubic")
|
||||
self.up = KernelUpsample1D(kernel="cubic")
|
||||
|
||||
def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
|
||||
res_hidden_states = res_hidden_states_tuple[-1]
|
||||
@@ -560,18 +596,24 @@ class AttnUpBlock1D(nn.Module):
|
||||
|
||||
|
||||
class UpBlock1D(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, mid_channels=None):
|
||||
def __init__(self, in_channels: int, out_channels: int, mid_channels: int = None, num_layers: int = 3):
|
||||
super().__init__()
|
||||
if num_layers < 1:
|
||||
raise ValueError("UpBlock1D requires added num_layers >= 1")
|
||||
|
||||
mid_channels = in_channels if mid_channels is None else mid_channels
|
||||
|
||||
resnets = [
|
||||
ResConvBlock(2 * in_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, out_channels),
|
||||
]
|
||||
resnets = []
|
||||
|
||||
for i in range(num_layers):
|
||||
in_channels = 2 * in_channels if i == 0 else mid_channels
|
||||
if i < (num_layers - 1):
|
||||
resnets.append(ResConvBlock(in_channels, mid_channels, mid_channels))
|
||||
else:
|
||||
resnets.append(ResConvBlock(mid_channels, mid_channels, out_channels))
|
||||
|
||||
self.resnets = nn.ModuleList(resnets)
|
||||
self.up = Upsample1d(kernel="cubic")
|
||||
self.up = KernelUpsample1D(kernel="cubic")
|
||||
|
||||
def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
|
||||
res_hidden_states = res_hidden_states_tuple[-1]
|
||||
@@ -586,15 +628,21 @@ class UpBlock1D(nn.Module):
|
||||
|
||||
|
||||
class UpBlock1DNoSkip(nn.Module):
|
||||
def __init__(self, in_channels, out_channels, mid_channels=None):
|
||||
def __init__(self, in_channels: int, out_channels: int, mid_channels: int = None, num_layers: int = 3):
|
||||
super().__init__()
|
||||
if num_layers < 1:
|
||||
raise ValueError("UpBlock1D requires added num_layers >= 1")
|
||||
|
||||
mid_channels = in_channels if mid_channels is None else mid_channels
|
||||
|
||||
resnets = [
|
||||
ResConvBlock(2 * in_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, mid_channels),
|
||||
ResConvBlock(mid_channels, mid_channels, out_channels, is_last=True),
|
||||
]
|
||||
resnets = []
|
||||
|
||||
for i in range(num_layers):
|
||||
in_channels = 2 * in_channels if i == 0 else mid_channels
|
||||
if i < (num_layers - 1):
|
||||
resnets.append(ResConvBlock(in_channels, mid_channels, mid_channels))
|
||||
else:
|
||||
resnets.append(ResConvBlock(mid_channels, mid_channels, out_channels, is_last=True))
|
||||
|
||||
self.resnets = nn.ModuleList(resnets)
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ init_image = init_image.resize((768, 512))
|
||||
|
||||
prompt = "A fantasy landscape, trending on artstation"
|
||||
|
||||
images = pipe(prompt=prompt, init_image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
images = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images
|
||||
|
||||
images[0].save("fantasy_landscape.png")
|
||||
```
|
||||
|
||||
@@ -435,9 +435,9 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
|
||||
init_image = init_image.to(device=device, dtype=dtype)
|
||||
init_latent_dist = self.vae.encode(init_image).latent_dist
|
||||
def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
|
||||
image = image.to(device=device, dtype=dtype)
|
||||
init_latent_dist = self.vae.encode(image).latent_dist
|
||||
init_latents = init_latent_dist.sample(generator=generator)
|
||||
init_latents = 0.18215 * init_latents
|
||||
|
||||
@@ -445,16 +445,16 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
# expand init_latents for batch_size
|
||||
deprecation_message = (
|
||||
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
|
||||
" images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
|
||||
" images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
|
||||
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
|
||||
" your script to pass as many init images as text prompts to suppress this warning."
|
||||
" your script to pass as many initial images as text prompts to suppress this warning."
|
||||
)
|
||||
deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
|
||||
deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
|
||||
additional_image_per_prompt = batch_size // init_latents.shape[0]
|
||||
init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
|
||||
elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
|
||||
raise ValueError(
|
||||
f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
|
||||
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
|
||||
)
|
||||
else:
|
||||
init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
|
||||
@@ -472,7 +472,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
guidance_scale: Optional[float] = 7.5,
|
||||
@@ -484,6 +484,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
@@ -491,15 +492,15 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
|
||||
will be used as a starting point, adding more noise to it the larger the `strength`. The number of
|
||||
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
|
||||
be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference. This parameter will be modulated by `strength`.
|
||||
@@ -540,6 +541,10 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
# 1. Check inputs
|
||||
self.check_inputs(prompt, strength, callback_steps)
|
||||
|
||||
@@ -557,8 +562,8 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
)
|
||||
|
||||
# 4. Preprocess image
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess(init_image)
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
image = preprocess(image)
|
||||
|
||||
# 5. set timesteps
|
||||
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
||||
@@ -567,7 +572,7 @@ class AltDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
|
||||
# 6. Prepare latent variables
|
||||
latents = self.prepare_latents(
|
||||
init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
|
||||
image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
|
||||
)
|
||||
|
||||
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
||||
|
||||
@@ -73,7 +73,7 @@ class DDPMPipeline(DiffusionPipeline):
|
||||
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
|
||||
" DDPMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
|
||||
)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.11.0", message, take_from=kwargs)
|
||||
|
||||
if predict_epsilon is not None:
|
||||
new_config = dict(self.scheduler.config)
|
||||
|
||||
+15
-14
@@ -17,7 +17,7 @@ from ...schedulers import (
|
||||
LMSDiscreteScheduler,
|
||||
PNDMScheduler,
|
||||
)
|
||||
from ...utils import PIL_INTERPOLATION
|
||||
from ...utils import PIL_INTERPOLATION, deprecate
|
||||
|
||||
|
||||
def preprocess(image):
|
||||
@@ -66,7 +66,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
|
||||
@torch.no_grad()
|
||||
def __call__(
|
||||
self,
|
||||
init_image: Union[torch.Tensor, PIL.Image.Image],
|
||||
image: Union[torch.Tensor, PIL.Image.Image],
|
||||
batch_size: Optional[int] = 1,
|
||||
num_inference_steps: Optional[int] = 100,
|
||||
eta: Optional[float] = 0.0,
|
||||
@@ -77,7 +77,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
|
||||
) -> Union[Tuple, ImagePipelineOutput]:
|
||||
r"""
|
||||
Args:
|
||||
init_image (`torch.Tensor` or `PIL.Image.Image`):
|
||||
image (`torch.Tensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
batch_size (`int`, *optional*, defaults to 1):
|
||||
@@ -102,20 +102,21 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
|
||||
`return_dict` is True, otherwise a `tuple. When returning a tuple, the first element is a list with the
|
||||
generated images.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
batch_size = 1
|
||||
elif isinstance(init_image, torch.Tensor):
|
||||
batch_size = init_image.shape[0]
|
||||
elif isinstance(image, torch.Tensor):
|
||||
batch_size = image.shape[0]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"`init_image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(init_image)}"
|
||||
)
|
||||
raise ValueError(f"`image` has to be of type `PIL.Image.Image` or `torch.Tensor` but is {type(image)}")
|
||||
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess(init_image)
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
image = preprocess(image)
|
||||
|
||||
height, width = init_image.shape[-2:]
|
||||
height, width = image.shape[-2:]
|
||||
|
||||
# in_channels should be 6: 3 for latents, 3 for low resolution image
|
||||
latents_shape = (batch_size, self.unet.in_channels // 2, height, width)
|
||||
@@ -128,7 +129,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
|
||||
else:
|
||||
latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
|
||||
|
||||
init_image = init_image.to(device=self.device, dtype=latents_dtype)
|
||||
image = image.to(device=self.device, dtype=latents_dtype)
|
||||
|
||||
# set timesteps and move to the correct device
|
||||
self.scheduler.set_timesteps(num_inference_steps, device=self.device)
|
||||
@@ -148,7 +149,7 @@ class LDMSuperResolutionPipeline(DiffusionPipeline):
|
||||
|
||||
for t in self.progress_bar(timesteps_tensor):
|
||||
# concat latents and low resolution image in the channel dimension.
|
||||
latents_input = torch.cat([latents, init_image], dim=1)
|
||||
latents_input = torch.cat([latents, image], dim=1)
|
||||
latents_input = self.scheduler.scale_model_input(latents_input, t)
|
||||
# predict the noise residual
|
||||
noise_pred = self.unet(latents_input, t).sample
|
||||
|
||||
@@ -138,7 +138,7 @@ prompt = "An astronaut riding an elephant"
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
source_prompt=source_prompt,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
num_inference_steps=100,
|
||||
eta=0.1,
|
||||
strength=0.8,
|
||||
@@ -164,7 +164,7 @@ torch.manual_seed(0)
|
||||
image = pipe(
|
||||
prompt=prompt,
|
||||
source_prompt=source_prompt,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
num_inference_steps=100,
|
||||
eta=0.1,
|
||||
strength=0.85,
|
||||
|
||||
@@ -477,9 +477,9 @@ class CycleDiffusionPipeline(DiffusionPipeline):
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
|
||||
init_image = init_image.to(device=device, dtype=dtype)
|
||||
init_latent_dist = self.vae.encode(init_image).latent_dist
|
||||
def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
|
||||
image = image.to(device=device, dtype=dtype)
|
||||
init_latent_dist = self.vae.encode(image).latent_dist
|
||||
init_latents = init_latent_dist.sample(generator=generator)
|
||||
init_latents = 0.18215 * init_latents
|
||||
|
||||
@@ -487,16 +487,16 @@ class CycleDiffusionPipeline(DiffusionPipeline):
|
||||
# expand init_latents for batch_size
|
||||
deprecation_message = (
|
||||
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
|
||||
" images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
|
||||
" images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
|
||||
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
|
||||
" your script to pass as many init images as text prompts to suppress this warning."
|
||||
" your script to pass as many initial images as text prompts to suppress this warning."
|
||||
)
|
||||
deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
|
||||
deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
|
||||
additional_image_per_prompt = batch_size // init_latents.shape[0]
|
||||
init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
|
||||
elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
|
||||
raise ValueError(
|
||||
f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
|
||||
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
|
||||
)
|
||||
else:
|
||||
init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
|
||||
@@ -516,7 +516,7 @@ class CycleDiffusionPipeline(DiffusionPipeline):
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
source_prompt: Union[str, List[str]],
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
guidance_scale: Optional[float] = 7.5,
|
||||
@@ -528,6 +528,7 @@ class CycleDiffusionPipeline(DiffusionPipeline):
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
@@ -535,15 +536,15 @@ class CycleDiffusionPipeline(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
|
||||
will be used as a starting point, adding more noise to it the larger the `strength`. The number of
|
||||
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
|
||||
be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference. This parameter will be modulated by `strength`.
|
||||
@@ -584,6 +585,10 @@ class CycleDiffusionPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
# 1. Check inputs
|
||||
self.check_inputs(prompt, strength, callback_steps)
|
||||
|
||||
@@ -602,8 +607,8 @@ class CycleDiffusionPipeline(DiffusionPipeline):
|
||||
)
|
||||
|
||||
# 4. Preprocess image
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess(init_image)
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
image = preprocess(image)
|
||||
|
||||
# 5. Prepare timesteps
|
||||
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
||||
@@ -612,7 +617,7 @@ class CycleDiffusionPipeline(DiffusionPipeline):
|
||||
|
||||
# 6. Prepare latent variables
|
||||
latents, clean_latents = self.prepare_latents(
|
||||
init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
|
||||
image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
|
||||
)
|
||||
source_latents = latents
|
||||
|
||||
|
||||
+20
-15
@@ -229,7 +229,7 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
init_image: Union[np.ndarray, PIL.Image.Image],
|
||||
image: Union[np.ndarray, PIL.Image.Image],
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
guidance_scale: Optional[float] = 7.5,
|
||||
@@ -241,6 +241,7 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
@@ -248,15 +249,15 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
init_image (`np.ndarray` or `PIL.Image.Image`):
|
||||
image (`np.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
|
||||
will be used as a starting point, adding more noise to it the larger the `strength`. The number of
|
||||
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
|
||||
be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference. This parameter will be modulated by `strength`.
|
||||
@@ -296,6 +297,10 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
if isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif isinstance(prompt, list):
|
||||
@@ -320,8 +325,8 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
# set timesteps
|
||||
self.scheduler.set_timesteps(num_inference_steps)
|
||||
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess(init_image)
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
image = preprocess(image)
|
||||
|
||||
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
||||
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
||||
@@ -333,9 +338,9 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
)
|
||||
|
||||
latents_dtype = text_embeddings.dtype
|
||||
init_image = init_image.astype(latents_dtype)
|
||||
image = image.astype(latents_dtype)
|
||||
# encode the init image into latents and scale the latents
|
||||
init_latents = self.vae_encoder(sample=init_image)[0]
|
||||
init_latents = self.vae_encoder(sample=image)[0]
|
||||
init_latents = 0.18215 * init_latents
|
||||
|
||||
if isinstance(prompt, str):
|
||||
@@ -344,16 +349,16 @@ class OnnxStableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
# expand init_latents for batch_size
|
||||
deprecation_message = (
|
||||
f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
|
||||
" images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
|
||||
" images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
|
||||
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
|
||||
" your script to pass as many init images as text prompts to suppress this warning."
|
||||
" your script to pass as many initial images as text prompts to suppress this warning."
|
||||
)
|
||||
deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
|
||||
deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
|
||||
additional_image_per_prompt = len(prompt) // init_latents.shape[0]
|
||||
init_latents = np.concatenate([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0)
|
||||
elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0:
|
||||
raise ValueError(
|
||||
f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
|
||||
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
|
||||
)
|
||||
else:
|
||||
init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0)
|
||||
|
||||
+18
-13
@@ -228,7 +228,7 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
init_image: Union[np.ndarray, PIL.Image.Image],
|
||||
image: Union[np.ndarray, PIL.Image.Image],
|
||||
mask_image: Union[np.ndarray, PIL.Image.Image],
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
@@ -241,6 +241,7 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
@@ -248,20 +249,20 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
init_image (`nd.ndarray` or `PIL.Image.Image`):
|
||||
image (`nd.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process. This is the image whose masked region will be inpainted.
|
||||
mask_image (`nd.ndarray` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
|
||||
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
|
||||
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
||||
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
||||
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.uu
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
|
||||
will be used as a starting point, adding more noise to it the larger the `strength`. The number of
|
||||
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
|
||||
be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference. This parameter will be modulated by `strength`.
|
||||
@@ -301,6 +302,10 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
if isinstance(prompt, str):
|
||||
batch_size = 1
|
||||
elif isinstance(prompt, list):
|
||||
@@ -325,8 +330,8 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
# set timesteps
|
||||
self.scheduler.set_timesteps(num_inference_steps)
|
||||
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess(init_image)
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
image = preprocess(image)
|
||||
|
||||
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
||||
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
||||
@@ -338,10 +343,10 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
)
|
||||
|
||||
latents_dtype = text_embeddings.dtype
|
||||
init_image = init_image.astype(latents_dtype)
|
||||
image = image.astype(latents_dtype)
|
||||
|
||||
# encode the init image into latents and scale the latents
|
||||
init_latents = self.vae_encoder(sample=init_image)[0]
|
||||
init_latents = self.vae_encoder(sample=image)[0]
|
||||
init_latents = 0.18215 * init_latents
|
||||
|
||||
# Expand init_latents for batch_size and num_images_per_prompt
|
||||
@@ -356,7 +361,7 @@ class OnnxStableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
|
||||
# check sizes
|
||||
if not mask.shape == init_latents.shape:
|
||||
raise ValueError("The mask and init_image should be the same size!")
|
||||
raise ValueError("The mask and image should be the same size!")
|
||||
|
||||
# get the original timestep using init_timestep
|
||||
offset = self.scheduler.config.get("steps_offset", 0)
|
||||
|
||||
@@ -444,9 +444,9 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
|
||||
init_image = init_image.to(device=device, dtype=dtype)
|
||||
init_latent_dist = self.vae.encode(init_image).latent_dist
|
||||
def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
|
||||
image = image.to(device=device, dtype=dtype)
|
||||
init_latent_dist = self.vae.encode(image).latent_dist
|
||||
init_latents = init_latent_dist.sample(generator=generator)
|
||||
init_latents = 0.18215 * init_latents
|
||||
|
||||
@@ -454,16 +454,16 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
# expand init_latents for batch_size
|
||||
deprecation_message = (
|
||||
f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
|
||||
" images (`init_image`). Initial images are now duplicating to match the number of text prompts. Note"
|
||||
" images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
|
||||
" that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
|
||||
" your script to pass as many init images as text prompts to suppress this warning."
|
||||
" your script to pass as many initial images as text prompts to suppress this warning."
|
||||
)
|
||||
deprecate("len(prompt) != len(init_image)", "1.0.0", deprecation_message, standard_warn=False)
|
||||
deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
|
||||
additional_image_per_prompt = batch_size // init_latents.shape[0]
|
||||
init_latents = torch.cat([init_latents] * additional_image_per_prompt * num_images_per_prompt, dim=0)
|
||||
elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
|
||||
raise ValueError(
|
||||
f"Cannot duplicate `init_image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
|
||||
f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
|
||||
)
|
||||
else:
|
||||
init_latents = torch.cat([init_latents] * num_images_per_prompt, dim=0)
|
||||
@@ -481,7 +481,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
guidance_scale: Optional[float] = 7.5,
|
||||
@@ -493,6 +493,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
@@ -500,15 +501,15 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process.
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to transform the reference `init_image`. Must be between 0 and 1.
|
||||
`init_image` will be used as a starting point, adding more noise to it the larger the `strength`. The
|
||||
number of denoising steps depends on the amount of noise initially added. When `strength` is 1, added
|
||||
noise will be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `init_image`.
|
||||
Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
|
||||
will be used as a starting point, adding more noise to it the larger the `strength`. The number of
|
||||
denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
|
||||
be maximum and the denoising process will run for the full number of iterations specified in
|
||||
`num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference. This parameter will be modulated by `strength`.
|
||||
@@ -549,6 +550,10 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
# 1. Check inputs
|
||||
self.check_inputs(prompt, strength, callback_steps)
|
||||
|
||||
@@ -566,8 +571,8 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
)
|
||||
|
||||
# 4. Preprocess image
|
||||
if isinstance(init_image, PIL.Image.Image):
|
||||
init_image = preprocess(init_image)
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
image = preprocess(image)
|
||||
|
||||
# 5. set timesteps
|
||||
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
||||
@@ -576,7 +581,7 @@ class StableDiffusionImg2ImgPipeline(DiffusionPipeline):
|
||||
|
||||
# 6. Prepare latent variables
|
||||
latents = self.prepare_latents(
|
||||
init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
|
||||
image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
|
||||
)
|
||||
|
||||
# 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
||||
|
||||
+16
-11
@@ -459,9 +459,9 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
def prepare_latents(self, init_image, timestep, batch_size, num_images_per_prompt, dtype, device, generator):
|
||||
init_image = init_image.to(device=self.device, dtype=dtype)
|
||||
init_latent_dist = self.vae.encode(init_image).latent_dist
|
||||
def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator):
|
||||
image = image.to(device=self.device, dtype=dtype)
|
||||
init_latent_dist = self.vae.encode(image).latent_dist
|
||||
init_latents = init_latent_dist.sample(generator=generator)
|
||||
init_latents = 0.18215 * init_latents
|
||||
|
||||
@@ -479,7 +479,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
def __call__(
|
||||
self,
|
||||
prompt: Union[str, List[str]],
|
||||
init_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
mask_image: Union[torch.FloatTensor, PIL.Image.Image],
|
||||
strength: float = 0.8,
|
||||
num_inference_steps: Optional[int] = 50,
|
||||
@@ -492,6 +492,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
return_dict: bool = True,
|
||||
callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
|
||||
callback_steps: Optional[int] = 1,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
Function invoked when calling the pipeline for generation.
|
||||
@@ -499,19 +500,19 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
Args:
|
||||
prompt (`str` or `List[str]`):
|
||||
The prompt or prompts to guide the image generation.
|
||||
init_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, that will be used as the starting point for the
|
||||
process. This is the image whose masked region will be inpainted.
|
||||
mask_image (`torch.FloatTensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch, to mask `init_image`. White pixels in the mask will be
|
||||
`Image`, or tensor representing an image batch, to mask `image`. White pixels in the mask will be
|
||||
replaced by noise and therefore repainted, while black pixels will be preserved. If `mask_image` is a
|
||||
PIL image, it will be converted to a single channel (luminance) before use. If it's a tensor, it should
|
||||
contain one color channel (L) instead of 3, so the expected shape would be `(B, H, W, 1)`.
|
||||
strength (`float`, *optional*, defaults to 0.8):
|
||||
Conceptually, indicates how much to inpaint the masked area. Must be between 0 and 1. When `strength`
|
||||
is 1, the denoising process will be run on the masked area for the full number of iterations specified
|
||||
in `num_inference_steps`. `init_image` will be used as a reference for the masked area, adding more
|
||||
noise to that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
|
||||
in `num_inference_steps`. `image` will be used as a reference for the masked area, adding more noise to
|
||||
that region the larger the `strength`. If `strength` is 0, no inpainting will occur.
|
||||
num_inference_steps (`int`, *optional*, defaults to 50):
|
||||
The reference number of denoising steps. More denoising steps usually lead to a higher quality image at
|
||||
the expense of slower inference. This parameter will be modulated by `strength`, as explained above.
|
||||
@@ -552,6 +553,10 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
|
||||
(nsfw) content, according to the `safety_checker`.
|
||||
"""
|
||||
message = "Please use `image` instead of `init_image`."
|
||||
init_image = deprecate("init_image", "0.12.0", message, take_from=kwargs)
|
||||
image = init_image or image
|
||||
|
||||
# 1. Check inputs
|
||||
self.check_inputs(prompt, strength, callback_steps)
|
||||
|
||||
@@ -569,8 +574,8 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
)
|
||||
|
||||
# 4. Preprocess image and mask
|
||||
if not isinstance(init_image, torch.FloatTensor):
|
||||
init_image = preprocess_image(init_image)
|
||||
if not isinstance(image, torch.FloatTensor):
|
||||
image = preprocess_image(image)
|
||||
|
||||
if not isinstance(mask_image, torch.FloatTensor):
|
||||
mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
|
||||
@@ -583,7 +588,7 @@ class StableDiffusionInpaintPipelineLegacy(DiffusionPipeline):
|
||||
# 6. Prepare latent variables
|
||||
# encode the init image into latents and scale the latents
|
||||
latents, init_latents_orig, noise = self.prepare_latents(
|
||||
init_image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
|
||||
image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
|
||||
)
|
||||
|
||||
# 7. Prepare mask latent
|
||||
|
||||
@@ -134,7 +134,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
|
||||
" DDIMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
|
||||
)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.11.0", message, take_from=kwargs)
|
||||
if predict_epsilon is not None:
|
||||
self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
|
||||
|
||||
|
||||
@@ -138,7 +138,7 @@ class FlaxDDIMScheduler(FlaxSchedulerMixin, ConfigMixin):
|
||||
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
|
||||
" FlaxDDIMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
|
||||
)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.11.0", message, take_from=kwargs)
|
||||
if predict_epsilon is not None:
|
||||
self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
|
||||
|
||||
|
||||
@@ -125,7 +125,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
|
||||
" DDPMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
|
||||
)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.11.0", message, take_from=kwargs)
|
||||
if predict_epsilon is not None:
|
||||
self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
|
||||
|
||||
@@ -255,7 +255,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
|
||||
" DDPMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
|
||||
)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.11.0", message, take_from=kwargs)
|
||||
if predict_epsilon is not None:
|
||||
new_config = dict(self.config)
|
||||
new_config["prediction_type"] = "epsilon" if predict_epsilon else "sample"
|
||||
|
||||
@@ -132,7 +132,7 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
|
||||
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
|
||||
" FlaxDDPMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
|
||||
)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.11.0", message, take_from=kwargs)
|
||||
if predict_epsilon is not None:
|
||||
self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
|
||||
|
||||
@@ -239,7 +239,7 @@ class FlaxDDPMScheduler(FlaxSchedulerMixin, ConfigMixin):
|
||||
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
|
||||
" FlaxDDPMScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
|
||||
)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.11.0", message, take_from=kwargs)
|
||||
if predict_epsilon is not None:
|
||||
new_config = dict(self.config)
|
||||
new_config["prediction_type"] = "epsilon" if predict_epsilon else "sample"
|
||||
|
||||
@@ -142,7 +142,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
|
||||
" DPMSolverMultistepScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
|
||||
)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.11.0", message, take_from=kwargs)
|
||||
if predict_epsilon is not None:
|
||||
self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
|
||||
|
||||
|
||||
@@ -177,7 +177,7 @@ class FlaxDPMSolverMultistepScheduler(FlaxSchedulerMixin, ConfigMixin):
|
||||
"Please make sure to instantiate your scheduler with `prediction_type` instead. E.g. `scheduler ="
|
||||
" FlaxDPMSolverMultistepScheduler.from_pretrained(<model_id>, prediction_type='epsilon')`."
|
||||
)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.10.0", message, take_from=kwargs)
|
||||
predict_epsilon = deprecate("predict_epsilon", "0.11.0", message, take_from=kwargs)
|
||||
if predict_epsilon is not None:
|
||||
self.register_to_config(prediction_type="epsilon" if predict_epsilon else "sample")
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ def deprecate(*args, take_from: Optional[Union[Dict, Any]] = None, standard_warn
|
||||
|
||||
if warning is not None:
|
||||
warning = warning + " " if standard_warn else ""
|
||||
warnings.warn(warning + message, FutureWarning)
|
||||
warnings.warn(warning + message, FutureWarning, stacklevel=2)
|
||||
|
||||
if isinstance(deprecated_kwargs, dict) and len(deprecated_kwargs) > 0:
|
||||
call_frame = inspect.getouterframes(inspect.currentframe())[1]
|
||||
|
||||
@@ -141,7 +141,7 @@ class AltDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
@@ -153,7 +153,7 @@ class AltDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
|
||||
@@ -204,7 +204,7 @@ class AltDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.TestCas
|
||||
generator=generator,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
).images
|
||||
|
||||
assert image.shape == (1, 32, 32, 3)
|
||||
@@ -243,7 +243,7 @@ class AltDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
generator=generator,
|
||||
|
||||
@@ -69,7 +69,7 @@ class DDPMPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
|
||||
|
||||
def test_inference_deprecated_predict_epsilon(self):
|
||||
deprecate("remove this test", "0.10.0", "remove")
|
||||
deprecate("remove this test", "0.11.0", "remove")
|
||||
unet = self.dummy_uncond_unet
|
||||
scheduler = DDPMScheduler(predict_epsilon=False)
|
||||
|
||||
|
||||
@@ -79,7 +79,7 @@ class LDMSuperResolutionPipelineFastTests(PipelineTesterMixin, unittest.TestCase
|
||||
init_image = self.dummy_image.to(device)
|
||||
|
||||
generator = torch.Generator(device=device).manual_seed(0)
|
||||
image = ldm(init_image, generator=generator, num_inference_steps=2, output_type="numpy").images
|
||||
image = ldm(image=init_image, generator=generator, num_inference_steps=2, output_type="numpy").images
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
@@ -124,7 +124,7 @@ class LDMSuperResolutionPipelineIntegrationTests(unittest.TestCase):
|
||||
ldm.set_progress_bar_config(disable=None)
|
||||
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
image = ldm(init_image, generator=generator, num_inference_steps=20, output_type="numpy").images
|
||||
image = ldm(image=init_image, generator=generator, num_inference_steps=20, output_type="numpy").images
|
||||
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
|
||||
|
||||
@@ -186,7 +186,7 @@ class CycleDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
source_prompt=source_prompt,
|
||||
generator=generator,
|
||||
num_inference_steps=2,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
eta=0.1,
|
||||
strength=0.8,
|
||||
guidance_scale=3,
|
||||
@@ -244,7 +244,7 @@ class CycleDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
|
||||
source_prompt=source_prompt,
|
||||
generator=generator,
|
||||
num_inference_steps=2,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
eta=0.1,
|
||||
strength=0.8,
|
||||
guidance_scale=3,
|
||||
@@ -297,7 +297,7 @@ class CycleDiffusionPipelineIntegrationTests(unittest.TestCase):
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
source_prompt=source_prompt,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
num_inference_steps=100,
|
||||
eta=0.1,
|
||||
strength=0.85,
|
||||
@@ -336,7 +336,7 @@ class CycleDiffusionPipelineIntegrationTests(unittest.TestCase):
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
source_prompt=source_prompt,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
num_inference_steps=100,
|
||||
eta=0.1,
|
||||
strength=0.85,
|
||||
|
||||
@@ -72,7 +72,7 @@ class OnnxStableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
generator = np.random.RandomState(0)
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
num_inference_steps=10,
|
||||
@@ -110,7 +110,7 @@ class OnnxStableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
generator = np.random.RandomState(0)
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
num_inference_steps=10,
|
||||
|
||||
@@ -80,7 +80,7 @@ class StableDiffusionOnnxInpaintLegacyPipelineIntegrationTests(unittest.TestCase
|
||||
generator = np.random.RandomState(0)
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
mask_image=mask_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
|
||||
@@ -188,7 +188,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
@@ -200,7 +200,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
|
||||
@@ -245,7 +245,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
)
|
||||
image = output.images
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
@@ -285,7 +285,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
)
|
||||
|
||||
image = output.images
|
||||
@@ -328,7 +328,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
)
|
||||
image = output.images
|
||||
|
||||
@@ -339,7 +339,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
return_dict=False,
|
||||
)
|
||||
image_from_tuple = output[0]
|
||||
@@ -382,7 +382,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
prompt,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
).images
|
||||
|
||||
assert images.shape == (1, 32, 32, 3)
|
||||
@@ -393,7 +393,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
[prompt] * batch_size,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
).images
|
||||
|
||||
assert images.shape == (batch_size, 32, 32, 3)
|
||||
@@ -404,7 +404,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
prompt,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
).images
|
||||
|
||||
@@ -416,7 +416,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
[prompt] * batch_size,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
).images
|
||||
|
||||
@@ -458,7 +458,7 @@ class StableDiffusionImg2ImgPipelineFastTests(PipelineTesterMixin, unittest.Test
|
||||
generator=generator,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
).images
|
||||
|
||||
assert image.shape == (1, 32, 32, 3)
|
||||
@@ -497,7 +497,7 @@ class StableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
generator=generator,
|
||||
@@ -535,7 +535,7 @@ class StableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
generator=generator,
|
||||
@@ -572,7 +572,7 @@ class StableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
generator=generator,
|
||||
@@ -626,7 +626,7 @@ class StableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
with torch.autocast(torch_device):
|
||||
pipe(
|
||||
prompt=prompt,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
strength=0.75,
|
||||
num_inference_steps=50,
|
||||
guidance_scale=7.5,
|
||||
@@ -663,7 +663,7 @@ class StableDiffusionImg2ImgPipelineIntegrationTests(unittest.TestCase):
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
_ = pipe(
|
||||
prompt=prompt,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
generator=generator,
|
||||
|
||||
@@ -191,7 +191,7 @@ class StableDiffusionInpaintLegacyPipelineFastTests(PipelineTesterMixin, unittes
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
mask_image=mask_image,
|
||||
)
|
||||
|
||||
@@ -204,7 +204,7 @@ class StableDiffusionInpaintLegacyPipelineFastTests(PipelineTesterMixin, unittes
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
mask_image=mask_image,
|
||||
return_dict=False,
|
||||
)[0]
|
||||
@@ -252,7 +252,7 @@ class StableDiffusionInpaintLegacyPipelineFastTests(PipelineTesterMixin, unittes
|
||||
guidance_scale=6.0,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
mask_image=mask_image,
|
||||
)
|
||||
|
||||
@@ -295,7 +295,7 @@ class StableDiffusionInpaintLegacyPipelineFastTests(PipelineTesterMixin, unittes
|
||||
prompt,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
mask_image=mask_image,
|
||||
).images
|
||||
|
||||
@@ -307,7 +307,7 @@ class StableDiffusionInpaintLegacyPipelineFastTests(PipelineTesterMixin, unittes
|
||||
[prompt] * batch_size,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
mask_image=mask_image,
|
||||
).images
|
||||
|
||||
@@ -319,7 +319,7 @@ class StableDiffusionInpaintLegacyPipelineFastTests(PipelineTesterMixin, unittes
|
||||
prompt,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
mask_image=mask_image,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
).images
|
||||
@@ -332,7 +332,7 @@ class StableDiffusionInpaintLegacyPipelineFastTests(PipelineTesterMixin, unittes
|
||||
[prompt] * batch_size,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
mask_image=mask_image,
|
||||
num_images_per_prompt=num_images_per_prompt,
|
||||
).images
|
||||
@@ -374,7 +374,7 @@ class StableDiffusionInpaintLegacyPipelineIntegrationTests(unittest.TestCase):
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
mask_image=mask_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
@@ -416,7 +416,7 @@ class StableDiffusionInpaintLegacyPipelineIntegrationTests(unittest.TestCase):
|
||||
generator = torch.Generator(device=torch_device).manual_seed(0)
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
mask_image=mask_image,
|
||||
strength=0.75,
|
||||
guidance_scale=7.5,
|
||||
@@ -474,7 +474,7 @@ class StableDiffusionInpaintLegacyPipelineIntegrationTests(unittest.TestCase):
|
||||
with torch.autocast(torch_device):
|
||||
pipe(
|
||||
prompt=prompt,
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
mask_image=mask_image,
|
||||
strength=0.75,
|
||||
num_inference_steps=50,
|
||||
|
||||
@@ -203,7 +203,7 @@ class ConfigTester(unittest.TestCase):
|
||||
ddpm_2 = DDPMScheduler.from_pretrained("google/ddpm-celebahq-256", beta_start=88)
|
||||
|
||||
with CaptureLogger(logger) as cap_logger:
|
||||
deprecate("remove this case", "0.10.0", "remove")
|
||||
deprecate("remove this case", "0.11.0", "remove")
|
||||
ddpm_3 = DDPMScheduler.from_pretrained(
|
||||
"hf-internal-testing/tiny-stable-diffusion-torch",
|
||||
subfolder="scheduler",
|
||||
|
||||
@@ -411,7 +411,7 @@ class PipelineFastTests(unittest.TestCase):
|
||||
generator=generator,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
mask_image=mask_image,
|
||||
).images
|
||||
image_img2img = img2img(
|
||||
@@ -419,7 +419,7 @@ class PipelineFastTests(unittest.TestCase):
|
||||
generator=generator,
|
||||
num_inference_steps=2,
|
||||
output_type="np",
|
||||
init_image=init_image,
|
||||
image=init_image,
|
||||
).images
|
||||
image_text2img = text2img(
|
||||
[prompt],
|
||||
|
||||
@@ -639,12 +639,12 @@ class DDPMSchedulerTest(SchedulerCommonTest):
|
||||
self.check_over_configs(prediction_type=prediction_type)
|
||||
|
||||
def test_deprecated_predict_epsilon(self):
|
||||
deprecate("remove this test", "0.10.0", "remove")
|
||||
deprecate("remove this test", "0.11.0", "remove")
|
||||
for predict_epsilon in [True, False]:
|
||||
self.check_over_configs(predict_epsilon=predict_epsilon)
|
||||
|
||||
def test_deprecated_epsilon(self):
|
||||
deprecate("remove this test", "0.10.0", "remove")
|
||||
deprecate("remove this test", "0.11.0", "remove")
|
||||
scheduler_class = self.scheduler_classes[0]
|
||||
scheduler_config = self.get_scheduler_config()
|
||||
|
||||
|
||||
@@ -626,12 +626,12 @@ class FlaxDDIMSchedulerTest(FlaxSchedulerCommonTest):
|
||||
self.check_over_configs(prediction_type=prediction_type)
|
||||
|
||||
def test_deprecated_predict_epsilon(self):
|
||||
deprecate("remove this test", "0.10.0", "remove")
|
||||
deprecate("remove this test", "0.11.0", "remove")
|
||||
for predict_epsilon in [True, False]:
|
||||
self.check_over_configs(predict_epsilon=predict_epsilon)
|
||||
|
||||
def test_deprecated_predict_epsilon_to_prediction_type(self):
|
||||
deprecate("remove this test", "0.10.0", "remove")
|
||||
deprecate("remove this test", "0.11.0", "remove")
|
||||
for scheduler_class in self.scheduler_classes:
|
||||
scheduler_config = self.get_scheduler_config(predict_epsilon=True)
|
||||
scheduler = scheduler_class.from_config(scheduler_config)
|
||||
|
||||
@@ -162,3 +162,9 @@ class DeprecateTester(unittest.TestCase):
|
||||
deprecate(("deprecated_arg", self.higher_version, "This message is better!!!"), standard_warn=False)
|
||||
|
||||
assert str(warning.warning) == "This message is better!!!"
|
||||
|
||||
def test_deprecate_stacklevel(self):
|
||||
with self.assertWarns(FutureWarning) as warning:
|
||||
deprecate(("deprecated_arg", self.higher_version, "This message is better!!!"), standard_warn=False)
|
||||
assert str(warning.warning) == "This message is better!!!"
|
||||
assert "diffusers/tests/test_utils.py" in warning.filename
|
||||
|
||||
Reference in New Issue
Block a user