Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 29 additions & 1 deletion docs/distilled_sd.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ python convert_diffusers_to_original_stable_diffusion.py \
The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.


### Another available .ckpt file:
##### Another available .ckpt file:

* https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt

Expand All @@ -97,3 +97,31 @@ for key, value in ckpt['state_dict'].items():
ckpt['state_dict'][key] = value.contiguous()
torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
```


### SDXS-512

Another very tiny and **incredibly fast** model is SDXS by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.

##### 1. Download the diffusers model from Hugging Face using Python:

```python
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
pipe.save_pretrained(save_directory="sdxs")
```
##### 2. Create a safetensors file

```bash
python convert_diffusers_to_original_stable_diffusion.py \
--model_path sdxs --checkpoint_path sdxs.safetensors --half --use_safetensors
```

##### 3. Run the model as follows:

```bash
~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
--cfg-scale 1 --steps 1
```

Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here.
7 changes: 7 additions & 0 deletions model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1038,6 +1038,7 @@ SDVersion ModelLoader::get_sd_version() {
int64_t patch_embedding_channels = 0;
bool has_img_emb = false;
bool has_middle_block_1 = false;
bool has_output_block_71 = false;

for (auto& [name, tensor_storage] : tensor_storage_map) {
if (!(is_xl)) {
Expand Down Expand Up @@ -1094,6 +1095,9 @@ SDVersion ModelLoader::get_sd_version() {
tensor_storage.name.find("unet.mid_block.resnets.1.") != std::string::npos) {
has_middle_block_1 = true;
}
if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos) {
has_output_block_71 = true;
}
if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
tensor_storage.name == "text_model.embeddings.token_embedding.weight" ||
Expand Down Expand Up @@ -1155,6 +1159,9 @@ SDVersion ModelLoader::get_sd_version() {
return VERSION_SD1_PIX2PIX;
}
if (!has_middle_block_1) {
if (!has_output_block_71) {
return VERSION_SDXS;
}
return VERSION_SD1_TINY_UNET;
}
return VERSION_SD1;
Expand Down
3 changes: 2 additions & 1 deletion model.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ enum SDVersion {
VERSION_SD2,
VERSION_SD2_INPAINT,
VERSION_SD2_TINY_UNET,
VERSION_SDXS,
VERSION_SDXL,
VERSION_SDXL_INPAINT,
VERSION_SDXL_PIX2PIX,
Expand All @@ -50,7 +51,7 @@ enum SDVersion {
};

static inline bool sd_version_is_sd1(SDVersion version) {
if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET) {
if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS) {
return true;
}
return false;
Expand Down
21 changes: 13 additions & 8 deletions stable-diffusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ const char* model_version_to_str[] = {
"SD 2.x",
"SD 2.x Inpaint",
"SD 2.x Tiny UNet",
"SDXS",
"SDXL",
"SDXL Inpaint",
"SDXL Instruct-Pix2Pix",
Expand Down Expand Up @@ -114,8 +115,8 @@ class StableDiffusionGGML {
std::shared_ptr<FrozenCLIPVisionEmbedder> clip_vision; // for svd or wan2.1 i2v
std::shared_ptr<DiffusionModel> diffusion_model;
std::shared_ptr<DiffusionModel> high_noise_diffusion_model;
std::shared_ptr<VAE> first_stage_model;
std::shared_ptr<TinyAutoEncoder> tae_first_stage;
std::shared_ptr<VAE> first_stage_model = nullptr;
std::shared_ptr<TinyAutoEncoder> tae_first_stage = nullptr;
std::shared_ptr<ControlNet> control_net;
std::shared_ptr<PhotoMakerIDEncoder> pmid_model;
std::shared_ptr<LoraModel> pmid_lora;
Expand Down Expand Up @@ -591,7 +592,7 @@ class StableDiffusionGGML {
vae_backend = backend;
}

if (!use_tiny_autoencoder || sd_ctx_params->tae_preview_only) {
if (!(use_tiny_autoencoder || version == VERSION_SDXS) || sd_ctx_params->tae_preview_only) {
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
first_stage_model = std::make_shared<WAN::WanVAERunner>(vae_backend,
offload_params_to_cpu,
Expand Down Expand Up @@ -629,8 +630,7 @@ class StableDiffusionGGML {
first_stage_model->get_param_tensors(tensors, "first_stage_model");
}
}

if (use_tiny_autoencoder) {
if (use_tiny_autoencoder || version == VERSION_SDXS) {
if (sd_version_is_wan(version) || sd_version_is_qwen_image(version)) {
tae_first_stage = std::make_shared<TinyVideoAutoEncoder>(vae_backend,
offload_params_to_cpu,
Expand All @@ -645,6 +645,10 @@ class StableDiffusionGGML {
"decoder.layers",
vae_decode_only,
version);
if (version == VERSION_SDXS) {
tae_first_stage->alloc_params_buffer();
tae_first_stage->get_param_tensors(tensors,"first_stage_model");
}
}
if (sd_ctx_params->vae_conv_direct) {
LOG_INFO("Using Conv2d direct in the tae model");
Expand Down Expand Up @@ -782,13 +786,14 @@ class StableDiffusionGGML {
unet_params_mem_size += high_noise_diffusion_model->get_params_buffer_size();
}
size_t vae_params_mem_size = 0;
if (!use_tiny_autoencoder || sd_ctx_params->tae_preview_only) {
if (!(use_tiny_autoencoder || version == VERSION_SDXS) || sd_ctx_params->tae_preview_only) {
vae_params_mem_size = first_stage_model->get_params_buffer_size();
}
if (use_tiny_autoencoder) {
if (!tae_first_stage->load_from_file(taesd_path, n_threads)) {
if (use_tiny_autoencoder || version == VERSION_SDXS) {
if (use_tiny_autoencoder && !tae_first_stage->load_from_file(taesd_path, n_threads)) {
return false;
}
use_tiny_autoencoder = true; // now the processing is identical for VERSION_SDXS
vae_params_mem_size = tae_first_stage->get_params_buffer_size();
}
size_t control_net_params_mem_size = 0;
Expand Down
9 changes: 9 additions & 0 deletions tae.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,7 @@ struct TinyAutoEncoder : public GGMLRunner {
struct ggml_context* output_ctx = nullptr) = 0;

virtual bool load_from_file(const std::string& file_path, int n_threads) = 0;
virtual void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) = 0;
};

struct TinyImageAutoEncoder : public TinyAutoEncoder {
Expand Down Expand Up @@ -555,6 +556,10 @@ struct TinyImageAutoEncoder : public TinyAutoEncoder {
return success;
}

void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
taesd.get_param_tensors(tensors,prefix);
}

struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
z = to_backend(z);
Expand Down Expand Up @@ -624,6 +629,10 @@ struct TinyVideoAutoEncoder : public TinyAutoEncoder {
return success;
}

void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
taehv.get_param_tensors(tensors,prefix);
}

struct ggml_cgraph* build_graph(struct ggml_tensor* z, bool decode_graph) {
struct ggml_cgraph* gf = ggml_new_graph(compute_ctx);
z = to_backend(z);
Expand Down
5 changes: 4 additions & 1 deletion unet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -215,10 +215,13 @@ class UnetModelBlock : public GGMLBlock {
} else if (sd_version_is_unet_edit(version)) {
in_channels = 8;
}
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET) {
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS) {
num_res_blocks = 1;
channel_mult = {1, 2, 4};
tiny_unet = true;
if (version == VERSION_SDXS) {
attention_resolutions = {4, 2}; // here just like SDXL
}
}

// dims is always 2
Expand Down
Loading