stduhpf · wbruna · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -176,6 +176,7 @@ jobs:
 
   build-and-push-docker-images:
     name: Build and push container images
+    if: ${{ github.event_name != 'pull_request' }}
     runs-on: ubuntu-latest
 
     permissions:

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -11,6 +11,10 @@ endif()
 if (MSVC)
     add_compile_definitions(_CRT_SECURE_NO_WARNINGS)
     add_compile_definitions(_SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING)
+    add_compile_options(
+        $<$<COMPILE_LANGUAGE:C>:/MP>
+        $<$<COMPILE_LANGUAGE:CXX>:/MP>
+    )
 endif()
 
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
@@ -152,10 +156,12 @@ endif()
 
 set(SD_LIB stable-diffusion)
 
-file(GLOB SD_LIB_SOURCES
+file(GLOB SD_LIB_SOURCES CONFIGURE_DEPENDS
     "src/*.h"
     "src/*.cpp"
     "src/*.hpp"
+    "src/model_io/*.h"
+    "src/model_io/*.cpp"
     "src/tokenizers/*.h"
     "src/tokenizers/*.cpp"
     "src/tokenizers/vocab/*.h"

diff --git a/README.md b/README.md
@@ -57,6 +57,7 @@ API and command-line option may change frequently.***
     - [Z-Image](./docs/z_image.md)
     - [Ovis-Image](./docs/ovis_image.md)
     - [Anima](./docs/anima.md)
+    - [ERNIE-Image](./docs/ernie_image.md)
   - Image Edit Models
     - [FLUX.1-Kontext-dev](./docs/kontext.md)
     - [Qwen Image Edit series](./docs/qwen_image_edit.md)
@@ -76,9 +77,10 @@ API and command-line option may change frequently.***
   - OpenCL
   - SYCL
 - Supported weight formats
-  - Pytorch checkpoint (`.ckpt` or `.pth`)
+  - Pytorch checkpoint (`.ckpt` or `.pth` or `.pt`)
   - Safetensors (`.safetensors`)
   - GGUF (`.gguf`)
+- Convert mode supports converting model weights to `.gguf` or `.safetensors`
 - Supported platforms
     - Linux
     - Mac OS
@@ -96,6 +98,7 @@ API and command-line option may change frequently.***
     - `DPM++ 2M`
     - [`DPM++ 2M v2`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/discussions/8457)
     - `DPM++ 2S a`
+    - `ER-SDE`
     - [`LCM`](https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/13952)
 - Cross-platform reproducibility
     - `--rng cuda`, default, consistent with the `stable-diffusion-webui GPU RNG`
@@ -144,6 +147,7 @@ If you want to improve performance or reduce VRAM/RAM usage, please refer to [pe
 - [🔥Z-Image](./docs/z_image.md)
 - [Ovis-Image](./docs/ovis_image.md)
 - [Anima](./docs/anima.md)
+- [ERNIE-Image](./docs/ernie_image.md)
 - [LoRA](./docs/lora.md)
 - [LCM/LCM-LoRA](./docs/lcm.md)
 - [Using PhotoMaker to personalize image generation](./docs/photo_maker.md)

diff --git a/assets/ernie_image/example.png b/assets/ernie_image/example.png
diff --git a/assets/ernie_image/turbo_example.png b/assets/ernie_image/turbo_example.png
diff --git a/docs/distilled_sd.md b/docs/distilled_sd.md
@@ -87,51 +87,32 @@ pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
 ```bash
 python convert_diffusers_to_original_stable_diffusion.py \
       --model_path  ./segmindtiny-sd \
-      --checkpoint_path ./segmind_tiny-sd.ckpt --half
+      --checkpoint_path ./segmind_tiny-sd.safetensors  --half --use_safetensors
 ```
 
-The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
+The file segmind_tiny-sd.safetensors will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
 
 
-##### Another available .ckpt file:
-
- * https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
-
-To use this file, you must first adjust its non-contiguous tensors:
-
-```python
-import torch
-ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
-for key, value in ckpt['state_dict'].items():
-    if isinstance(value, torch.Tensor):
-        ckpt['state_dict'][key] = value.contiguous()
-torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
-```
-
-
-### SDXS-512
+### SDXS-512-DreamShaper
 
 Another very tiny and **incredibly fast**  model is SDXS by IDKiro et al.  The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
+##### Some ready-to-run SDXS-512 model files are available online, such as:
 
-##### 1. Download the diffusers model from  Hugging Face using Python:
-
-```python
-from diffusers import StableDiffusionPipeline
-pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
-pipe.save_pretrained(save_directory="sdxs")
-```
-##### 2. Create a safetensors file
-
-```bash
-python convert_diffusers_to_original_stable_diffusion.py \
-    --model_path  sdxs  --checkpoint_path sdxs.safetensors --half --use_safetensors
-```
-
-##### 3. Run the model as follows:
+* https://huggingface.co/akleine/sdxs-512
+* https://huggingface.co/concedo/sdxs-512-tinySDdistilled-GGUF
 
+##### Run the model as follows:
 ```bash
 ~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
   --cfg-scale 1 --steps 1
 ```
+Both options: ``` --cfg-scale 1 ``` and  ``` --steps 1 ``` are mandatory here.
+
+### SDXS-512-0.9
+
+Even though the name "SDXS-512-0.9" is similar to "SDXS-512-DreamShaper", it is *completely different* but also **incredibly fast**. Sometimes it is preferred, so try it yourself.
+##### Download a ready-to-run file from here:
+
+* https://huggingface.co/akleine/sdxs-09
 
-Both options: ``` --cfg-scale 1 ``` and  ``` --steps 1 ``` are mandatory here.                                                 
+For the use of this model, both options ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are again absolutely necessary.
diff --git a/docs/ernie_image.md b/docs/ernie_image.md
@@ -0,0 +1,35 @@
+# How to Use
+
+You can run ERNIE-Image with stable-diffusion.cpp on GPUs with 4GB of VRAM — or even less.
+
+## Download weights
+
+- Download ERNIE-Image-Turbo
+    - safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/diffusion_models
+    - gguf: https://huggingface.co/unsloth/ERNIE-Image-Turbo-GGUF/tree/main
+- Download ERNIE-Image
+    - safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/diffusion_models
+    - gguf: https://huggingface.co/unsloth/ERNIE-Image-GGUF/tree/main
+- Download vae
+    - safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/vae
+- Download ministral 3b
+    - safetensors: https://huggingface.co/Comfy-Org/ERNIE-Image/tree/main/text_encoders
+    - gguf: https://huggingface.co/unsloth/Ministral-3-3B-Instruct-2512-GGUF/tree/main
+
+## Examples
+
+### ERNIE-Image-Turbo
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\ernie-image-turbo.safetensors --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\ministral-3-3b.safetensors -p "a lovely cat" --cfg-scale 1.0 --steps 8 -v --offload-to-cpu --diffusion-fa
+```
+
+<img width="256" alt="ERNIE-Image Turbo example" src="../assets/ernie_image/turbo_example.png" />
+
+### ERNIE-Image
+
+```
+.\bin\Release\sd-cli.exe --diffusion-model  ..\..\ComfyUI\models\diffusion_models\ernie-image-UD-Q4_K_M.gguf --vae ..\..\ComfyUI\models\vae\flux2_ae.safetensors  --llm ..\..\ComfyUI\models\text_encoders\ministral-3-3b.safetensors -p "a lovely cat" --cfg-scale 5.0 -v --offload-to-cpu --diffusion-fa
+```
+
+<img width="256" alt="ERNIE-Image example" src="../assets/ernie_image/example.png" />
diff --git a/examples/cli/README.md b/examples/cli/README.md
@@ -14,6 +14,9 @@ CLI Options:
   --metadata-format <string>  metadata output format, one of [text, json] (default: text)
   --canny                     apply canny preprocessor (edge detection)
   --convert-name              convert tensor name (for convert mode)
+                              convert mode writes `.gguf` or `.safetensors` based on the output extension.
+                              `.safetensors` export currently supports f16, bf16, f32, and i32 tensor types only.
+                              i32 is passthrough only; no f32 <-> i32 conversion is performed
   -v, --verbose               print extra info
   --color                     colors the logging tags according to level
   --taesd-preview-only        prevents usage of taesd for decoding the final image. (for use with --preview tae)
@@ -114,15 +117,15 @@ Generation Options:
                                            medium
   --skip-layer-start <float>               SLG enabling point (default: 0.01)
   --skip-layer-end <float>                 SLG disabling point (default: 0.2)
-  --eta <float>                            noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a and dpm++2s_a)
+  --eta <float>                            noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
   --flow-shift <float>                     shift value for Flow models like SD3.x or WAN (default: auto)
   --high-noise-cfg-scale <float>           (high noise) unconditional guidance scale: (default: 7.0)
   --high-noise-img-cfg-scale <float>       (high noise) image guidance scale for inpaint or instruct-pix2pix models (default: same as --cfg-scale)
   --high-noise-guidance <float>            (high noise) distilled guidance scale for models with guidance input (default: 3.5)
   --high-noise-slg-scale <float>           (high noise) skip layer guidance (SLG) scale, only for DiT models: (default: 0)
   --high-noise-skip-layer-start <float>    (high noise) SLG enabling point (default: 0.01)
   --high-noise-skip-layer-end <float>      (high noise) SLG disabling point (default: 0.2)
-  --high-noise-eta <float>                 (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a and dpm++2s_a)
+  --high-noise-eta <float>                 (high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)
   --strength <float>                       strength for noising/unnoising (default: 0.75)
   --pm-style-strength <float>
   --control-strength <float>               strength to apply Control Net (default: 0.9). 1.0 corresponds to full destruction of information in init image
@@ -133,10 +136,10 @@ Generation Options:
   --disable-image-metadata                 do not embed generation metadata on image files
   -s, --seed                               RNG seed (default: 42, use random seed for < 0)
   --sampling-method                        sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing,
-                                           tcd, res_multistep, res_2s] (default: euler for Flux/SD3/Wan, euler_a
+                                           tcd, res_multistep, res_2s, er_sde] (default: euler for Flux/SD3/Wan, euler_a
                                            otherwise)
   --high-noise-sampling-method             (high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm,
-                                           ddim_trailing, tcd, res_multistep, res_2s] default: euler for Flux/SD3/Wan,
+                                           ddim_trailing, tcd, res_multistep, res_2s, er_sde] default: euler for Flux/SD3/Wan,
                                            euler_a otherwise
   --scheduler                              denoiser sigma scheduler, one of [discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple,
                                            kl_optimal, lcm, bong_tangent], default: discrete

diff --git a/examples/common/common.cpp b/examples/common/common.cpp
@@ -841,6 +841,22 @@ ArgOptions SDGenerationParams::get_options() {
          "--guidance",
          "distilled guidance scale for models with guidance input (default: 3.5)",
          &sample_params.guidance.distilled_guidance},
+        {"",
+         "--apg-eta",
+         "parallel projected guidance scale for APG (default: 1.0, recommended: between 0 and 1)",
+         &sample_params.guidance.apg.eta},
+        {"",
+         "--apg-momentum",
+         "momentum for guidance adjustments with APG (default: 0, recommended: around -0.5 (negative))",
+         &sample_params.guidance.apg.momentum},
+        {"",
+         "--apg-nt",
+         "APG norm threshold: Upper bound allowed for the amplitude (L2 norm) of guidance updates (default: 0 = disabled, recommended: 4-15)",
+         &sample_params.guidance.apg.norm_threshold},
+        {"",
+         "--apg-nt-smoothing",
+         "Norm threshold smoothing for APG, smoothly decrease the amplitude of the guidance update if it gets too close to the norm threshold (experimental; default: 0 = disabled)",
+         &sample_params.guidance.apg.norm_threshold_smoothing},
         {"",
          "--slg-scale",
          "skip layer guidance (SLG) scale, only for DiT models: (default: 0). 0 means disabled, a value of 2.5 is nice for sd3.5 medium",
@@ -855,7 +871,7 @@ ArgOptions SDGenerationParams::get_options() {
          &sample_params.guidance.slg.layer_end},
         {"",
          "--eta",
-         "noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a and dpm++2s_a)",
+         "noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)",
          &sample_params.eta},
         {"",
          "--flow-shift",
@@ -887,7 +903,7 @@ ArgOptions SDGenerationParams::get_options() {
          &high_noise_sample_params.guidance.slg.layer_end},
         {"",
          "--high-noise-eta",
-         "(high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a and dpm++2s_a)",
+         "(high noise) noise multiplier (default: 0 for ddim_trailing, tcd, res_multistep and res_2s; 1 for euler_a, er_sde and dpm++2s_a)",
          &high_noise_sample_params.eta},
         {"",
          "--strength",
@@ -931,6 +947,18 @@ ArgOptions SDGenerationParams::get_options() {
          "do not embed generation metadata on image files",
          false,
          &embed_image_metadata},
+        {"",
+         "--slg-uncond",
+         "use CFG's forward pass for skip layer guidance (SLG) instead of a separate pass, only for DiT models (recommended to keep slg-scale at 0 if enabled)",
+         true,
+         &sample_params.guidance.slg.uncond
+        },
+        {"",
+         "--high-noise-slg-uncond",
+         "(high noise) use CFG's forward pass for skip layer guidance (SLG) instead of a separate pass, only for DiT models (recommended to keep slg-scale at 0 if enabled)",
+         true,
+         &high_noise_sample_params.guidance.slg.uncond
+        },
         {"",
          "--vae-tiling",
          "process vae in tiles to reduce memory usage",
@@ -1185,12 +1213,12 @@ ArgOptions SDGenerationParams::get_options() {
          on_seed_arg},
         {"",
          "--sampling-method",
-         "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s] "
+         "sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s, er_sde] "
          "(default: euler for Flux/SD3/Wan, euler_a otherwise)",
          on_sample_method_arg},
         {"",
          "--high-noise-sampling-method",
-         "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s]"
+         "(high noise) sampling method, one of [euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, res_multistep, res_2s, er_sde]"
          " default: euler for Flux/SD3/Wan, euler_a otherwise",
          on_high_noise_sample_method_arg},
         {"",
@@ -1531,6 +1559,21 @@ bool SDGenerationParams::from_json_str(
             if (guidance_json.contains("distilled_guidance") && guidance_json["distilled_guidance"].is_number()) {
                 target_params.guidance.distilled_guidance = guidance_json["distilled_guidance"];
             }
+            if (guidance_json.contains("apg") && guidance_json["apg"].is_object()) {
+                const json& apg_json = guidance_json["apg"];
+                if (apg_json.contains("eta") && apg_json["eta"].is_number()) {
+                    target_params.guidance.apg.eta = apg_json["eta"];
+                }
+                if (apg_json.contains("momentum") && apg_json["momentum"].is_number()) {
+                    target_params.guidance.apg.momentum = apg_json["momentum"];
+                }
+                if (apg_json.contains("norm_threshold") && apg_json["norm_threshold"].is_number()) {
+                    target_params.guidance.apg.norm_threshold = apg_json["norm_threshold"];
+                }
+                if (apg_json.contains("norm_threshold_smoothing") && apg_json["norm_threshold_smoothing"].is_number()) {
+                    target_params.guidance.apg.norm_threshold_smoothing = apg_json["norm_threshold_smoothing"];
+                }
+            }
             if (guidance_json.contains("slg") && guidance_json["slg"].is_object()) {
                 const json& slg_json = guidance_json["slg"];
                 if (slg_json.contains("layers") && slg_json["layers"].is_array()) {
@@ -1545,6 +1588,9 @@ bool SDGenerationParams::from_json_str(
                 if (slg_json.contains("scale") && slg_json["scale"].is_number()) {
                     target_params.guidance.slg.scale = slg_json["scale"];
                 }
+                if (slg_json.contains("uncond") && slg_json["scale"].is_number()) {
+                    target_params.guidance.slg.scale = slg_json["scale"];
+                }
             }
         }
     };
@@ -1589,10 +1635,18 @@ bool SDGenerationParams::from_json_str(
         LOG_ERROR("invalid init_image");
         return false;
     }
+    if (!parse_image_json_field(j, "end_image", 3, width, height, end_image)) {
+        LOG_ERROR("invalid end_image");
+        return false;
+    }
     if (!parse_image_array_json_field(j, "ref_images", 3, width, height, ref_images)) {
         LOG_ERROR("invalid ref_images");
         return false;
     }
+    if (!parse_image_array_json_field(j, "control_frames", 3, width, height, control_frames)) {
+        LOG_ERROR("invalid control_frames");
+        return false;
+    }
     if (!parse_image_json_field(j, "mask_image", 1, width, height, mask_image)) {
         LOG_ERROR("invalid mask_image");
         return false;
@@ -2097,6 +2151,8 @@ std::string version_string() {
 }
 
 std::string get_image_params(const SDContextParams& ctx_params, const SDGenerationParams& gen_params, int64_t seed) {
+    sd_img_gen_params_t defaults;
+    sd_img_gen_params_init(&defaults);
     std::string parameter_string;
     if (gen_params.prompt_with_lora.size() != 0) {
         parameter_string += gen_params.prompt_with_lora + "\n";
@@ -2108,6 +2164,22 @@ std::string get_image_params(const SDContextParams& ctx_params, const SDGenerati
     }
     parameter_string += "Steps: " + std::to_string(gen_params.sample_params.sample_steps) + ", ";
     parameter_string += "CFG scale: " + std::to_string(gen_params.sample_params.guidance.txt_cfg) + ", ";
+    {
+        auto & apg = gen_params.sample_params.guidance.apg;
+        auto & def_apg = defaults.sample_params.guidance.apg;
+        if (apg.eta != def_apg.eta) {
+            parameter_string += "APG eta: " + std::to_string(apg.eta) + ", ";
+        }
+        if (apg.momentum != def_apg.momentum) {
+            parameter_string += "APG momentum: " + std::to_string(apg.momentum) + ", ";
+        }
+        if (apg.norm_threshold != def_apg.norm_threshold) {
+            parameter_string += "APG norm threshold: " + std::to_string(apg.norm_threshold) + ", ";
+            if (apg.norm_threshold > 0 && apg.norm_threshold_smoothing != def_apg.norm_threshold_smoothing && apg.norm_threshold_smoothing > 0) {
+                parameter_string += "APG norm threshold smoothing: " + std::to_string(apg.norm_threshold_smoothing) + ", ";
+            }
+        }
+    }
     if (gen_params.sample_params.guidance.slg.scale != 0 && gen_params.skip_layers.size() != 0) {
         parameter_string += "SLG scale: " + std::to_string(gen_params.sample_params.guidance.txt_cfg) + ", ";
         parameter_string += "Skip layers: [";