From 17a42e5877b02452b52c85497a21c49ecab2197c Mon Sep 17 00:00:00 2001
From: Mathieu Croquelois <croquelois@gmail.com>
Date: Mon, 19 May 2025 05:06:23 +0100
Subject: [PATCH 1/6] Add BF16 to GGUF (#2877)

---
 backend/operations_gguf.py       | 1 +
 packages_3rdparty/gguf/quants.py | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/backend/operations_gguf.py b/backend/operations_gguf.py
index f30ef7dd..468e4991 100644
--- a/backend/operations_gguf.py
+++ b/backend/operations_gguf.py
@@ -13,6 +13,7 @@ quants_mapping = {
     gguf.GGMLQuantizationType.Q5_K: gguf.Q5_K,
     gguf.GGMLQuantizationType.Q6_K: gguf.Q6_K,
     gguf.GGMLQuantizationType.Q8_0: gguf.Q8_0,
+    gguf.GGMLQuantizationType.BF16: gguf.BF16,
 }
 
 
diff --git a/packages_3rdparty/gguf/quants.py b/packages_3rdparty/gguf/quants.py
index abe52d54..cfd4d21b 100644
--- a/packages_3rdparty/gguf/quants.py
+++ b/packages_3rdparty/gguf/quants.py
@@ -268,6 +268,9 @@ class BF16(__Quant, qtype=GGMLQuantizationType.BF16):
     def dequantize_blocks(cls, blocks: np.ndarray) -> np.ndarray:
         return (blocks.view(np.int16).astype(np.int32) << 16).view(np.float32)
 
+    @classmethod
+    def dequantize_blocks_pytorch(cls, blocks, block_size, type_size, parameter) -> torch.Tensor:
+        return (blocks.view(torch.int16).to(torch.int32) << 16).view(torch.float32)
 
 class Q4_0(__Quant, qtype=GGMLQuantizationType.Q4_0):
     @classmethod

From 4f670347940a9b3629303e0bd55e9dcef105ff96 Mon Sep 17 00:00:00 2001
From: spawner <spawner6@outlook.com>
Date: Mon, 19 May 2025 15:33:03 +0800
Subject: [PATCH 2/6] Update args.py

---
 backend/args.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/backend/args.py b/backend/args.py
index 6758bb89..0aeddf2d 100644
--- a/backend/args.py
+++ b/backend/args.py
@@ -31,6 +31,8 @@ attn_group = parser.add_mutually_exclusive_group()
 attn_group.add_argument("--attention-split", action="store_true")
 attn_group.add_argument("--attention-quad", action="store_true")
 attn_group.add_argument("--attention-pytorch", action="store_true")
+attn_group.add_argument("--use-sage-attention", action="store_true", help="Use sage attention.")
+attn_group.add_argument("--use-flash-attention", action="store_true", help="Use FlashAttention.")
 
 upcast = parser.add_mutually_exclusive_group()
 upcast.add_argument("--force-upcast-attention", action="store_true")

From d88f325f32a7e2438ccb7df6130f8c4cd3b2162b Mon Sep 17 00:00:00 2001
From: spawner <spawner6@outlook.com>
Date: Mon, 19 May 2025 15:33:33 +0800
Subject: [PATCH 3/6] sage and flash

---
 backend/attention.py | 120 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 119 insertions(+), 1 deletion(-)

diff --git a/backend/attention.py b/backend/attention.py
index 58fee278..077111ee 100644
--- a/backend/attention.py
+++ b/backend/attention.py
@@ -18,6 +18,22 @@ if memory_management.xformers_enabled():
     except:
         pass
 
+if memory_management.sage_attention_enabled():
+    try:
+        from sageattention import sageattn
+    except ModuleNotFoundError:
+        print(f"\n\nTo use the `--use-sage-attention` feature, the `sageattention` package must be installed first.\ncommand:\n\t{sys.executable} -m pip install sageattention")
+        exit(-1)
+
+if memory_management.flash_attention_enabled():
+    try:
+        from flash_attn import flash_attn_func
+    except ModuleNotFoundError:
+        print(f"\n\nTo use the `--use-flash-attention` feature, the `flash-attn` package must be installed first.\ncommand:\n\t{sys.executable} -m pip install flash-attn")
+        exit(-1)
+
+import backend.operations
+ops = backend.operations.ForgeOperations
 
 FORCE_UPCAST_ATTENTION_DTYPE = memory_management.force_upcast_attention_dtype()
 
@@ -338,6 +354,102 @@ def attention_pytorch(q, k, v, heads, mask=None, attn_precision=None, skip_resha
     )
     return out
 
+def attention_sage(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
+    # sageattn doesn't work with sd1.5
+    if q.shape[-1] // heads not in [64, 96, 128]:
+        if memory_management.flash_attention_enabled():
+            return attention_flash(q, k, v, heads, mask=mask, attn_precision=attn_precision, skip_reshape=skip_reshape)
+        elif memory_management.xformers_enabled():
+            return attention_xformers(q, k, v, heads, mask=mask, attn_precision=attn_precision, skip_reshape=skip_reshape)
+        return attention_pytorch(q, k, v, heads, mask=mask, attn_precision=attn_precision, skip_reshape=skip_reshape)
+    if skip_reshape:
+        b, _, _, dim_head = q.shape
+        tensor_layout="HND"
+    else:
+        b, _, dim_head = q.shape
+        dim_head //= heads
+        q, k, v = map(
+            lambda t: t.view(b, -1, heads, dim_head),
+            (q, k, v),
+        )
+        tensor_layout="NHD"
+
+    if mask is not None:
+        # add a batch dimension if there isn't already one
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0)
+        # add a heads dimension if there isn't already one
+        if mask.ndim == 3:
+            mask = mask.unsqueeze(1)
+
+    out = sageattn(q, k, v, attn_mask=mask, is_causal=False, tensor_layout=tensor_layout)
+    if tensor_layout == "HND":
+        if not skip_output_reshape:
+            out = (
+                out.transpose(1, 2).reshape(b, -1, heads * dim_head)
+            )
+    else:
+        if skip_output_reshape:
+            out = out.transpose(1, 2)
+        else:
+            out = out.reshape(b, -1, heads * dim_head)
+    return out
+
+try:
+    @torch.library.custom_op("flash_attention::flash_attn", mutates_args=())
+    def flash_attn_wrapper(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    dropout_p: float = 0.0, causal: bool = False) -> torch.Tensor:
+        return flash_attn_func(q, k, v, dropout_p=dropout_p, causal=causal)
+
+
+    @flash_attn_wrapper.register_fake
+    def flash_attn_fake(q, k, v, dropout_p=0.0, causal=False):
+        # Output shape is the same as q
+        return q.new_empty(q.shape)
+except AttributeError as error:
+    FLASH_ATTN_ERROR = error
+
+    def flash_attn_wrapper(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                    dropout_p: float = 0.0, causal: bool = False) -> torch.Tensor:
+        assert False, f"Could not define flash_attn_wrapper: {FLASH_ATTN_ERROR}"
+
+
+def attention_flash(q, k, v, heads, mask=None, attn_precision=None, skip_reshape=False, skip_output_reshape=False):
+    if skip_reshape:
+        b, _, _, dim_head = q.shape
+    else:
+        b, _, dim_head = q.shape
+        dim_head //= heads
+        q, k, v = map(
+            lambda t: t.view(b, -1, heads, dim_head).transpose(1, 2),
+            (q, k, v),
+        )
+
+    if mask is not None:
+        # add a batch dimension if there isn't already one
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0)
+        # add a heads dimension if there isn't already one
+        if mask.ndim == 3:
+            mask = mask.unsqueeze(1)
+
+    try:
+        assert mask is None
+        out = flash_attn_wrapper(
+            q.transpose(1, 2),
+            k.transpose(1, 2),
+            v.transpose(1, 2),
+            dropout_p=0.0,
+            causal=False,
+        ).transpose(1, 2)
+    except Exception as e:
+        print(f"Flash Attention failed, using default SDPA: {e}")
+        out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False)
+    if not skip_output_reshape:
+        out = (
+            out.transpose(1, 2).reshape(b, -1, heads * dim_head)
+        )
+    return out
 
 def slice_attention_single_head_spatial(q, k, v):
     r1 = torch.zeros_like(k, device=q.device)
@@ -427,7 +539,13 @@ def pytorch_attention_single_head_spatial(q, k, v):
     return out
 
 
-if memory_management.xformers_enabled():
+if memory_management.sage_attention_enabled():
+    print("Using sage attention")
+    attention_function = attention_sage
+elif memory_management.flash_attention_enabled():
+    print("Using Flash Attention")
+    attention_function = attention_flash
+elif memory_management.xformers_enabled():
     print("Using xformers cross attention")
     attention_function = attention_xformers
 elif memory_management.pytorch_attention_enabled():

From df925305e80b6fb2ce8e12276b32337ff3ff421a Mon Sep 17 00:00:00 2001
From: spawner <spawner6@outlook.com>
Date: Mon, 19 May 2025 15:34:50 +0800
Subject: [PATCH 4/6] flash and sage

---
 backend/memory_management.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/backend/memory_management.py b/backend/memory_management.py
index 5f0c8312..56d19b7c 100644
--- a/backend/memory_management.py
+++ b/backend/memory_management.py
@@ -955,6 +955,11 @@ def cast_to_device(tensor, device, dtype, copy=False):
     else:
         return tensor.to(device, dtype, copy=copy, non_blocking=non_blocking)
 
+def sage_attention_enabled():
+    return args.use_sage_attention
+
+def flash_attention_enabled():
+    return args.use_flash_attention
 
 def xformers_enabled():
     global directml_enabled

From ae1d2449a5c4caa5b3da454cde55e332b9a3e57d Mon Sep 17 00:00:00 2001
From: spawner <spawner6@outlook.com>
Date: Mon, 19 May 2025 15:38:13 +0800
Subject: [PATCH 5/6] support noob ctrlnet inpaint model
 https://civitai.com/models/1376234/noobai-inpainting-controlnet

---
 .../scripts/preprocessor_inpaint.py           | 44 +++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/extensions-builtin/forge_preprocessor_inpaint/scripts/preprocessor_inpaint.py b/extensions-builtin/forge_preprocessor_inpaint/scripts/preprocessor_inpaint.py
index 21f222d9..f4c5e8d5 100644
--- a/extensions-builtin/forge_preprocessor_inpaint/scripts/preprocessor_inpaint.py
+++ b/extensions-builtin/forge_preprocessor_inpaint/scripts/preprocessor_inpaint.py
@@ -159,9 +159,53 @@ class PreprocessorInpaintLama(PreprocessorInpaintOnly):
         process.modified_noise = original_noise + self.latent.to(original_noise) / sigma_max.to(original_noise)
         return cond, mask
 
+class PreprocessorInpaintNoobAIXL(Preprocessor): # support noob ctrlnet inpaint model https://civitai.com/models/1376234/noobai-inpainting-controlnet
+    def __init__(self):
+        super().__init__()
+        self.name = 'inpaint_noobai_xl'
+        self.tags = ['Inpaint']
+        self.model_filename_filters = ['inpaint', 'noobai']
+        self.slider_resolution = PreprocessorParameter(visible=False)
+        self.fill_mask_with_one_when_resize_and_fill = True
+        self.expand_mask_when_resize_and_fill = True
+
+    def __call__(self, input_image, resolution=512, slider_1=None, slider_2=None, slider_3=None, input_mask=None, **kwargs):
+        if input_mask is None:
+            return input_image
+
+        if not isinstance(input_image, np.ndarray):
+            input_image = np.array(input_image)
+        if not isinstance(input_mask, np.ndarray):
+            input_mask = np.array(input_mask)
+
+        mask = input_mask.astype(np.float32) / 255.0
+        mask = (mask > 0.5).astype(np.float32)
+
+        # Create a copy of the input image
+        result = input_image.copy()
+
+        # Convert mask to proper shape if needed
+        if mask.ndim == 2:
+            mask = np.expand_dims(mask, axis=-1)
+        if mask.shape[-1] == 1:
+            mask = np.repeat(mask, 3, axis=-1)
+
+        mask_indices = mask > 0.5
+        result[mask_indices] = 0.0
+
+        return result
+
+    def process_before_every_sampling(self, process, cond, mask, *args, **kwargs):
+        mask = mask.round()
+        mixed_cond = cond.clone()
+        mixed_cond = mixed_cond * (1.0 - mask)
+
+        return mixed_cond, None
 
 add_supported_preprocessor(PreprocessorInpaint())
 
 add_supported_preprocessor(PreprocessorInpaintOnly())
 
 add_supported_preprocessor(PreprocessorInpaintLama())
+
+add_supported_preprocessor(PreprocessorInpaintNoobAIXL())

From 2ed324da6a1f16bce911db2b807bbbbdd80ec032 Mon Sep 17 00:00:00 2001
From: spawner <spawner6@outlook.com>
Date: Mon, 19 May 2025 15:39:13 +0800
Subject: [PATCH 6/6] automaticly add local themes

---
 modules/shared_gradio_themes.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/modules/shared_gradio_themes.py b/modules/shared_gradio_themes.py
index b4e3f32b..061ab161 100644
--- a/modules/shared_gradio_themes.py
+++ b/modules/shared_gradio_themes.py
@@ -40,6 +40,23 @@ gradio_hf_hub_themes = [
     "NoCrypt/miku"
 ]
 
+# automaticly add local themes
+theme_dir = "tmp/gradio_themes"
+json_files = []
+
+try:
+    if os.path.exists(theme_dir):
+        json_files = [f for f in os.listdir(theme_dir) if f.endswith('.json')]
+    else:
+        print(f"Directory {theme_dir} does not exist. No new themes will be added.")
+except OSError as e:
+    print(f"Error accessing directory {theme_dir}: {e}. No new themes will be added.")
+
+for json_file in json_files:
+    theme_name = json_file.replace('.json', '')
+    theme_name = theme_name.replace('_', '/')
+    if theme_name not in gradio_hf_hub_themes:
+        gradio_hf_hub_themes.append(theme_name)
 
 def reload_gradio_theme(theme_name=None):
     if not theme_name: