From 79f9c6e854ca510c61df33d480fe2044a0546318 Mon Sep 17 00:00:00 2001
From: kingbri <8082010+kingbri1@users.noreply.github.com>
Date: Wed, 19 Mar 2025 11:52:10 -0400
Subject: [PATCH] Model: Remove num_experts_per_token

This shouldn't even be an exposed option since changing it always
breaks inference with the model. Let the model's config.json handle
it.

Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
---
 backends/exllamav2/model.py        |  6 ------
 colab/TabbyAPI_Colab_Example.ipynb |  5 -----
 common/config_models.py            | 10 ----------
 config_sample.yml                  |  6 ------
 docs/02.-Server-options.md         |  1 -
 endpoints/core/types/model.py      |  2 --
 6 files changed, 30 deletions(-)

diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py
index bb88365..4ffded7 100644
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
@@ -344,11 +344,6 @@ class ExllamaV2Container:
                 "template wasn't provided or auto-detected."
             )
 
-        # Set num of experts per token if provided
-        num_experts_override = kwargs.get("num_experts_per_token")
-        if num_experts_override:
-            self.config.num_experts_per_token = kwargs.get("num_experts_per_token")
-
         # Make sure chunk size is >= 256, keep near or below max seq len
         user_chunk_size = unwrap(kwargs.get("chunk_size"), 2048)
         chunk_size = sorted((256, user_chunk_size, self.config.max_seq_len))[1]
@@ -502,7 +497,6 @@ class ExllamaV2Container:
             "cache_size": self.cache_size,
             "cache_mode": self.cache_mode,
             "chunk_size": self.config.max_input_len,
-            "num_experts_per_token": self.config.num_experts_per_token,
             "use_vision": self.use_vision,
         }
 
diff --git a/colab/TabbyAPI_Colab_Example.ipynb b/colab/TabbyAPI_Colab_Example.ipynb
index d2d8b6a..7af5a57 100644
--- a/colab/TabbyAPI_Colab_Example.ipynb
+++ b/colab/TabbyAPI_Colab_Example.ipynb
@@ -194,11 +194,6 @@
         "  # NOTE: Only works with chat completion message lists!\n",
         "  prompt_template: {PromptTemplate}\n",
         "\n",
-        "  # Number of experts to use per token. Loads from the model's config.json if not specified (default: None)\n",
-        "  # WARNING: Don't set this unless you know what you're doing!\n",
-        "  # NOTE: For MoE models (ex. Mixtral) only!\n",
-        "  num_experts_per_token: {NumExpertsPerToken}\n",
-        "\n",
         "  # Options for draft models (speculative decoding). This will use more VRAM!\n",
         "  draft:\n",
         "    # Overrides the directory to look for draft (default: models)\n",
diff --git a/common/config_models.py b/common/config_models.py
index f78408f..97381bc 100644
--- a/common/config_models.py
+++ b/common/config_models.py
@@ -294,16 +294,6 @@ class ModelConfig(BaseConfigModel):
             "Enables vision support if the model supports it. (default: False)"
         ),
     )
-    num_experts_per_token: Optional[int] = Field(
-        None,
-        description=(
-            "Number of experts to use per token.\n"
-            "Fetched from the model's config.json if empty.\n"
-            "NOTE: For MoE models only.\n"
-            "WARNING: Don't set this unless you know what you're doing!"
-        ),
-        ge=1,
-    )
 
     _metadata: Metadata = PrivateAttr(Metadata())
     model_config = ConfigDict(protected_namespaces=())
diff --git a/config_sample.yml b/config_sample.yml
index a13e64e..b6f362d 100644
--- a/config_sample.yml
+++ b/config_sample.yml
@@ -139,12 +139,6 @@ model:
   # Enables vision support if the model supports it. (default: False)
   vision: false
 
-  # Number of experts to use per token.
-  # Fetched from the model's config.json if empty.
-  # NOTE: For MoE models only.
-  # WARNING: Don't set this unless you know what you're doing!
-  num_experts_per_token:
-
 # Options for draft models (speculative decoding)
 # This will use more VRAM!
 draft_model:
diff --git a/docs/02.-Server-options.md b/docs/02.-Server-options.md
index b319f76..d97c7bd 100644
--- a/docs/02.-Server-options.md
+++ b/docs/02.-Server-options.md
@@ -75,7 +75,6 @@ Note: Most of the options here will only apply on initial model load/startup (ep
 | max_batch_size        | Int (None)                       | The absolute maximum amount of prompts to process at one time. This value is automatically adjusted based on cache size.                                                                                                       |
 | prompt_template       | String (None)                    | Name of a jinja2 chat template to apply for this model. Must be located in the `templates` directory.                                                                                                                          |
 | vision                | Bool (False)                     | Enable vision support for the provided model (if it exists).                                                                                                                                                                   |
-| num_experts_per_token | Int (None)                       | Number of experts to use per-token for MoE models. Pulled from the config.json if not specified.                                                                                                                               |
 
 ### Draft Model Options
 
diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py
index cc42dce..02213f9 100644
--- a/endpoints/core/types/model.py
+++ b/endpoints/core/types/model.py
@@ -22,7 +22,6 @@ class ModelCardParameters(BaseModel):
     chunk_size: Optional[int] = 2048
     prompt_template: Optional[str] = None
     prompt_template_content: Optional[str] = None
-    num_experts_per_token: Optional[int] = None
     use_vision: Optional[bool] = False
 
     # Draft is another model, so include it in the card params
@@ -114,7 +113,6 @@ class ModelLoadRequest(BaseModel):
     chunk_size: Optional[int] = None
     prompt_template: Optional[str] = None
     vision: Optional[bool] = None
-    num_experts_per_token: Optional[int] = None
 
     # Non-config arguments
     draft_model: Optional[DraftModelLoadRequest] = Field(