From 79f9c6e854ca510c61df33d480fe2044a0546318 Mon Sep 17 00:00:00 2001 From: kingbri <8082010+kingbri1@users.noreply.github.com> Date: Wed, 19 Mar 2025 11:52:10 -0400 Subject: [PATCH] Model: Remove num_experts_per_token This shouldn't even be an exposed option since changing it always breaks inference with the model. Let the model's config.json handle it. Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com> --- backends/exllamav2/model.py | 6 ------ colab/TabbyAPI_Colab_Example.ipynb | 5 ----- common/config_models.py | 10 ---------- config_sample.yml | 6 ------ docs/02.-Server-options.md | 1 - endpoints/core/types/model.py | 2 -- 6 files changed, 30 deletions(-) diff --git a/backends/exllamav2/model.py b/backends/exllamav2/model.py index bb88365..4ffded7 100644 --- a/backends/exllamav2/model.py +++ b/backends/exllamav2/model.py @@ -344,11 +344,6 @@ class ExllamaV2Container: "template wasn't provided or auto-detected." ) - # Set num of experts per token if provided - num_experts_override = kwargs.get("num_experts_per_token") - if num_experts_override: - self.config.num_experts_per_token = kwargs.get("num_experts_per_token") - # Make sure chunk size is >= 256, keep near or below max seq len user_chunk_size = unwrap(kwargs.get("chunk_size"), 2048) chunk_size = sorted((256, user_chunk_size, self.config.max_seq_len))[1] @@ -502,7 +497,6 @@ class ExllamaV2Container: "cache_size": self.cache_size, "cache_mode": self.cache_mode, "chunk_size": self.config.max_input_len, - "num_experts_per_token": self.config.num_experts_per_token, "use_vision": self.use_vision, } diff --git a/colab/TabbyAPI_Colab_Example.ipynb b/colab/TabbyAPI_Colab_Example.ipynb index d2d8b6a..7af5a57 100644 --- a/colab/TabbyAPI_Colab_Example.ipynb +++ b/colab/TabbyAPI_Colab_Example.ipynb @@ -194,11 +194,6 @@ " # NOTE: Only works with chat completion message lists!\n", " prompt_template: {PromptTemplate}\n", "\n", - " # Number of experts to use per token. Loads from the model's config.json if not specified (default: None)\n", - " # WARNING: Don't set this unless you know what you're doing!\n", - " # NOTE: For MoE models (ex. Mixtral) only!\n", - " num_experts_per_token: {NumExpertsPerToken}\n", - "\n", " # Options for draft models (speculative decoding). This will use more VRAM!\n", " draft:\n", " # Overrides the directory to look for draft (default: models)\n", diff --git a/common/config_models.py b/common/config_models.py index f78408f..97381bc 100644 --- a/common/config_models.py +++ b/common/config_models.py @@ -294,16 +294,6 @@ class ModelConfig(BaseConfigModel): "Enables vision support if the model supports it. (default: False)" ), ) - num_experts_per_token: Optional[int] = Field( - None, - description=( - "Number of experts to use per token.\n" - "Fetched from the model's config.json if empty.\n" - "NOTE: For MoE models only.\n" - "WARNING: Don't set this unless you know what you're doing!" - ), - ge=1, - ) _metadata: Metadata = PrivateAttr(Metadata()) model_config = ConfigDict(protected_namespaces=()) diff --git a/config_sample.yml b/config_sample.yml index a13e64e..b6f362d 100644 --- a/config_sample.yml +++ b/config_sample.yml @@ -139,12 +139,6 @@ model: # Enables vision support if the model supports it. (default: False) vision: false - # Number of experts to use per token. - # Fetched from the model's config.json if empty. - # NOTE: For MoE models only. - # WARNING: Don't set this unless you know what you're doing! - num_experts_per_token: - # Options for draft models (speculative decoding) # This will use more VRAM! draft_model: diff --git a/docs/02.-Server-options.md b/docs/02.-Server-options.md index b319f76..d97c7bd 100644 --- a/docs/02.-Server-options.md +++ b/docs/02.-Server-options.md @@ -75,7 +75,6 @@ Note: Most of the options here will only apply on initial model load/startup (ep | max_batch_size | Int (None) | The absolute maximum amount of prompts to process at one time. This value is automatically adjusted based on cache size. | | prompt_template | String (None) | Name of a jinja2 chat template to apply for this model. Must be located in the `templates` directory. | | vision | Bool (False) | Enable vision support for the provided model (if it exists). | -| num_experts_per_token | Int (None) | Number of experts to use per-token for MoE models. Pulled from the config.json if not specified. | ### Draft Model Options diff --git a/endpoints/core/types/model.py b/endpoints/core/types/model.py index cc42dce..02213f9 100644 --- a/endpoints/core/types/model.py +++ b/endpoints/core/types/model.py @@ -22,7 +22,6 @@ class ModelCardParameters(BaseModel): chunk_size: Optional[int] = 2048 prompt_template: Optional[str] = None prompt_template_content: Optional[str] = None - num_experts_per_token: Optional[int] = None use_vision: Optional[bool] = False # Draft is another model, so include it in the card params @@ -114,7 +113,6 @@ class ModelLoadRequest(BaseModel): chunk_size: Optional[int] = None prompt_template: Optional[str] = None vision: Optional[bool] = None - num_experts_per_token: Optional[int] = None # Non-config arguments draft_model: Optional[DraftModelLoadRequest] = Field(