Model: Remove num_experts_per_token

This shouldn't even be an exposed option since changing it always
breaks inference with the model. Let the model's config.json handle
it.

Signed-off-by: kingbri <8082010+kingbri1@users.noreply.github.com>
This commit is contained in:
kingbri 2025-03-19 11:52:10 -04:00
parent 698d8339cb
commit 79f9c6e854
6 changed files with 0 additions and 30 deletions

View file

@ -344,11 +344,6 @@ class ExllamaV2Container:
"template wasn't provided or auto-detected." "template wasn't provided or auto-detected."
) )
# Set num of experts per token if provided
num_experts_override = kwargs.get("num_experts_per_token")
if num_experts_override:
self.config.num_experts_per_token = kwargs.get("num_experts_per_token")
# Make sure chunk size is >= 256, keep near or below max seq len # Make sure chunk size is >= 256, keep near or below max seq len
user_chunk_size = unwrap(kwargs.get("chunk_size"), 2048) user_chunk_size = unwrap(kwargs.get("chunk_size"), 2048)
chunk_size = sorted((256, user_chunk_size, self.config.max_seq_len))[1] chunk_size = sorted((256, user_chunk_size, self.config.max_seq_len))[1]
@ -502,7 +497,6 @@ class ExllamaV2Container:
"cache_size": self.cache_size, "cache_size": self.cache_size,
"cache_mode": self.cache_mode, "cache_mode": self.cache_mode,
"chunk_size": self.config.max_input_len, "chunk_size": self.config.max_input_len,
"num_experts_per_token": self.config.num_experts_per_token,
"use_vision": self.use_vision, "use_vision": self.use_vision,
} }

View file

@ -194,11 +194,6 @@
" # NOTE: Only works with chat completion message lists!\n", " # NOTE: Only works with chat completion message lists!\n",
" prompt_template: {PromptTemplate}\n", " prompt_template: {PromptTemplate}\n",
"\n", "\n",
" # Number of experts to use per token. Loads from the model's config.json if not specified (default: None)\n",
" # WARNING: Don't set this unless you know what you're doing!\n",
" # NOTE: For MoE models (ex. Mixtral) only!\n",
" num_experts_per_token: {NumExpertsPerToken}\n",
"\n",
" # Options for draft models (speculative decoding). This will use more VRAM!\n", " # Options for draft models (speculative decoding). This will use more VRAM!\n",
" draft:\n", " draft:\n",
" # Overrides the directory to look for draft (default: models)\n", " # Overrides the directory to look for draft (default: models)\n",

View file

@ -294,16 +294,6 @@ class ModelConfig(BaseConfigModel):
"Enables vision support if the model supports it. (default: False)" "Enables vision support if the model supports it. (default: False)"
), ),
) )
num_experts_per_token: Optional[int] = Field(
None,
description=(
"Number of experts to use per token.\n"
"Fetched from the model's config.json if empty.\n"
"NOTE: For MoE models only.\n"
"WARNING: Don't set this unless you know what you're doing!"
),
ge=1,
)
_metadata: Metadata = PrivateAttr(Metadata()) _metadata: Metadata = PrivateAttr(Metadata())
model_config = ConfigDict(protected_namespaces=()) model_config = ConfigDict(protected_namespaces=())

View file

@ -139,12 +139,6 @@ model:
# Enables vision support if the model supports it. (default: False) # Enables vision support if the model supports it. (default: False)
vision: false vision: false
# Number of experts to use per token.
# Fetched from the model's config.json if empty.
# NOTE: For MoE models only.
# WARNING: Don't set this unless you know what you're doing!
num_experts_per_token:
# Options for draft models (speculative decoding) # Options for draft models (speculative decoding)
# This will use more VRAM! # This will use more VRAM!
draft_model: draft_model:

View file

@ -75,7 +75,6 @@ Note: Most of the options here will only apply on initial model load/startup (ep
| max_batch_size | Int (None) | The absolute maximum amount of prompts to process at one time. This value is automatically adjusted based on cache size. | | max_batch_size | Int (None) | The absolute maximum amount of prompts to process at one time. This value is automatically adjusted based on cache size. |
| prompt_template | String (None) | Name of a jinja2 chat template to apply for this model. Must be located in the `templates` directory. | | prompt_template | String (None) | Name of a jinja2 chat template to apply for this model. Must be located in the `templates` directory. |
| vision | Bool (False) | Enable vision support for the provided model (if it exists). | | vision | Bool (False) | Enable vision support for the provided model (if it exists). |
| num_experts_per_token | Int (None) | Number of experts to use per-token for MoE models. Pulled from the config.json if not specified. |
### Draft Model Options ### Draft Model Options

View file

@ -22,7 +22,6 @@ class ModelCardParameters(BaseModel):
chunk_size: Optional[int] = 2048 chunk_size: Optional[int] = 2048
prompt_template: Optional[str] = None prompt_template: Optional[str] = None
prompt_template_content: Optional[str] = None prompt_template_content: Optional[str] = None
num_experts_per_token: Optional[int] = None
use_vision: Optional[bool] = False use_vision: Optional[bool] = False
# Draft is another model, so include it in the card params # Draft is another model, so include it in the card params
@ -114,7 +113,6 @@ class ModelLoadRequest(BaseModel):
chunk_size: Optional[int] = None chunk_size: Optional[int] = None
prompt_template: Optional[str] = None prompt_template: Optional[str] = None
vision: Optional[bool] = None vision: Optional[bool] = None
num_experts_per_token: Optional[int] = None
# Non-config arguments # Non-config arguments
draft_model: Optional[DraftModelLoadRequest] = Field( draft_model: Optional[DraftModelLoadRequest] = Field(