Merge pull request #322 from theroyallab/model-rewrite

Model rewrite
2025-04-26 02:15:48 -04:00 · 2025-04-26 02:15:48 -04:00 · b081aa9fa3
commit b081aa9fa3
parent 6bb5f8f599 3649d3bb51
15 changed files with 758 additions and 535 deletions
--- a/backends/base_model_container.py
+++ b/backends/base_model_container.py
@ -0,0 +1,244 @@
+import abc
+import asyncio
+import pathlib
+from loguru import logger
+from typing import (
+    Any,
+    AsyncIterator,
+    Dict,
+    List,
+    Optional,
+)
+
+from common.multimodal import MultimodalEmbeddingWrapper
+from common.sampling import BaseSamplerRequest
+from common.templating import PromptTemplate
+from common.transformers_utils import GenerationConfig
+from endpoints.core.types.model import ModelCard
+
+
+class BaseModelContainer(abc.ABC):
+    """Abstract base class for model containers."""
+
+    # Exposed model information
+    model_dir: pathlib.Path = pathlib.Path("models")
+    prompt_template: Optional[PromptTemplate] = None
+    generation_config: Optional[GenerationConfig] = None
+
+    # Load synchronization
+    # The bool is a master switch for accepting requests
+    # The lock keeps load tasks sequential
+    # The condition notifies any waiting tasks
+    active_job_ids: Dict[str, Any] = {}
+    loaded: bool = False
+    load_lock: asyncio.Lock
+    load_condition: asyncio.Condition
+
+    # Required methods
+    @classmethod
+    @abc.abstractmethod
+    async def create(cls, model_directory: pathlib.Path, **kwargs):
+        """
+        Asynchronously creates and initializes a model container instance.
+
+        Args:
+            model_directory: Path to the model files.
+            **kwargs: Backend-specific configuration options.
+
+        Returns:
+            An instance of the implementing class.
+        """
+
+        pass
+
+    @abc.abstractmethod
+    async def load(self, progress_callback=None, **kwargs):
+        """
+        Loads the model into memory.
+
+        Args:
+            progress_callback: Optional callback for progress updates.
+            **kwargs: Additional loading options.
+        """
+
+        pass
+
+    # NOTE: Might be an optional method
+    @abc.abstractmethod
+    async def load_gen(self, progress_callback=None, **kwargs) -> AsyncIterator[Any]:
+        """
+        Loads the model into memory, yielding progress updates.
+
+        Args:
+            progress_callback: Optional callback for progress updates.
+            **kwargs: Additional loading options.
+
+        Yields:
+            Progress updates
+        """
+
+        if False:
+            yield
+
+    @abc.abstractmethod
+    async def unload(self, loras_only: bool = False, **kwargs):
+        """
+        Unloads the model and associated resources from memory.
+
+        Args:
+            loras_only: If True, only unload LoRAs.
+            **kwargs: Additional unloading options (e.g., shutdown).
+        """
+
+        pass
+
+    @abc.abstractmethod
+    def encode_tokens(self, text: str, **kwargs) -> List[int]:
+        """
+        Encodes a string of text into a list of token IDs.
+
+        Args:
+            text: The input text string.
+            **kwargs: Backend-specific encoding options (e.g., add_bos_token).
+
+        Returns:
+            A list of integer token IDs.
+        """
+
+        pass
+
+    @abc.abstractmethod
+    def decode_tokens(self, ids: List[int], **kwargs) -> str:
+        """
+        Decodes a list of token IDs back into a string.
+
+        Args:
+            ids: A list of integer token IDs.
+            **kwargs: Backend-specific decoding options (e.g., decode_special_tokens).
+
+        Returns:
+            The decoded text string.
+        """
+
+        pass
+
+    @abc.abstractmethod
+    def get_special_tokens(self, **kwargs) -> Dict[str, Any]:
+        """
+        Gets special tokens used by the model/tokenizer.
+
+        Args:
+            **kwargs: Options like add_bos_token, ban_eos_token.
+
+        Returns:
+            A dictionary mapping special token names (e.g., 'bos_token', 'eos_token')
+            to their string or ID representation.
+        """
+
+        pass
+
+    @abc.abstractmethod
+    async def generate(
+        self,
+        request_id: str,
+        prompt: str,
+        params: BaseSamplerRequest,
+        abort_event: Optional[asyncio.Event] = None,
+        mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None,
+    ) -> Dict[str, Any]:
+        """
+        Generates a complete response for a given prompt and parameters.
+
+        Args:
+            request_id: Unique identifier for the generation request.
+            prompt: The input prompt string.
+            params: Sampling and generation parameters.
+            abort_event: An asyncio Event to signal cancellation.
+            mm_embeddings: Optional multimodal embeddings.
+
+        Returns:
+            A dictionary containing the generation info
+        """
+
+        pass
+
+    @abc.abstractmethod
+    async def stream_generate(
+        self,
+        request_id: str,
+        prompt: str,
+        params: BaseSamplerRequest,
+        abort_event: Optional[asyncio.Event] = None,
+        mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None,
+    ) -> AsyncIterator[Dict[str, Any]]:
+        """
+        Generates a response iteratively (streaming) for a given prompt.
+
+        Args:
+            request_id: Unique identifier for the generation request.
+            prompt: The input prompt string.
+            params: Sampling and generation parameters.
+            abort_event: An asyncio Event to signal cancellation.
+            mm_embeddings: Optional multimodal embeddings.
+
+        Yields:
+            Generation chunks
+        """
+
+        if False:
+            yield
+
+    @abc.abstractmethod
+    def model_info(self) -> ModelCard:
+        """
+        Returns a dictionary of the current model's configuration parameters.
+
+        Returns:
+            Model parameters provided by the backend
+        """
+
+        pass
+
+    @abc.abstractmethod
+    async def wait_for_jobs(self, skip_wait: bool = False):
+        """
+        Waits for any active generation jobs to complete.
+
+        Args:
+            skip_wait: If True, cancel jobs immediately instead of waiting.
+        """
+
+        pass
+
+    # Optional methods
+    async def load_loras(
+        self, lora_directory: pathlib.Path, **kwargs
+    ) -> Dict[str, List[str]]:
+        """
+        Loads LoRA adapters. Base implementation does nothing or raises error.
+
+        Args:
+            lora_directory: Path to the directory containing LoRA files.
+            **kwargs: LoRA configuration (e.g., list of loras, scaling).
+
+        Returns:
+            A dictionary indicating success/failure for each LoRA.
+        """
+
+        logger.warning("LoRA loading not implemented for this backend.")  # type: ignore
+        return {
+            "success": [],
+            "failure": [
+                lora.get("name", "unknown") for lora in kwargs.get("loras", [])
+            ],
+        }
+
+    def get_loras(self) -> List[Any]:
+        """
+        Gets the currently loaded LoRA adapters. Base implementation returns empty list.
+
+        Returns:
+            A list representing the loaded LoRAs (backend-specific format).
+        """
+
+        return []
--- a/backends/exllamav2/model.py
+++ b/backends/exllamav2/model.py
--- a/backends/infinity/model.py
+++ b/backends/infinity/model.py
@ -14,8 +14,7 @@ if dependencies.extras:

 class InfinityContainer:
    model_dir: pathlib.Path
-    model_is_loading: bool = False
-    model_loaded: bool = False
+    loaded: bool = False

    # Use a runtime type hint here
    engine: Optional["AsyncEmbeddingEngine"] = None
@ -24,8 +23,6 @@ class InfinityContainer:
        self.model_dir = model_directory

    async def load(self, **kwargs):
-        self.model_is_loading = True
-
        # Use cpu by default
        device = unwrap(kwargs.get("embeddings_device"), "cpu")

@ -40,7 +37,7 @@ class InfinityContainer:
        self.engine = AsyncEmbeddingEngine.from_args(engine_args)
        await self.engine.astart()

-        self.model_loaded = True
+        self.loaded = True
        logger.info("Embedding model successfully loaded.")

    async def unload(self):
--- a/common/model.py
+++ b/common/model.py
@ -4,24 +4,29 @@ Manages the storage and utility of model containers.
 Containers exist as a common interface for backends.
 """

+import aiofiles
 import pathlib
 from enum import Enum
 from fastapi import HTTPException
 from loguru import logger
+from ruamel.yaml import YAML
 from typing import Optional

+from backends.base_model_container import BaseModelContainer
 from common.logger import get_loading_progress_bar
 from common.networking import handle_request_error
 from common.tabby_config import config
 from common.optional_dependencies import dependencies
+from common.utils import unwrap

+# Global variables for model container
+container: Optional[BaseModelContainer] = None
+embeddings_container = None
+
+# FIXME: Possibly use this solely when creating the model
 if dependencies.exllamav2:
    from backends.exllamav2.model import ExllamaV2Container

-    # Global model container
-    container: Optional[ExllamaV2Container] = None
-    embeddings_container = None
-

 if dependencies.extras:
    from backends.infinity.model import InfinityContainer
@ -41,6 +46,36 @@ def load_progress(module, modules):
    yield module, modules


+async def apply_inline_overrides(model_dir: pathlib.Path, **kwargs):
+    """Sets overrides from a model folder's config yaml."""
+
+    override_config_path = model_dir / "tabby_config.yml"
+
+    if not override_config_path.exists():
+        return kwargs
+
+    async with aiofiles.open(
+        override_config_path, "r", encoding="utf8"
+    ) as override_config_file:
+        contents = await override_config_file.read()
+
+        # Create a temporary YAML parser
+        yaml = YAML(typ="safe")
+        override_args = unwrap(yaml.load(contents), {})
+
+        # Merge draft overrides beforehand
+        draft_override_args = unwrap(override_args.get("draft_model"), {})
+        if draft_override_args:
+            kwargs["draft_model"] = {
+                **draft_override_args,
+                **unwrap(kwargs.get("draft_model"), {}),
+            }
+
+        # Merge the override and model kwargs
+        merged_kwargs = {**override_args, **kwargs}
+        return merged_kwargs
+
+
 async def unload_model(skip_wait: bool = False, shutdown: bool = False):
    """Unloads a model"""
    global container
@ -57,7 +92,7 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs):
    if container and container.model:
        loaded_model_name = container.model_dir.name

-        if loaded_model_name == model_path.name and container.model_loaded:
+        if loaded_model_name == model_path.name and container.loaded:
            raise ValueError(
                f'Model "{loaded_model_name}" is already loaded! Aborting.'
            )
@ -65,22 +100,34 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs):
        logger.info("Unloading existing model.")
        await unload_model()

-    # Merge with config defaults
+    # Reset to prepare for a new container
+    container = None
+
+    # Model_dir is already provided
+    if "model_dir" in kwargs:
+        kwargs.pop("model_dir")
+
+    # Merge with config and inline defaults
+    # TODO: Figure out a way to do this with Pydantic validation
+    # and ModelLoadRequest. Pydantic doesn't have async validators
    kwargs = {**config.model_defaults, **kwargs}
+    kwargs = await apply_inline_overrides(model_path, **kwargs)

    # Create a new container
-    container = await ExllamaV2Container.create(model_path.resolve(), False, **kwargs)
+    new_container = await ExllamaV2Container.create(
+        model_path.resolve(), False, **kwargs
+    )

    # Add possible types of models that can be loaded
    model_type = [ModelType.MODEL]

-    if container.use_vision:
+    if new_container.use_vision:
        model_type.insert(0, ModelType.VISION)

-    if container.draft_config:
+    if new_container.draft_config:
        model_type.insert(0, ModelType.DRAFT)

-    load_status = container.load_gen(load_progress, **kwargs)
+    load_status = new_container.load_gen(load_progress, **kwargs)

    progress = get_loading_progress_bar()
    progress.start()
@ -104,6 +151,8 @@ async def load_model_gen(model_path: pathlib.Path, **kwargs):
                    progress.stop()
                else:
                    index += 1
+
+        container = new_container
    finally:
        progress.stop()

@ -142,7 +191,7 @@ async def load_embedding_model(model_path: pathlib.Path, **kwargs):
    if embeddings_container and embeddings_container.engine:
        loaded_model_name = embeddings_container.model_dir.name

-        if loaded_model_name == model_path.name and embeddings_container.model_loaded:
+        if loaded_model_name == model_path.name and embeddings_container.loaded:
            raise ValueError(
                f'Embeddings model "{loaded_model_name}" is already loaded! Aborting.'
            )
@ -150,8 +199,13 @@ async def load_embedding_model(model_path: pathlib.Path, **kwargs):
        logger.info("Unloading existing embeddings model.")
        await unload_embedding_model()

-    embeddings_container = InfinityContainer(model_path)
-    await embeddings_container.load(**kwargs)
+    # Reset to prepare for a new container
+    embeddings_container = None
+
+    new_embeddings_container = InfinityContainer(model_path)
+    await new_embeddings_container.load(**kwargs)
+
+    embeddings_container = new_embeddings_container


 async def unload_embedding_model():
@ -164,13 +218,13 @@ async def unload_embedding_model():
 async def check_model_container():
    """FastAPI depends that checks if a model isn't loaded or currently loading."""

-    if container is None or not (container.model_is_loading or container.model_loaded):
+    if container is None:
        error_message = handle_request_error(
            "No models are currently loaded.",
            exc_info=False,
        ).error.message

-        raise HTTPException(400, error_message)
+        raise HTTPException(503, error_message)


 async def check_embeddings_container():
@ -180,12 +234,10 @@ async def check_embeddings_container():
    This is the same as the model container check, but with embeddings instead.
    """

-    if embeddings_container is None or not (
-        embeddings_container.model_is_loading or embeddings_container.model_loaded
-    ):
+    if embeddings_container is None:
        error_message = handle_request_error(
            "No embedding models are currently loaded.",
            exc_info=False,
        ).error.message

-        raise HTTPException(400, error_message)
+        raise HTTPException(503, error_message)
--- a/common/sampling.py
+++ b/common/sampling.py
@ -41,12 +41,6 @@ class BaseSamplerRequest(BaseModel):
        ge=0,
    )

-    generate_window: Optional[int] = Field(
-        default_factory=lambda: get_default_sampler_value("generate_window"),
-        examples=[512],
-        ge=0,
-    )
-
    stop: Optional[Union[str, List[Union[str, int]]]] = Field(
        default_factory=lambda: get_default_sampler_value("stop", []),
        validation_alias=AliasChoices("stop", "stop_sequence"),
@ -165,7 +159,7 @@ class BaseSamplerRequest(BaseModel):
            "rep_pen_range",
        ),
        description=(
-            "Aliases: repetition_range, repetition_penalty_range, " "rep_pen_range"
+            "Aliases: repetition_range, repetition_penalty_range, rep_pen_range"
        ),
    )

@ -281,6 +275,11 @@ class BaseSamplerRequest(BaseModel):
        ge=0,
    )

+    logprobs: Optional[int] = Field(
+        default_factory=lambda: get_default_sampler_value("logprobs", 0),
+        ge=0,
+    )
+
    @field_validator("top_k", mode="before")
    def convert_top_k(cls, v):
        """Fixes instance if Top-K is -1."""
--- a/common/templating.py
+++ b/common/templating.py
@ -1,5 +1,6 @@
 """Small replication of AutoTokenizer's chat template system for efficiency"""

+import traceback
 import aiofiles
 import json
 import pathlib
@ -211,3 +212,56 @@ def find_template_from_model(model_path: pathlib.Path):
            return template_name
        else:
            raise TemplateLoadError("Could not find template from model name.")
+
+
+async def find_prompt_template(template_name, model_dir: pathlib.Path):
+    """Tries to find a prompt template using various methods."""
+
+    logger.info("Attempting to load a prompt template if present.")
+
+    find_template_functions = [
+        lambda: PromptTemplate.from_model_json(
+            model_dir / "chat_template.json",
+            key="chat_template",
+        ),
+        lambda: PromptTemplate.from_model_json(
+            model_dir / "tokenizer_config.json",
+            key="chat_template",
+        ),
+        lambda: PromptTemplate.from_file(find_template_from_model(model_dir)),
+    ]
+
+    # Find the template in the model directory if it exists
+    model_dir_template_path = model_dir / "tabby_template.jinja"
+    if model_dir_template_path.exists():
+        find_template_functions[:0] = [
+            lambda: PromptTemplate.from_file(model_dir_template_path)
+        ]
+
+    # Add lookup from prompt template name if provided
+    if template_name:
+        find_template_functions[:0] = [
+            lambda: PromptTemplate.from_file(pathlib.Path("templates") / template_name),
+            lambda: PromptTemplate.from_model_json(
+                model_dir / "tokenizer_config.json",
+                key="chat_template",
+                name=template_name,
+            ),
+        ]
+
+    # Continue on exception since functions are tried as they fail
+    for template_func in find_template_functions:
+        try:
+            prompt_template = await template_func()
+            if prompt_template is not None:
+                return prompt_template
+        except TemplateLoadError as e:
+            logger.warning(f"TemplateLoadError: {str(e)}")
+            continue
+        except Exception:
+            logger.error(traceback.format_exc())
+            logger.warning(
+                "An unexpected error happened when trying to load the template. "
+                "Trying other methods."
+            )
+            continue
--- a/common/utils.py
+++ b/common/utils.py
@ -85,3 +85,24 @@ def unwrap_optional_type(type_hint) -> Type:
                    return arg

    return type_hint
+
+
+def calculate_rope_alpha(base_seq_len: int, target_seq_len: int):
+    """
+    Converts a given max sequence length to a rope alpha value.
+
+    Args:
+        base_seq_len: The model's configured sequence length.
+        target_seq_len: The user-specified max sequence length.
+    """
+
+    # Get the ratio of the model's max sequence length to the target
+    ratio = target_seq_len / base_seq_len
+
+    # Default to a 1 alpha if the sequence length is ever less
+    # than or equal to 1
+    if ratio <= 1.0:
+        alpha = 1
+    else:
+        alpha = -0.13436 + 0.80541 * ratio + 0.28833 * ratio**2
+    return alpha
--- a/endpoints/Kobold/router.py
+++ b/endpoints/Kobold/router.py
@ -122,7 +122,7 @@ async def get_tokencount(data: TokenCountRequest) -> TokenCountResponse:
 async def get_max_length() -> MaxLengthResponse:
    """Fetches the max length of the model."""

-    max_length = model.container.get_model_parameters().get("max_seq_len")
+    max_length = model.container.model_info().parameters.max_seq_len
    return {"value": max_length}


--- a/endpoints/Kobold/utils/generation.py
+++ b/endpoints/Kobold/utils/generation.py
@ -52,7 +52,7 @@ async def _stream_collector(data: GenerateRequest, request: Request):
    try:
        logger.info(f"Received Kobold generation request {data.genkey}")

-        generator = model.container.generate_gen(
+        generator = model.container.stream_generate(
            request_id=data.genkey, abort_event=abort_event, **data.model_dump()
        )
        async for generation in generator:
--- a/endpoints/OAI/types/common.py
+++ b/endpoints/OAI/types/common.py
@ -32,10 +32,6 @@ class CommonCompletionRequest(BaseSamplerRequest):
    # Generation info (remainder is in BaseSamplerRequest superclass)
    stream: Optional[bool] = False
    stream_options: Optional[ChatCompletionStreamOptions] = None
-    logprobs: Optional[int] = Field(
-        default_factory=lambda: get_default_sampler_value("logprobs", 0),
-        ge=0,
-    )
    response_format: Optional[CompletionResponseFormat] = Field(
        default_factory=CompletionResponseFormat
    )
--- a/endpoints/OAI/utils/chat_completion.py
+++ b/endpoints/OAI/utils/chat_completion.py
@ -333,11 +333,11 @@ async def stream_generate_chat_completion(
                _stream_collector(
                    n,
                    gen_queue,
-                    prompt,
                    request.state.id,
+                    prompt,
+                    task_gen_params,
                    abort_event,
-                    embeddings=embeddings,
-                    **task_gen_params.model_dump(exclude={"prompt"}),
+                    mm_embeddings=embeddings,
                )
            )

@ -422,10 +422,10 @@ async def generate_chat_completion(
            gen_tasks.append(
                asyncio.create_task(
                    model.container.generate(
-                        prompt,
                        request.state.id,
-                        embeddings=embeddings,
-                        **data.model_dump(exclude={"prompt"}),
+                        prompt,
+                        data,
+                        mm_embeddings=embeddings,
                    )
                )
            )
@ -465,7 +465,6 @@ async def generate_tool_calls(
    # FIXME: May not be necessary depending on how the codebase evolves
    tool_data = data.model_copy(deep=True)
    tool_data.json_schema = tool_data.tool_call_schema
-    gen_params = tool_data.model_dump()

    for idx, gen in enumerate(generations):
        if gen["stop_str"] in tool_data.tool_call_start:
@ -488,10 +487,10 @@ async def generate_tool_calls(
            gen_tasks.append(
                asyncio.create_task(
                    model.container.generate(
-                        pre_tool_prompt,
                        request.state.id,
+                        pre_tool_prompt,
+                        tool_data,
                        embeddings=mm_embeddings,
-                        **gen_params,
                    )
                )
            )
--- a/endpoints/OAI/utils/completion.py
+++ b/endpoints/OAI/utils/completion.py
@ -8,12 +8,12 @@ import asyncio
 import pathlib
 from asyncio import CancelledError
 from fastapi import HTTPException, Request
-from typing import List, Union
-
 from loguru import logger
+from typing import List, Optional, Union

 from common import model
 from common.auth import get_key_permission
+from common.multimodal import MultimodalEmbeddingWrapper
 from common.networking import (
    get_generator_error,
    handle_request_disconnect,
@ -86,16 +86,21 @@ def _create_response(
 async def _stream_collector(
    task_idx: int,
    gen_queue: asyncio.Queue,
-    prompt: str,
    request_id: str,
+    prompt: str,
+    params: CompletionRequest,
    abort_event: asyncio.Event,
-    **kwargs,
+    mm_embeddings: Optional[MultimodalEmbeddingWrapper] = None,
 ):
    """Collects a stream and places results in a common queue"""

    try:
-        new_generation = model.container.generate_gen(
-            prompt, request_id, abort_event, **kwargs
+        new_generation = model.container.stream_generate(
+            request_id,
+            prompt,
+            params,
+            abort_event,
+            mm_embeddings,
        )
        async for generation in new_generation:
            generation["index"] = task_idx
@ -115,7 +120,7 @@ async def load_inline_model(model_name: str, request: Request):
    if (
        model.container
        and model.container.model_dir.name == model_name
-        and model.container.model_loaded
+        and model.container.loaded
    ):
        return

@ -195,10 +200,10 @@ async def stream_generate_completion(
                _stream_collector(
                    n,
                    gen_queue,
-                    data.prompt,
                    request.state.id,
+                    data.prompt,
+                    task_gen_params,
                    abort_event,
-                    **task_gen_params.model_dump(exclude={"prompt"}),
                )
            )

@ -256,9 +261,9 @@ async def generate_completion(
            gen_tasks.append(
                asyncio.create_task(
                    model.container.generate(
-                        data.prompt,
                        request.state.id,
-                        **task_gen_params.model_dump(exclude={"prompt"}),
+                        data.prompt,
+                        task_gen_params,
                    )
                )
            )
--- a/endpoints/core/utils/model.py
+++ b/endpoints/core/utils/model.py
@ -5,10 +5,8 @@ from typing import Optional
 from common import model
 from common.networking import get_generator_error, handle_request_disconnect
 from common.tabby_config import config
-from common.utils import unwrap
 from endpoints.core.types.model import (
    ModelCard,
-    ModelCardParameters,
    ModelList,
    ModelLoadRequest,
    ModelLoadResponse,
@ -64,30 +62,7 @@ async def get_current_model_list(model_type: str = "model"):
 def get_current_model():
    """Gets the current model with all parameters."""

-    model_params = model.container.get_model_parameters()
-    draft_model_params = model_params.pop("draft", {})
-
-    if draft_model_params:
-        model_params["draft"] = ModelCard(
-            id=unwrap(draft_model_params.get("name"), "unknown"),
-            parameters=ModelCardParameters.model_validate(draft_model_params),
-        )
-    else:
-        draft_model_params = None
-
-    model_card = ModelCard(
-        id=unwrap(model_params.pop("name", None), "unknown"),
-        parameters=ModelCardParameters.model_validate(model_params),
-        logging=config.logging,
-    )
-
-    if draft_model_params:
-        draft_card = ModelCard(
-            id=unwrap(draft_model_params.pop("name", None), "unknown"),
-            parameters=ModelCardParameters.model_validate(draft_model_params),
-        )
-
-        model_card.parameters.draft = draft_card
+    model_card = model.container.model_info()

    return model_card

--- a/pyproject.toml
+++ b/pyproject.toml
@ -40,8 +40,6 @@ dependencies = [
    "uvloop ; platform_system == 'Linux' and platform_machine == 'x86_64'",
    "winloop ; platform_system == 'Windows'",

-    "numpy < 2.0.0",
-
    # For python 3.12
    "setuptools ; python_version >= '3.12'"
 ]
@ -60,55 +58,55 @@ dev = [
 ]
 cu121 = [
    # Torch (Extra index URLs not support in pyproject.toml)
-    "torch @ https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp313-cp313-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.13'",
-    "torch @ https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "torch @ https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "torch @ https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.13'",
-    "torch @ https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "torch @ https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "torch @ https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.13'",
+    "torch @ https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "torch @ https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "torch @ https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "torch @ https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.13'",
+    "torch @ https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "torch @ https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "torch @ https://download.pytorch.org/whl/cu128/torch-2.7.0%2Bcu128-cp310-cp310-manylinux_2_28_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",

    # Exl2
-    "exllamav2 @ https://github.com/turboderp-org/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp313-cp313-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.13'",
-    "exllamav2 @ https://github.com/turboderp-org/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp-org/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp-org/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
-    "exllamav2 @ https://github.com/turboderp-org/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.13'",
-    "exllamav2 @ https://github.com/turboderp-org/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp-org/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp-org/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+cu124.torch2.6.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "exllamav2 @ https://github.com/kingbri1/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu128.torch2.7.0-cp313-cp313-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.13'",
+    "exllamav2 @ https://github.com/kingbri1/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu128.torch2.7.0-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "exllamav2 @ https://github.com/kingbri1/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu128.torch2.7.0-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "exllamav2 @ https://github.com/kingbri1/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu128.torch2.7.0-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "exllamav2 @ https://github.com/kingbri1/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu128.torch2.7.0-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.13'",
+    "exllamav2 @ https://github.com/kingbri1/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu128.torch2.7.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "exllamav2 @ https://github.com/kingbri1/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu128.torch2.7.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "exllamav2 @ https://github.com/kingbri1/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+cu128.torch2.7.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",

    # Windows FA2 from https://github.com/kingbri1/flash-attention/releases
-    "flash_attn @ https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp313-cp313-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.13'",
-    "flash_attn @ https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
-    "flash_attn @ https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
-    "flash_attn @ https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu124torch2.6.0cxx11abiFALSE-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",
+    "flash_attn @ https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp313-cp313-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.13'",
+    "flash_attn @ https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp312-cp312-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.12'",
+    "flash_attn @ https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.11'",
+    "flash_attn @ https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp310-cp310-win_amd64.whl ; platform_system == 'Windows' and python_version == '3.10'",

-    # Linux FA2 from https://github.com/Dao-AILab/flash-attention/releases
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.13'",
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "flash_attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    # Linux FA2 from https://github.com/kingbri1/flash-attention/releases
+    "flash_attn @ https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.13'",
+    "flash_attn @ https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "flash_attn @ https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "flash_attn @ https://github.com/kingbri1/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu128torch2.7.0cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 ]
 amd = [
    # Torch triton for ROCm
-    "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.2.0-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.13'",
-    "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.2.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.2.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.2.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.3.0-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.13'",
+    "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.3.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.3.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "pytorch_triton_rocm @ https://download.pytorch.org/whl/pytorch_triton_rocm-3.3.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",

    # Torch
-    "torch @ https://download.pytorch.org/whl/rocm6.2.4/torch-2.6.0%2Brocm6.2.4-cp313-cp313-manylinux_2_28_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.13'",
-    "torch @ https://download.pytorch.org/whl/rocm6.2.4/torch-2.6.0%2Brocm6.2.4-cp312-cp312-manylinux_2_28_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "torch @ https://download.pytorch.org/whl/rocm6.2.4/torch-2.6.0%2Brocm6.2.4-cp311-cp311-manylinux_2_28_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "torch @ https://download.pytorch.org/whl/rocm6.2.4/torch-2.6.0%2Brocm6.2.4-cp310-cp310-manylinux_2_28_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "torch @ https://download.pytorch.org/whl/rocm6.3/torch-2.7.0%2Brocm6.3-cp313-cp313-manylinux_2_28_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.13'",
+    "torch @ https://download.pytorch.org/whl/rocm6.3/torch-2.7.0%2Brocm6.3-cp312-cp312-manylinux_2_28_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "torch @ https://download.pytorch.org/whl/rocm6.3/torch-2.7.0%2Brocm6.3-cp311-cp311-manylinux_2_28_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "torch @ https://download.pytorch.org/whl/rocm6.3/torch-2.7.0%2Brocm6.3-cp310-cp310-manylinux_2_28_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",

    # Exl2
-    "exllamav2 @ https://github.com/turboderp-org/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.2.4.torch2.6.0-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.13'",
-    "exllamav2 @ https://github.com/turboderp-org/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.2.4.torch2.6.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
-    "exllamav2 @ https://github.com/turboderp-org/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.2.4.torch2.6.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
-    "exllamav2 @ https://github.com/turboderp-org/exllamav2/releases/download/v0.2.8/exllamav2-0.2.8+rocm6.2.4.torch2.6.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
+    "exllamav2 @ https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.3.torch2.7.0-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.13'",
+    "exllamav2 @ https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.3.torch2.7.0-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.12'",
+    "exllamav2 @ https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.3.torch2.7.0-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.11'",
+    "exllamav2 @ https://github.com/turboderp-org/exllamav2/releases/download/v0.2.9/exllamav2-0.2.9+rocm6.3.torch2.7.0-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and platform_machine == 'x86_64' and python_version == '3.10'",
 ]

 # MARK: Ruff options
--- a/sampler_overrides/sample_preset.yml
+++ b/sampler_overrides/sample_preset.yml
@ -14,9 +14,6 @@ max_tokens:
 min_tokens:
  override: 0
  force: false
-generate_window:
-  override: 512
-  force: false
 stop:
  override: []
  force: false