tabbyAPI-ollama/common/config_models.py

from inspect import getdoc
from pathlib import Path
from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
from textwrap import dedent
from typing import List, Literal, Optional, Union

from pydantic_core import PydanticUndefined

CACHE_SIZES = Literal["FP16", "Q8", "Q6", "Q4"]


class Metadata(BaseModel):
    """metadata model for config options"""

    include_in_config: Optional[bool] = Field(True)


class BaseConfigModel(BaseModel):
    """Base model for config models with added metadata"""

    _metadata: Metadata = PrivateAttr(Metadata())


class ConfigOverrideConfig(BaseConfigModel):
    """Model for overriding a provided config file."""

    # TODO: convert this to a pathlib.path?
    config: Optional[str] = Field(
        None, description=("Path to an overriding config.yml file")
    )

    _metadata: Metadata = PrivateAttr(Metadata(include_in_config=False))


class UtilityActions(BaseConfigModel):
    """Model used for arg actions."""

    # YAML export options
    export_config: Optional[str] = Field(
        None, description="generate a template config file"
    )
    config_export_path: Optional[Path] = Field(
        "config_sample.yml", description="path to export configuration file to"
    )

    # OpenAPI JSON export options
    export_openapi: Optional[bool] = Field(
        False, description="export openapi schema files"
    )
    openapi_export_path: Optional[Path] = Field(
        "openapi.json", description="path to export openapi schema to"
    )

    _metadata: Metadata = PrivateAttr(Metadata(include_in_config=False))


class NetworkConfig(BaseConfigModel):
    """Options for networking"""

    host: Optional[str] = Field(
        "127.0.0.1",
        description=(
            "The IP to host on (default: 127.0.0.1).\n"
            "Use 0.0.0.0 to expose on all network adapters."
        ),
    )
    port: Optional[int] = Field(
        5000, description=("The port to host on (default: 5000).")
    )
    disable_auth: Optional[bool] = Field(
        False,
        description=(
            "Disable HTTP token authentication with requests.\n"
            "WARNING: This will make your instance vulnerable!\n"
            "Turn on this option if you are ONLY connecting from localhost."
        ),
    )
    send_tracebacks: Optional[bool] = Field(
        False,
        description=(
            "Send tracebacks over the API (default: False).\n"
            "NOTE: Only enable this for debug purposes."
        ),
    )
    api_servers: Optional[List[Literal["OAI", "Kobold"]]] = Field(
        default_factory=list,
        description=(
            'Select API servers to enable (default: ["OAI"]).\n'
            "Possible values: OAI, Kobold."
        ),
    )


# TODO: Migrate config.yml to have the log_ prefix
# This is a breaking change.
class LoggingConfig(BaseConfigModel):
    """Options for logging"""

    log_prompt: Optional[bool] = Field(
        False,
        description=("Enable prompt logging (default: False)."),
    )
    log_generation_params: Optional[bool] = Field(
        False,
        description=("Enable generation parameter logging (default: False)."),
    )
    log_requests: Optional[bool] = Field(
        False,
        description=(
            "Enable request logging (default: False).\n"
            "NOTE: Only use this for debugging!"
        ),
    )


class ModelConfig(BaseConfigModel):
    """
    Options for model overrides and loading
    Please read the comments to understand how arguments are handled
    between initial and API loads
    """

    # TODO: convert this to a pathlib.path?
    model_dir: str = Field(
        "models",
        description=(
            "Directory to look for models (default: models).\n"
            "Windows users, do NOT put this path in quotes!"
        ),
    )
    inline_model_loading: Optional[bool] = Field(
        True,
        description=(
            "Allow direct loading of models "
            "from a completion or chat completion request (default: False)."
        ),
    )
    use_dummy_models: Optional[bool] = Field(
        False,
        description=(
            "Sends dummy model names when the models endpoint is queried.\n"
            "Enable this if the client is looking for specific OAI models."
        ),
    )
    model_name: Optional[str] = Field(
        None,
        description=(
            "An initial model to load.\n"
            "Make sure the model is located in the model directory!\n"
            "REQUIRED: This must be filled out to load a model on startup."
        ),
    )
    use_as_default: List[str] = Field(
        default_factory=list,
        description=(
            "Names of args to use as a fallback for API load requests (default: []).\n"
            "For example, if you always want cache_mode to be Q4 "
            'instead of on the inital model load, add "cache_mode" to this array.\n'
            "Example: ['max_seq_len', 'cache_mode']."
        ),
    )
    max_seq_len: Optional[int] = Field(
        None,
        description=(
            "Max sequence length (default: Empty).\n"
            "Fetched from the model's base sequence length in config.json by default."
        ),
        ge=0,
    )
    override_base_seq_len: Optional[int] = Field(
        None,
        description=(
            "Overrides base model context length (default: Empty).\n"
            "WARNING: Don't set this unless you know what you're doing!\n"
            "Again, do NOT use this for configuring context length, "
            "use max_seq_len above ^"
        ),
        ge=0,
    )
    tensor_parallel: Optional[bool] = Field(
        False,
        description=(
            "Load model with tensor parallelism.\n"
            "Falls back to autosplit if GPU split isn't provided.\n"
            "This ignores the gpu_split_auto value."
        ),
    )
    gpu_split_auto: Optional[bool] = Field(
        True,
        description=(
            "Automatically allocate resources to GPUs (default: True).\n"
            "Not parsed for single GPU users."
        ),
    )
    autosplit_reserve: List[int] = Field(
        [96],
        description=(
            "Reserve VRAM used for autosplit loading (default: 96 MB on GPU 0).\n"
            "Represented as an array of MB per GPU."
        ),
    )
    gpu_split: List[float] = Field(
        default_factory=list,
        description=(
            "An integer array of GBs of VRAM to split between GPUs (default: []).\n"
            "Used with tensor parallelism."
        ),
    )
    rope_scale: Optional[float] = Field(
        1.0,
        description=(
            "Rope scale (default: 1.0).\n"
            "Same as compress_pos_emb.\n"
            "Use if the model was trained on long context with rope.\n"
            "Leave blank to pull the value from the model."
        ),
    )
    rope_alpha: Optional[Union[float, Literal["auto"]]] = Field(
        1.0,
        description=(
            "Rope alpha (default: 1.0).\n"
            'Same as alpha_value. Set to "auto" to auto-calculate.'
        ),
    )
    cache_mode: Optional[CACHE_SIZES] = Field(
        "FP16",
        description=(
            "Enable different cache modes for VRAM savings (default: FP16).\n"
            f"Possible values: {str(CACHE_SIZES)[15:-1]}."
        ),
    )
    cache_size: Optional[int] = Field(
        None,
        description=(
            "Size of the prompt cache to allocate (default: max_seq_len).\n"
            "Must be a multiple of 256 and can't be less than max_seq_len.\n"
            "For CFG, set this to 2 * max_seq_len."
        ),
        multiple_of=256,
        gt=0,
    )
    chunk_size: Optional[int] = Field(
        2048,
        description=(
            "Chunk size for prompt ingestion (default: 2048).\n"
            "A lower value reduces VRAM usage but decreases ingestion speed.\n"
            "NOTE: Effects vary depending on the model.\n"
            "An ideal value is between 512 and 4096."
        ),
        gt=0,
    )
    max_batch_size: Optional[int] = Field(
        None,
        description=(
            "Set the maximum number of prompts to process at one time "
            "(default: None/Automatic).\n"
            "Automatically calculated if left blank.\n"
            "NOTE: Only available for Nvidia ampere (30 series) and above GPUs."
        ),
        ge=1,
    )
    prompt_template: Optional[str] = Field(
        None,
        description=(
            "Set the prompt template for this model. (default: None)\n"
            "If empty, attempts to look for the model's chat template.\n"
            "If a model contains multiple templates in its tokenizer_config.json,\n"
            "set prompt_template to the name of the template you want to use.\n"
            "NOTE: Only works with chat completion message lists!"
        ),
    )
    num_experts_per_token: Optional[int] = Field(
        None,
        description=(
            "Number of experts to use per token.\n"
            "Fetched from the model's config.json if empty.\n"
            "NOTE: For MoE models only.\n"
            "WARNING: Don't set this unless you know what you're doing!"
        ),
        ge=1,
    )
    fasttensors: Optional[bool] = Field(
        False,
        description=(
            "Enables fasttensors to possibly increase model loading speeds "
            "(default: False)."
        ),
    )

    _metadata: Metadata = PrivateAttr(Metadata())
    model_config = ConfigDict(protected_namespaces=())


class DraftModelConfig(BaseConfigModel):
    """
    Options for draft models (speculative decoding)
    This will use more VRAM!
    """

    # TODO: convert this to a pathlib.path?
    draft_model_dir: Optional[str] = Field(
        "models",
        description=("Directory to look for draft models (default: models)"),
    )
    draft_model_name: Optional[str] = Field(
        None,
        description=(
            "An initial draft model to load.\n"
            "Ensure the model is in the model directory."
        ),
    )
    draft_rope_scale: Optional[float] = Field(
        1.0,
        description=(
            "Rope scale for draft models (default: 1.0).\n"
            "Same as compress_pos_emb.\n"
            "Use if the draft model was trained on long context with rope."
        ),
    )
    draft_rope_alpha: Optional[float] = Field(
        None,
        description=(
            "Rope alpha for draft models (default: None).\n"
            'Same as alpha_value. Set to "auto" to auto-calculate.'
        ),
    )
    draft_cache_mode: Optional[CACHE_SIZES] = Field(
        "FP16",
        description=(
            "Cache mode for draft models to save VRAM (default: FP16).\n"
            f"Possible values: {str(CACHE_SIZES)[15:-1]}."
        ),
    )


class LoraInstanceModel(BaseConfigModel):
    """Model representing an instance of a Lora."""

    name: str = Field(..., description=("Name of the LoRA model."))
    scaling: float = Field(
        1.0,
        description=("Scaling factor for the LoRA model (default: 1.0)."),
        ge=0,
    )


class LoraConfig(BaseConfigModel):
    """Options for Loras"""

    # TODO: convert this to a pathlib.path?
    lora_dir: Optional[str] = Field(
        "loras", description=("Directory to look for LoRAs (default: loras).")
    )
    loras: Optional[List[LoraInstanceModel]] = Field(
        None,
        description=(
            "List of LoRAs to load and associated scaling factors "
            "(default scale: 1.0)."
        ),
    )


class EmbeddingsConfig(BaseConfigModel):
    """
    Options for embedding models and loading.
    NOTE: Embeddings requires the "extras" feature to be installed
    Install it via "pip install .[extras]"
    """

    # TODO: convert this to a pathlib.path?
    embedding_model_dir: Optional[str] = Field(
        "models",
        description=("Directory to look for embedding models (default: models)."),
    )
    embeddings_device: Optional[Literal["cpu", "auto", "cuda"]] = Field(
        "cpu",
        description=(
            "Device to load embedding models on (default: cpu).\n"
            "Possible values: cpu, auto, cuda.\n"
            "NOTE: It's recommended to load embedding models on the CPU.\n"
            "If using an AMD GPU, set this value to 'cuda'."
        ),
    )
    embedding_model_name: Optional[str] = Field(
        None,
        description=("An initial embedding model to load on the infinity backend."),
    )


class SamplingConfig(BaseConfigModel):
    """Options for Sampling"""

    override_preset: Optional[str] = Field(
        None,
        description=(
            "Select a sampler override preset (default: None).\n"
            "Find this in the sampler-overrides folder.\n"
            "This overrides default fallbacks for sampler values "
            "that are passed to the API."
        ),
    )


class DeveloperConfig(BaseConfigModel):
    """Options for development and experimentation"""

    unsafe_launch: Optional[bool] = Field(
        False,
        description=(
            "Skip Exllamav2 version check (default: False).\n"
            "WARNING: It's highly recommended to update your dependencies rather "
            "than enabling this flag."
        ),
    )
    disable_request_streaming: Optional[bool] = Field(
        False, description=("Disable API request streaming (default: False).")
    )
    cuda_malloc_backend: Optional[bool] = Field(
        False, description=("Enable the torch CUDA malloc backend (default: False).")
    )
    uvloop: Optional[bool] = Field(
        False,
        description=(
            "Run asyncio using Uvloop or Winloop which can improve performance.\n"
            "NOTE: It's recommended to enable this, but if something breaks "
            "turn this off."
        ),
    )
    realtime_process_priority: Optional[bool] = Field(
        False,
        description=(
            "Set process to use a higher priority.\n"
            "For realtime process priority, run as administrator or sudo.\n"
            "Otherwise, the priority will be set to high."
        ),
    )


class TabbyConfigModel(BaseModel):
    """Base model for a TabbyConfig."""

    config: ConfigOverrideConfig = Field(
        default_factory=ConfigOverrideConfig.model_construct
    )
    network: NetworkConfig = Field(default_factory=NetworkConfig.model_construct)
    logging: LoggingConfig = Field(default_factory=LoggingConfig.model_construct)
    model: ModelConfig = Field(default_factory=ModelConfig.model_construct)
    draft_model: DraftModelConfig = Field(
        default_factory=DraftModelConfig.model_construct
    )
    lora: LoraConfig = Field(default_factory=LoraConfig.model_construct)
    embeddings: EmbeddingsConfig = Field(
        default_factory=EmbeddingsConfig.model_construct
    )
    sampling: SamplingConfig = Field(default_factory=SamplingConfig.model_construct)
    developer: DeveloperConfig = Field(default_factory=DeveloperConfig.model_construct)
    actions: UtilityActions = Field(default_factory=UtilityActions.model_construct)

    model_config = ConfigDict(validate_assignment=True, protected_namespaces=())


# TODO: Possibly switch to ruamel.yaml for a more native implementation
def generate_config_file(
    model: BaseConfigModel = None,
    filename: str = "config_sample.yml",
    indentation: int = 2,
) -> None:
    """Creates a config.yml file from Pydantic models."""

    # Add a preamble
    yaml = dedent("""
    # Sample YAML file for configuration.
    # Comment and uncomment values as needed.
    # Every value has a default within the application.
    # This file serves to be a drop in for config.yml

    # Unless specified in the comments, DO NOT put these options in quotes!
    # You can use https://www.yamllint.com/ if you want to check your YAML formatting.\n
    """)

    schema = model if model else TabbyConfigModel()

    # TODO: Make the disordered iteration look cleaner
    iter_once = False
    for field, field_data in schema.model_fields.items():
        subfield_model = field_data.default_factory()

        if not subfield_model._metadata.include_in_config:
            continue

        # Since the list is out of order with the length
        # Add newlines from the beginning once one iteration finishes
        # This is a sanity check for formatting
        if iter_once:
            yaml += "\n"
        else:
            iter_once = True

        for line in getdoc(subfield_model).splitlines():
            yaml += f"# {line}\n"

        yaml += f"{field}:\n"

        sub_iter_once = False
        for subfield, subfield_data in subfield_model.model_fields.items():
            # Same logic as iter_once
            if sub_iter_once:
                yaml += "\n"
            else:
                sub_iter_once = True

            if subfield_data.default_factory:
                value = subfield_data.default_factory()
            else:
                value = subfield_data.default

            value = value if value is not None else ""
            value = value if value is not PydanticUndefined else ""

            for line in subfield_data.description.splitlines():
                yaml += f"{' ' * indentation}# {line}\n"

            yaml += f"{' ' * indentation}{subfield}: {value}\n"

    with open(filename, "w") as f:
        f.write(yaml)