tabbyAPI-ollama/colab/TabbyAPI_Colab_Example.ipynb

{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "NcgQp3r7BS-q"
      },
      "source": [
        "# **TabbyAPI Colab**"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "zU0omxLLm8AN"
      },
      "outputs": [],
      "source": [
        "#@title # Keep this widget playing to prevent Colab from disconnecting you { display-mode: \"form\" }\n",
        "#@markdown Press play on the audio player that will appear below:\n",
        "%%html\n",
        "<audio src=\"https://oobabooga.github.io/silence.m4a\" controls>"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "ThqTbSnPalJl"
      },
      "outputs": [],
      "source": [
        "# @title # Install and download model { display-mode: \"form\" }\n",
        "# @markdown ---\n",
        "# @markdown Select model:\n",
        "repo_id = \"Kooten/FlatOrcamaid-13b-v0.2-5bpw-exl2\" # @param {type:\"string\"}\n",
        "revision = \"main\" # @param {type:\"string\"}\n",
        "if revision == \"\": revision = \"main\"\n",
        "# @markdown ---\n",
        "# @markdown Select draft model (optional, for speculative decoding):\n",
        "draft_repo_id = \"\" # @param {type:\"string\"}\n",
        "draft_revision = \"\" # @param {type:\"string\"}\n",
        "if draft_revision == \"\": draft_revision = \"main\"\n",
        "# @markdown ---\n",
        "# @markdown Select lora (optional):\n",
        "lora_repo_id = \"\" # @param {type:\"string\"}\n",
        "lora_revision = \"\" # @param {type:\"string\"}\n",
        "if lora_revision == \"\": lora_revision = \"main\"\n",
        "# @markdown ---\n",
        "\n",
        "# Install tabbyAPI\n",
        "%cd /content/\n",
        "\n",
        "!git clone https://github.com/theroyallab/tabbyAPI\n",
        "%cd tabbyAPI\n",
        "\n",
        "# Install cuda requirements\n",
        "!pip install -r requirements.txt -q\n",
        "!pip install huggingface-hub -q\n",
        "\n",
        "# Download cloudflared tunnel\n",
        "%cd /content/tabbyAPI/\n",
        "\n",
        "!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared\n",
        "!chmod a+x cloudflared\n",
        "\n",
        "# Download model repo\n",
        "%cd /content/tabbyAPI/\n",
        "\n",
        "from huggingface_hub import snapshot_download\n",
        "\n",
        "snapshot_download(repo_id=repo_id, revision=revision, local_dir=f\"./models/{repo_id.replace('/', '_')}\")\n",
        "model = repo_id.replace('/', '_')\n",
        "\n",
        "if len(draft_repo_id) > 0: snapshot_download(repo_id=draft_repo_id, revision=draft_revision, local_dir=f\"./models/{draft_repo_id.replace('/', '_')}\")\n",
        "draft_model = draft_repo_id.replace('/', '_')\n",
        "\n",
        "if len(lora_repo_id) > 0: snapshot_download(repo_id=lora_repo_id, revision=lora_revision, local_dir=f\"./loras/{lora_repo_id.replace('/', '_')}\")\n",
        "lora = lora_repo_id.replace('/', '_')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "aD7Rz8ZCeShD"
      },
      "outputs": [],
      "source": [
        "# @title # Configure and launch API { display-mode: \"form\" }\n",
        "# @markdown ---\n",
        "# @markdown Model parameters:\n",
        "ContextSize = 4096 # @param {type:\"integer\"}\n",
        "OverrideBaseSeqLen = 4096 # @param {type:\"integer\"}\n",
        "RopeScale = 1.0 # @param {type:\"number\"}\n",
        "RopeAlpha = 1.0 # @param {type:\"number\"}\n",
        "NumExpertsPerToken = 2 # @param {type:\"integer\"}\n",
        "PromptTemplate = \"\" # @param {type:\"string\"}\n",
        "# @markdown ---\n",
        "# @markdown Draft model parameters (optional, for speculative decoding):\n",
        "DraftRopeScale = 1.0 # @param {type:\"number\"}\n",
        "DraftRopeAlpha = 1.0 # @param {type:\"number\"}\n",
        "# @markdown ---\n",
        "# @markdown Lora parameters (optional, for loras):\n",
        "LoraScaling = 1.0 # @param {type:\"number\"}\n",
        "# @markdown ---\n",
        "# @markdown Logging options:\n",
        "PromptLogging = False # @param {type:\"boolean\"}\n",
        "GenParamLogging = False # @param {type:\"boolean\"}\n",
        "# @markdown ---\n",
        "# @markdown Misc options:\n",
        "CacheMode = \"FP16\" # @param [\"FP8\", \"FP16\"] {type:\"string\"}\n",
        "UseDummyModels = False # @param {type:\"boolean\"}\n",
        "NoFlashAttention = False # @param {type:\"boolean\"}\n",
        "DisableAuth = False # @param {type:\"boolean\"}\n",
        "# @markdown ---\n",
        "# @markdown To connect, make note of the cloudflared URL and your auto-generated API key after launching and provide it to your preferred frontend.\n",
        "\n",
        "# Setup Config\n",
        "%cd /content/tabbyAPI/\n",
        "\n",
        "write = f'''\n",
        "# Sample YAML file for configuration.\n",
        "# Comment out values as needed. Every value has a default within the application.\n",
        "\n",
        "# Unless specified in the comments, DO NOT put these options in quotes!\n",
        "# You can use https://www.yamllint.com/ if you want to check your YAML formatting.\n",
        "\n",
        "# Options for networking\n",
        "network:\n",
        "  # The IP to host on (default: 127.0.0.1).\n",
        "  # Use 0.0.0.0 to expose on all network adapters\n",
        "  host: 0.0.0.0\n",
        "\n",
        "  # The port to host on (default: 5000)\n",
        "  port: 5000\n",
        "\n",
        "  # Disable HTTP token authenticaion with requests\n",
        "  # WARNING: This will make your instance vulnerable!\n",
        "  # Turn on this option if you are ONLY connecting from localhost\n",
        "  disable_auth: {DisableAuth}\n",
        "\n",
        "# Options for logging\n",
        "logging:\n",
        "  # Enable prompt logging (default: False)\n",
        "  prompt: {PromptLogging}\n",
        "\n",
        "  # Enable generation parameter logging (default: False)\n",
        "  generation_params: {GenParamLogging}\n",
        "\n",
        "# Options for model overrides and loading\n",
        "model:\n",
        "  # Overrides the directory to look for models (default: models)\n",
        "  # Windows users, DO NOT put this path in quotes! This directory will be invalid otherwise.\n",
        "  model_dir: models\n",
        "\n",
        "  # An initial model to load. Make sure the model is located in the model directory!\n",
        "  # A model can be loaded later via the API.\n",
        "  model_name: {model}\n",
        "\n",
        "  # Sends dummy model names when the models endpoint is queried\n",
        "  # Enable this if the program is looking for a specific OAI model\n",
        "  use_dummy_models: {UseDummyModels}\n",
        "\n",
        "  # The below parameters apply only if model_name is set\n",
        "\n",
        "  # Max sequence length (default: None)\n",
        "  # Fetched from the model's base sequence length in config.json by default\n",
        "  max_seq_len: {ContextSize}\n",
        "\n",
        "  # Overrides base model context length (default: None)\n",
        "  # WARNING: Don't set this unless you know what you're doing!\n",
        "  # Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral/Mixtral models)\n",
        "  override_base_seq_len: {OverrideBaseSeqLen}\n",
        "\n",
        "  # Automatically allocate resources to GPUs (default: True)\n",
        "  gpu_split_auto: True\n",
        "\n",
        "  # An integer array of GBs of vram to split between GPUs (default: [])\n",
        "  # gpu_split: [20.6, 24]\n",
        "\n",
        "  # Rope scale (default: 1.0)\n",
        "  # Same thing as compress_pos_emb\n",
        "  # Only use if your model was trained on long context with rope (check config.json)\n",
        "  rope_scale: {RopeScale}\n",
        "\n",
        "  # Rope scale (default: 1.0)\n",
        "  # Same thing as compress_pos_emb\n",
        "  # Only use if your model was trained on long context with rope (check config.json)\n",
        "  rope_alpha: {RopeAlpha}\n",
        "\n",
        "  # Disable Flash-attention 2. Set to True for GPUs lower than Nvidia's 3000 series. (default: False)\n",
        "  no_flash_attention: {NoFlashAttention}\n",
        "\n",
        "  # Enable 8 bit cache mode for VRAM savings (slight performance hit). Possible values FP16, FP8. (default: FP16)\n",
        "  cache_mode: {CacheMode}\n",
        "\n",
        "  # Set the prompt template for this model. If empty, chat completions will be disabled. (default: None)\n",
        "  # NOTE: Only works with chat completion message lists!\n",
        "  prompt_template: {PromptTemplate}\n",
        "\n",
        "  # Number of experts to use per token. Loads from the model's config.json if not specified (default: None)\n",
        "  # WARNING: Don't set this unless you know what you're doing!\n",
        "  # NOTE: For MoE models (ex. Mixtral) only!\n",
        "  num_experts_per_token: {NumExpertsPerToken}\n",
        "\n",
        "  # Options for draft models (speculative decoding). This will use more VRAM!\n",
        "  draft:\n",
        "    # Overrides the directory to look for draft (default: models)\n",
        "    draft_model_dir: models\n",
        "\n",
        "    # An initial draft model to load. Make sure this model is located in the model directory!\n",
        "    # A draft model can be loaded later via the API.\n",
        "    #draft_model_name: {draft_model}\n",
        "\n",
        "    # Rope scale for draft models (default: 1.0)\n",
        "    # Same thing as compress_pos_emb\n",
        "    # Only use if your draft model was trained on long context with rope (check config.json)\n",
        "    draft_rope_scale: {DraftRopeScale}\n",
        "\n",
        "    # Rope alpha for draft model (default: 1.0)\n",
        "    # Same thing as alpha_value\n",
        "    # Leave blank to automatically calculate alpha value\n",
        "    draft_rope_alpha: {DraftRopeAlpha}\n",
        "\n",
        "  # Options for loras\n",
        "  lora:\n",
        "    # Overrides the directory to look for loras (default: loras)\n",
        "    lora_dir: loras\n",
        "\n",
        "    # List of loras to load and associated scaling factors (default: 1.0). Comment out unused entries or add more rows as needed.\n",
        "    loras:\n",
        "    - name: {lora}\n",
        "      scaling: {LoraScaling}\n",
        "'''\n",
        "with open(\"./config.yml\", \"w\") as file:\n",
        "    file.write(write)\n",
        "!cat config.yml\n",
        "\n",
        "%cd /content/tabbyAPI/\n",
        "\n",
        "!python main.py & ./cloudflared tunnel --url localhost:5000"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}