# **TabbyAPI Colab**

In [None]:
#@title # Keep this widget playing to prevent Colab from disconnecting you { display-mode: "form" }
#@markdown Press play on the audio player that will appear below:
%%html
<audio src="https://oobabooga.github.io/silence.m4a" controls>

In [None]:
# @title # Install and download model { display-mode: "form" }
# @markdown ---
# @markdown Select model:
repo_id = "Kooten/FlatOrcamaid-13b-v0.2-5bpw-exl2" # @param {type:"string"}
revision = "main" # @param {type:"string"}
if revision == "": revision = "main"
# @markdown ---
# @markdown Select draft model (optional, for speculative decoding):
draft_repo_id = "" # @param {type:"string"}
draft_revision = "" # @param {type:"string"}
if draft_revision == "": draft_revision = "main"
# @markdown ---
# @markdown Select lora (optional):
lora_repo_id = "" # @param {type:"string"}
lora_revision = "" # @param {type:"string"}
if lora_revision == "": lora_revision = "main"
# @markdown ---

# Install tabbyAPI
%cd /content/

!git clone https://github.com/theroyallab/tabbyAPI
%cd tabbyAPI

# Install cuda requirements
!pip install .[cu121] -q
!pip install huggingface-hub -q

# Download cloudflared tunnel
%cd /content/tabbyAPI/

!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared
!chmod a+x cloudflared

# Download model repo
%cd /content/tabbyAPI/

from huggingface_hub import snapshot_download

snapshot_download(repo_id=repo_id, revision=revision, local_dir=f"./models/{repo_id.replace('/', '_')}")
model = repo_id.replace('/', '_')

if len(draft_repo_id) > 0: snapshot_download(repo_id=draft_repo_id, revision=draft_revision, local_dir=f"./models/{draft_repo_id.replace('/', '_')}")
draft_model = draft_repo_id.replace('/', '_')

if len(lora_repo_id) > 0: snapshot_download(repo_id=lora_repo_id, revision=lora_revision, local_dir=f"./loras/{lora_repo_id.replace('/', '_')}")
lora = lora_repo_id.replace('/', '_')

In [None]:
# @title # Configure and launch API { display-mode: "form" }
# @markdown ---
# @markdown Model parameters:
ContextSize = 4096 # @param {type:"integer"}
RopeScale = 1.0 # @param {type:"number"}
RopeAlpha = 1.0 # @param {type:"number"}
NumExpertsPerToken = 2 # @param {type:"integer"}
PromptTemplate = "" # @param {type:"string"}
# @markdown ---
# @markdown Draft model parameters (optional, for speculative decoding):
DraftRopeScale = 1.0 # @param {type:"number"}
DraftRopeAlpha = 1.0 # @param {type:"number"}
# @markdown ---
# @markdown Lora parameters (optional, for loras):
LoraScaling = 1.0 # @param {type:"number"}
# @markdown ---
# @markdown Logging options:
PromptLogging = False # @param {type:"boolean"}
GenParamLogging = False # @param {type:"boolean"}
# @markdown ---
# @markdown Misc options:
CacheMode = "FP16" # @param ["FP8", "FP16"] {type:"string"}
UseDummyModels = False # @param {type:"boolean"}
NoFlashAttention = False # @param {type:"boolean"}
DisableAuth = False # @param {type:"boolean"}
# @markdown ---
# @markdown To connect, make note of the cloudflared URL and your auto-generated API key after launching and provide it to your preferred frontend.

# Setup Config
%cd /content/tabbyAPI/

write = f'''
# Sample YAML file for configuration.
# Comment out values as needed. Every value has a default within the application.

# Unless specified in the comments, DO NOT put these options in quotes!
# You can use https://www.yamllint.com/ if you want to check your YAML formatting.

# Options for networking
network:
  # The IP to host on (default: 127.0.0.1).
  # Use 0.0.0.0 to expose on all network adapters
  host: 0.0.0.0

  # The port to host on (default: 5000)
  port: 5000

  # Disable HTTP token authenticaion with requests
  # WARNING: This will make your instance vulnerable!
  # Turn on this option if you are ONLY connecting from localhost
  disable_auth: {DisableAuth}

# Options for logging
logging:
  # Enable prompt logging (default: False)
  prompt: {PromptLogging}

  # Enable generation parameter logging (default: False)
  generation_params: {GenParamLogging}

# Options for model overrides and loading
model:
  # Overrides the directory to look for models (default: models)
  # Windows users, DO NOT put this path in quotes! This directory will be invalid otherwise.
  model_dir: models

  # An initial model to load. Make sure the model is located in the model directory!
  # A model can be loaded later via the API.
  model_name: {model}

  # Sends dummy model names when the models endpoint is queried
  # Enable this if the program is looking for a specific OAI model
  use_dummy_models: {UseDummyModels}

  # The below parameters apply only if model_name is set

  # Max sequence length (default: None)
  # Fetched from the model's base sequence length in config.json by default
  max_seq_len: {ContextSize}

  # Automatically allocate resources to GPUs (default: True)
  gpu_split_auto: True

  # An integer array of GBs of vram to split between GPUs (default: [])
  # gpu_split: [20.6, 24]

  # Rope scale (default: 1.0)
  # Same thing as compress_pos_emb
  # Only use if your model was trained on long context with rope (check config.json)
  rope_scale: {RopeScale}

  # Rope scale (default: 1.0)
  # Same thing as compress_pos_emb
  # Only use if your model was trained on long context with rope (check config.json)
  rope_alpha: {RopeAlpha}

  # Disable Flash-attention 2. Set to True for GPUs lower than Nvidia's 3000 series. (default: False)
  no_flash_attention: {NoFlashAttention}

  # Enable 8 bit cache mode for VRAM savings (slight performance hit). Possible values FP16, FP8. (default: FP16)
  cache_mode: {CacheMode}

  # Set the prompt template for this model. If empty, chat completions will be disabled. (default: None)
  # NOTE: Only works with chat completion message lists!
  prompt_template: {PromptTemplate}

  # Options for draft models (speculative decoding). This will use more VRAM!
  draft:
    # Overrides the directory to look for draft (default: models)
    draft_model_dir: models

    # An initial draft model to load. Make sure this model is located in the model directory!
    # A draft model can be loaded later via the API.
    #draft_model_name: {draft_model}

    # Rope scale for draft models (default: 1.0)
    # Same thing as compress_pos_emb
    # Only use if your draft model was trained on long context with rope (check config.json)
    draft_rope_scale: {DraftRopeScale}

    # Rope alpha for draft model (default: 1.0)
    # Same thing as alpha_value
    # Leave blank to automatically calculate alpha value
    draft_rope_alpha: {DraftRopeAlpha}

  # Options for loras
  lora:
    # Overrides the directory to look for loras (default: loras)
    lora_dir: loras

    # List of loras to load and associated scaling factors (default: 1.0). Comment out unused entries or add more rows as needed.
    loras:
    - name: {lora}
      scaling: {LoraScaling}
'''
with open("./config.yml", "w") as file:
    file.write(write)
!cat config.yml

%cd /content/tabbyAPI/

!python main.py & ./cloudflared tunnel --url localhost:5000