Move common functions into their own folder and refactor the backends to use their own folder as well. Also cleanup imports and alphabetize import statments themselves. Finally, move colab and docker into their own folders as well. Signed-off-by: kingbri <bdashore3@proton.me>
263 lines
No EOL
11 KiB
Text
263 lines
No EOL
11 KiB
Text
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "NcgQp3r7BS-q"
|
|
},
|
|
"source": [
|
|
"# **TabbyAPI Colab**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "zU0omxLLm8AN"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"#@title # Keep this widget playing to prevent Colab from disconnecting you { display-mode: \"form\" }\n",
|
|
"#@markdown Press play on the audio player that will appear below:\n",
|
|
"%%html\n",
|
|
"<audio src=\"https://oobabooga.github.io/silence.m4a\" controls>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "ThqTbSnPalJl"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# @title # Install and download model { display-mode: \"form\" }\n",
|
|
"# @markdown ---\n",
|
|
"# @markdown Select model:\n",
|
|
"repo_id = \"Kooten/FlatOrcamaid-13b-v0.2-5bpw-exl2\" # @param {type:\"string\"}\n",
|
|
"revision = \"main\" # @param {type:\"string\"}\n",
|
|
"if revision == \"\": revision = \"main\"\n",
|
|
"# @markdown ---\n",
|
|
"# @markdown Select draft model (optional, for speculative decoding):\n",
|
|
"draft_repo_id = \"\" # @param {type:\"string\"}\n",
|
|
"draft_revision = \"\" # @param {type:\"string\"}\n",
|
|
"if draft_revision == \"\": draft_revision = \"main\"\n",
|
|
"# @markdown ---\n",
|
|
"# @markdown Select lora (optional):\n",
|
|
"lora_repo_id = \"\" # @param {type:\"string\"}\n",
|
|
"lora_revision = \"\" # @param {type:\"string\"}\n",
|
|
"if lora_revision == \"\": lora_revision = \"main\"\n",
|
|
"# @markdown ---\n",
|
|
"\n",
|
|
"# Install tabbyAPI\n",
|
|
"%cd /content/\n",
|
|
"\n",
|
|
"!git clone https://github.com/theroyallab/tabbyAPI\n",
|
|
"%cd tabbyAPI\n",
|
|
"\n",
|
|
"# Install cuda requirements\n",
|
|
"!pip install -r requirements.txt -q\n",
|
|
"!pip install huggingface-hub -q\n",
|
|
"\n",
|
|
"# Download cloudflared tunnel\n",
|
|
"%cd /content/tabbyAPI/\n",
|
|
"\n",
|
|
"!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared\n",
|
|
"!chmod a+x cloudflared\n",
|
|
"\n",
|
|
"# Download model repo\n",
|
|
"%cd /content/tabbyAPI/\n",
|
|
"\n",
|
|
"from huggingface_hub import snapshot_download\n",
|
|
"\n",
|
|
"snapshot_download(repo_id=repo_id, revision=revision, local_dir=f\"./models/{repo_id.replace('/', '_')}\")\n",
|
|
"model = repo_id.replace('/', '_')\n",
|
|
"\n",
|
|
"if len(draft_repo_id) > 0: snapshot_download(repo_id=draft_repo_id, revision=draft_revision, local_dir=f\"./models/{draft_repo_id.replace('/', '_')}\")\n",
|
|
"draft_model = draft_repo_id.replace('/', '_')\n",
|
|
"\n",
|
|
"if len(lora_repo_id) > 0: snapshot_download(repo_id=lora_repo_id, revision=lora_revision, local_dir=f\"./loras/{lora_repo_id.replace('/', '_')}\")\n",
|
|
"lora = lora_repo_id.replace('/', '_')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {
|
|
"id": "aD7Rz8ZCeShD"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"# @title # Configure and launch API { display-mode: \"form\" }\n",
|
|
"# @markdown ---\n",
|
|
"# @markdown Model parameters:\n",
|
|
"ContextSize = 4096 # @param {type:\"integer\"}\n",
|
|
"OverrideBaseSeqLen = 4096 # @param {type:\"integer\"}\n",
|
|
"RopeScale = 1.0 # @param {type:\"number\"}\n",
|
|
"RopeAlpha = 1.0 # @param {type:\"number\"}\n",
|
|
"NumExpertsPerToken = 2 # @param {type:\"integer\"}\n",
|
|
"PromptTemplate = \"\" # @param {type:\"string\"}\n",
|
|
"# @markdown ---\n",
|
|
"# @markdown Draft model parameters (optional, for speculative decoding):\n",
|
|
"DraftRopeScale = 1.0 # @param {type:\"number\"}\n",
|
|
"DraftRopeAlpha = 1.0 # @param {type:\"number\"}\n",
|
|
"# @markdown ---\n",
|
|
"# @markdown Lora parameters (optional, for loras):\n",
|
|
"LoraScaling = 1.0 # @param {type:\"number\"}\n",
|
|
"# @markdown ---\n",
|
|
"# @markdown Logging options:\n",
|
|
"PromptLogging = False # @param {type:\"boolean\"}\n",
|
|
"GenParamLogging = False # @param {type:\"boolean\"}\n",
|
|
"# @markdown ---\n",
|
|
"# @markdown Misc options:\n",
|
|
"CacheMode = \"FP16\" # @param [\"FP8\", \"FP16\"] {type:\"string\"}\n",
|
|
"UseDummyModels = False # @param {type:\"boolean\"}\n",
|
|
"NoFlashAttention = False # @param {type:\"boolean\"}\n",
|
|
"DisableAuth = False # @param {type:\"boolean\"}\n",
|
|
"# @markdown ---\n",
|
|
"# @markdown To connect, make note of the cloudflared URL and your auto-generated API key after launching and provide it to your preferred frontend.\n",
|
|
"\n",
|
|
"# Setup Config\n",
|
|
"%cd /content/tabbyAPI/\n",
|
|
"\n",
|
|
"write = f'''\n",
|
|
"# Sample YAML file for configuration.\n",
|
|
"# Comment out values as needed. Every value has a default within the application.\n",
|
|
"\n",
|
|
"# Unless specified in the comments, DO NOT put these options in quotes!\n",
|
|
"# You can use https://www.yamllint.com/ if you want to check your YAML formatting.\n",
|
|
"\n",
|
|
"# Options for networking\n",
|
|
"network:\n",
|
|
" # The IP to host on (default: 127.0.0.1).\n",
|
|
" # Use 0.0.0.0 to expose on all network adapters\n",
|
|
" host: 0.0.0.0\n",
|
|
"\n",
|
|
" # The port to host on (default: 5000)\n",
|
|
" port: 5000\n",
|
|
"\n",
|
|
" # Disable HTTP token authenticaion with requests\n",
|
|
" # WARNING: This will make your instance vulnerable!\n",
|
|
" # Turn on this option if you are ONLY connecting from localhost\n",
|
|
" disable_auth: {DisableAuth}\n",
|
|
"\n",
|
|
"# Options for logging\n",
|
|
"logging:\n",
|
|
" # Enable prompt logging (default: False)\n",
|
|
" prompt: {PromptLogging}\n",
|
|
"\n",
|
|
" # Enable generation parameter logging (default: False)\n",
|
|
" generation_params: {GenParamLogging}\n",
|
|
"\n",
|
|
"# Options for model overrides and loading\n",
|
|
"model:\n",
|
|
" # Overrides the directory to look for models (default: models)\n",
|
|
" # Windows users, DO NOT put this path in quotes! This directory will be invalid otherwise.\n",
|
|
" model_dir: models\n",
|
|
"\n",
|
|
" # An initial model to load. Make sure the model is located in the model directory!\n",
|
|
" # A model can be loaded later via the API.\n",
|
|
" model_name: {model}\n",
|
|
"\n",
|
|
" # Sends dummy model names when the models endpoint is queried\n",
|
|
" # Enable this if the program is looking for a specific OAI model\n",
|
|
" use_dummy_models: {UseDummyModels}\n",
|
|
"\n",
|
|
" # The below parameters apply only if model_name is set\n",
|
|
"\n",
|
|
" # Max sequence length (default: None)\n",
|
|
" # Fetched from the model's base sequence length in config.json by default\n",
|
|
" max_seq_len: {ContextSize}\n",
|
|
"\n",
|
|
" # Overrides base model context length (default: None)\n",
|
|
" # WARNING: Don't set this unless you know what you're doing!\n",
|
|
" # Only use this if the model's base sequence length in config.json is incorrect (ex. Mistral/Mixtral models)\n",
|
|
" override_base_seq_len: {OverrideBaseSeqLen}\n",
|
|
"\n",
|
|
" # Automatically allocate resources to GPUs (default: True)\n",
|
|
" gpu_split_auto: True\n",
|
|
"\n",
|
|
" # An integer array of GBs of vram to split between GPUs (default: [])\n",
|
|
" # gpu_split: [20.6, 24]\n",
|
|
"\n",
|
|
" # Rope scale (default: 1.0)\n",
|
|
" # Same thing as compress_pos_emb\n",
|
|
" # Only use if your model was trained on long context with rope (check config.json)\n",
|
|
" rope_scale: {RopeScale}\n",
|
|
"\n",
|
|
" # Rope scale (default: 1.0)\n",
|
|
" # Same thing as compress_pos_emb\n",
|
|
" # Only use if your model was trained on long context with rope (check config.json)\n",
|
|
" rope_alpha: {RopeAlpha}\n",
|
|
"\n",
|
|
" # Disable Flash-attention 2. Set to True for GPUs lower than Nvidia's 3000 series. (default: False)\n",
|
|
" no_flash_attention: {NoFlashAttention}\n",
|
|
"\n",
|
|
" # Enable 8 bit cache mode for VRAM savings (slight performance hit). Possible values FP16, FP8. (default: FP16)\n",
|
|
" cache_mode: {CacheMode}\n",
|
|
"\n",
|
|
" # Set the prompt template for this model. If empty, chat completions will be disabled. (default: None)\n",
|
|
" # NOTE: Only works with chat completion message lists!\n",
|
|
" prompt_template: {PromptTemplate}\n",
|
|
"\n",
|
|
" # Number of experts to use per token. Loads from the model's config.json if not specified (default: None)\n",
|
|
" # WARNING: Don't set this unless you know what you're doing!\n",
|
|
" # NOTE: For MoE models (ex. Mixtral) only!\n",
|
|
" num_experts_per_token: {NumExpertsPerToken}\n",
|
|
"\n",
|
|
" # Options for draft models (speculative decoding). This will use more VRAM!\n",
|
|
" draft:\n",
|
|
" # Overrides the directory to look for draft (default: models)\n",
|
|
" draft_model_dir: models\n",
|
|
"\n",
|
|
" # An initial draft model to load. Make sure this model is located in the model directory!\n",
|
|
" # A draft model can be loaded later via the API.\n",
|
|
" #draft_model_name: {draft_model}\n",
|
|
"\n",
|
|
" # Rope scale for draft models (default: 1.0)\n",
|
|
" # Same thing as compress_pos_emb\n",
|
|
" # Only use if your draft model was trained on long context with rope (check config.json)\n",
|
|
" draft_rope_scale: {DraftRopeScale}\n",
|
|
"\n",
|
|
" # Rope alpha for draft model (default: 1.0)\n",
|
|
" # Same thing as alpha_value\n",
|
|
" # Leave blank to automatically calculate alpha value\n",
|
|
" draft_rope_alpha: {DraftRopeAlpha}\n",
|
|
"\n",
|
|
" # Options for loras\n",
|
|
" lora:\n",
|
|
" # Overrides the directory to look for loras (default: loras)\n",
|
|
" lora_dir: loras\n",
|
|
"\n",
|
|
" # List of loras to load and associated scaling factors (default: 1.0). Comment out unused entries or add more rows as needed.\n",
|
|
" loras:\n",
|
|
" - name: {lora}\n",
|
|
" scaling: {LoraScaling}\n",
|
|
"'''\n",
|
|
"with open(\"./config.yml\", \"w\") as file:\n",
|
|
" file.write(write)\n",
|
|
"!cat config.yml\n",
|
|
"\n",
|
|
"%cd /content/tabbyAPI/\n",
|
|
"\n",
|
|
"!python main.py & ./cloudflared tunnel --url localhost:5000"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"accelerator": "GPU",
|
|
"colab": {
|
|
"gpuType": "T4",
|
|
"provenance": []
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"name": "python"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0
|
|
} |