diff --git a/TabbyAPI_Colab_Example.ipynb b/TabbyAPI_Colab_Example.ipynb new file mode 100644 index 0000000..3c246ca --- /dev/null +++ b/TabbyAPI_Colab_Example.ipynb @@ -0,0 +1,168 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "gpuType": "T4" + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ThqTbSnPalJl" + }, + "outputs": [], + "source": [ + "# Install tabbyAPI\n", + "%cd /content/\n", + "\n", + "!git clone https://github.com/theroyallab/tabbyAPI\n", + "%cd tabbyAPI\n", + "!pip install https://github.com/turboderp/exllamav2/releases/download/v0.0.10/exllamav2-0.0.10+cu118-cp310-cp310-linux_x86_64.whl -q\n", + "!pip install -r requirements.txt -q\n", + "!pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.3.6/flash_attn-2.3.6+cu118torch2.1cxx11abiFALSE-cp310-cp310-linux_x86_64.whl -q\n", + "!pip install huggingface-hub -q" + ] + }, + { + "cell_type": "code", + "source": [ + "# Download cloudflared tunnel\n", + "%cd /content/tabbyAPI/\n", + "\n", + "!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared\n", + "!chmod a+x cloudflared" + ], + "metadata": { + "id": "tpcfGFPzf1C0" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Download model repo\n", + "%cd /content/tabbyAPI/\n", + "\n", + "# Select model and branch\n", + "repo_id = \"royallab/airoboros-mistral2.2-7b-exl2\"\n", + "revision = \"6bpw\"\n", + "\n", + "from huggingface_hub import snapshot_download\n", + "snapshot_download(repo_id=repo_id, revision=revision, local_dir=f\"./models/{repo_id.replace('/', '_')}\")\n", + "\n", + "print(f\"Model dir: './models/{repo_id.replace('/', '_')}'\")" + ], + "metadata": { + "id": "ZOZwXx0cc4l1" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Setup Config - edit parameters to fit your needs\n", + "%cd /content/tabbyAPI/\n", + "\n", + "write = '''\n", + "# Sample YAML file for configuration.\n", + "# Comment out values as needed. Every value has a default within the application.\n", + "\n", + "# Unless specified in the comments, DO NOT put these options in quotes!\n", + "# You can use https://www.yamllint.com/ if you want to check your YAML formatting.\n", + "\n", + "# Options for networking\n", + "network:\n", + " # The IP to host on (default: 127.0.0.1).\n", + " # Use 0.0.0.0 to expose on all network adapters\n", + " host: 0.0.0.0\n", + "\n", + " # The port to host on (default: 5000)\n", + " port: 5000\n", + "\n", + "# Options for model overrides and loading\n", + "model:\n", + " # Overrides the directory to look for models (default: models)\n", + " # Windows users, DO NOT put this path in quotes! This directory will be invalid otherwise.\n", + " model_dir: models\n", + "\n", + " # An initial model to load. Make sure the model is located in the model directory!\n", + " # A model can be loaded later via the API.\n", + " model_name: royallab_airoboros-mistral2.2-7b-exl2\n", + "\n", + " # Sends dummy model names when the models endpoint is queried\n", + " # Enable this if the program is looking for a specific OAI model\n", + " use_dummy_models: False\n", + "\n", + " # The below parameters apply only if model_name is set\n", + "\n", + " # Maximum model context length (default: 4096)\n", + " max_seq_len: 4096\n", + "\n", + " # Automatically allocate resources to GPUs (default: True)\n", + " gpu_split_auto: True\n", + "\n", + " # An integer array of GBs of vram to split between GPUs (default: [])\n", + " # gpu_split: [20.6, 24]\n", + "\n", + " # Rope scaling parameters (default: 1.0)\n", + " rope_scale: 1.0\n", + " rope_alpha: 1.0\n", + "\n", + " # Disable Flash-attention 2. Set to True for GPUs lower than Nvidia's 3000 series. (default: False)\n", + " no_flash_attention: False\n", + "\n", + " # Enable 8 bit cache mode for VRAM savings (slight performance hit). Possible values FP16, FP8. (default: FP16)\n", + " cache_mode: FP16\n", + "\n", + " # Options for draft models (speculative decoding). This will use more VRAM!\n", + " draft:\n", + " # Overrides the directory to look for draft (default: models)\n", + " # draft_model_dir: Your draft model directory path\n", + "\n", + " # An initial draft model to load. Make sure this model is located in the model directory!\n", + " # A draft model can be loaded later via the API.\n", + " # draft_model_name: A model name\n", + "\n", + " # Rope parameters for draft models (default: 1.0)\n", + " # draft_rope_alpha: 1.0\n", + "'''\n", + "with open(\"./config.yml\", \"w\") as file:\n", + " file.write(write)\n", + "!cat config.yml" + ], + "metadata": { + "id": "aD7Rz8ZCeShD" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Start API\n", + "%cd /content/tabbyAPI/\n", + "\n", + "!python main.py & ./cloudflared tunnel --url localhost:5000" + ], + "metadata": { + "id": "ZhSK71UeeMJi" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file