tabbyAPI-ollama/TabbyAPI_Colab_Example.ipynb
kingbri 5fbb37405f Colab: Remove the pydantic hotfix
Requirements.txt is now pinned to install pydantic >= 2.0.0

Signed-off-by: kingbri <bdashore3@proton.me>
2023-12-20 00:01:58 -05:00

215 lines
8.5 KiB
Text

{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "NcgQp3r7BS-q"
},
"source": [
"# **TabbyAPI Colab**"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "zU0omxLLm8AN"
},
"outputs": [],
"source": [
"#@title # Keep this widget playing to prevent Colab from disconnecting you { display-mode: \"form\" }\n",
"#@markdown Press play on the audio player that will appear below:\n",
"%%html\n",
"<audio src=\"https://oobabooga.github.io/silence.m4a\" controls>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "ThqTbSnPalJl"
},
"outputs": [],
"source": [
"# @title # Install and download model { display-mode: \"form\" }\n",
"# @markdown ---\n",
"# @markdown Select model:\n",
"repo_id = \"royallab/Noromaid-13b-v0.1.1-exl2\" # @param {type:\"string\"}\n",
"revision = \"4bpw\" # @param {type:\"string\"}\n",
"if revision == \"\": revision = \"main\"\n",
"# @markdown ---\n",
"# @markdown Select draft model (optional, for speculative decoding):\n",
"draft_repo_id = \"\" # @param {type:\"string\"}\n",
"draft_revision = \"\" # @param {type:\"string\"}\n",
"if draft_revision == \"\": draft_revision = \"main\"\n",
"# @markdown ---\n",
"# @markdown Select lora (optional):\n",
"lora_repo_id = \"\" # @param {type:\"string\"}\n",
"lora_revision = \"\" # @param {type:\"string\"}\n",
"if lora_revision == \"\": lora_revision = \"main\"\n",
"# @markdown ---\n",
"\n",
"# Install tabbyAPI\n",
"%cd /content/\n",
"\n",
"!git clone https://github.com/theroyallab/tabbyAPI\n",
"%cd tabbyAPI\n",
"\n",
"# Install cuda requirements\n",
"!pip install -r requirements.txt -q\n",
"!pip install huggingface-hub -q\n",
"\n",
"# Download cloudflared tunnel\n",
"%cd /content/tabbyAPI/\n",
"\n",
"!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared\n",
"!chmod a+x cloudflared\n",
"\n",
"# Download model repo\n",
"%cd /content/tabbyAPI/\n",
"\n",
"from huggingface_hub import snapshot_download\n",
"\n",
"snapshot_download(repo_id=repo_id, revision=revision, local_dir=f\"./models/{repo_id.replace('/', '_')}\")\n",
"model = repo_id.replace('/', '_')\n",
"\n",
"if len(draft_repo_id) > 0: snapshot_download(repo_id=draft_repo_id, revision=draft_revision, local_dir=f\"./models/{draft_repo_id.replace('/', '_')}\")\n",
"draft_model = draft_repo_id.replace('/', '_')\n",
"\n",
"if len(lora_repo_id) > 0: snapshot_download(repo_id=lora_repo_id, revision=lora_revision, local_dir=f\"./loras/{lora_repo_id.replace('/', '_')}\")\n",
"lora = lora_repo_id.replace('/', '_')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "aD7Rz8ZCeShD"
},
"outputs": [],
"source": [
"# @title # Configure and launch API { display-mode: \"form\" }\n",
"# @markdown ---\n",
"# @markdown Model parameters:\n",
"ContextSize = 4096 # @param {type:\"integer\"}\n",
"RopeScale = 1.0 # @param {type:\"number\"}\n",
"RopeAlpha = 1.0 # @param {type:\"number\"}\n",
"# @markdown ---\n",
"# @markdown Draft model parameters (optional, for speculative decoding):\n",
"DraftRopeScale = 1.0 # @param {type:\"number\"}\n",
"DraftRopeAlpha = 1.0 # @param {type:\"number\"}\n",
"# @markdown ---\n",
"# @markdown Lora parameters (optional, for loras):\n",
"LoraScaling = 1.0 # @param {type:\"number\"}\n",
"# @markdown ---\n",
"# @markdown Misc options:\n",
"CacheMode = \"FP16\" # @param [\"FP8\", \"FP16\"] {type:\"string\"}\n",
"UseDummyModels = False # @param {type:\"boolean\"}\n",
"NoFlashAttention = False # @param {type:\"boolean\"}\n",
"# @markdown ---\n",
"# @markdown To connect, make note of the cloudflared URL and your auto-generated API key after launching and provide it to your preferred frontend.\n",
"\n",
"# Setup Config\n",
"%cd /content/tabbyAPI/\n",
"\n",
"write = f'''\n",
"# Sample YAML file for configuration.\n",
"# Comment out values as needed. Every value has a default within the application.\n",
"\n",
"# Unless specified in the comments, DO NOT put these options in quotes!\n",
"# You can use https://www.yamllint.com/ if you want to check your YAML formatting.\n",
"\n",
"# Options for networking\n",
"network:\n",
" # The IP to host on (default: 127.0.0.1).\n",
" # Use 0.0.0.0 to expose on all network adapters\n",
" host: 0.0.0.0\n",
"\n",
" # The port to host on (default: 5000)\n",
" port: 5000\n",
"\n",
"# Options for model overrides and loading\n",
"model:\n",
" # Overrides the directory to look for models (default: models)\n",
" # Windows users, DO NOT put this path in quotes! This directory will be invalid otherwise.\n",
" model_dir: models\n",
"\n",
" # An initial model to load. Make sure the model is located in the model directory!\n",
" # A model can be loaded later via the API.\n",
" model_name: {model}\n",
"\n",
" # Sends dummy model names when the models endpoint is queried\n",
" # Enable this if the program is looking for a specific OAI model\n",
" use_dummy_models: {UseDummyModels}\n",
"\n",
" # The below parameters apply only if model_name is set\n",
"\n",
" # Maximum model context length (default: 4096)\n",
" max_seq_len: {ContextSize}\n",
"\n",
" # Automatically allocate resources to GPUs (default: True)\n",
" gpu_split_auto: True\n",
"\n",
" # An integer array of GBs of vram to split between GPUs (default: [])\n",
" # gpu_split: [20.6, 24]\n",
"\n",
" # Rope scaling parameters (default: 1.0)\n",
" rope_scale: {RopeScale}\n",
" rope_alpha: {RopeAlpha}\n",
"\n",
" # Disable Flash-attention 2. Set to True for GPUs lower than Nvidia's 3000 series. (default: False)\n",
" no_flash_attention: {NoFlashAttention}\n",
"\n",
" # Enable 8 bit cache mode for VRAM savings (slight performance hit). Possible values FP16, FP8. (default: FP16)\n",
" cache_mode: {CacheMode}\n",
"\n",
" # Options for draft models (speculative decoding). This will use more VRAM!\n",
" draft:\n",
" # Overrides the directory to look for draft (default: models)\n",
" draft_model_dir: models\n",
"\n",
" # An initial draft model to load. Make sure this model is located in the model directory!\n",
" # A draft model can be loaded later via the API.\n",
" draft_model_name: {draft_model}\n",
"\n",
" # Rope parameters for draft models (default: 1.0)\n",
" draft_rope_scale: {DraftRopeScale}\n",
" draft_rope_alpha: {DraftRopeAlpha}\n",
"\n",
" # Options for loras\n",
" lora:\n",
" # Overrides the directory to look for loras (default: loras)\n",
" lora_dir: loras\n",
"\n",
" # List of loras to load and associated scaling factors (default: 1.0). Comment out unused entries or add more rows as needed.\n",
" loras:\n",
" - name: {lora}\n",
" scaling: {LoraScaling}\n",
"'''\n",
"with open(\"./config.yml\", \"w\") as file:\n",
" file.write(write)\n",
"!cat config.yml\n",
"\n",
"%cd /content/tabbyAPI/\n",
"\n",
"!python main.py & ./cloudflared tunnel --url localhost:5000"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}