mirror of
https://github.com/fauxpilot/fauxpilot.git
synced 2025-03-12 04:36:10 -07:00
Merge branch 'main' into main
This commit is contained in:
commit
7ebb56c551
.dockerignore.editorconfig
.github
.gitignoreDockerfileREADME.mdconverter
copilot_proxy
docker-compose.yamldocumentation
example.envimg
launch.shpython_backend
setup.shshutdown.shtests/python_backend
@ -1,3 +1,6 @@
|
|||||||
|
# huggingface cache
|
||||||
|
.hf_cache/
|
||||||
|
|
||||||
.idea/**/workspace.xml
|
.idea/**/workspace.xml
|
||||||
.idea/**/tasks.xml
|
.idea/**/tasks.xml
|
||||||
.idea/**/usage.statistics.xml
|
.idea/**/usage.statistics.xml
|
||||||
@ -319,4 +322,4 @@ dmypy.json
|
|||||||
.pytype/
|
.pytype/
|
||||||
cython_debug/
|
cython_debug/
|
||||||
*.md
|
*.md
|
||||||
.git*
|
.git*
|
||||||
|
49
.editorconfig
Normal file
49
.editorconfig
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
# EditorConfig is awesome: https://EditorConfig.org
|
||||||
|
|
||||||
|
# top-most EditorConfig file
|
||||||
|
root = true
|
||||||
|
|
||||||
|
[*]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 4
|
||||||
|
tab_width = 4
|
||||||
|
# end_of_line = crlf
|
||||||
|
charset = utf-8
|
||||||
|
trim_trailing_whitespace = true
|
||||||
|
insert_final_newline = false
|
||||||
|
|
||||||
|
# Markdown
|
||||||
|
[*.{md}]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 2
|
||||||
|
insert_final_newline = true
|
||||||
|
|
||||||
|
# Serialized data
|
||||||
|
[*.{yml,yaml,json,pbtxt}]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 2
|
||||||
|
insert_final_newline = true
|
||||||
|
|
||||||
|
# Shell script
|
||||||
|
[*.{sh,bash,bashrc,zsh,fish,ksh,csh}]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 4
|
||||||
|
insert_final_newline = true
|
||||||
|
|
||||||
|
# Python
|
||||||
|
[*.py]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 4
|
||||||
|
insert_final_newline = true
|
||||||
|
|
||||||
|
# Environment
|
||||||
|
[*.env]
|
||||||
|
insert_final_newline = false
|
||||||
|
|
||||||
|
# Python requirements
|
||||||
|
[requirements.txt]
|
||||||
|
insert_final_newline = true
|
||||||
|
|
||||||
|
# Dockerfile
|
||||||
|
[Dockerfile]
|
||||||
|
insert_final_newline = true
|
18
.github/CODEOWNERS
vendored
Normal file
18
.github/CODEOWNERS
vendored
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
# This is a comment. Each line is a file pattern followed by one or more owners.
|
||||||
|
# For more details, visit https://help.github.com/articles/about-codeowners/
|
||||||
|
#
|
||||||
|
# Note that the current version of github.com (community edition) does not
|
||||||
|
# translate "@org/team-name" correctly, although the GitHub webpage depicts how
|
||||||
|
# to use "@org/team-name" as well as "@username" format. So, We need to append
|
||||||
|
# all member IDs directly to avoid unexpected situations.
|
||||||
|
|
||||||
|
# In order that all members of a repository are supposed to review each other
|
||||||
|
* @moyix @thakkarparth007 @fdegier
|
||||||
|
converter @moyix
|
||||||
|
copilot_proxy @moyix @thakkarparth007
|
||||||
|
python_backend @moyix @thakkarparth007
|
||||||
|
tests @moyix
|
||||||
|
documentation @moyix @thakkarparth007 @fdegier
|
||||||
|
docker-compose.yaml @moyix @thakkarparth007 @fdegier
|
||||||
|
Dockerfile @moyix @thakkarparth007 @fdegier
|
||||||
|
copilot_proxy/Dockerfile @moyix @thakkarparth007 @fdegier
|
31
.github/ISSUE_TEMPLATE/Bug_report.md
vendored
Normal file
31
.github/ISSUE_TEMPLATE/Bug_report.md
vendored
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
---
|
||||||
|
name: Bug report
|
||||||
|
about: Create a report to help us improve
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Having problems with a source code of a github repository?
|
||||||
|
|
||||||
|
Having problems with the FauxPilot that controls the build process?
|
||||||
|
|
||||||
|
Good to go? Then please remove these lines above, including this one, and help us understand your issue by answering the following:
|
||||||
|
|
||||||
|
# Issue Description
|
||||||
|
A clear and concise description of what the bug is.
|
||||||
|
|
||||||
|
Expected Result
|
||||||
|
============
|
||||||
|
A clear and concise description of what you expected to happen.
|
||||||
|
|
||||||
|
How to Reproduce
|
||||||
|
===============
|
||||||
|
1. Go to '...'
|
||||||
|
2. Click on '....'
|
||||||
|
3. Scroll down to '....'
|
||||||
|
4. See error
|
||||||
|
|
||||||
|
|
||||||
|
Further Information
|
||||||
|
===============
|
||||||
|
* A link to an output result showing the issue
|
||||||
|
* Exact OS version
|
17
.github/ISSUE_TEMPLATE/Feature_request.md
vendored
Normal file
17
.github/ISSUE_TEMPLATE/Feature_request.md
vendored
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
---
|
||||||
|
name: Feature request
|
||||||
|
about: Suggest an idea for this project
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
**Is your feature request related to a problem? Please describe.**
|
||||||
|
A clear and concise description of what the problem is. For example, I'm always frustrated when [...]
|
||||||
|
|
||||||
|
**Describe the solution you'd like**
|
||||||
|
A clear and concise description of what you want to happen.
|
||||||
|
|
||||||
|
**Describe alternatives you've considered**
|
||||||
|
A clear and concise description of any alternative solutions or features you've considered.
|
||||||
|
|
||||||
|
**Additional context**
|
||||||
|
Add any other context or screenshots about the feature request here.
|
13
.github/ISSUE_TEMPLATE/Support_request.md
vendored
Normal file
13
.github/ISSUE_TEMPLATE/Support_request.md
vendored
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
---
|
||||||
|
name: Support Request
|
||||||
|
about: Report a problem with our project source code
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|

|
||||||
|
|
||||||
|
Please only create issues/feature requests for the project here.
|
||||||
|
|
||||||
|
For support contact our project maintainer(s), they meet online in a 'Issues' list.
|
||||||
|
There you can ask questions if you have trouble understanding something, seek advice and mingle with other project members.
|
||||||
|
For further information see 'Wiki' page.
|
27
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
27
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
---
|
||||||
|
# [Template] PR Description
|
||||||
|
|
||||||
|
In general, the github system duplicates your commit message automatically for your convenience.
|
||||||
|
After composing your own PR description using this template, please remove any unneeded portions.
|
||||||
|
```bash
|
||||||
|
## 1. General Description
|
||||||
|
The commit title must begin with one of the eleven given options.
|
||||||
|
Build, chore, CI, documentation, task, fix, performance, refactor, revert, style, and test are some examples.
|
||||||
|
or more details, please see [HERE](https://www.conventionalcommits.org/en/v1.0.0/).
|
||||||
|
Summarize changes in no more than 50 characters ASAP for readability and maintenance.
|
||||||
|
|
||||||
|
## 2. Changes proposed in this PR:
|
||||||
|
- Bulleted lists are also acceptable.
|
||||||
|
- Typically, a hyphen or asterisk before the bullet, followed by a single space.
|
||||||
|
|
||||||
|
Resolves: #{GitHub-Issue-Number}
|
||||||
|
See also: #{GitHub-Issue-Number}
|
||||||
|
|
||||||
|
|
||||||
|
## 3. How to evaluate:
|
||||||
|
1. Describe how to evaluate such that it may be reproduced by the reviewer (s).
|
||||||
|
2. Self assessment:**
|
||||||
|
- Build test: [ ]Passed [ ]Failed [*]Skipped
|
||||||
|
- Run test: [ ]Passed [ ]Failed [* ]Skipped
|
||||||
|
```
|
||||||
|
|
22
.github/workflows/python_backend_tests.yaml
vendored
Normal file
22
.github/workflows/python_backend_tests.yaml
vendored
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
name: FauxPilot Python Backend Tests
|
||||||
|
|
||||||
|
on: [push]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v3
|
||||||
|
- uses: actions/setup-python@v4
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install -r tests/python_backend/requirements.txt
|
||||||
|
- name: Build container
|
||||||
|
run: |
|
||||||
|
cp tests/python_backend/runner.env .env &&
|
||||||
|
docker compose build &&
|
||||||
|
rm -f .env
|
||||||
|
- name: Run tests
|
||||||
|
run: pytest tests
|
||||||
|
|
21
.github/workflows/welcome-new-contributor.yml
vendored
Normal file
21
.github/workflows/welcome-new-contributor.yml
vendored
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
name: 'Welcome New Contributors'
|
||||||
|
|
||||||
|
on:
|
||||||
|
issues:
|
||||||
|
types: [opened]
|
||||||
|
pull_request_target:
|
||||||
|
types: [opened]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
welcome-new-contributor:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: 'Greet the contributor'
|
||||||
|
uses: garg3133/welcome-new-contributors@v1.2
|
||||||
|
with:
|
||||||
|
token: ${{ secrets.BOT_ACCESS_TOKEN }}
|
||||||
|
is-oauth-token: true
|
||||||
|
issue-message: 'Hello there, thanks for opening your first issue. We
|
||||||
|
welcome you to the FauxPilot community!'
|
||||||
|
pr-message: 'Hello there, thanks for opening your first Pull
|
||||||
|
Request. Someone will review it soon.'
|
4
.gitignore
vendored
4
.gitignore
vendored
@ -1,7 +1,7 @@
|
|||||||
config.env
|
|
||||||
|
|
||||||
# Codegen Models
|
# Codegen Models
|
||||||
models/*
|
models/*
|
||||||
|
# huggingface cache
|
||||||
|
.hf_cache/
|
||||||
|
|
||||||
.vscode/*
|
.vscode/*
|
||||||
!.vscode/settings.json
|
!.vscode/settings.json
|
||||||
|
5
Dockerfile
Normal file
5
Dockerfile
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
FROM moyix/triton_with_ft:22.09
|
||||||
|
|
||||||
|
# Install dependencies: torch
|
||||||
|
RUN python3 -m pip install --disable-pip-version-check -U torch --extra-index-url https://download.pytorch.org/whl/cu116
|
||||||
|
RUN python3 -m pip install --disable-pip-version-check -U transformers bitsandbytes accelerate
|
@ -1,7 +1,11 @@
|
|||||||
|
|
||||||
# FauxPilot
|
# FauxPilot
|
||||||
|
|
||||||
This is an attempt to build a locally hosted version of [GitHub Copilot](https://copilot.github.com/). It uses the [SalesForce CodeGen](https://github.com/salesforce/CodeGen) models inside of NVIDIA's [Triton Inference Server](https://developer.nvidia.com/nvidia-triton-inference-server) with the [FasterTransformer backend](https://github.com/triton-inference-server/fastertransformer_backend/).
|
This is an attempt to build a locally hosted version of [GitHub Copilot](https://copilot.github.com/). It uses the [SalesForce CodeGen](https://github.com/salesforce/CodeGen) models inside of NVIDIA's [Triton Inference Server](https://developer.nvidia.com/nvidia-triton-inference-server) with the [FasterTransformer backend](https://github.com/triton-inference-server/fastertransformer_backend/).
|
||||||
|
|
||||||
|
<p align="right">
|
||||||
|
<img width="50%" align="right" src="./img/fauxpilot.png">
|
||||||
|
</p>
|
||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
|
9
converter/README.md
Normal file
9
converter/README.md
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
This section describes the Python scripts necessary for converting deep learning model files:
|
||||||
|
|
||||||
|
* `Dockerfile`: A Docker file used to construct an image based on Ubuntu 20.04 that includes the Transformer library.
|
||||||
|
* `download_and_convert_model.sh`: A shell script that converts model codegen-6B-multi with the provided number of GPUs.
|
||||||
|
* `codegen_gptj_convert.py`: A Python script for converting SalesForce CodeGen models to GPT-J (e.g., Salesforce/codegen-350M-multi).
|
||||||
|
* `huggingface_gptj_convert.py`: A Python script for converting the HF model to the GPT-J format (e.g., GPTJForCausalLM model)
|
||||||
|
* `triton_config_gen.py`: A Python script that creates a config and weight file for running a Codgen model with Triton.
|
||||||
|
* `config_template.pbtxt`: A template file for defining the config file's data format.
|
||||||
|
|
@ -20,6 +20,7 @@ parser.add_argument('--template', default=CONFIG_TEMPLATE_PATH, help='Path to th
|
|||||||
parser.add_argument('--model_store', required=True, help='Path to the Triton model store')
|
parser.add_argument('--model_store', required=True, help='Path to the Triton model store')
|
||||||
parser.add_argument('--hf_model_dir', required=True, help='Path to HF model directory')
|
parser.add_argument('--hf_model_dir', required=True, help='Path to HF model directory')
|
||||||
parser.add_argument('--tokenizer', default='Salesforce/codegen-16B-multi', help='Name or path to the tokenizer')
|
parser.add_argument('--tokenizer', default='Salesforce/codegen-16B-multi', help='Name or path to the tokenizer')
|
||||||
|
parser.add_argument('--rebase', default=None, help='Path to rebase the model store to (e.g. for Docker)')
|
||||||
parser.add_argument('-n', '--num_gpu', help='Number of GPUs to use', type=int, default=1)
|
parser.add_argument('-n', '--num_gpu', help='Number of GPUs to use', type=int, default=1)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
@ -61,8 +62,8 @@ params['is_half'] = is_half
|
|||||||
params['head_num'] = config.n_head
|
params['head_num'] = config.n_head
|
||||||
params['size_per_head'] = config.n_embd // config.n_head
|
params['size_per_head'] = config.n_embd // config.n_head
|
||||||
params['inter_size'] = 4*config.n_embd
|
params['inter_size'] = 4*config.n_embd
|
||||||
# Vocab size gets rounded up to a multiple of 1024
|
# Vocab size *sometimes* gets rounded up to a multiple of 1024
|
||||||
params['vocab_size'] = round_up(tokenizer.vocab_size, 1024)
|
params['vocab_size'] = tokenizer.vocab_size+len(tokenizer.get_added_vocab()) # round_up(tokenizer.vocab_size, 1024)
|
||||||
params['start_id'] = tokenizer.eos_token_id
|
params['start_id'] = tokenizer.eos_token_id
|
||||||
params['end_id'] = tokenizer.eos_token_id
|
params['end_id'] = tokenizer.eos_token_id
|
||||||
params['decoder_layers'] = config.n_layer
|
params['decoder_layers'] = config.n_layer
|
||||||
@ -70,7 +71,14 @@ params['rotary_embedding'] = config.rotary_dim
|
|||||||
# NOTE: this assumes that the model dir follows the format used by the other conversion scripts
|
# NOTE: this assumes that the model dir follows the format used by the other conversion scripts
|
||||||
model_dir = os.path.join(args.model_store, f'{model_name}-{args.num_gpu}gpu')
|
model_dir = os.path.join(args.model_store, f'{model_name}-{args.num_gpu}gpu')
|
||||||
weights_path = os.path.join(model_dir, 'fastertransformer', f'{version}', f'{args.num_gpu}-gpu')
|
weights_path = os.path.join(model_dir, 'fastertransformer', f'{version}', f'{args.num_gpu}-gpu')
|
||||||
params['checkpoint_path'] = weights_path
|
if args.rebase:
|
||||||
|
rebased_model_dir = os.path.join(args.rebase, f'{model_name}-{args.num_gpu}gpu')
|
||||||
|
rebased_weights_path = os.path.join(args.rebase, 'fastertransformer', f'{version}', f'{args.num_gpu}-gpu')
|
||||||
|
else:
|
||||||
|
rebased_model_dir = model_dir
|
||||||
|
rebased_weights_path = weights_path
|
||||||
|
|
||||||
|
params['checkpoint_path'] = rebased_weights_path
|
||||||
triton_config = template.substitute(params)
|
triton_config = template.substitute(params)
|
||||||
assert '${' not in triton_config
|
assert '${' not in triton_config
|
||||||
|
|
||||||
@ -84,6 +92,10 @@ with open(config_path, 'w') as f:
|
|||||||
|
|
||||||
print('==========================================================')
|
print('==========================================================')
|
||||||
print(f'Created config file for {model_name}')
|
print(f'Created config file for {model_name}')
|
||||||
print(f' Config: {config_path}')
|
print(f' Config: {config_path}')
|
||||||
print(f' Weights: {weights_path}')
|
print(f' Weights: {weights_path}')
|
||||||
print('==========================================================')
|
print(f' Store: {args.model_store}')
|
||||||
|
print(f' Rebase: {model_dir} => {args.rebase}')
|
||||||
|
print(f' Weights: {rebased_weights_path}')
|
||||||
|
print(f' Num GPU: {args.num_gpu}')
|
||||||
|
print('==========================================================')
|
||||||
|
@ -10,4 +10,4 @@ COPY . .
|
|||||||
|
|
||||||
EXPOSE 5000
|
EXPOSE 5000
|
||||||
|
|
||||||
CMD [ "uvicorn", "--host", "0.0.0.0", "--port", "5000", "app:app"]
|
CMD ["uvicorn", "--host", "0.0.0.0", "--port", "5000", "app:app"]
|
||||||
|
@ -1,11 +1,17 @@
|
|||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from fastapi import FastAPI, Response
|
from fastapi import FastAPI, Request, Response
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
from sse_starlette.sse import EventSourceResponse
|
from sse_starlette.sse import EventSourceResponse
|
||||||
|
|
||||||
|
from config.log_config import uvicorn_logger
|
||||||
from models import OpenAIinput
|
from models import OpenAIinput
|
||||||
from utils.codegen import CodeGenProxy
|
from utils.codegen import CodeGenProxy
|
||||||
|
from utils.errors import FauxPilotException
|
||||||
|
|
||||||
|
logging.config.dictConfig(uvicorn_logger)
|
||||||
|
|
||||||
codegen = CodeGenProxy(
|
codegen = CodeGenProxy(
|
||||||
host=os.environ.get("TRITON_HOST", "triton"),
|
host=os.environ.get("TRITON_HOST", "triton"),
|
||||||
@ -21,24 +27,50 @@ app = FastAPI(
|
|||||||
swagger_ui_parameters={"defaultModelsExpandDepth": -1}
|
swagger_ui_parameters={"defaultModelsExpandDepth": -1}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@app.exception_handler(FauxPilotException)
|
||||||
|
async def fauxpilot_handler(request: Request, exc: FauxPilotException):
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=400,
|
||||||
|
content=exc.json()
|
||||||
|
)
|
||||||
|
|
||||||
@app.post("/v1/engines/codegen/completions", status_code=200)
|
# Used to support copilot.vim
|
||||||
@app.post("/v1/completions", status_code=200)
|
@app.get("/copilot_internal/v2/token")
|
||||||
|
def get_copilot_token():
|
||||||
|
content = {'token': '1', 'expires_at': 2600000000, 'refresh_in': 900}
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=200,
|
||||||
|
content=content
|
||||||
|
)
|
||||||
|
|
||||||
|
@app.post("/v1/engines/codegen/completions")
|
||||||
|
# Used to support copilot.vim
|
||||||
|
@app.post("/v1/engines/copilot-codex/completions")
|
||||||
|
@app.post("/v1/completions")
|
||||||
async def completions(data: OpenAIinput):
|
async def completions(data: OpenAIinput):
|
||||||
data = data.dict()
|
data = data.dict()
|
||||||
print(data)
|
try:
|
||||||
|
content = codegen(data=data)
|
||||||
|
except codegen.TokensExceedsMaximum as E:
|
||||||
|
raise FauxPilotException(
|
||||||
|
message=str(E),
|
||||||
|
type="invalid_request_error",
|
||||||
|
param=None,
|
||||||
|
code=None,
|
||||||
|
)
|
||||||
|
|
||||||
if data.get("stream") is not None:
|
if data.get("stream") is not None:
|
||||||
return EventSourceResponse(
|
return EventSourceResponse(
|
||||||
content=codegen(data=data),
|
content=content,
|
||||||
status_code=200,
|
status_code=200,
|
||||||
media_type="text/event-stream"
|
media_type="text/event-stream"
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
return Response(
|
return Response(
|
||||||
status_code=200,
|
status_code=200,
|
||||||
content=codegen(data=data),
|
content=content,
|
||||||
media_type="application/json"
|
media_type="application/json"
|
||||||
)
|
)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
uvicorn.run("app:app", host=os.environ.get("API_HOST", "0.0.0.0"), port=os.environ.get("API_PORT", 5000))
|
uvicorn.run("app:app", host="0.0.0.0", port=5000)
|
||||||
|
0
copilot_proxy/config/__init__.py
Normal file
0
copilot_proxy/config/__init__.py
Normal file
27
copilot_proxy/config/log_config.py
Normal file
27
copilot_proxy/config/log_config.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# The uvicorn_logger is used to add timestamps
|
||||||
|
|
||||||
|
uvicorn_logger = {
|
||||||
|
"version": 1,
|
||||||
|
"disable_existing_loggers": False,
|
||||||
|
"formatters": {
|
||||||
|
"access": {
|
||||||
|
"()": "uvicorn.logging.AccessFormatter",
|
||||||
|
"fmt": '%(levelprefix)s %(asctime)s :: %(client_addr)s - "%(request_line)s" %(status_code)s',
|
||||||
|
"use_colors": True
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"handlers": {
|
||||||
|
"access": {
|
||||||
|
"formatter": "access",
|
||||||
|
"class": "logging.StreamHandler",
|
||||||
|
"stream": "ext://sys.stdout",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"loggers": {
|
||||||
|
"uvicorn.access": {
|
||||||
|
"handlers": ["access"],
|
||||||
|
# "level": "INFO",
|
||||||
|
"propagate": False
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
@ -1,10 +1,10 @@
|
|||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel, constr
|
||||||
|
|
||||||
|
|
||||||
class OpenAIinput(BaseModel):
|
class OpenAIinput(BaseModel):
|
||||||
model: str
|
model: constr(regex="^(fastertransformer|py-model)$") = "fastertransformer"
|
||||||
prompt: Optional[str]
|
prompt: Optional[str]
|
||||||
suffix: Optional[str]
|
suffix: Optional[str]
|
||||||
max_tokens: Optional[int] = 16
|
max_tokens: Optional[int] = 16
|
||||||
|
@ -6,14 +6,14 @@ import time
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import tritonclient.grpc as client_util
|
import tritonclient.grpc as client_util
|
||||||
from tokenizers import Tokenizer
|
from tokenizers import Tokenizer
|
||||||
from tritonclient.utils import np_to_triton_dtype
|
from tritonclient.utils import np_to_triton_dtype, InferenceServerException
|
||||||
|
|
||||||
np.finfo(np.dtype("float32"))
|
np.finfo(np.dtype("float32"))
|
||||||
np.finfo(np.dtype("float64"))
|
np.finfo(np.dtype("float64"))
|
||||||
|
|
||||||
|
|
||||||
class CodeGenProxy:
|
class CodeGenProxy:
|
||||||
def __init__(self, host: str = 'localhost', port: int = 8001, verbose: bool = False):
|
def __init__(self, host: str = 'triton', port: int = 8001, verbose: bool = False):
|
||||||
self.tokenizer = Tokenizer.from_file('/python-docker/cgtok/tokenizer.json')
|
self.tokenizer = Tokenizer.from_file('/python-docker/cgtok/tokenizer.json')
|
||||||
self.client = client_util.InferenceServerClient(url=f'{host}:{port}', verbose=verbose)
|
self.client = client_util.InferenceServerClient(url=f'{host}:{port}', verbose=verbose)
|
||||||
self.PAD_CHAR = 50256
|
self.PAD_CHAR = 50256
|
||||||
@ -21,6 +21,9 @@ class CodeGenProxy:
|
|||||||
# Max number of tokens the model can handle
|
# Max number of tokens the model can handle
|
||||||
self.MAX_MODEL_LEN = 2048
|
self.MAX_MODEL_LEN = 2048
|
||||||
|
|
||||||
|
class TokensExceedsMaximum(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def prepare_tensor(name: str, tensor_input):
|
def prepare_tensor(name: str, tensor_input):
|
||||||
t = client_util.InferInput(
|
t = client_util.InferInput(
|
||||||
@ -70,20 +73,31 @@ class CodeGenProxy:
|
|||||||
return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
|
return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
|
||||||
|
|
||||||
def generate(self, data):
|
def generate(self, data):
|
||||||
model_name = "fastertransformer"
|
|
||||||
prompt = data['prompt']
|
prompt = data['prompt']
|
||||||
n = data.get('n', 1)
|
n = data.get('n', 1)
|
||||||
|
model_name = data["model"]
|
||||||
|
# ugly hack to set the data type correctly. Huggingface models want int32, but fastertransformer needs uint32
|
||||||
|
# i could've done the conversion from uint32 to int32 in the model but that'd be inefficient.
|
||||||
|
np_type = np.int32 if model_name.startswith("py-") else np.uint32
|
||||||
|
|
||||||
input_start_ids = np.expand_dims(self.tokenizer.encode(prompt).ids, 0)
|
input_start_ids = np.expand_dims(self.tokenizer.encode(prompt).ids, 0)
|
||||||
input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np.uint32)
|
input_start_ids = np.repeat(input_start_ids, n, axis=0).astype(np_type)
|
||||||
prompt_len = input_start_ids.shape[1]
|
prompt_len = input_start_ids.shape[1]
|
||||||
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
input_len = prompt_len * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||||
max_tokens = data.get('max_tokens', 16)
|
max_tokens = data.get('max_tokens', 16)
|
||||||
if max_tokens + input_len[0][0] > self.MAX_MODEL_LEN:
|
prompt_tokens: int = input_len[0][0]
|
||||||
raise ValueError("Max tokens + prompt length exceeds maximum model length")
|
requested_tokens = max_tokens + prompt_tokens
|
||||||
output_len = np.ones_like(input_len).astype(np.uint32) * max_tokens
|
if requested_tokens > self.MAX_MODEL_LEN:
|
||||||
|
print(1)
|
||||||
|
raise self.TokensExceedsMaximum(
|
||||||
|
f"This model's maximum context length is {self.MAX_MODEL_LEN}, however you requested "
|
||||||
|
f"{requested_tokens} tokens ({prompt_tokens} in your prompt; {max_tokens} for the completion). "
|
||||||
|
f"Please reduce your prompt; or completion length."
|
||||||
|
)
|
||||||
|
output_len = np.ones_like(input_len).astype(np_type) * max_tokens
|
||||||
num_logprobs = data.get('logprobs', -1)
|
num_logprobs = data.get('logprobs', -1)
|
||||||
if num_logprobs is None:
|
if num_logprobs is None:
|
||||||
num_logprobs = 1
|
num_logprobs = -1
|
||||||
want_logprobs = num_logprobs > 0
|
want_logprobs = num_logprobs > 0
|
||||||
|
|
||||||
temperature = data.get('temperature', 0.2)
|
temperature = data.get('temperature', 0.2)
|
||||||
@ -95,7 +109,7 @@ class CodeGenProxy:
|
|||||||
|
|
||||||
top_p = data.get('top_p', 1.0)
|
top_p = data.get('top_p', 1.0)
|
||||||
frequency_penalty = data.get('frequency_penalty', 1.0)
|
frequency_penalty = data.get('frequency_penalty', 1.0)
|
||||||
runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
runtime_top_k = top_k * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||||
runtime_top_p = top_p * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
runtime_top_p = top_p * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||||
beam_search_diversity_rate = 0.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
beam_search_diversity_rate = 0.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||||
random_seed = np.random.randint(0, 2 ** 31 - 1, (input_start_ids.shape[0], 1), dtype=np.int32)
|
random_seed = np.random.randint(0, 2 ** 31 - 1, (input_start_ids.shape[0], 1), dtype=np.int32)
|
||||||
@ -103,9 +117,9 @@ class CodeGenProxy:
|
|||||||
len_penalty = 1.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
len_penalty = 1.0 * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||||
repetition_penalty = frequency_penalty * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
repetition_penalty = frequency_penalty * np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
||||||
is_return_log_probs = want_logprobs * np.ones([input_start_ids.shape[0], 1]).astype(np.bool_)
|
is_return_log_probs = want_logprobs * np.ones([input_start_ids.shape[0], 1]).astype(np.bool_)
|
||||||
beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np.uint32)
|
beam_width = (1 * np.ones([input_start_ids.shape[0], 1])).astype(np_type)
|
||||||
start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
start_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||||
end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np.uint32)
|
end_ids = self.PAD_CHAR * np.ones([input_start_ids.shape[0], 1]).astype(np_type)
|
||||||
|
|
||||||
stop_words = data.get('stop', [])
|
stop_words = data.get('stop', [])
|
||||||
if stop_words is None:
|
if stop_words is None:
|
||||||
@ -220,8 +234,8 @@ class CodeGenProxy:
|
|||||||
for c in choices:
|
for c in choices:
|
||||||
completion['id'] = self.random_completion_id()
|
completion['id'] = self.random_completion_id()
|
||||||
completion['choices'] = [c]
|
completion['choices'] = [c]
|
||||||
yield f'data: {json.dumps(completion)}\n\n'
|
yield f'{json.dumps(completion)}'
|
||||||
yield 'data: [DONE]\n\n'
|
yield '[DONE]'
|
||||||
|
|
||||||
def non_streamed_response(self, completion, choices) -> str:
|
def non_streamed_response(self, completion, choices) -> str:
|
||||||
completion['id'] = self.random_completion_id()
|
completion['id'] = self.random_completion_id()
|
||||||
@ -230,7 +244,19 @@ class CodeGenProxy:
|
|||||||
|
|
||||||
def __call__(self, data: dict):
|
def __call__(self, data: dict):
|
||||||
st = time.time()
|
st = time.time()
|
||||||
completion, choices = self.generate(data)
|
try:
|
||||||
|
completion, choices = self.generate(data)
|
||||||
|
except InferenceServerException as exc:
|
||||||
|
# status: unavailable -- this happens if the `model` string is invalid
|
||||||
|
print(exc)
|
||||||
|
if exc.status() == 'StatusCode.UNAVAILABLE':
|
||||||
|
print(
|
||||||
|
f"WARNING: Model '{data['model']}' is not available. Please ensure that "
|
||||||
|
"`model` is set to either 'fastertransformer' or 'py-model' depending on "
|
||||||
|
"your installation"
|
||||||
|
)
|
||||||
|
completion = {}
|
||||||
|
choices = []
|
||||||
ed = time.time()
|
ed = time.time()
|
||||||
print(f"Returned completion in {(ed - st) * 1000} ms")
|
print(f"Returned completion in {(ed - st) * 1000} ms")
|
||||||
if data.get('stream', False):
|
if data.get('stream', False):
|
||||||
|
19
copilot_proxy/utils/errors.py
Normal file
19
copilot_proxy/utils/errors.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
from typing import *
|
||||||
|
|
||||||
|
class FauxPilotException(Exception):
|
||||||
|
def __init__(self, message: str, type: Optional[str] = None, param: Optional[str] = None, code: Optional[int] = None):
|
||||||
|
super().__init__(message)
|
||||||
|
self.message = message
|
||||||
|
self.type = type
|
||||||
|
self.param = param
|
||||||
|
self.code = code
|
||||||
|
|
||||||
|
def json(self):
|
||||||
|
return {
|
||||||
|
'error': {
|
||||||
|
'message': self.message,
|
||||||
|
'type': self.type,
|
||||||
|
'param': self.param,
|
||||||
|
'code': self.code
|
||||||
|
}
|
||||||
|
}
|
@ -1,14 +1,17 @@
|
|||||||
version: '3.3'
|
version: '3.3'
|
||||||
services:
|
services:
|
||||||
triton:
|
triton:
|
||||||
image: moyix/triton_with_ft:22.09
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
command: bash -c "CUDA_VISIBLE_DEVICES=${GPUS} mpirun -n 1 --allow-run-as-root /opt/tritonserver/bin/tritonserver --model-repository=/model"
|
command: bash -c "CUDA_VISIBLE_DEVICES=${GPUS} mpirun -n 1 --allow-run-as-root /opt/tritonserver/bin/tritonserver --model-repository=/model"
|
||||||
shm_size: '2gb'
|
shm_size: '2gb'
|
||||||
volumes:
|
volumes:
|
||||||
- ${MODEL_DIR}:/model
|
- ${MODEL_DIR}:/model
|
||||||
|
- ${HF_CACHE_DIR}:/root/.cache/huggingface
|
||||||
ports:
|
ports:
|
||||||
- "8000:8000"
|
- "8000:8000"
|
||||||
- "8001:8001"
|
- "${TRITON_PORT}:8001"
|
||||||
- "8002:8002"
|
- "8002:8002"
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
@ -23,11 +26,11 @@ services:
|
|||||||
# command: python3 -m flask run --host=0.0.0.0 --port=5000
|
# command: python3 -m flask run --host=0.0.0.0 --port=5000
|
||||||
# For local build
|
# For local build
|
||||||
build:
|
build:
|
||||||
context: ./copilot_proxy
|
context: .
|
||||||
dockerfile: Dockerfile
|
dockerfile: copilot_proxy/Dockerfile
|
||||||
command: uvicorn app:app --host 0.0.0.0 --port 5000
|
command: uvicorn app:app --host 0.0.0.0 --port 5000
|
||||||
env_file:
|
env_file:
|
||||||
# You can modify this env file to configure your proxy environment
|
# Automatically created via ./setup.sh
|
||||||
- example.env
|
- .env
|
||||||
ports:
|
ports:
|
||||||
- "5000:5000"
|
- "${API_EXTERNAL_PORT}:5000"
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
We offer some ways to connect to FAUXPILOT Server. For example, you can create a client by how to open the Openai API, Copilot Plugin, REST API.
|
We offer some ways to connect to the FauxPilot Server. For example, you can create a client by how to open the Openai API, Copilot Plugin, REST API.
|
||||||
|
|
||||||
## API
|
## API
|
||||||
|
|
||||||
@ -64,3 +64,9 @@ And you should be able to use Copilot with your own locally hosted suggestions!
|
|||||||
Another issue with using the Copilot plugin is that its tokenizer (the component that turns text into a sequence of integers for the model) is slightly different from the one used by CodeGen, so the plugin will sometimes send a request that is longer than CodeGen can handle. You can work around this by replacing the `vocab.bpe` and `tokenizer.json` found in the Copilot extension (something like `.vscode/extensions/github.copilot-[version]/dist/`) with the ones found [here](https://github.com/moyix/fauxpilot/tree/main/copilot_proxy/cgtok/openai_format).
|
Another issue with using the Copilot plugin is that its tokenizer (the component that turns text into a sequence of integers for the model) is slightly different from the one used by CodeGen, so the plugin will sometimes send a request that is longer than CodeGen can handle. You can work around this by replacing the `vocab.bpe` and `tokenizer.json` found in the Copilot extension (something like `.vscode/extensions/github.copilot-[version]/dist/`) with the ones found [here](https://github.com/moyix/fauxpilot/tree/main/copilot_proxy/cgtok/openai_format).
|
||||||
|
|
||||||
Have fun!
|
Have fun!
|
||||||
|
|
||||||
|
## GitLab - VS Code extentension
|
||||||
|
|
||||||
|
Another option is to use the [GitLab VS Code extension](https://marketplace.visualstudio.com/items?itemName=GitLab.gitlab-workflow) which has support for FauxPilot.
|
||||||
|
|
||||||
|
Contributions are encouraged :smile: https://gitlab.com/gitlab-org/gitlab-vscode-extension
|
||||||
|
@ -1,4 +0,0 @@
|
|||||||
TRITON_HOST=triton
|
|
||||||
TRITON_PORT=8001
|
|
||||||
API_HOST=0.0.0.0
|
|
||||||
API_PORT=5000
|
|
BIN
img/fauxpilot.png
Normal file
BIN
img/fauxpilot.png
Normal file
Binary file not shown.
After ![]() (image error) Size: 2.3 MiB |
47
launch.sh
47
launch.sh
@ -1,19 +1,40 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
# Read in config.env file; error if not found
|
# Read in .env file; error if not found
|
||||||
if [ ! -f config.env ]; then
|
if [ ! -f .env ]; then
|
||||||
echo "config.env not found, please run setup.sh"
|
echo ".env not found, running setup.sh"
|
||||||
exit 1
|
bash setup.sh
|
||||||
fi
|
fi
|
||||||
source config.env
|
source .env
|
||||||
|
|
||||||
export NUM_GPUS=${NUM_GPUS}
|
function showhelp () {
|
||||||
export MODEL_DIR="${MODEL_DIR}"/"${MODEL}-${NUM_GPUS}gpu"
|
# Display Help
|
||||||
export GPUS=$(seq 0 $(( NUM_GPUS - 1 )) | paste -sd ',')
|
echo
|
||||||
|
echo "Usage: $0 [option...]"
|
||||||
|
echo "options:"
|
||||||
|
echo " -h Print this help."
|
||||||
|
echo " -d Start in daemon mode."
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
# On newer versions, docker-compose is docker compose
|
while getopts "hd" option; do
|
||||||
if command -v docker-compose > /dev/null; then
|
case $option in
|
||||||
docker compose up
|
h)
|
||||||
|
showhelp
|
||||||
|
exit;;
|
||||||
|
d)
|
||||||
|
options="-d"
|
||||||
|
;;
|
||||||
|
\?) # incorrect option
|
||||||
|
echo "Error: Invalid option"
|
||||||
|
exit;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
# On versions above 20.10.2, docker-compose is docker compose
|
||||||
|
smaller=$(printf "$(docker --version | egrep -o '[0-9]+\.[0-9]+\.[0-9]+')\n20.10.2" | sort -V | head -n1)
|
||||||
|
if [[ "$smaller" == "20.10.2" ]]; then
|
||||||
|
docker compose up $options --remove-orphans --build
|
||||||
else
|
else
|
||||||
docker-compose up
|
docker-compose up $options --remove-orphans --build
|
||||||
fi
|
fi;
|
||||||
|
180
python_backend/config_template.pbtxt
Normal file
180
python_backend/config_template.pbtxt
Normal file
@ -0,0 +1,180 @@
|
|||||||
|
name: "py-model"
|
||||||
|
backend: "python"
|
||||||
|
max_batch_size: 4
|
||||||
|
input [
|
||||||
|
{
|
||||||
|
name: "input_ids"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ -1 ]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "start_id"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "end_id"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "input_lengths"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "request_output_len"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ -1 ]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "runtime_top_k"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "runtime_top_p"
|
||||||
|
data_type: TYPE_FP32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "beam_search_diversity_rate"
|
||||||
|
data_type: TYPE_FP32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "temperature"
|
||||||
|
data_type: TYPE_FP32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "len_penalty"
|
||||||
|
data_type: TYPE_FP32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "repetition_penalty"
|
||||||
|
data_type: TYPE_FP32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "random_seed"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "is_return_log_probs"
|
||||||
|
data_type: TYPE_BOOL
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "beam_width"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ 1 ]
|
||||||
|
reshape: { shape: [ ] }
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "bad_words_list"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ 2, -1 ]
|
||||||
|
optional: true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
# UNUSED
|
||||||
|
name: "stop_words_list"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ 2, -1 ]
|
||||||
|
optional: true
|
||||||
|
}
|
||||||
|
]
|
||||||
|
output [
|
||||||
|
{
|
||||||
|
name: "output_ids"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ -1, -1, -1 ]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "sequence_length"
|
||||||
|
data_type: TYPE_INT32
|
||||||
|
dims: [ -1, -1 ]
|
||||||
|
} #,
|
||||||
|
# Following is currently unsupported, but should be supported in the future
|
||||||
|
# {
|
||||||
|
# name: "cum_log_probs"
|
||||||
|
# data_type: TYPE_FP32
|
||||||
|
# dims: [ -1 ]
|
||||||
|
# },
|
||||||
|
# {
|
||||||
|
# name: "output_log_probs"
|
||||||
|
# data_type: TYPE_FP32
|
||||||
|
# dims: [ -1, -1 ]
|
||||||
|
# }
|
||||||
|
]
|
||||||
|
# unsure what this is for
|
||||||
|
instance_group [
|
||||||
|
{
|
||||||
|
count: 1
|
||||||
|
kind: KIND_CPU
|
||||||
|
}
|
||||||
|
]
|
||||||
|
parameters {
|
||||||
|
key: "use_half"
|
||||||
|
value: {
|
||||||
|
string_value: "1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
parameters {
|
||||||
|
key: "model_name"
|
||||||
|
value: {
|
||||||
|
string_value: "${model_name}" # e.g. "codegen-350M-multi"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
parameters {
|
||||||
|
key: "org_name"
|
||||||
|
value: {
|
||||||
|
string_value: "${org_name}" # e.g. "Salesforce"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
parameters {
|
||||||
|
key: "use_int8",
|
||||||
|
value: {
|
||||||
|
string_value: "${use_int8}" # e.g. "0" or "1"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
parameters {
|
||||||
|
key: "use_auto_device_map",
|
||||||
|
value: {
|
||||||
|
string_value: "${use_auto_device_map}" # e.g. "0" or "1"
|
||||||
|
}
|
||||||
|
}
|
44
python_backend/init_model.py
Normal file
44
python_backend/init_model.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
"""
|
||||||
|
A simple script that sets up the model directory of a given model for Triton.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
from string import Template
|
||||||
|
|
||||||
|
SCRIPT_DIR = Path(__file__).parent
|
||||||
|
CONFIG_TEMPLATE_PATH = os.path.join(SCRIPT_DIR, 'config_template.pbtxt')
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--model_dir", type=str, required=True)
|
||||||
|
parser.add_argument("--model_name", type=str, required=True)
|
||||||
|
parser.add_argument("--org_name", type=str, required=True)
|
||||||
|
parser.add_argument("--use_half", type=str, default="1")
|
||||||
|
parser.add_argument("--use_int8", type=str, default="0")
|
||||||
|
parser.add_argument("--use_auto_device_map", type=str, default="1")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
# Step1: Make model directory
|
||||||
|
model_dir_path = Path(os.path.join(Path(args.model_dir), f"py-{args.org_name}-{args.model_name}/py-model/1"))
|
||||||
|
model_dir_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Step 2: copy model.py
|
||||||
|
shutil.copy(os.path.join(SCRIPT_DIR, 'model.py'), os.path.join(model_dir_path, 'model.py'))
|
||||||
|
|
||||||
|
# Step 3: Generate config.pbtxt
|
||||||
|
with open(CONFIG_TEMPLATE_PATH, 'r') as f:
|
||||||
|
template = Template(f.read())
|
||||||
|
|
||||||
|
config = template.substitute(
|
||||||
|
org_name=args.org_name,
|
||||||
|
model_name=args.model_name,
|
||||||
|
use_half=args.use_half,
|
||||||
|
use_int8=args.use_int8,
|
||||||
|
use_auto_device_map=args.use_auto_device_map,
|
||||||
|
)
|
||||||
|
with open(os.path.join(model_dir_path, '../config.pbtxt'), 'w') as f:
|
||||||
|
f.write(config)
|
||||||
|
print(f"Config written to {os.path.abspath(f.name)}")
|
102
python_backend/model.py
Normal file
102
python_backend/model.py
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
import json
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import triton_python_backend_utils as pb_utils
|
||||||
|
# Using dlpack causes segfaults on some machines, so not using it for now
|
||||||
|
# But it supports zero copy transfer from triton tensors to torch tensors,
|
||||||
|
# so worth investigating further
|
||||||
|
# from torch.utils.dlpack import to_dlpack, from_dlpack
|
||||||
|
from transformers import AutoModelForCausalLM
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def pb2torch(request, name):
|
||||||
|
tensor = pb_utils.get_input_tensor_by_name(request, name)
|
||||||
|
return torch.from_numpy(tensor.as_numpy())
|
||||||
|
# return from_dlpack(tensor.to_dlpack())
|
||||||
|
|
||||||
|
|
||||||
|
def torch2pb(name, tensor):
|
||||||
|
return pb_utils.Tensor(name, tensor.numpy())
|
||||||
|
# return pb_utils.Tensor.from_dlpack(name, to_dlpack(tensor))
|
||||||
|
|
||||||
|
|
||||||
|
class TritonPythonModel:
|
||||||
|
def initialize(self, args):
|
||||||
|
self.model_config = model_config = json.loads(args["model_config"])
|
||||||
|
org_name = model_config["parameters"].get("org_name", {"string_value": "Salesforce"})["string_value"]
|
||||||
|
model_name = org_name + "/" + model_config["parameters"]["model_name"]["string_value"]
|
||||||
|
|
||||||
|
def get_bool(x):
|
||||||
|
return model_config["parameters"][x]["string_value"].lower() in ["1", "true"]
|
||||||
|
|
||||||
|
is_half = get_bool("use_half") and torch.cuda.is_available()
|
||||||
|
# This will make inference marginally slower, but will allow bigger models to fit in GPU
|
||||||
|
int8 = get_bool("use_int8") and torch.cuda.is_available()
|
||||||
|
auto_device_map = get_bool("use_auto_device_map") and torch.cuda.is_available()
|
||||||
|
|
||||||
|
print("Cuda available?", torch.cuda.is_available())
|
||||||
|
print(f"is_half: {is_half}, int8: {int8}, auto_device_map: {auto_device_map}")
|
||||||
|
self.model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
model_name,
|
||||||
|
torch_dtype=torch.float16 if is_half else ("auto" if torch.cuda.is_available() else torch.float32),
|
||||||
|
load_in_8bit=int8,
|
||||||
|
device_map="auto" if auto_device_map else None,
|
||||||
|
low_cpu_mem_usage=True,
|
||||||
|
)
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
print(f"Model {model_name} Loaded. Footprint: {self.model.get_memory_footprint()}")
|
||||||
|
|
||||||
|
# set max_batch_size
|
||||||
|
self.max_batch_size = 0 # model_config["max_batch_size"]
|
||||||
|
|
||||||
|
def execute(self, requests):
|
||||||
|
# TODO: don't just loop over requests. batch them up
|
||||||
|
|
||||||
|
responses = []
|
||||||
|
|
||||||
|
for request in requests:
|
||||||
|
input_ids_torch = pb2torch(request, "input_ids")
|
||||||
|
input_lengths_torch = pb2torch(request, "input_lengths")
|
||||||
|
request_output_len_torch = pb2torch(request, "request_output_len")
|
||||||
|
|
||||||
|
# Attention mask
|
||||||
|
attention_mask = None
|
||||||
|
if input_lengths_torch.min() != input_lengths_torch.max():
|
||||||
|
attention_mask = torch.zeros(input_ids_torch.shape, dtype=torch.long)
|
||||||
|
for i, l in enumerate(input_lengths_torch):
|
||||||
|
attention_mask[i, :l] = 1
|
||||||
|
|
||||||
|
# Output length
|
||||||
|
max_new_tokens = request_output_len_torch[0][0]
|
||||||
|
|
||||||
|
top_k = pb_utils.get_input_tensor_by_name(request, "runtime_top_k").as_numpy().tolist()[0]
|
||||||
|
top_p = pb_utils.get_input_tensor_by_name(request, "runtime_top_p").as_numpy().tolist()[0]
|
||||||
|
temperature = pb_utils.get_input_tensor_by_name(request, "temperature").as_numpy().tolist()[0]
|
||||||
|
# n_samples = pb_utils.get_input_tensor_by_name(request, "n")
|
||||||
|
n_samples = 1 # TODO: client doesn't send this yet. instead it duplicates the request n times
|
||||||
|
|
||||||
|
# Generate
|
||||||
|
output_ids = self.model.generate(
|
||||||
|
input_ids=input_ids_torch, attention_mask=attention_mask,
|
||||||
|
max_new_tokens=max_new_tokens, do_sample=True, top_k=top_k, top_p=top_p, num_return_sequences=n_samples,
|
||||||
|
temperature=temperature,
|
||||||
|
)
|
||||||
|
|
||||||
|
# client wants batch x beam_width x seq_len and we don't support beam_width yet
|
||||||
|
output_ids = output_ids.unsqueeze(1)
|
||||||
|
|
||||||
|
# create output tensors
|
||||||
|
out_tensor_pb = torch2pb("output_ids", output_ids)
|
||||||
|
|
||||||
|
# calculate sequence_length
|
||||||
|
sequence_length = torch.zeros(output_ids.shape[:2], dtype=torch.int32)
|
||||||
|
for i in range(output_ids.shape[0]):
|
||||||
|
sequence_length[i, 0] = torch.sum(output_ids[i, 0] != self.model.config.eos_token_id).item()
|
||||||
|
sequence_length_pb = torch2pb("sequence_length", sequence_length)
|
||||||
|
|
||||||
|
# create response
|
||||||
|
response = pb_utils.InferenceResponse([out_tensor_pb, sequence_length_pb])
|
||||||
|
responses.append(response)
|
||||||
|
|
||||||
|
return responses
|
225
setup.sh
225
setup.sh
@ -1,9 +1,15 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
if [ -f config.env ]; then
|
if [ -f .env ]; then
|
||||||
echo "config.env already exists, skipping"
|
read -rp ".env already exists, do you want to delete .env and recreate it? [y/n] " DELETE
|
||||||
echo "Please delete config.env if you want to re-run this script"
|
if [[ ${DELETE:-y} =~ ^[Yy]$ ]]
|
||||||
exit 1
|
then
|
||||||
|
echo "Deleting .env"
|
||||||
|
rm .env
|
||||||
|
else
|
||||||
|
echo "Exiting"
|
||||||
|
exit 0
|
||||||
|
fi;
|
||||||
fi
|
fi
|
||||||
|
|
||||||
function check_dep(){
|
function check_dep(){
|
||||||
@ -17,73 +23,170 @@ check_dep curl
|
|||||||
check_dep zstd
|
check_dep zstd
|
||||||
check_dep docker
|
check_dep docker
|
||||||
|
|
||||||
|
############### Common configuration ###############
|
||||||
echo "Models available:"
|
|
||||||
echo "[1] codegen-350M-mono (2GB total VRAM required; Python-only)"
|
|
||||||
echo "[2] codegen-350M-multi (2GB total VRAM required; multi-language)"
|
|
||||||
echo "[3] codegen-2B-mono (7GB total VRAM required; Python-only)"
|
|
||||||
echo "[4] codegen-2B-multi (7GB total VRAM required; multi-language)"
|
|
||||||
echo "[5] codegen-6B-mono (13GB total VRAM required; Python-only)"
|
|
||||||
echo "[6] codegen-6B-multi (13GB total VRAM required; multi-language)"
|
|
||||||
echo "[7] codegen-16B-mono (32GB total VRAM required; Python-only)"
|
|
||||||
echo "[8] codegen-16B-multi (32GB total VRAM required; multi-language)"
|
|
||||||
# Read their choice
|
|
||||||
read -p "Enter your choice [6]: " MODEL_NUM
|
|
||||||
|
|
||||||
# Convert model number to model name
|
|
||||||
case $MODEL_NUM in
|
|
||||||
1) MODEL="codegen-350M-mono" ;;
|
|
||||||
2) MODEL="codegen-350M-multi" ;;
|
|
||||||
3) MODEL="codegen-2B-mono" ;;
|
|
||||||
4) MODEL="codegen-2B-multi" ;;
|
|
||||||
5) MODEL="codegen-6B-mono" ;;
|
|
||||||
6) MODEL="codegen-6B-multi" ;;
|
|
||||||
7) MODEL="codegen-16B-mono" ;;
|
|
||||||
8) MODEL="codegen-16B-multi" ;;
|
|
||||||
*) MODEL="codegen-6B-multi" ;;
|
|
||||||
esac
|
|
||||||
|
|
||||||
# Read number of GPUs
|
# Read number of GPUs
|
||||||
read -p "Enter number of GPUs [1]: " NUM_GPUS
|
read -rp "Enter number of GPUs [1]: " NUM_GPUS
|
||||||
NUM_GPUS=${NUM_GPUS:-1}
|
NUM_GPUS=${NUM_GPUS:-1}
|
||||||
|
|
||||||
# Read model directory
|
read -rp "External port for the API [5000]: " API_EXTERNAL_PORT
|
||||||
read -p "Where do you want to save the model [$(pwd)/models]? " MODEL_DIR
|
API_EXTERNAL_PORT=${API_EXTERNAL_PORT:-5000}
|
||||||
if [ -z "$MODEL_DIR" ]; then
|
|
||||||
MODEL_DIR="$(pwd)/models"
|
read -rp "Address for Triton [triton]: " TRITON_HOST
|
||||||
|
TRITON_HOST=${TRITON_HOST:-triton}
|
||||||
|
|
||||||
|
read -rp "Port of Triton host [8001]: " TRITON_PORT
|
||||||
|
TRITON_PORT=${TRITON_PORT:-8001}
|
||||||
|
|
||||||
|
# Read models root directory (all models go under this)
|
||||||
|
read -rp "Where do you want to save your models [$(pwd)/models]? " MODELS_ROOT_DIR
|
||||||
|
if [ -z "$MODELS_ROOT_DIR" ]; then
|
||||||
|
MODELS_ROOT_DIR="$(pwd)/models"
|
||||||
else
|
else
|
||||||
MODEL_DIR="$(readlink -m "${MODEL_DIR}")"
|
MODELS_ROOT_DIR="$(readlink -m "${MODELS_ROOT_DIR}")"
|
||||||
fi
|
fi
|
||||||
|
mkdir -p "$MODELS_ROOT_DIR"
|
||||||
|
|
||||||
# Write config.env
|
# Write .env
|
||||||
echo "MODEL=${MODEL}" > config.env
|
echo "NUM_GPUS=${NUM_GPUS}" >> .env
|
||||||
echo "NUM_GPUS=${NUM_GPUS}" >> config.env
|
echo "GPUS=$(seq 0 $(( NUM_GPUS - 1)) | paste -s -d ',' -)" >> .env
|
||||||
echo "MODEL_DIR=${MODEL_DIR}" >> config.env
|
echo "API_EXTERNAL_PORT=${API_EXTERNAL_PORT}" >> .env
|
||||||
|
echo "TRITON_HOST=${TRITON_HOST}" >> .env
|
||||||
|
echo "TRITON_PORT=${TRITON_PORT}" >> .env
|
||||||
|
|
||||||
if [ -d "$MODEL_DIR"/"${MODEL}"-${NUM_GPUS}gpu ]; then
|
############### Backend specific configuration ###############
|
||||||
echo "Converted model for ${MODEL}-${NUM_GPUS}gpu already exists."
|
|
||||||
read -p "Do you want to re-use it? y/n: " REUSE_CHOICE
|
function fastertransformer_backend(){
|
||||||
if [ "${REUSE_CHOICE^^}" = "Y" ]; then
|
echo "Models available:"
|
||||||
exit 0
|
echo "[1] codegen-350M-mono (2GB total VRAM required; Python-only)"
|
||||||
|
echo "[2] codegen-350M-multi (2GB total VRAM required; multi-language)"
|
||||||
|
echo "[3] codegen-2B-mono (7GB total VRAM required; Python-only)"
|
||||||
|
echo "[4] codegen-2B-multi (7GB total VRAM required; multi-language)"
|
||||||
|
echo "[5] codegen-6B-mono (13GB total VRAM required; Python-only)"
|
||||||
|
echo "[6] codegen-6B-multi (13GB total VRAM required; multi-language)"
|
||||||
|
echo "[7] codegen-16B-mono (32GB total VRAM required; Python-only)"
|
||||||
|
echo "[8] codegen-16B-multi (32GB total VRAM required; multi-language)"
|
||||||
|
# Read their choice
|
||||||
|
read -rp "Enter your choice [6]: " MODEL_NUM
|
||||||
|
|
||||||
|
# Convert model number to model name
|
||||||
|
case $MODEL_NUM in
|
||||||
|
1) MODEL="codegen-350M-mono" ;;
|
||||||
|
2) MODEL="codegen-350M-multi" ;;
|
||||||
|
3) MODEL="codegen-2B-mono" ;;
|
||||||
|
4) MODEL="codegen-2B-multi" ;;
|
||||||
|
5) MODEL="codegen-6B-mono" ;;
|
||||||
|
6) MODEL="codegen-6B-multi" ;;
|
||||||
|
7) MODEL="codegen-16B-mono" ;;
|
||||||
|
8) MODEL="codegen-16B-multi" ;;
|
||||||
|
*) MODEL="codegen-6B-multi" ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
echo "MODEL=${MODEL}" >> .env
|
||||||
|
echo "MODEL_DIR=${MODELS_ROOT_DIR}/${MODEL}-${NUM_GPUS}gpu" >> .env
|
||||||
|
|
||||||
|
if (test -d "$MODELS_ROOT_DIR"/"${MODEL}"-"${NUM_GPUS}"gpu ); then
|
||||||
|
echo "$MODELS_ROOT_DIR"/"${MODEL}"-"${NUM_GPUS}"gpu
|
||||||
|
echo "Converted model for ${MODEL}-${NUM_GPUS}gpu already exists."
|
||||||
|
read -rp "Do you want to re-use it? y/n: " REUSE_CHOICE
|
||||||
|
if [[ ${REUSE_CHOICE:-y} =~ ^[Yy]$ ]]
|
||||||
|
then
|
||||||
|
DOWNLOAD_MODEL=n
|
||||||
|
echo "Re-using model"
|
||||||
|
else
|
||||||
|
DOWNLOAD_MODEL=y
|
||||||
|
rm -rf "$MODELS_ROOT_DIR"/"${MODEL}"-"${NUM_GPUS}"gpu
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
DOWNLOAD_MODEL=y
|
||||||
fi
|
fi
|
||||||
fi
|
|
||||||
|
|
||||||
# Create model directory
|
if [[ ${DOWNLOAD_MODEL:-y} =~ ^[Yy]$ ]]
|
||||||
mkdir -p "${MODEL_DIR}"
|
then
|
||||||
|
if [ "$NUM_GPUS" -le 2 ]; then
|
||||||
|
echo "Downloading the model from HuggingFace, this will take a while..."
|
||||||
|
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
|
||||||
|
DEST="${MODEL}-${NUM_GPUS}gpu"
|
||||||
|
ARCHIVE="${MODELS_ROOT_DIR}/${DEST}.tar.zst"
|
||||||
|
cp -r "$SCRIPT_DIR"/converter/models/"$DEST" "${MODELS_ROOT_DIR}"
|
||||||
|
curl -L "https://huggingface.co/moyix/${MODEL}-gptj/resolve/main/${MODEL}-${NUM_GPUS}gpu.tar.zst" \
|
||||||
|
-o "$ARCHIVE"
|
||||||
|
zstd -dc "$ARCHIVE" | tar -xf - -C "${MODELS_ROOT_DIR}"
|
||||||
|
rm -f "$ARCHIVE"
|
||||||
|
else
|
||||||
|
echo "Downloading and converting the model, this will take a while..."
|
||||||
|
docker run --rm -v "${MODELS_ROOT_DIR}":/models -e MODEL=${MODEL} -e NUM_GPUS="${NUM_GPUS}" moyix/model_converter:latest
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
# For some of the models we can download it preconverted.
|
# Not used for this backend but needs to be present
|
||||||
if [ $NUM_GPUS -le 2 ]; then
|
HF_CACHE_DIR="$(pwd)/.hf_cache"
|
||||||
echo "Downloading the model from HuggingFace, this will take a while..."
|
mkdir -p "$HF_CACHE_DIR"
|
||||||
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
|
echo "HF_CACHE_DIR=${HF_CACHE_DIR}" >> .env
|
||||||
DEST="${MODEL}-${NUM_GPUS}gpu"
|
}
|
||||||
ARCHIVE="${MODEL_DIR}/${DEST}.tar.zst"
|
|
||||||
cp -r "$SCRIPT_DIR"/converter/models/"$DEST" "${MODEL_DIR}"
|
function python_backend(){
|
||||||
curl -L "https://huggingface.co/moyix/${MODEL}-gptj/resolve/main/${MODEL}-${NUM_GPUS}gpu.tar.zst" \
|
echo "Models available:"
|
||||||
-o "$ARCHIVE"
|
echo "[1] codegen-350M-mono (1GB total VRAM required; Python-only)"
|
||||||
zstd -dc "$ARCHIVE" | tar -xf - -C "${MODEL_DIR}"
|
echo "[2] codegen-350M-multi (1GB total VRAM required; multi-language)"
|
||||||
rm -f "$ARCHIVE"
|
echo "[3] codegen-2B-mono (4GB total VRAM required; Python-only)"
|
||||||
|
echo "[4] codegen-2B-multi (4GB total VRAM required; multi-language)"
|
||||||
|
|
||||||
|
read -rp "Enter your choice [4]: " MODEL_NUM
|
||||||
|
|
||||||
|
# Convert model number to model name
|
||||||
|
case $MODEL_NUM in
|
||||||
|
1) MODEL="codegen-350M-mono"; ORG="Salesforce" ;;
|
||||||
|
2) MODEL="codegen-350M-multi"; ORG="Salesforce" ;;
|
||||||
|
3) MODEL="codegen-2B-mono"; ORG="Salesforce" ;;
|
||||||
|
4) MODEL="codegen-2B-multi"; ORG="Salesforce" ;;
|
||||||
|
*) MODEL="codegen-2B-multi"; ORG="Salesforce" ;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# share huggingface cache? Should be safe to share, but permission issues may arise depending upon your docker setup
|
||||||
|
read -rp "Do you want to share your huggingface cache between host and docker container? y/n [n]: " SHARE_HF_CACHE
|
||||||
|
SHARE_HF_CACHE=${SHARE_HF_CACHE:-n}
|
||||||
|
if [[ ${SHARE_HF_CACHE:-y} =~ ^[Yy]$ ]]; then
|
||||||
|
read -rp "Enter your huggingface cache directory [$HOME/.cache/huggingface]: " HF_CACHE_DIR
|
||||||
|
HF_CACHE_DIR=${HF_CACHE_DIR:-$HOME/.cache/huggingface}
|
||||||
|
else
|
||||||
|
HF_CACHE_DIR="$(pwd)/.hf_cache"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# use int8? Allows larger models to fit in GPU but might be very marginally slower
|
||||||
|
read -rp "Do you want to use int8? y/n [y]: " USE_INT8
|
||||||
|
if [[ ${USE_INT8:-y} =~ ^[Nn]$ ]]; then
|
||||||
|
USE_INT8="0"
|
||||||
|
else
|
||||||
|
USE_INT8="1"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Write config.env
|
||||||
|
echo "MODEL=py-${MODEL}" >> .env
|
||||||
|
echo "MODEL_DIR=${MODELS_ROOT_DIR}/py-${ORG}-${MODEL}" >> .env # different format from fastertransformer backend
|
||||||
|
echo "HF_CACHE_DIR=${HF_CACHE_DIR}" >> .env
|
||||||
|
|
||||||
|
python3 ./python_backend/init_model.py --model_name "${MODEL}" --org_name "${ORG}" --model_dir "${MODELS_ROOT_DIR}" --use_int8 "${USE_INT8}"
|
||||||
|
bash -c "source .env ; docker compose build || docker-compose build"
|
||||||
|
}
|
||||||
|
|
||||||
|
# choose backend
|
||||||
|
echo "Choose your backend:"
|
||||||
|
echo "[1] FasterTransformer backend (faster, but limited models)"
|
||||||
|
echo "[2] Python backend (slower, but more models, and allows loading with int8)"
|
||||||
|
read -rp "Enter your choice [1]: " BACKEND_NUM
|
||||||
|
|
||||||
|
if [ "$BACKEND_NUM" -eq 2 ]; then
|
||||||
|
python_backend
|
||||||
else
|
else
|
||||||
echo "Downloading and converting the model, this will take a while..."
|
fastertransformer_backend
|
||||||
docker run --rm -v ${MODEL_DIR}:/models -e MODEL=${MODEL} -e NUM_GPUS=${NUM_GPUS} moyix/model_converter:latest
|
fi
|
||||||
|
|
||||||
|
read -rp "Config complete, do you want to run FauxPilot? [y/n] " RUN
|
||||||
|
if [[ ${RUN:-y} =~ ^[Yy]$ ]]
|
||||||
|
then
|
||||||
|
bash ./launch.sh
|
||||||
|
else
|
||||||
|
echo "You can run ./launch.sh to start the FauxPilot server."
|
||||||
|
exit 0
|
||||||
fi
|
fi
|
||||||
echo "Done! Now run ./launch.sh to start the FauxPilot server."
|
|
||||||
|
6
shutdown.sh
Executable file
6
shutdown.sh
Executable file
@ -0,0 +1,6 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
source .env
|
||||||
|
|
||||||
|
# On newer versions, docker-compose is docker compose
|
||||||
|
docker compose down --remove-orphans || docker-compose down --remove-orphans
|
2
tests/python_backend/.gitignore
vendored
Normal file
2
tests/python_backend/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
test.env
|
||||||
|
models/*
|
35
tests/python_backend/docker-compose-with-gpus.yaml
Normal file
35
tests/python_backend/docker-compose-with-gpus.yaml
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
version: '3.3'
|
||||||
|
services:
|
||||||
|
triton:
|
||||||
|
build:
|
||||||
|
context: ../../
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
command: bash -c "CUDA_VISIBLE_DEVICES="${GPUS}" mpirun -n 1 --allow-run-as-root /opt/tritonserver/bin/tritonserver --model-repository=/model"
|
||||||
|
shm_size: '2gb'
|
||||||
|
volumes:
|
||||||
|
- ${MODEL_DIR}:/model
|
||||||
|
- ${HF_CACHE_DIR}:/root/.cache/huggingface
|
||||||
|
ports:
|
||||||
|
- "8000:8000"
|
||||||
|
- "8001:8001"
|
||||||
|
- "8002:8002"
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: all
|
||||||
|
capabilities: [gpu]
|
||||||
|
copilot_proxy:
|
||||||
|
# For dockerhub version
|
||||||
|
# image: moyix/copilot_proxy:latest
|
||||||
|
# command: python3 -m flask run --host=0.0.0.0 --port=5000
|
||||||
|
# For local build
|
||||||
|
build:
|
||||||
|
context: ../../
|
||||||
|
dockerfile: copilot_proxy/Dockerfile
|
||||||
|
env_file:
|
||||||
|
# Automatically created via ./setup.sh
|
||||||
|
- test.env
|
||||||
|
ports:
|
||||||
|
- "${API_EXTERNAL_PORT}:5000"
|
28
tests/python_backend/docker-compose-without-gpus.yaml
Normal file
28
tests/python_backend/docker-compose-without-gpus.yaml
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
version: '3.3'
|
||||||
|
services:
|
||||||
|
triton:
|
||||||
|
build:
|
||||||
|
context: ../../
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
command: bash -c "CUDA_VISIBLE_DEVICES="${GPUS}" mpirun -n 1 --allow-run-as-root /opt/tritonserver/bin/tritonserver --model-repository=/model"
|
||||||
|
shm_size: '2gb'
|
||||||
|
volumes:
|
||||||
|
- ${MODEL_DIR}:/model
|
||||||
|
- ${HF_CACHE_DIR}:/root/.cache/huggingface
|
||||||
|
ports:
|
||||||
|
- "8000:8000"
|
||||||
|
- "8001:8001"
|
||||||
|
- "8002:8002"
|
||||||
|
copilot_proxy:
|
||||||
|
# For dockerhub version
|
||||||
|
# image: moyix/copilot_proxy:latest
|
||||||
|
# command: python3 -m flask run --host=0.0.0.0 --port=5000
|
||||||
|
# For local build
|
||||||
|
build:
|
||||||
|
context: ../../
|
||||||
|
dockerfile: copilot_proxy/Dockerfile
|
||||||
|
env_file:
|
||||||
|
# Automatically created via ./setup.sh
|
||||||
|
- test.env
|
||||||
|
ports:
|
||||||
|
- "${API_EXTERNAL_PORT}:5000"
|
3
tests/python_backend/requirements.txt
Normal file
3
tests/python_backend/requirements.txt
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
pytest
|
||||||
|
pexpect
|
||||||
|
requests
|
8
tests/python_backend/runner.env
Normal file
8
tests/python_backend/runner.env
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
NUM_GPUS=1
|
||||||
|
GPUS=0
|
||||||
|
API_EXTERNAL_PORT=5000
|
||||||
|
TRITON_HOST=triton
|
||||||
|
TRITON_PORT=8001
|
||||||
|
MODEL=py-codegen-350M-mono
|
||||||
|
MODEL_DIR=${HOME}/models/py-Salesforce-codegen-350M-mono
|
||||||
|
HF_CACHE_DIR=${HOME}/.cache/huggingface
|
163
tests/python_backend/test_setup.py
Normal file
163
tests/python_backend/test_setup.py
Normal file
@ -0,0 +1,163 @@
|
|||||||
|
"Tests setup script (currently for Python backend)"
|
||||||
|
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import signal
|
||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Union
|
||||||
|
|
||||||
|
import pexpect
|
||||||
|
import pytest
|
||||||
|
import requests
|
||||||
|
|
||||||
|
curdir = Path(__file__).parent
|
||||||
|
root = curdir.parent.parent
|
||||||
|
|
||||||
|
test_models_dir = curdir/"models"
|
||||||
|
|
||||||
|
|
||||||
|
def setup_module():
|
||||||
|
"Setup steps for tests in this module"
|
||||||
|
assert (root/"setup.sh").exists(), "setup.sh not found"
|
||||||
|
if (root/".env").exists():
|
||||||
|
shutil.move(str(root/".env"), str(root/".env.bak"))
|
||||||
|
|
||||||
|
def teardown_module():
|
||||||
|
"Teardown steps for tests in this module"
|
||||||
|
if (root/".env.bak").exists():
|
||||||
|
shutil.move(str(root/".env.bak"), str(root/".env"))
|
||||||
|
try:
|
||||||
|
if test_models_dir.exists():
|
||||||
|
shutil.rmtree(test_models_dir)
|
||||||
|
except Exception as exc:
|
||||||
|
print(
|
||||||
|
f"WARNING: Couldn't delete `{test_models_dir}` most likely due to permission issues."
|
||||||
|
f"Run the tests with sudo to ensure this gets deleted automatically, or else delete manually. Exception: {exc}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def enter_input(proc: pexpect.spawn, expect: str, input_s: str, timeout: int = 5) -> str:
|
||||||
|
"Helper function to enter input for a given prompt. Returns consumed output."
|
||||||
|
|
||||||
|
try:
|
||||||
|
proc.expect(expect, timeout=timeout)
|
||||||
|
except pexpect.exceptions.TIMEOUT as exc:
|
||||||
|
raise AssertionError(
|
||||||
|
f"Timeout waiting for prompt: `{expect}`.\n"
|
||||||
|
f"Output-before: `{proc.before}`\nOutput-after: `{proc.after}`"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
after = str(proc.after)
|
||||||
|
print(after)
|
||||||
|
proc.sendline(input_s)
|
||||||
|
return after
|
||||||
|
|
||||||
|
def run_common_setup_steps(n_gpus: int = 0) -> pexpect.spawn:
|
||||||
|
"Helper function to run common setup steps."
|
||||||
|
proc = pexpect.pty_spawn.spawn(
|
||||||
|
"./setup.sh 2>&1", encoding="utf-8", cwd=str(root),
|
||||||
|
)
|
||||||
|
proc.ignorecase = True
|
||||||
|
|
||||||
|
enter_input(proc, r".*Enter number of GPUs[^:]+: ?", str(n_gpus))
|
||||||
|
enter_input(proc, r".*port for the API[^:]+: ?", "5000")
|
||||||
|
enter_input(proc, r".*Address for Triton[^:]+: ?", "triton")
|
||||||
|
enter_input(proc, r".*Port of Triton[^:]+: ?", "8001")
|
||||||
|
enter_input(proc, r".*save your models[^\?]+\? ?", str(test_models_dir.absolute()))
|
||||||
|
|
||||||
|
return proc
|
||||||
|
|
||||||
|
def load_test_env():
|
||||||
|
"Load test env vars"
|
||||||
|
# Without loading default env vars, PATH won't be set correctly
|
||||||
|
env = os.environ.copy()
|
||||||
|
with open(curdir/"test.env", "r", encoding="utf8") as test_env:
|
||||||
|
for line in test_env:
|
||||||
|
key, val = line.strip().split("=")
|
||||||
|
env[key] = val
|
||||||
|
return env
|
||||||
|
|
||||||
|
def run_inference(
|
||||||
|
prompt: str, model: str = "py-model", port: int = 5000, return_all: bool = False,
|
||||||
|
**kwargs
|
||||||
|
) -> Union[str, Dict]:
|
||||||
|
"Invokes the copilot proxy with the given prompt and returns the completion"
|
||||||
|
endpoint = f"http://localhost:{port}/v1/engines/codegen/completions"
|
||||||
|
data = {
|
||||||
|
"model": model,
|
||||||
|
"prompt": prompt,
|
||||||
|
"suffix": kwargs.get("suffix", ""),
|
||||||
|
"max_tokens": kwargs.get("max_tokens", 16),
|
||||||
|
"temperature": kwargs.get("temperature", 0.0),
|
||||||
|
"top_p": kwargs.get("top_p", 1.0),
|
||||||
|
"n": kwargs.get("n", 1),
|
||||||
|
"stream": kwargs.get("stream", None), # it's not true/false. It's None or not None :[
|
||||||
|
"logprobs": kwargs.get("logprobs", 0),
|
||||||
|
"stop": kwargs.get("stop", ""),
|
||||||
|
"echo": kwargs.get("echo", True),
|
||||||
|
"presence_penalty": kwargs.get("presence_penalty", 0.0),
|
||||||
|
"frequency_penalty": kwargs.get("frequency_penalty", 0.0),
|
||||||
|
"best_of": kwargs.get("best_of", 1),
|
||||||
|
"logit_bias": kwargs.get("logit_bias", {}),
|
||||||
|
"user": kwargs.get("user", "test"),
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.post(endpoint, json=data)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
if return_all:
|
||||||
|
return response.json()
|
||||||
|
return response.json()["choices"][0]["text"]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("n_gpus", [0]) # we don't have a GPU on CI
|
||||||
|
def test_python_backend(n_gpus: int):
|
||||||
|
"""
|
||||||
|
Step 1: run $root/setup.sh while passing appropriate options via stdin
|
||||||
|
Step 2: run docker-compose up with test.env sourced
|
||||||
|
Step 3: call :5000 with appropriate request
|
||||||
|
"""
|
||||||
|
proc = run_common_setup_steps(n_gpus)
|
||||||
|
|
||||||
|
choices = enter_input(proc, r".*Choose your backend.*Enter your choice[^:]+: ?", "2")
|
||||||
|
assert "[2] Python backend" in choices, "Option 2 should be Python backend"
|
||||||
|
|
||||||
|
choices = enter_input(proc, r".*Models available:.*Enter your choice[^:]+: ?", "1")
|
||||||
|
assert "[1] codegen-350M-mono" in choices, "Option 1 should be codegen-350M-mono"
|
||||||
|
|
||||||
|
enter_input(proc, r".*share (your )?huggingface cache[^:]+: ?", "y")
|
||||||
|
enter_input(proc, r".*cache directory[^:]+: ?", "") # default
|
||||||
|
enter_input(proc, r".*use int8[^:]+: ?", "n")
|
||||||
|
enter_input(proc, r".*run FauxPilot\? \[y/n\] ", "n", timeout=120)
|
||||||
|
|
||||||
|
# copy $root/.env to $curdir/test.env
|
||||||
|
shutil.copy(str(root/".env"), str(curdir/"test.env"))
|
||||||
|
|
||||||
|
# run docker-compose up -f docker-compose-{without|with}-gpus.yml
|
||||||
|
compose_file = f"docker-compose-with{'' if n_gpus > 0 else 'out'}-gpus.yaml"
|
||||||
|
docker_proc = None
|
||||||
|
try:
|
||||||
|
docker_proc = pexpect.pty_spawn.spawn(
|
||||||
|
f"docker compose -f {compose_file} up",
|
||||||
|
encoding="utf-8",
|
||||||
|
cwd=curdir,
|
||||||
|
env=load_test_env(),
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Waiting for API to be ready...")
|
||||||
|
docker_proc.expect(r".*Started GRPCInferenceService at 0.0.0.0:8001", timeout=120)
|
||||||
|
|
||||||
|
print("API ready, sending request...")
|
||||||
|
|
||||||
|
# Simple test 1: hello world prompt without bells and whistles
|
||||||
|
response = run_inference("def hello_world():\n", max_tokens=16, return_all=True)
|
||||||
|
assert response["choices"][0]["text"].rstrip() == ' print("Hello World")\n\nhello_world()\n\n#'
|
||||||
|
assert response["choices"][0]["finish_reason"] == "length"
|
||||||
|
|
||||||
|
finally:
|
||||||
|
if docker_proc is not None and docker_proc.isalive():
|
||||||
|
docker_proc.kill(signal.SIGINT)
|
||||||
|
|
||||||
|
# killing docker-compose process doesn't bring down the containers.
|
||||||
|
# explicitly stop the containers:
|
||||||
|
subprocess.run(["docker-compose", "-f", compose_file, "down"], cwd=curdir, check=True, env=load_test_env())
|
Loading…
x
Reference in New Issue
Block a user