mirror of
https://github.com/fauxpilot/fauxpilot.git
synced 2025-03-12 04:36:10 -07:00
I'd been using dlpack for copying triton tensors to torch tensors, which I did because it was advertised to perform zero copy transfers. Turns out that only worked on my laptop, and didn't work on other machines. IDK why. But for now, I'm just copying the tensors as triton<->numpy<->torch. That works on the VM on which earlier code was segfaulting Signed-off-by: Parth <thakkarparth007@gmail.com>
181 lines
3.0 KiB
Plaintext
181 lines
3.0 KiB
Plaintext
name: "py-model"
|
|
backend: "python"
|
|
max_batch_size: 4
|
|
input [
|
|
{
|
|
name: "input_ids"
|
|
data_type: TYPE_INT32
|
|
dims: [ -1 ]
|
|
},
|
|
{
|
|
# UNUSED
|
|
name: "start_id"
|
|
data_type: TYPE_INT32
|
|
dims: [ 1 ]
|
|
reshape: { shape: [ ] }
|
|
optional: true
|
|
},
|
|
{
|
|
# UNUSED
|
|
name: "end_id"
|
|
data_type: TYPE_INT32
|
|
dims: [ 1 ]
|
|
reshape: { shape: [ ] }
|
|
optional: true
|
|
},
|
|
{
|
|
name: "input_lengths"
|
|
data_type: TYPE_INT32
|
|
dims: [ 1 ]
|
|
reshape: { shape: [ ] }
|
|
},
|
|
{
|
|
name: "request_output_len"
|
|
data_type: TYPE_INT32
|
|
dims: [ -1 ]
|
|
},
|
|
{
|
|
name: "runtime_top_k"
|
|
data_type: TYPE_INT32
|
|
dims: [ 1 ]
|
|
reshape: { shape: [ ] }
|
|
optional: true
|
|
},
|
|
{
|
|
name: "runtime_top_p"
|
|
data_type: TYPE_FP32
|
|
dims: [ 1 ]
|
|
reshape: { shape: [ ] }
|
|
optional: true
|
|
},
|
|
{
|
|
# UNUSED
|
|
name: "beam_search_diversity_rate"
|
|
data_type: TYPE_FP32
|
|
dims: [ 1 ]
|
|
reshape: { shape: [ ] }
|
|
optional: true
|
|
},
|
|
{
|
|
name: "temperature"
|
|
data_type: TYPE_FP32
|
|
dims: [ 1 ]
|
|
reshape: { shape: [ ] }
|
|
optional: true
|
|
},
|
|
{
|
|
# UNUSED
|
|
name: "len_penalty"
|
|
data_type: TYPE_FP32
|
|
dims: [ 1 ]
|
|
reshape: { shape: [ ] }
|
|
optional: true
|
|
},
|
|
{
|
|
# UNUSED
|
|
name: "repetition_penalty"
|
|
data_type: TYPE_FP32
|
|
dims: [ 1 ]
|
|
reshape: { shape: [ ] }
|
|
optional: true
|
|
},
|
|
{
|
|
# UNUSED
|
|
name: "random_seed"
|
|
data_type: TYPE_INT32
|
|
dims: [ 1 ]
|
|
reshape: { shape: [ ] }
|
|
optional: true
|
|
},
|
|
{
|
|
# UNUSED
|
|
name: "is_return_log_probs"
|
|
data_type: TYPE_BOOL
|
|
dims: [ 1 ]
|
|
reshape: { shape: [ ] }
|
|
optional: true
|
|
},
|
|
{
|
|
# UNUSED
|
|
name: "beam_width"
|
|
data_type: TYPE_INT32
|
|
dims: [ 1 ]
|
|
reshape: { shape: [ ] }
|
|
optional: true
|
|
},
|
|
{
|
|
# UNUSED
|
|
name: "bad_words_list"
|
|
data_type: TYPE_INT32
|
|
dims: [ 2, -1 ]
|
|
optional: true
|
|
},
|
|
{
|
|
# UNUSED
|
|
name: "stop_words_list"
|
|
data_type: TYPE_INT32
|
|
dims: [ 2, -1 ]
|
|
optional: true
|
|
}
|
|
]
|
|
output [
|
|
{
|
|
name: "output_ids"
|
|
data_type: TYPE_INT32
|
|
dims: [ -1, -1, -1 ]
|
|
},
|
|
{
|
|
name: "sequence_length"
|
|
data_type: TYPE_INT32
|
|
dims: [ -1, -1 ]
|
|
} #,
|
|
# Following is currently unsupported, but should be supported in the future
|
|
# {
|
|
# name: "cum_log_probs"
|
|
# data_type: TYPE_FP32
|
|
# dims: [ -1 ]
|
|
# },
|
|
# {
|
|
# name: "output_log_probs"
|
|
# data_type: TYPE_FP32
|
|
# dims: [ -1, -1 ]
|
|
# }
|
|
]
|
|
# unsure what this is for
|
|
instance_group [
|
|
{
|
|
count: 1
|
|
kind: KIND_CPU
|
|
}
|
|
]
|
|
parameters {
|
|
key: "use_half"
|
|
value: {
|
|
string_value: "1"
|
|
}
|
|
}
|
|
parameters {
|
|
key: "model_name"
|
|
value: {
|
|
string_value: "${model_name}" # e.g. "codegen-350M-multi"
|
|
}
|
|
}
|
|
parameters {
|
|
key: "org_name"
|
|
value: {
|
|
string_value: "${org_name}" # e.g. "Salesforce"
|
|
}
|
|
}
|
|
parameters {
|
|
key: "use_int8",
|
|
value: {
|
|
string_value: "${use_int8}" # e.g. "0" or "1"
|
|
}
|
|
}
|
|
parameters {
|
|
key: "use_auto_device_map",
|
|
value: {
|
|
string_value: "${use_auto_device_map}" # e.g. "0" or "1"
|
|
}
|
|
}
|