返回模型

说明文档

启用键值缓存的 Phi 3.5 mini instruct ONNX fp16 格式模型

模型创建者：Microsoft
原始模型：Phi 3.5 mini instruct

描述

本仓库包含由 Esperanto Technologies 制作的 Phi 3.5 mini instruct ONNX 转换文件。模型采用 fp16 格式，并启用了 KVC（键值缓存）功能。

如何下载 ONNX 模型和权重文件

获取模型最简单的方法是克隆整个仓库。或者，你也可以使用 huggingface-hub Python 库来下载文件。

pip3 install huggingface-hub>=0.17.1

然后你可以使用如下命令将任意单个模型文件高速下载到当前目录：

huggingface-cli download Esperanto/phi-3.5-mini-instruct-kvc-fp16-onnx  --local-dir phi-3.5-mini-instruct-kvc-fp16-onnx  --local-dir-use-symlinks False

关于使用 huggingface-cli 下载的更多文档，请参阅：HF -> Hub Python Library -> Download files -> Download from the CLI。

如何使用 ONNXRuntime 从 Python 代码运行

此模型可以轻松在 CPU 上使用 ONNXRuntime 运行。

首先安装依赖包

pip3 install onnx==1.16.1
pip3 install onnxruntime==1.17.1

示例代码：使用此模型生成文本

我们定义一个使用贪婪解码的循环：

import numpy as np
import onnxruntime
import onnx
from transformers import AutoTokenizer
def generate_text(model_path, prompt, tokenizer, max_gen_tokens, total_sequence, window, context):
    model = onnx.load(model_path)
    #我们为第一次迭代创建输入
    input_tensor = tokenizer(prompt, return_tensors=\"pt\")
    prompt_size = len(input_tensor['input_ids'][0])
    actual_input = input_tensor['input_ids']
    if prompt_size < window:
        actual_input = np.concatenate((tokenizer.bos_token_id*np.ones([1, window - prompt_size], dtype = 'int64'),
                                       actual_input), axis=1)
    if prompt_size + max_gen_tokens > total_sequence:
        print(\"ERROR: Longer total sequence is needed!\")
        return
    first_attention = np.concatenate((np.zeros([1, total_sequence - window], dtype = 'int64'),
                                      np.ones((1, window), dtype = 'int64')), axis=1)
    max_gen_tokens += prompt_size #我们需要在解析提示词的基础上进行生成
    inputs_names =[node.name for node in model.graph.input]
    output_names =[node.name for node in model.graph.output]
    n_heads = 32 #kvc的gqa头数
    inputs_dict = {}
    inputs_dict['input_ids'] = actual_input[:, :window].reshape(1, window).numpy()
    inputs_dict['attention_mask'] = first_attention
    index_pos = sum(first_attention[0])
    inputs_dict['position_ids'] = np.concatenate((np.zeros([1, total_sequence - index_pos], dtype = 'int64'), np.arange(index_pos, dtype = 'int64').reshape(1, index_pos)), axis=1)
    inputs_dict['tree_attention'] = np.triu(-65504*np.ones(total_sequence), k= 1).astype('float16').reshape(1, 1, total_sequence, total_sequence)
    for name in inputs_names:
        if name == 'input_ids' or name == 'attention_mask' or name == 'position_ids' or name == 'tree_attention': continue
        inputs_dict[name] = np.zeros([1, n_heads, context-window, 96], dtype=\"float16\")
    index = 0
    new_token = np.array([10])
    next_index = window
    old_j = 0
    total_input = actual_input.numpy()
    rt_session = onnxruntime.InferenceSession(model_path)
    ## 运行推理
    while next_index < max_gen_tokens:
        if new_token.any() == tokenizer.eos_token_id:
            break
        #推理
        output = rt_session.run(output_names, inputs_dict)
        outs_dictionary = {name: content for (name, content) in zip (output_names, output)}
        #我们为下一次推理准备输入
        for name in inputs_names:
            if name == 'input_ids':
                old_j = next_index
                if next_index < prompt_size:
                    if prompt_size - next_index >= window: next_index += window
                    else: next_index = prompt_size 
                    j = next_index - window
                else:
                    next_index +=1
                    j = next_index - window
                    new_token = outs_dictionary['logits'].argmax(-1).reshape(1, window)
                    total_input = np.concatenate((total_input, new_token[: , -1:]), axis = 1)
                inputs_dict['input_ids']= total_input[:, j:next_index].reshape(1, window)
            elif name == 'attention_mask':
                inputs_dict['attention_mask'] = np.concatenate((np.zeros((1, total_sequence-next_index), dtype = 'int64'), np.ones((1, next_index), dtype = 'int64')), axis=1)
            elif name == 'position_ids':
                inputs_dict['position_ids'] = np.concatenate((np.zeros([1, total_sequence - next_index], dtype = 'int64'), np.arange(next_index, dtype = 'int64').reshape(1, next_index)), axis=1)
            elif name == 'tree_attention': continue
            else:
                old_name = name.replace(\"past_key_values\", \"present\")
                inputs_dict[name] = outs_dictionary[old_name][:, :, next_index-old_j:context-window+(next_index - old_j), :]
    answer = tokenizer.decode(total_input[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
    return answer

现在我们运行推理：

tokenizer = AutoTokenizer.from_pretrained(\"Esperanto/phi-3.5-mini-instruct-kvc-fp16-onnx\")
model_path = \"phi-3.5-mini-instruct-kvc-fp16-onnx/model.onnx\"
max_gen_tokens = 20    #我们想要生成的token数量
total_sequence = 128   #总序列长度
context = 1024         #用于扩展kvc的上下文
window = 16            #每次想要解析的token数量
messages = [
    {\"role\": \"system\", \"content\": \"You are a pirate chatbot who always responds in pirate speak!\"},
    {\"role\": \"user\", \"content\": \"Who are you?\"},
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
generated = generate_text(model_path, prompt, tokenizer, max_gen_tokens, total_sequence, window, context)
print(generated)

Esperanto/phi-3.5-mini-instruct-kvc-fp16-onnx

作者 Esperanto

text-generation

↓ 1 ♥ 0

创建时间: 2024-08-26 14:51:25+00:00

更新时间: 2024-12-11 14:23:12+00:00

在 Hugging Face 上查看

文件 (204)

.gitattributes

README.md

added_tokens.json

config.json

model.embed_tokens.weight

model.layers.0.input_layernorm.weight

model.layers.0.post_attention_layernorm.weight

model.layers.1.input_layernorm.weight

model.layers.1.post_attention_layernorm.weight

model.layers.10.input_layernorm.weight

model.layers.10.post_attention_layernorm.weight

model.layers.11.input_layernorm.weight

model.layers.11.post_attention_layernorm.weight

model.layers.12.input_layernorm.weight

model.layers.12.post_attention_layernorm.weight

model.layers.13.input_layernorm.weight

model.layers.13.post_attention_layernorm.weight

model.layers.14.input_layernorm.weight

model.layers.14.post_attention_layernorm.weight

model.layers.15.input_layernorm.weight

model.layers.15.post_attention_layernorm.weight

model.layers.16.input_layernorm.weight

model.layers.16.post_attention_layernorm.weight

model.layers.17.input_layernorm.weight

model.layers.17.post_attention_layernorm.weight

model.layers.18.input_layernorm.weight

model.layers.18.post_attention_layernorm.weight

model.layers.19.input_layernorm.weight

model.layers.19.post_attention_layernorm.weight

model.layers.2.input_layernorm.weight

model.layers.2.post_attention_layernorm.weight

model.layers.20.input_layernorm.weight

model.layers.20.post_attention_layernorm.weight

model.layers.21.input_layernorm.weight

model.layers.21.post_attention_layernorm.weight

model.layers.22.input_layernorm.weight

model.layers.22.post_attention_layernorm.weight

model.layers.23.input_layernorm.weight

model.layers.23.post_attention_layernorm.weight

model.layers.24.input_layernorm.weight

model.layers.24.post_attention_layernorm.weight

model.layers.25.input_layernorm.weight

model.layers.25.post_attention_layernorm.weight

model.layers.26.input_layernorm.weight

model.layers.26.post_attention_layernorm.weight

model.layers.27.input_layernorm.weight

model.layers.27.post_attention_layernorm.weight

model.layers.28.input_layernorm.weight

model.layers.28.post_attention_layernorm.weight

model.layers.29.input_layernorm.weight

model.layers.29.post_attention_layernorm.weight

model.layers.3.input_layernorm.weight

model.layers.3.post_attention_layernorm.weight

model.layers.30.input_layernorm.weight

model.layers.30.post_attention_layernorm.weight

model.layers.31.input_layernorm.weight

model.layers.31.post_attention_layernorm.weight

model.layers.4.input_layernorm.weight

model.layers.4.post_attention_layernorm.weight

model.layers.5.input_layernorm.weight

model.layers.5.post_attention_layernorm.weight

model.layers.6.input_layernorm.weight

model.layers.6.post_attention_layernorm.weight

model.layers.7.input_layernorm.weight

model.layers.7.post_attention_layernorm.weight

model.layers.8.input_layernorm.weight

model.layers.8.post_attention_layernorm.weight

model.layers.9.input_layernorm.weight

model.layers.9.post_attention_layernorm.weight

model.norm.weight

model.onnx ONNX

onnx__MatMul_6634

onnx__MatMul_6668

onnx__MatMul_6669

onnx__MatMul_6670

onnx__MatMul_6671

onnx__MatMul_6703

onnx__MatMul_6704

onnx__MatMul_6705

onnx__MatMul_6706

onnx__MatMul_6738

onnx__MatMul_6739

onnx__MatMul_6740

onnx__MatMul_6741

onnx__MatMul_6773

onnx__MatMul_6774

onnx__MatMul_6775

onnx__MatMul_6776

onnx__MatMul_6808

onnx__MatMul_6809

onnx__MatMul_6810

onnx__MatMul_6811

onnx__MatMul_6843

onnx__MatMul_6844

onnx__MatMul_6845

onnx__MatMul_6846

onnx__MatMul_6878

onnx__MatMul_6879

onnx__MatMul_6880

onnx__MatMul_6881

onnx__MatMul_6913

onnx__MatMul_6914

onnx__MatMul_6915

onnx__MatMul_6916

onnx__MatMul_6948

onnx__MatMul_6949

onnx__MatMul_6950

onnx__MatMul_6951

onnx__MatMul_6983

onnx__MatMul_6984

onnx__MatMul_6985

onnx__MatMul_6986

onnx__MatMul_7018

onnx__MatMul_7019

onnx__MatMul_7020

onnx__MatMul_7021

onnx__MatMul_7053

onnx__MatMul_7054

onnx__MatMul_7055

onnx__MatMul_7056

onnx__MatMul_7088

onnx__MatMul_7089

onnx__MatMul_7090

onnx__MatMul_7091

onnx__MatMul_7123

onnx__MatMul_7124

onnx__MatMul_7125

onnx__MatMul_7126

onnx__MatMul_7158

onnx__MatMul_7159

onnx__MatMul_7160

onnx__MatMul_7161

onnx__MatMul_7193

onnx__MatMul_7194

onnx__MatMul_7195

onnx__MatMul_7196

onnx__MatMul_7228

onnx__MatMul_7229

onnx__MatMul_7230

onnx__MatMul_7231

onnx__MatMul_7263

onnx__MatMul_7264

onnx__MatMul_7265

onnx__MatMul_7266

onnx__MatMul_7298

onnx__MatMul_7299

onnx__MatMul_7300

onnx__MatMul_7301

onnx__MatMul_7333

onnx__MatMul_7334

onnx__MatMul_7335

onnx__MatMul_7336

onnx__MatMul_7368

onnx__MatMul_7369

onnx__MatMul_7370

onnx__MatMul_7371

onnx__MatMul_7403

onnx__MatMul_7404

onnx__MatMul_7405

onnx__MatMul_7406

onnx__MatMul_7438

onnx__MatMul_7439

onnx__MatMul_7440

onnx__MatMul_7441

onnx__MatMul_7473

onnx__MatMul_7474

onnx__MatMul_7475

onnx__MatMul_7476

onnx__MatMul_7508

onnx__MatMul_7509

onnx__MatMul_7510

onnx__MatMul_7511

onnx__MatMul_7543

onnx__MatMul_7544

onnx__MatMul_7545

onnx__MatMul_7546

onnx__MatMul_7578

onnx__MatMul_7579

onnx__MatMul_7580

onnx__MatMul_7581

onnx__MatMul_7613

onnx__MatMul_7614

onnx__MatMul_7615

onnx__MatMul_7616

onnx__MatMul_7648

onnx__MatMul_7649

onnx__MatMul_7650

onnx__MatMul_7651

onnx__MatMul_7683

onnx__MatMul_7684

onnx__MatMul_7685

onnx__MatMul_7686

onnx__MatMul_7718

onnx__MatMul_7719

onnx__MatMul_7720

onnx__MatMul_7721

onnx__MatMul_7753

onnx__MatMul_7754

onnx__MatMul_7755

onnx__MatMul_7756

special_tokens_map.json

tokenizer.json

tokenizer.model

tokenizer_config.json