ONNX 模型库
返回模型

说明文档

Gemma 7B 指令模型,启用键值缓存,ONNX fp16 格式

<!-- description start -->

描述

本仓库包含由 Esperanto Technologies 完成的 Gemma 7B instruct ONNX 转换的 ONNX 文件。 该模型采用 fp16 格式,并启用了 KVC(键值缓存)。

<!-- description end -->

如何下载 ONNX 模型和权重文件

获取模型最简单的方法是克隆整个仓库。 或者,您也可以使用 huggingface-hub Python 库下载文件。

pip3 install huggingface-hub>=0.17.1

然后您可以使用如下命令将任意单个模型文件高速下载到当前目录:

huggingface-cli download Esperanto/gemma-7b-it-kvc-fp16-onnx --local-dir gemma-7b-it-kvc-fp16-onnx --local-dir-use-symlinks False

关于使用 huggingface-cli 下载的更多文档,请参阅:HF -> Hub Python Library -> Download files -> Download from the CLI

如何使用 ONNXRuntime 从 Python 代码运行

此模型可以使用 ONNXRuntime 在 CPU 上轻松运行。

首先安装依赖包

pip3 install onnx==1.16.1
pip3 install onnxruntime==1.17.1

示例代码:使用此模型生成文本

我们使用贪婪解码定义循环:

import numpy as np
import onnxruntime
import onnx
from transformers import AutoTokenizer
def generate_text(model_path, prompt, tokenizer, max_gen_tokens, total_sequence, window, context):
    model = onnx.load(model_path)
    #我们为第一次迭代创建输入
    input_tensor = tokenizer(prompt, return_tensors=\"pt\")
    prompt_size = len(input_tensor['input_ids'][0])
    actual_input = input_tensor['input_ids']
    if prompt_size < window:
        actual_input = np.concatenate((tokenizer.bos_token_id*np.ones([1, window - prompt_size], dtype = 'int64'),
                                       actual_input), axis=1)
    if prompt_size + max_gen_tokens > total_sequence:
        print(\"ERROR: Longer total sequence is needed!\")
        return
    first_attention = np.concatenate((np.zeros([1, total_sequence - window], dtype = 'int64'),
                                      np.ones((1, window), dtype = 'int64')), axis=1)
    max_gen_tokens += prompt_size #我们需要在解析提示词的基础上生成
    inputs_names =[node.name for node in model.graph.input]
    output_names =[node.name for node in model.graph.output]
    n_heads = 16 #kvc的gqa头数
    inputs_dict = {}
    inputs_dict['input_ids'] = actual_input[:, :window].reshape(1, window).numpy()
    inputs_dict['attention_mask'] = first_attention
    index_pos = sum(first_attention[0])
    inputs_dict['position_ids'] = np.concatenate((np.zeros([1, total_sequence - index_pos], dtype = 'int64'), np.arange(index_pos, dtype = 'int64').reshape(1, index_pos)), axis=1)
    inputs_dict['tree_attention'] = np.triu(-65504*np.ones(total_sequence), k= 1).astype('float16').reshape(1, 1, total_sequence, total_sequence)
    for name in inputs_names:
        if name == 'input_ids' or name == 'attention_mask' or name == 'position_ids' or name == 'tree_attention': continue
        inputs_dict[name] = np.zeros([1, n_heads, context-window, 256], dtype=\"float16\")
    index = 0
    new_token = np.array([10])
    next_index = window
    old_j = 0
    total_input = actual_input.numpy()
    rt_session = onnxruntime.InferenceSession(model_path)
    ## 我们运行推理
    while next_index < max_gen_tokens:
        if new_token.any() == tokenizer.eos_token_id:
            break
        #推理
        output = rt_session.run(output_names, inputs_dict)
        outs_dictionary = {name: content for (name, content) in zip (output_names, output)}
        #我们为下一次推理准备输入
        for name in inputs_names:
            if name == 'input_ids':
                old_j = next_index
                if next_index < prompt_size:
                    if prompt_size - next_index >= window: next_index += window
                    else: next_index = prompt_size 
                    j = next_index - window
                else:
                    next_index +=1
                    j = next_index - window
                    new_token = outs_dictionary['logits'].argmax(-1).reshape(1, window)
                    total_input = np.concatenate((total_input, new_token[: , -1:]), axis = 1)
                inputs_dict['input_ids']= total_input[:, j:next_index].reshape(1, window)
            elif name == 'attention_mask':
                inputs_dict['attention_mask'] = np.concatenate((np.zeros((1, total_sequence-next_index), dtype = 'int64'), np.ones((1, next_index), dtype = 'int64')), axis=1)
            elif name == 'position_ids':
                inputs_dict['position_ids'] = np.concatenate((np.zeros([1, total_sequence - next_index], dtype = 'int64'), np.arange(next_index, dtype = 'int64').reshape(1, next_index)), axis=1)
            elif name == 'tree_attention': continue
            else:
                old_name = name.replace(\"past_key_values\", \"present\")
                inputs_dict[name] = outs_dictionary[old_name][:, :, next_index-old_j:context-window+(next_index - old_j), :]
    answer = tokenizer.decode(total_input[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
    return answer

现在我们运行推理:

tokenizer = AutoTokenizer.from_pretrained(\"Esperanto/gemma-7b-it-kvc-fp16-onnx\")
model_path = \"gemma-7b-it-kvc-fp16-onnx/model.onnx\"
max_gen_tokens = 20    #我们想要生成的token数量
total_sequence = 128   #总序列长度
context = 1024         #扩展kvc的上下文
window = 16            #我们想要一次解析的token数量
messages = [
    {\"role\": \"system\", \"content\": \"You are a pirate chatbot who always responds in pirate speak!\"},
    {\"role\": \"user\", \"content\": \"Who are you?\"},
]
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
generated = generate_text(model_path, prompt, tokenizer, max_gen_tokens, total_sequence, window, context)
print(generated)

Esperanto/gemma-7b-it-kvc-fp16-onnx

作者 Esperanto

text-generation
↓ 1 ♥ 0

创建时间: 2024-08-02 10:54:00+00:00

更新时间: 2024-12-11 14:16:18+00:00

在 Hugging Face 上查看

文件 (264)

.gitattributes
README.md
config.json
model.embed_tokens.weight
model.onnx ONNX
onnx__MatMul_5671
onnx__MatMul_5672
onnx__MatMul_5673
onnx__MatMul_5693
onnx__MatMul_5696
onnx__MatMul_5697
onnx__MatMul_5698
onnx__MatMul_5701
onnx__MatMul_5702
onnx__MatMul_5703
onnx__MatMul_5721
onnx__MatMul_5724
onnx__MatMul_5725
onnx__MatMul_5726
onnx__MatMul_5729
onnx__MatMul_5730
onnx__MatMul_5731
onnx__MatMul_5749
onnx__MatMul_5752
onnx__MatMul_5753
onnx__MatMul_5754
onnx__MatMul_5757
onnx__MatMul_5758
onnx__MatMul_5759
onnx__MatMul_5777
onnx__MatMul_5780
onnx__MatMul_5781
onnx__MatMul_5782
onnx__MatMul_5785
onnx__MatMul_5786
onnx__MatMul_5787
onnx__MatMul_5805
onnx__MatMul_5808
onnx__MatMul_5809
onnx__MatMul_5810
onnx__MatMul_5813
onnx__MatMul_5814
onnx__MatMul_5815
onnx__MatMul_5833
onnx__MatMul_5836
onnx__MatMul_5837
onnx__MatMul_5838
onnx__MatMul_5841
onnx__MatMul_5842
onnx__MatMul_5843
onnx__MatMul_5861
onnx__MatMul_5864
onnx__MatMul_5865
onnx__MatMul_5866
onnx__MatMul_5869
onnx__MatMul_5870
onnx__MatMul_5871
onnx__MatMul_5889
onnx__MatMul_5892
onnx__MatMul_5893
onnx__MatMul_5894
onnx__MatMul_5897
onnx__MatMul_5898
onnx__MatMul_5899
onnx__MatMul_5917
onnx__MatMul_5920
onnx__MatMul_5921
onnx__MatMul_5922
onnx__MatMul_5925
onnx__MatMul_5926
onnx__MatMul_5927
onnx__MatMul_5945
onnx__MatMul_5948
onnx__MatMul_5949
onnx__MatMul_5950
onnx__MatMul_5953
onnx__MatMul_5954
onnx__MatMul_5955
onnx__MatMul_5973
onnx__MatMul_5976
onnx__MatMul_5977
onnx__MatMul_5978
onnx__MatMul_5981
onnx__MatMul_5982
onnx__MatMul_5983
onnx__MatMul_6001
onnx__MatMul_6004
onnx__MatMul_6005
onnx__MatMul_6006
onnx__MatMul_6009
onnx__MatMul_6010
onnx__MatMul_6011
onnx__MatMul_6029
onnx__MatMul_6032
onnx__MatMul_6033
onnx__MatMul_6034
onnx__MatMul_6037
onnx__MatMul_6038
onnx__MatMul_6039
onnx__MatMul_6057
onnx__MatMul_6060
onnx__MatMul_6061
onnx__MatMul_6062
onnx__MatMul_6065
onnx__MatMul_6066
onnx__MatMul_6067
onnx__MatMul_6085
onnx__MatMul_6088
onnx__MatMul_6089
onnx__MatMul_6090
onnx__MatMul_6093
onnx__MatMul_6094
onnx__MatMul_6095
onnx__MatMul_6113
onnx__MatMul_6116
onnx__MatMul_6117
onnx__MatMul_6118
onnx__MatMul_6121
onnx__MatMul_6122
onnx__MatMul_6123
onnx__MatMul_6141
onnx__MatMul_6144
onnx__MatMul_6145
onnx__MatMul_6146
onnx__MatMul_6149
onnx__MatMul_6150
onnx__MatMul_6151
onnx__MatMul_6169
onnx__MatMul_6172
onnx__MatMul_6173
onnx__MatMul_6174
onnx__MatMul_6177
onnx__MatMul_6178
onnx__MatMul_6179
onnx__MatMul_6197
onnx__MatMul_6200
onnx__MatMul_6201
onnx__MatMul_6202
onnx__MatMul_6205
onnx__MatMul_6206
onnx__MatMul_6207
onnx__MatMul_6225
onnx__MatMul_6228
onnx__MatMul_6229
onnx__MatMul_6230
onnx__MatMul_6233
onnx__MatMul_6234
onnx__MatMul_6235
onnx__MatMul_6253
onnx__MatMul_6256
onnx__MatMul_6257
onnx__MatMul_6258
onnx__MatMul_6261
onnx__MatMul_6262
onnx__MatMul_6263
onnx__MatMul_6281
onnx__MatMul_6284
onnx__MatMul_6285
onnx__MatMul_6286
onnx__MatMul_6289
onnx__MatMul_6290
onnx__MatMul_6291
onnx__MatMul_6309
onnx__MatMul_6312
onnx__MatMul_6313
onnx__MatMul_6314
onnx__MatMul_6317
onnx__MatMul_6318
onnx__MatMul_6319
onnx__MatMul_6337
onnx__MatMul_6340
onnx__MatMul_6341
onnx__MatMul_6342
onnx__MatMul_6345
onnx__MatMul_6346
onnx__MatMul_6347
onnx__MatMul_6365
onnx__MatMul_6368
onnx__MatMul_6369
onnx__MatMul_6370
onnx__MatMul_6373
onnx__MatMul_6374
onnx__MatMul_6375
onnx__MatMul_6393
onnx__MatMul_6396
onnx__MatMul_6397
onnx__MatMul_6398
onnx__MatMul_6401
onnx__MatMul_6402
onnx__MatMul_6403
onnx__MatMul_6421
onnx__MatMul_6424
onnx__MatMul_6425
onnx__MatMul_6426
onnx__MatMul_6429
onnx__MatMul_6430
onnx__MatMul_6431
onnx__MatMul_6449
onnx__MatMul_6452
onnx__MatMul_6453
onnx__MatMul_6454
onnx__MatMul_6457
onnx__Mul_5670
onnx__Mul_5695
onnx__Mul_5700
onnx__Mul_5723
onnx__Mul_5728
onnx__Mul_5751
onnx__Mul_5756
onnx__Mul_5779
onnx__Mul_5784
onnx__Mul_5807
onnx__Mul_5812
onnx__Mul_5835
onnx__Mul_5840
onnx__Mul_5863
onnx__Mul_5868
onnx__Mul_5891
onnx__Mul_5896
onnx__Mul_5919
onnx__Mul_5924
onnx__Mul_5947
onnx__Mul_5952
onnx__Mul_5975
onnx__Mul_5980
onnx__Mul_6003
onnx__Mul_6008
onnx__Mul_6031
onnx__Mul_6036
onnx__Mul_6059
onnx__Mul_6064
onnx__Mul_6087
onnx__Mul_6092
onnx__Mul_6115
onnx__Mul_6120
onnx__Mul_6143
onnx__Mul_6148
onnx__Mul_6171
onnx__Mul_6176
onnx__Mul_6199
onnx__Mul_6204
onnx__Mul_6227
onnx__Mul_6232
onnx__Mul_6255
onnx__Mul_6260
onnx__Mul_6283
onnx__Mul_6288
onnx__Mul_6311
onnx__Mul_6316
onnx__Mul_6339
onnx__Mul_6344
onnx__Mul_6367
onnx__Mul_6372
onnx__Mul_6395
onnx__Mul_6400
onnx__Mul_6423
onnx__Mul_6428
onnx__Mul_6451
onnx__Mul_6456
special_tokens_map.json
token_id_to_str.json
tokenizer.json
tokenizer.model
tokenizer_config.json