forked from tloen/alpaca-lora
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathminiInference.py
26 lines (19 loc) · 1.11 KB
/
miniInference.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import sys
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer
# some bare minimum code to verify inference works
model_path = "decapoda-research/llama-7b-hf" # You can modify the path for storing the local model
print("loading model, path:", model_path)
model = LlamaForCausalLM.from_pretrained(model_path, load_in_8bit=True, device_map='auto', low_cpu_mem_usage=True)
tokenizer = LlamaTokenizer.from_pretrained(model_path)
print("Human:")
line = input()
while line:
inputs = 'Human: ' + line.strip() + '\n\nAssistant:'
input_ids = tokenizer(inputs, return_tensors="pt").input_ids
input_ids = input_ids.cuda()
outputs = model.generate(input_ids, max_new_tokens=100, do_sample = True, top_k = 30, top_p = 0.85, temperature = 0.5, repetition_penalty=1., eos_token_id=2, bos_token_id=1, pad_token_id=0)
rets = tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)
print("Assistant:\n" + rets[0].strip().replace(inputs, ""))
print("\n------------------------------------------------\nHuman:")
line = input()