forked from EleutherAI/lm-evaluation-harness
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathopenai_eval.py
89 lines (76 loc) · 3.24 KB
/
openai_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import argparse
import os
import time
import math
import openai
import json
from tqdm import tqdm
from datasets import load_dataset
from lm_eval.tasks.humaneval_ppl import tokenize_for_bleu_eval
# The private OpenAI API key needs to be an environment variable
openai.api_key = os.getenv('OPENAI_API_KEY')
# As instructed here: https://community.openai.com/t/token-logprobs-when-echo-is-true/9626/2
# "Transformer models don’t predict the probability of the first token. If you want to get the probability
# for your first token you can try to use <|endoftext|> as the first token as a workaround."
endoftext_token = '<|endoftext|>'
def oa_completion(**kwargs):
""" Query OpenAI API for completion.
Retry with back-off until they respond
"""
import openai
backoff_time = 3
while True:
try:
return openai.Completion.create(**kwargs)
except openai.error.OpenAIError:
import traceback
traceback.print_exc()
time.sleep(backoff_time)
backoff_time *= 1.5
def call_codex(code_str):
eos_code_str = endoftext_token + code_str
# engine: 'davinci-codex' is currently the best codex model
# max_tokens=0 means that we don't want the model to generate additional tokens
# logprobs=0 means that we don't want the logprobs of the alternative tokens, only the actual tokens
# echo=True means that we want the model to echo our prompt, in addition to our (not existing) completion
completion = oa_completion(engine="cushman-codex", prompt=eos_code_str,
max_tokens=0,
temperature=0.0,
logprobs=0,
n=1,
echo=True)
c = completion.choices[0]
# skipping the <|endoftext|> token
saved_probs = {
'text': code_str,
'canonical_tokens': tokenize_for_bleu_eval(code_str),
'tokens': c.logprobs.tokens[1:],
'logprobs': c.logprobs.token_logprobs[1:],
'sum_logprobs': sum(c.logprobs.token_logprobs[1:])
}
return saved_probs
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--output', type=str, required=False, default=os.devnull)
args = parser.parse_args()
results = {}
# Load evaluation dataset and metric
human_eval = load_dataset("openai_humaneval")
# Generate completions for evaluation set
n_tasks = len(human_eval["test"])
prompts = []
solutions = []
for task in range(n_tasks):
prompts.append(human_eval["test"][task]["prompt"])
solutions.append(human_eval["test"][task]["canonical_solution"])
print('Loaded HumanEval:', len(prompts))
log_probs_sum = 0
tokens_count = 0
ignored_files = []
all_per_token_probs = []
with open(args.output, 'w') as out_file:
for idx, (prompt, solution) in tqdm(enumerate(zip(prompts, solutions))):
prompt_per_token_probs = call_codex(prompt)
full_per_token_probs = call_codex(prompt + solution)
data_dict = {'prompt': prompt_per_token_probs, 'full': full_per_token_probs}
out_file.write(json.dumps(data_dict) + '\n')