forked from slee-lab/llama-recipes
-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathprocess_data.py
46 lines (37 loc) · 1.75 KB
/
process_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import json, jsonlines
import sys
NERRErepo_dir = '../NERRE/'
if NERRErepo_dir.endswith('/'):
pass
else:
NERRErepo_dir+='/'
read=[]
valsize=0
# process doping data for llama-2 finetuning
for mode in ['json','engextra','eng']:
with jsonlines.open(NERRErepo_dir+'doping/data/'+f'training_{mode}.jsonl') as f:
for line in f:#.iter():
read.append(line)
write=[{'input':dat['prompt'],'output':dat['completion']} for dat in read]
# Note that doping 'prompt' has instruction in it, feel free to explore without it or with different ones.
train_dat,val_dat = write[valsize:], write[:valsize] #valset can be used needed for llama-2 finetuning but we will give it blank list here
for ftmode in ['train','val']:
if ftmode=='train':
data = train_dat[:]
else:
data = val_dat[:]
with open(NERRErepo_dir+'doping/data/'+f"doping_data_forllama_{mode}scheme_{ftmode}.json","w") as file1:
json.dump(data,file1)
# process general and MOF data for llama-2 finetuning
for taskdir in ['experiments_general','experiments_mof_gpt3']:
for i in range(5):
folddir = f'fold_{i}/'
path = NERRErepo_dir+'general_and_mofs/data/'+taskdir+'/'+folddir
for mode in ['train','val']: # Note val.jsonl is used as test data
read=[]
with jsonlines.open(path+f'{mode}.jsonl') as f:
read = [line for line in f]
write=[{'input':dat['prompt'],'output':dat['completion']} for dat in read]
# Feel free to test other instructions or formats. Note that we do not use instruction for this task.
with open(path+f"{mode}.json","w") as file1: #the processed file is .json file not .jsonl
json.dump(write,file1)