-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathevaluate_baselines.py
80 lines (69 loc) · 3.72 KB
/
evaluate_baselines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from constat.basic_model_loader import load_model
from constat.overlap import Perplexity, Lowercase, TopKMin
import pandas as pd
import torch
import os
import gc
from transformers import set_seed
import datasets
from prepare import *
set_seed(42)
def run_model(model, tokenizer, df, output_path, batch_size,
ref_model_name=None):
set_seed(42)
new_df = df.copy()
if 'perplexity' not in new_df.columns:
perplexity = Perplexity(model, tokenizer)
new_df['perplexity_output'] = perplexity.batch_call(new_df['output'].tolist(), new_df['input'].tolist(), batch_size=batch_size)
new_df['perplexity_input'] = perplexity.batch_call(new_df['input'].tolist(), batch_size=batch_size)
new_df['perplexity_all'] = perplexity.batch_call((new_df['input'] + new_df['output']).tolist(), batch_size=batch_size)
if 'topkmin' not in new_df.columns:
topkmin = TopKMin(model, tokenizer)
new_df['topkmin'] = topkmin.batch_call(new_df['output'].tolist(), new_df['input'].tolist(), batch_size=batch_size)
new_df['topkmin_all'] = topkmin.batch_call((new_df['input'] + new_df['output']).tolist(), batch_size=batch_size)
if 'lowercase' not in new_df.columns:
lowercase = Lowercase(model, tokenizer)
new_df['lowercase'] = lowercase.batch_call(new_df['output'].tolist(), new_df['input'].tolist(), batch_size=batch_size)
if ref_model_name is not None and 'perplexity_ref' not in new_df.columns:
model_ref, tokenizer = load_model(ref_model_name, return_tokenizer=True)
model_ref.eval()
perplexity = Perplexity(model_ref, tokenizer)
new_df['perplexity_ref'] = perplexity.batch_call(new_df['output'].tolist(), new_df['input'].tolist(), batch_size=batch_size)
del model_ref, perplexity.model, topkmin.model
gc.collect()
torch.cuda.empty_cache()
os.makedirs(os.path.dirname(output_path), exist_ok=True)
new_df.to_csv(output_path, index=False)
base_path = 'data/baselines'
base_path_data = 'data/contamination'
def main(model_name, benchmark_name, dataset_name,
no_cont=False, batch_size=8, ref_model_name='microsoft/phi-2', synthetic=False):
model, tokenizer = load_model(model_name, return_tokenizer=True, trust_remote_code=False)
model.eval()
output_path = os.path.join(base_path, model_name, dataset_name + f"_{no_cont}_0.csv")
if synthetic:
output_path = os.path.join(base_path, model_name, dataset_name + f"_{no_cont}_synthetic.csv")
if 'gsm8k' == benchmark_name:
data = prepare_gsm8k(no_cont=no_cont, synthetic=synthetic, few_shot=0)
elif 'mmlu' == benchmark_name:
data = prepare_mmlu(no_cont=no_cont, synthetic=synthetic, few_shot=0)
elif 'arc' == benchmark_name:
data = prepare_arc(no_cont=no_cont, synthetic=synthetic, few_shot=0)
elif 'hellaswag' == benchmark_name:
data = prepare_hellaswag(no_cont=no_cont, synthetic=synthetic, few_shot=0)
else:
raise ValueError(f"Unknown benchmark {benchmark_name}")
run_model(model, tokenizer, data, output_path, batch_size, ref_model_name=ref_model_name)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--model_name', type=str)
parser.add_argument('--batch_size', type=int, default=8)
parser.add_argument('--ref_model', type=str, default='microsoft/phi-2')
parser.add_argument('--benchmark', type=str)
parser.add_argument('--no-cont', action='store_true')
parser.add_argument('--synthetic', action='store_true')
args = parser.parse_args()
main(args.model_name, args.benchmark, args.benchmark,
no_cont=args.no_cont, batch_size=args.batch_size, ref_model_name=args.ref_model,
synthetic=args.synthetic)