Skip to content

Commit

Permalink
add plotting to benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
slobentanzer committed Feb 7, 2024
1 parent b1e3304 commit d46e4d8
Show file tree
Hide file tree
Showing 11 changed files with 224 additions and 92 deletions.
24 changes: 12 additions & 12 deletions benchmark/results/preprocessed_for_frontend/overview-model.csv
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
Model name,Size,Mean Accuracy,SD
gpt-4,Unknown,0.94,0.12
gpt-3.5-turbo,175,0.9,0.18
llama-2-chat,70,0.66,0.29
llama-2-chat,7,0.65,0.26
llama-2-chat,13,0.63,0.27
openhermes-2.5,7,0.6,0.37
code-llama-instruct,7,0.49,0.38
chatglm3,6,0.43,0.35
mixtral-instruct-v0.1,"46,7",0.39,0.32
code-llama-instruct,34,0.38,0.38
code-llama-instruct,13,0.36,0.38
Model name,Size,Median Accuracy,SD
gpt-3.5-turbo,175,1.0,0.18
gpt-4,Unknown,1.0,0.12
openhermes-2.5,7,0.8,0.37
llama-2-chat,13,0.75,0.27
llama-2-chat,7,0.75,0.26
llama-2-chat,70,0.75,0.29
chatglm3,6,0.5,0.35
code-llama-instruct,7,0.5,0.38
code-llama-instruct,13,0.33,0.39
code-llama-instruct,34,0.33,0.38
mixtral-instruct-v0.1,"46,7",0.29,0.32
Original file line number Diff line number Diff line change
@@ -1,47 +1,47 @@
Model name,Size,Version,Quantisation,Mean Accuracy,SD
gpt-4,Unknown,,,0.94,0.12
gpt-3.5-turbo,175,,,0.9,0.18
Model name,Size,Version,Quantisation,Median Accuracy,SD
gpt-3.5-turbo,175,,,1.0,0.18
gpt-4,Unknown,,,1.0,0.12
llama-2-chat,7,ggufv2,Q5_K_M,0.88,0.12
openhermes-2.5,7,ggufv2,Q8_0,0.88,0.34
llama-2-chat,70,ggufv2,Q5_K_M,0.88,0.12
llama-2-chat,13,ggufv2,Q4_K_S,0.88,0.12
llama-2-chat,7,ggufv2,Q5_K_M,0.88,0.12
llama-2-chat,13,ggufv2,Q5_0,0.75,0.25
llama-2-chat,7,ggufv2,Q6_K,0.75,0.25
openhermes-2.5,7,ggufv2,Q4_K_M,0.8,0.39
code-llama-instruct,7,ggufv2,Q4_K_M,0.75,0.43
llama-2-chat,13,ggufv2,Q5_K_M,0.75,0.25
llama-2-chat,7,ggufv2,Q3_K_M,0.75,0.25
llama-2-chat,7,ggufv2,Q4_0,0.75,0.25
llama-2-chat,7,ggufv2,Q4_1,0.75,0.25
llama-2-chat,7,ggufv2,Q4_K_S,0.75,0.25
llama-2-chat,7,ggufv2,Q5_0,0.75,0.25
llama-2-chat,70,ggufv2,Q3_K_M,0.75,0.25
llama-2-chat,13,ggufv2,Q5_0,0.75,0.25
llama-2-chat,7,ggufv2,Q6_K,0.75,0.25
llama-2-chat,13,ggufv2,Q6_K,0.75,0.25
llama-2-chat,13,ggufv2,Q4_1,0.75,0.25
llama-2-chat,13,ggufv2,Q4_0,0.75,0.25
openhermes-2.5,7,ggufv2,Q8_0,0.71,0.33
llama-2-chat,70,ggufv2,Q4_K_M,0.75,0.38
llama-2-chat,70,ggufv2,Q3_K_M,0.75,0.25
mixtral-instruct-v0.1,"46,7",ggufv2,Q4_0,0.71,0.29
llama-2-chat,13,ggufv2,Q3_K_M,0.67,0.17
openhermes-2.5,7,ggufv2,Q4_K_M,0.62,0.39
llama-2-chat,70,ggufv2,Q4_K_M,0.6,0.38
code-llama-instruct,7,ggufv2,Q4_K_M,0.59,0.43
chatglm3,6,ggmlv3,q4_0,0.5,0.35
llama-2-chat,7,ggufv2,Q4_K_M,0.5,0.36
llama-2-chat,70,ggufv2,Q2_K,0.5,0.41
code-llama-instruct,34,ggufv2,Q2_K,0.5,0.39
openhermes-2.5,7,ggufv2,Q2_K,0.5,0.39
llama-2-chat,7,ggufv2,Q8_0,0.5,0.37
mixtral-instruct-v0.1,"46,7",ggufv2,Q6_K,0.5,0.5
openhermes-2.5,7,ggufv2,Q2_K,0.47,0.39
code-llama-instruct,7,ggufv2,Q2_K,0.45,0.3
llama-2-chat,7,ggufv2,Q4_K_M,0.45,0.36
chatglm3,6,ggmlv3,q4_0,0.43,0.35
code-llama-instruct,13,ggufv2,Q8_0,0.43,0.4
llama-2-chat,7,ggufv2,Q8_0,0.42,0.36
code-llama-instruct,7,ggufv2,Q8_0,0.42,0.4
llama-2-chat,70,ggufv2,Q2_K,0.42,0.41
code-llama-instruct,34,ggufv2,Q2_K,0.41,0.39
code-llama-instruct,34,ggufv2,Q8_0,0.4,0.37
llama-2-chat,13,ggufv2,Q4_K_M,0.38,0.39
llama-2-chat,13,ggufv2,Q8_0,0.38,0.38
code-llama-instruct,13,ggufv2,Q4_K_M,0.37,0.38
mixtral-instruct-v0.1,"46,7",ggufv2,Q4_K_M,0.36,0.35
code-llama-instruct,34,ggufv2,Q4_K_M,0.33,0.37
mixtral-instruct-v0.1,"46,7",ggufv2,Q2_K,0.3,0.29
mixtral-instruct-v0.1,"46,7",ggufv2,Q8_0,0.29,0.33
code-llama-instruct,13,ggufv2,Q2_K,0.29,0.37
code-llama-instruct,13,ggufv2,Q8_0,0.5,0.4
code-llama-instruct,7,ggufv2,Q8_0,0.5,0.4
llama-2-chat,13,ggufv2,Q8_0,0.41,0.38
code-llama-instruct,7,ggufv2,Q2_K,0.38,0.3
code-llama-instruct,13,ggufv2,Q4_K_M,0.33,0.38
mixtral-instruct-v0.1,"46,7",ggufv2,Q2_K,0.33,0.3
code-llama-instruct,34,ggufv2,Q8_0,0.33,0.37
llama-2-chat,13,ggufv2,Q4_K_M,0.31,0.39
mixtral-instruct-v0.1,"46,7",ggufv2,Q5_0,0.29,0.21
llama-2-chat,13,ggufv2,Q2_K,0.28,0.37
llama-2-chat,7,ggufv2,Q2_K,0.28,0.3
code-llama-instruct,34,ggufv2,Q4_K_M,0.25,0.37
mixtral-instruct-v0.1,"46,7",ggufv2,Q4_K_M,0.25,0.35
mixtral-instruct-v0.1,"46,7",ggufv2,Q3_K_M,0.25,0.25
llama-2-chat,7,ggufv2,Q2_K,0.16,0.3
code-llama-instruct,13,ggufv2,Q2_K,0.08,0.38
mixtral-instruct-v0.1,"46,7",ggufv2,Q8_0,0.08,0.34
llama-2-chat,13,ggufv2,Q2_K,0.0,0.38
62 changes: 31 additions & 31 deletions benchmark/results/preprocessed_for_frontend/overview.csv
Original file line number Diff line number Diff line change
@@ -1,47 +1,47 @@
Full model name,property_selection,query_generation,explicit_relevance_of_single_fragments,entity_selection,relationship_selection,end_to_end_query_generation,naive_query_generation_using_schema,implicit_relevance_of_multiple_fragments,property_exists,Mean Accuracy,SD
gpt-4,0.7647058823529411,1.0,1.0,1.0,1.0,1.0,0.6875,1.0,1.0,0.9391339869281047,0.11531479999158567
gpt-3.5-turbo,0.6470588235294118,1.0,1.0,1.0,1.0,0.9375,0.5,1.0,1.0,0.8982843137254901,0.1780633459505017
Full model name,property_selection,query_generation,explicit_relevance_of_single_fragments,entity_selection,relationship_selection,end_to_end_query_generation,naive_query_generation_using_schema,implicit_relevance_of_multiple_fragments,property_exists,Median Accuracy,SD
gpt-3.5-turbo,0.6470588235294118,1.0,1.0,1.0,1.0,0.9375,0.5,1.0,1.0,1.0,0.18094519406305737
gpt-4,0.7647058823529411,1.0,1.0,1.0,1.0,1.0,0.6875,1.0,1.0,1.0,0.11691009473876164
llama-2-chat:7:ggufv2:Q5_K_M,,,1.0,,,,,0.75,,0.875,0.125
openhermes-2.5:7:ggufv2:Q8_0,0.4117647058823529,0.875,1.0,1.0,0.5,0.0,0.625,1.0,1.0,0.875,0.3385805640416911
llama-2-chat:70:ggufv2:Q5_K_M,,,1.0,,,,,0.75,,0.875,0.125
llama-2-chat:13:ggufv2:Q4_K_S,,,1.0,,,,,0.75,,0.875,0.125
llama-2-chat:7:ggufv2:Q5_K_M,,,1.0,,,,,0.75,,0.875,0.125
llama-2-chat:13:ggufv2:Q5_0,,,1.0,,,,,0.5,,0.75,0.25
llama-2-chat:7:ggufv2:Q6_K,,,1.0,,,,,0.5,,0.75,0.25
openhermes-2.5:7:ggufv2:Q4_K_M,0.0,0.875,1.0,1.0,0.5,0.0,0.375,1.0,0.8,0.8,0.39496835316262996
code-llama-instruct:7:ggufv2:Q4_K_M,0.0,0.875,1.0,0.75,0.0,0.0,0.6875,1.0,1.0,0.75,0.4330628162031206
llama-2-chat:13:ggufv2:Q5_K_M,,,1.0,,,,,0.5,,0.75,0.25
llama-2-chat:7:ggufv2:Q3_K_M,,,1.0,,,,,0.5,,0.75,0.25
llama-2-chat:7:ggufv2:Q4_0,,,1.0,,,,,0.5,,0.75,0.25
llama-2-chat:7:ggufv2:Q4_1,,,1.0,,,,,0.5,,0.75,0.25
llama-2-chat:7:ggufv2:Q4_K_S,,,1.0,,,,,0.5,,0.75,0.25
llama-2-chat:7:ggufv2:Q5_0,,,1.0,,,,,0.5,,0.75,0.25
llama-2-chat:70:ggufv2:Q3_K_M,,,1.0,,,,,0.5,,0.75,0.25
llama-2-chat:13:ggufv2:Q5_0,,,1.0,,,,,0.5,,0.75,0.25
llama-2-chat:7:ggufv2:Q6_K,,,1.0,,,,,0.5,,0.75,0.25
llama-2-chat:13:ggufv2:Q6_K,,,1.0,,,,,0.5,,0.75,0.25
llama-2-chat:13:ggufv2:Q4_1,,,1.0,,,,,0.5,,0.75,0.25
llama-2-chat:13:ggufv2:Q4_0,,,1.0,,,,,0.5,,0.75,0.25
openhermes-2.5:7:ggufv2:Q8_0,0.4117647058823529,0.875,1.0,1.0,0.5,0.0,0.625,1.0,1.0,0.7124183006535948,0.33465432531278183
llama-2-chat:70:ggufv2:Q4_K_M,0.0,0.75,1.0,0.75,0.5,0.0,0.4375,1.0,1.0,0.75,0.38011557087578274
llama-2-chat:70:ggufv2:Q3_K_M,,,1.0,,,,,0.5,,0.75,0.25
mixtral-instruct-v0.1:46_7:ggufv2:Q4_0,,,0.4166666666666667,,,,,1.0,,0.7083333333333334,0.2916666666666667
llama-2-chat:13:ggufv2:Q3_K_M,,,0.8333333333333334,,,,,0.5,,0.6666666666666667,0.16666666666666669
openhermes-2.5:7:ggufv2:Q4_K_M,0.0,0.875,1.0,1.0,0.5,0.0,0.375,1.0,0.8,0.6166666666666667,0.3906902723243681
llama-2-chat:70:ggufv2:Q4_K_M,0.0,0.75,1.0,0.75,0.5,0.0,0.4375,1.0,1.0,0.6041666666666666,0.377307714089059
code-llama-instruct:7:ggufv2:Q4_K_M,0.0,0.875,1.0,0.75,0.0,0.0,0.6875,1.0,1.0,0.5902777777777778,0.4301072935333895
chatglm3:6:ggmlv3:q4_0,0.7647058823529411,0.34375,0.75,0.5,0.0,0.0,0.5,1.0,0.0,0.5,0.35210878703471027
llama-2-chat:7:ggufv2:Q4_K_M,0.0,0.78125,1.0,0.75,0.0,0.0,0.3125,0.5,0.7,0.5,0.36453035329341543
llama-2-chat:70:ggufv2:Q2_K,0.0,0.75,1.0,0.0,0.0,0.0,0.5,0.5,1.0,0.5,0.40909792362111924
code-llama-instruct:34:ggufv2:Q2_K,0.0,0.8125,0.5,0.0,0.0,0.0,0.625,1.0,0.75,0.5,0.3886966290623633
openhermes-2.5:7:ggufv2:Q2_K,0.0,1.0,1.0,0.5,0.0,0.0,0.375,0.5,0.875,0.5,0.39440531887330776
llama-2-chat:7:ggufv2:Q8_0,0.0,0.8125,1.0,0.75,0.0,0.0,0.1875,0.5,0.5,0.5,0.3653860211398959
mixtral-instruct-v0.1:46_7:ggufv2:Q6_K,,,0.0,,,,,1.0,,0.5,0.5
openhermes-2.5:7:ggufv2:Q2_K,0.0,1.0,1.0,0.5,0.0,0.0,0.375,0.5,0.875,0.4722222222222222,0.39430748796051085
code-llama-instruct:7:ggufv2:Q2_K,0.1176470588235294,0.875,0.3333333333333333,0.25,0.5,0.0,0.375,0.75,0.875,0.45288671023965144,0.3029245580367224
llama-2-chat:7:ggufv2:Q4_K_M,0.0,0.78125,1.0,0.75,0.0,0.0,0.3125,0.5,0.7,0.44930555555555557,0.3641776843828698
chatglm3:6:ggmlv3:q4_0,0.7647058823529411,0.34375,0.75,0.5,0.0,0.0,0.5,1.0,0.0,0.4287173202614379,0.3513865049542796
code-llama-instruct:13:ggufv2:Q8_0,0.0,0.875,0.8333333333333334,0.0,0.0,0.0,0.625,0.5,1.0,0.42592592592592593,0.40386719421415535
llama-2-chat:7:ggufv2:Q8_0,0.0,0.8125,1.0,0.75,0.0,0.0,0.1875,0.5,0.5,0.4166666666666667,0.3644344934278313
code-llama-instruct:7:ggufv2:Q8_0,0.0,0.875,1.0,0.0,0.0,0.0,0.5,0.5,0.875,0.4166666666666667,0.4039733214513608
llama-2-chat:70:ggufv2:Q2_K,0.0,0.75,1.0,0.0,0.0,0.0,0.5,0.5,1.0,0.4166666666666667,0.408248290463863
code-llama-instruct:34:ggufv2:Q2_K,0.0,0.8125,0.5,0.0,0.0,0.0,0.625,1.0,0.75,0.4097222222222222,0.38764682602646844
code-llama-instruct:34:ggufv2:Q8_0,0.0,0.875,0.3333333333333333,0.25,0.0,0.0,0.375,0.75,1.0,0.39814814814814814,0.36752378309239514
llama-2-chat:13:ggufv2:Q4_K_M,0.0,0.875,1.0,0.0,0.0,0.0,0.3125,0.5,0.75,0.3819444444444444,0.3891368257256083
llama-2-chat:13:ggufv2:Q8_0,0.0,0.875,1.0,0.0,0.0,0.0,0.40625,0.5,0.625,0.3784722222222222,0.377435506713922
code-llama-instruct:13:ggufv2:Q4_K_M,0.0,0.875,0.3333333333333333,0.0,0.0,0.0,0.625,0.5,1.0,0.37037037037037035,0.3775348710717613
mixtral-instruct-v0.1:46_7:ggufv2:Q4_K_M,0.0,0.8125,0.1666666666666666,0.25,0.0,0.0,0.5,1.0,0.5,0.3587962962962963,0.3483007939593339
code-llama-instruct:34:ggufv2:Q4_K_M,0.0,0.875,0.5,0.0,0.0,0.0,0.375,0.25,1.0,0.3333333333333333,0.3679900360969936
mixtral-instruct-v0.1:46_7:ggufv2:Q2_K,0.0,0.65625,0.3333333333333333,0.0,0.0,0.0,0.59375,0.75,0.375,0.3009259259259259,0.29485053088372465
mixtral-instruct-v0.1:46_7:ggufv2:Q8_0,0.0,0.8125,0.0833333333333333,0.0,0.0,0.0,0.375,0.75,0.625,0.29398148148148145,0.33049200551064345
code-llama-instruct:13:ggufv2:Q2_K,0.0,0.875,0.0833333333333333,0.0,0.0,0.0,0.4375,0.25,1.0,0.29398148148148145,0.3724766497316793
code-llama-instruct:13:ggufv2:Q8_0,0.0,0.875,0.8333333333333334,0.0,0.0,0.0,0.625,0.5,1.0,0.5,0.4045459274389095
code-llama-instruct:7:ggufv2:Q8_0,0.0,0.875,1.0,0.0,0.0,0.0,0.5,0.5,0.875,0.5,0.4048319267163706
llama-2-chat:13:ggufv2:Q8_0,0.0,0.875,1.0,0.0,0.0,0.0,0.40625,0.5,0.625,0.40625,0.37753770966914313
code-llama-instruct:7:ggufv2:Q2_K,0.1176470588235294,0.875,0.3333333333333333,0.25,0.5,0.0,0.375,0.75,0.875,0.375,0.3039242040788116
code-llama-instruct:13:ggufv2:Q4_K_M,0.0,0.875,0.3333333333333333,0.0,0.0,0.0,0.625,0.5,1.0,0.3333333333333333,0.3777164982978897
mixtral-instruct-v0.1:46_7:ggufv2:Q2_K,0.0,0.65625,0.3333333333333333,0.0,0.0,0.0,0.59375,0.75,0.375,0.3333333333333333,0.29502857415494727
code-llama-instruct:34:ggufv2:Q8_0,0.0,0.875,0.3333333333333333,0.25,0.0,0.0,0.375,0.75,1.0,0.3333333333333333,0.36809486163283217
llama-2-chat:13:ggufv2:Q4_K_M,0.0,0.875,1.0,0.0,0.0,0.0,0.3125,0.5,0.75,0.3125,0.3897559777889522
mixtral-instruct-v0.1:46_7:ggufv2:Q5_0,,,0.0833333333333333,,,,,0.5,,0.29166666666666663,0.20833333333333334
llama-2-chat:13:ggufv2:Q2_K,0.0,0.1875,0.8333333333333334,0.0,0.0,0.0,0.46875,1.0,0.0,0.2766203703703704,0.3742633940614961
llama-2-chat:7:ggufv2:Q2_K,0.0,0.625,0.8333333333333334,0.0,0.0,0.0,0.15625,0.5,0.375,0.2766203703703704,0.3002663712272593
code-llama-instruct:34:ggufv2:Q4_K_M,0.0,0.875,0.5,0.0,0.0,0.0,0.375,0.25,1.0,0.25,0.3689323936863109
mixtral-instruct-v0.1:46_7:ggufv2:Q4_K_M,0.0,0.8125,0.1666666666666666,0.25,0.0,0.0,0.5,1.0,0.5,0.25,0.34999586637770663
mixtral-instruct-v0.1:46_7:ggufv2:Q3_K_M,,,0.0,,,,,0.5,,0.25,0.25
llama-2-chat:7:ggufv2:Q2_K,0.0,0.625,0.8333333333333334,0.0,0.0,0.0,0.15625,0.5,0.375,0.15625,0.3026694505500948
code-llama-instruct:13:ggufv2:Q2_K,0.0,0.875,0.0833333333333333,0.0,0.0,0.0,0.4375,0.25,1.0,0.0833333333333333,0.37838620327274153
mixtral-instruct-v0.1:46_7:ggufv2:Q8_0,0.0,0.8125,0.0833333333333333,0.0,0.0,0.0,0.375,0.75,0.625,0.0833333333333333,0.33713829497443865
llama-2-chat:13:ggufv2:Q2_K,0.0,0.1875,0.8333333333333334,0.0,0.0,0.0,0.46875,1.0,0.0,0.0,0.3843500631778468
4 changes: 4 additions & 0 deletions docs/benchmark.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,17 @@ Click the column names to reorder.

{{ read_csv('benchmark/results/preprocessed_for_frontend/overview-model.csv', colalign=("left","right")) }}

![Boxplot Model](boxplot-per-model.png)

## Scores per quantisation

Table sorted by mean score in descending order.
Click the column names to reorder.

{{ read_csv('benchmark/results/preprocessed_for_frontend/overview-quantisation.csv', colalign=("left","right")) }}

![Boxplot Quantisation](boxplot-per-quantisation.png)

## Scores of all tasks

Wide table; you may need to scroll horizontally to see all columns.
Expand Down
Binary file added docs/boxplot-per-model.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/boxplot-per-quantisation.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/boxplot-per-task.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
126 changes: 116 additions & 10 deletions docs/scripts/hooks.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import os
import re
import seaborn as sns
import matplotlib

matplotlib.use("Agg")
import matplotlib.pyplot as plt

import pandas as pd

Expand Down Expand Up @@ -121,9 +126,9 @@ def create_overview_table(result_files_path: str, result_file_names: list[str]):
subtask_result = subtask_result[file_name_without_extension]
subtask_results.append(subtask_result)
overview = pd.concat(subtask_results, axis=1)
overview["Mean Accuracy"] = overview.mean(axis=1)
overview["Median Accuracy"] = overview.median(axis=1)
overview["SD"] = overview.std(axis=1)
overview = overview.sort_values(by="Mean Accuracy", ascending=False)
overview = overview.sort_values(by="Median Accuracy", ascending=False)
# split "Full model name" at : to get Model name, size, version, and quantisation
overview.to_csv(
f"{result_files_path}preprocessed_for_frontend/overview.csv",
Expand Down Expand Up @@ -158,15 +163,17 @@ def create_overview_table(result_files_path: str, result_file_names: list[str]):
"Size",
"Version",
"Quantisation",
"Mean Accuracy",
"Median Accuracy",
"SD",
]
]
# round mean and sd to 2 decimal places
overview_per_quantisation["Mean Accuracy"] = overview_per_quantisation[
"Mean Accuracy"
overview_per_quantisation.loc[:, "Median Accuracy"] = (
overview_per_quantisation["Median Accuracy"].round(2)
)
overview_per_quantisation.loc[:, "SD"] = overview_per_quantisation[
"SD"
].round(2)
overview_per_quantisation["SD"] = overview_per_quantisation["SD"].round(2)
overview_per_quantisation.to_csv(
f"{result_files_path}preprocessed_for_frontend/overview-quantisation.csv",
index=False,
Expand All @@ -178,24 +185,123 @@ def create_overview_table(result_files_path: str, result_file_names: list[str]):
["Model name", "Size"]
).agg(
{
"Mean Accuracy": "mean",
"Median Accuracy": "median",
"SD": "mean",
}
)
# round mean and SD to 2 decimal places
overview_per_size["Mean Accuracy"] = overview_per_size[
"Mean Accuracy"
overview_per_size["Median Accuracy"] = overview_per_size[
"Median Accuracy"
].round(2)
overview_per_size["SD"] = overview_per_size["SD"].round(2)
# sort by mean, descending
overview_per_size = overview_per_size.sort_values(
by="Mean Accuracy", ascending=False
by="Median Accuracy", ascending=False
)
overview_per_size.to_csv(
f"{result_files_path}preprocessed_for_frontend/overview-model.csv",
index=True,
)

plot_accuracy_per_model(overview)
plot_accuracy_per_quantisation(overview)
plot_accuracy_per_task(overview)


def plot_accuracy_per_model(overview) -> None:
sns.set_theme(style="whitegrid")
overview_melted = overview.melt(
id_vars=[
"Full model name",
"Model name",
"Size",
"Version",
"Quantisation",
"Median Accuracy",
"SD",
],
var_name="Task",
value_name="Accuracy",
)
plt.figure(figsize=(10, 6))
sns.boxplot(x="Model name", y="Accuracy", hue="Size", data=overview_melted)
plt.title("Boxplot across tasks, per Model")
plt.xticks(rotation=45)
plt.savefig(
f"docs/boxplot-per-model.png",
bbox_inches="tight",
)
plt.close()


def plot_accuracy_per_quantisation(overview) -> None:
sns.set_theme(style="whitegrid")
overview_melted = overview.melt(
id_vars=[
"Full model name",
"Model name",
"Size",
"Version",
"Quantisation",
"Median Accuracy",
"SD",
],
var_name="Task",
value_name="Accuracy",
)
# unify quantisation names: 2-bit, 3-bit, etc
digit_pattern = r"\d+"
overview_melted["Quantisation"] = overview_melted["Quantisation"].apply(
lambda x: f"{re.findall(digit_pattern, x)[0]}-bit" if x else "None"
)
# set quantisation of gpt models to None
overview_melted["Quantisation"] = overview_melted.apply(
lambda row: (
"None"
if row["Model name"] in ["gpt-3.5-turbo", "gpt-4"]
else row["Quantisation"]
),
axis=1,
)

plt.figure(figsize=(10, 6))
sns.boxplot(
x="Model name", y="Accuracy", hue="Quantisation", data=overview_melted
)
plt.title("Boxplot across tasks, per Quantisation")
plt.xticks(rotation=45)
plt.savefig(
f"docs/boxplot-per-quantisation.png",
bbox_inches="tight",
)
plt.close()


def plot_accuracy_per_task(overview):
sns.set_theme(style="whitegrid")
overview_melted = overview.melt(
id_vars=[
"Full model name",
"Model name",
"Size",
"Version",
"Quantisation",
"Median Accuracy",
"SD",
],
var_name="Task",
value_name="Accuracy",
)
plt.figure(figsize=(10, 6))
sns.boxplot(x="Task", y="Accuracy", hue="Model name", data=overview_melted)
plt.title("Boxplot across models, per Task")
plt.xticks(rotation=45)
plt.savefig(
f"docs/boxplot-per-task.png",
bbox_inches="tight",
)
plt.close()


if __name__ == "__main__":
on_pre_build(None)
File renamed without changes.
Loading

0 comments on commit d46e4d8

Please sign in to comment.