add plotting to benchmark

biocypher · Feb 7, 2024 · d46e4d8 · d46e4d8
1 parent b1e3304
commit d46e4d8
Show file tree

Hide file tree

Showing 11 changed files with 224 additions and 92 deletions.
diff --git a/benchmark/results/preprocessed_for_frontend/overview-model.csv b/benchmark/results/preprocessed_for_frontend/overview-model.csv
@@ -1,12 +1,12 @@
-Model name,Size,Mean Accuracy,SD
-gpt-4,Unknown,0.94,0.12
-gpt-3.5-turbo,175,0.9,0.18
-llama-2-chat,70,0.66,0.29
-llama-2-chat,7,0.65,0.26
-llama-2-chat,13,0.63,0.27
-openhermes-2.5,7,0.6,0.37
-code-llama-instruct,7,0.49,0.38
-chatglm3,6,0.43,0.35
-mixtral-instruct-v0.1,"46,7",0.39,0.32
-code-llama-instruct,34,0.38,0.38
-code-llama-instruct,13,0.36,0.38
+Model name,Size,Median Accuracy,SD
+gpt-3.5-turbo,175,1.0,0.18
+gpt-4,Unknown,1.0,0.12
+openhermes-2.5,7,0.8,0.37
+llama-2-chat,13,0.75,0.27
+llama-2-chat,7,0.75,0.26
+llama-2-chat,70,0.75,0.29
+chatglm3,6,0.5,0.35
+code-llama-instruct,7,0.5,0.38
+code-llama-instruct,13,0.33,0.39
+code-llama-instruct,34,0.33,0.38
+mixtral-instruct-v0.1,"46,7",0.29,0.32
diff --git a/benchmark/results/preprocessed_for_frontend/overview-quantisation.csv b/benchmark/results/preprocessed_for_frontend/overview-quantisation.csv
@@ -1,47 +1,47 @@
-Model name,Size,Version,Quantisation,Mean Accuracy,SD
-gpt-4,Unknown,,,0.94,0.12
-gpt-3.5-turbo,175,,,0.9,0.18
+Model name,Size,Version,Quantisation,Median Accuracy,SD
+gpt-3.5-turbo,175,,,1.0,0.18
+gpt-4,Unknown,,,1.0,0.12
+llama-2-chat,7,ggufv2,Q5_K_M,0.88,0.12
+openhermes-2.5,7,ggufv2,Q8_0,0.88,0.34
 llama-2-chat,70,ggufv2,Q5_K_M,0.88,0.12
 llama-2-chat,13,ggufv2,Q4_K_S,0.88,0.12
-llama-2-chat,7,ggufv2,Q5_K_M,0.88,0.12
-llama-2-chat,13,ggufv2,Q5_0,0.75,0.25
-llama-2-chat,7,ggufv2,Q6_K,0.75,0.25
+openhermes-2.5,7,ggufv2,Q4_K_M,0.8,0.39
+code-llama-instruct,7,ggufv2,Q4_K_M,0.75,0.43
 llama-2-chat,13,ggufv2,Q5_K_M,0.75,0.25
 llama-2-chat,7,ggufv2,Q3_K_M,0.75,0.25
 llama-2-chat,7,ggufv2,Q4_0,0.75,0.25
 llama-2-chat,7,ggufv2,Q4_1,0.75,0.25
 llama-2-chat,7,ggufv2,Q4_K_S,0.75,0.25
 llama-2-chat,7,ggufv2,Q5_0,0.75,0.25
-llama-2-chat,70,ggufv2,Q3_K_M,0.75,0.25
+llama-2-chat,13,ggufv2,Q5_0,0.75,0.25
+llama-2-chat,7,ggufv2,Q6_K,0.75,0.25
 llama-2-chat,13,ggufv2,Q6_K,0.75,0.25
 llama-2-chat,13,ggufv2,Q4_1,0.75,0.25
 llama-2-chat,13,ggufv2,Q4_0,0.75,0.25
-openhermes-2.5,7,ggufv2,Q8_0,0.71,0.33
+llama-2-chat,70,ggufv2,Q4_K_M,0.75,0.38
+llama-2-chat,70,ggufv2,Q3_K_M,0.75,0.25
 mixtral-instruct-v0.1,"46,7",ggufv2,Q4_0,0.71,0.29
 llama-2-chat,13,ggufv2,Q3_K_M,0.67,0.17
-openhermes-2.5,7,ggufv2,Q4_K_M,0.62,0.39
-llama-2-chat,70,ggufv2,Q4_K_M,0.6,0.38
-code-llama-instruct,7,ggufv2,Q4_K_M,0.59,0.43
+chatglm3,6,ggmlv3,q4_0,0.5,0.35
+llama-2-chat,7,ggufv2,Q4_K_M,0.5,0.36
+llama-2-chat,70,ggufv2,Q2_K,0.5,0.41
+code-llama-instruct,34,ggufv2,Q2_K,0.5,0.39
+openhermes-2.5,7,ggufv2,Q2_K,0.5,0.39
+llama-2-chat,7,ggufv2,Q8_0,0.5,0.37
 mixtral-instruct-v0.1,"46,7",ggufv2,Q6_K,0.5,0.5
-openhermes-2.5,7,ggufv2,Q2_K,0.47,0.39
-code-llama-instruct,7,ggufv2,Q2_K,0.45,0.3
-llama-2-chat,7,ggufv2,Q4_K_M,0.45,0.36
-chatglm3,6,ggmlv3,q4_0,0.43,0.35
-code-llama-instruct,13,ggufv2,Q8_0,0.43,0.4
-llama-2-chat,7,ggufv2,Q8_0,0.42,0.36
-code-llama-instruct,7,ggufv2,Q8_0,0.42,0.4
-llama-2-chat,70,ggufv2,Q2_K,0.42,0.41
-code-llama-instruct,34,ggufv2,Q2_K,0.41,0.39
-code-llama-instruct,34,ggufv2,Q8_0,0.4,0.37
-llama-2-chat,13,ggufv2,Q4_K_M,0.38,0.39
-llama-2-chat,13,ggufv2,Q8_0,0.38,0.38
-code-llama-instruct,13,ggufv2,Q4_K_M,0.37,0.38
-mixtral-instruct-v0.1,"46,7",ggufv2,Q4_K_M,0.36,0.35
-code-llama-instruct,34,ggufv2,Q4_K_M,0.33,0.37
-mixtral-instruct-v0.1,"46,7",ggufv2,Q2_K,0.3,0.29
-mixtral-instruct-v0.1,"46,7",ggufv2,Q8_0,0.29,0.33
-code-llama-instruct,13,ggufv2,Q2_K,0.29,0.37
+code-llama-instruct,13,ggufv2,Q8_0,0.5,0.4
+code-llama-instruct,7,ggufv2,Q8_0,0.5,0.4
+llama-2-chat,13,ggufv2,Q8_0,0.41,0.38
+code-llama-instruct,7,ggufv2,Q2_K,0.38,0.3
+code-llama-instruct,13,ggufv2,Q4_K_M,0.33,0.38
+mixtral-instruct-v0.1,"46,7",ggufv2,Q2_K,0.33,0.3
+code-llama-instruct,34,ggufv2,Q8_0,0.33,0.37
+llama-2-chat,13,ggufv2,Q4_K_M,0.31,0.39
 mixtral-instruct-v0.1,"46,7",ggufv2,Q5_0,0.29,0.21
-llama-2-chat,13,ggufv2,Q2_K,0.28,0.37
-llama-2-chat,7,ggufv2,Q2_K,0.28,0.3
+code-llama-instruct,34,ggufv2,Q4_K_M,0.25,0.37
+mixtral-instruct-v0.1,"46,7",ggufv2,Q4_K_M,0.25,0.35
 mixtral-instruct-v0.1,"46,7",ggufv2,Q3_K_M,0.25,0.25
+llama-2-chat,7,ggufv2,Q2_K,0.16,0.3
+code-llama-instruct,13,ggufv2,Q2_K,0.08,0.38
+mixtral-instruct-v0.1,"46,7",ggufv2,Q8_0,0.08,0.34
+llama-2-chat,13,ggufv2,Q2_K,0.0,0.38
diff --git a/benchmark/results/preprocessed_for_frontend/overview.csv b/benchmark/results/preprocessed_for_frontend/overview.csv
@@ -1,47 +1,47 @@
-Full model name,property_selection,query_generation,explicit_relevance_of_single_fragments,entity_selection,relationship_selection,end_to_end_query_generation,naive_query_generation_using_schema,implicit_relevance_of_multiple_fragments,property_exists,Mean Accuracy,SD
-gpt-4,0.7647058823529411,1.0,1.0,1.0,1.0,1.0,0.6875,1.0,1.0,0.9391339869281047,0.11531479999158567
-gpt-3.5-turbo,0.6470588235294118,1.0,1.0,1.0,1.0,0.9375,0.5,1.0,1.0,0.8982843137254901,0.1780633459505017
+Full model name,property_selection,query_generation,explicit_relevance_of_single_fragments,entity_selection,relationship_selection,end_to_end_query_generation,naive_query_generation_using_schema,implicit_relevance_of_multiple_fragments,property_exists,Median Accuracy,SD
+gpt-3.5-turbo,0.6470588235294118,1.0,1.0,1.0,1.0,0.9375,0.5,1.0,1.0,1.0,0.18094519406305737
+gpt-4,0.7647058823529411,1.0,1.0,1.0,1.0,1.0,0.6875,1.0,1.0,1.0,0.11691009473876164
+llama-2-chat:7:ggufv2:Q5_K_M,,,1.0,,,,,0.75,,0.875,0.125
+openhermes-2.5:7:ggufv2:Q8_0,0.4117647058823529,0.875,1.0,1.0,0.5,0.0,0.625,1.0,1.0,0.875,0.3385805640416911
 llama-2-chat:70:ggufv2:Q5_K_M,,,1.0,,,,,0.75,,0.875,0.125
 llama-2-chat:13:ggufv2:Q4_K_S,,,1.0,,,,,0.75,,0.875,0.125
-llama-2-chat:7:ggufv2:Q5_K_M,,,1.0,,,,,0.75,,0.875,0.125
-llama-2-chat:13:ggufv2:Q5_0,,,1.0,,,,,0.5,,0.75,0.25
-llama-2-chat:7:ggufv2:Q6_K,,,1.0,,,,,0.5,,0.75,0.25
+openhermes-2.5:7:ggufv2:Q4_K_M,0.0,0.875,1.0,1.0,0.5,0.0,0.375,1.0,0.8,0.8,0.39496835316262996
+code-llama-instruct:7:ggufv2:Q4_K_M,0.0,0.875,1.0,0.75,0.0,0.0,0.6875,1.0,1.0,0.75,0.4330628162031206
 llama-2-chat:13:ggufv2:Q5_K_M,,,1.0,,,,,0.5,,0.75,0.25
 llama-2-chat:7:ggufv2:Q3_K_M,,,1.0,,,,,0.5,,0.75,0.25
 llama-2-chat:7:ggufv2:Q4_0,,,1.0,,,,,0.5,,0.75,0.25
 llama-2-chat:7:ggufv2:Q4_1,,,1.0,,,,,0.5,,0.75,0.25
 llama-2-chat:7:ggufv2:Q4_K_S,,,1.0,,,,,0.5,,0.75,0.25
 llama-2-chat:7:ggufv2:Q5_0,,,1.0,,,,,0.5,,0.75,0.25
-llama-2-chat:70:ggufv2:Q3_K_M,,,1.0,,,,,0.5,,0.75,0.25
+llama-2-chat:13:ggufv2:Q5_0,,,1.0,,,,,0.5,,0.75,0.25
+llama-2-chat:7:ggufv2:Q6_K,,,1.0,,,,,0.5,,0.75,0.25
 llama-2-chat:13:ggufv2:Q6_K,,,1.0,,,,,0.5,,0.75,0.25
 llama-2-chat:13:ggufv2:Q4_1,,,1.0,,,,,0.5,,0.75,0.25
 llama-2-chat:13:ggufv2:Q4_0,,,1.0,,,,,0.5,,0.75,0.25
-openhermes-2.5:7:ggufv2:Q8_0,0.4117647058823529,0.875,1.0,1.0,0.5,0.0,0.625,1.0,1.0,0.7124183006535948,0.33465432531278183
+llama-2-chat:70:ggufv2:Q4_K_M,0.0,0.75,1.0,0.75,0.5,0.0,0.4375,1.0,1.0,0.75,0.38011557087578274
+llama-2-chat:70:ggufv2:Q3_K_M,,,1.0,,,,,0.5,,0.75,0.25
 mixtral-instruct-v0.1:46_7:ggufv2:Q4_0,,,0.4166666666666667,,,,,1.0,,0.7083333333333334,0.2916666666666667
 llama-2-chat:13:ggufv2:Q3_K_M,,,0.8333333333333334,,,,,0.5,,0.6666666666666667,0.16666666666666669
-openhermes-2.5:7:ggufv2:Q4_K_M,0.0,0.875,1.0,1.0,0.5,0.0,0.375,1.0,0.8,0.6166666666666667,0.3906902723243681
-llama-2-chat:70:ggufv2:Q4_K_M,0.0,0.75,1.0,0.75,0.5,0.0,0.4375,1.0,1.0,0.6041666666666666,0.377307714089059
-code-llama-instruct:7:ggufv2:Q4_K_M,0.0,0.875,1.0,0.75,0.0,0.0,0.6875,1.0,1.0,0.5902777777777778,0.4301072935333895
+chatglm3:6:ggmlv3:q4_0,0.7647058823529411,0.34375,0.75,0.5,0.0,0.0,0.5,1.0,0.0,0.5,0.35210878703471027
+llama-2-chat:7:ggufv2:Q4_K_M,0.0,0.78125,1.0,0.75,0.0,0.0,0.3125,0.5,0.7,0.5,0.36453035329341543
+llama-2-chat:70:ggufv2:Q2_K,0.0,0.75,1.0,0.0,0.0,0.0,0.5,0.5,1.0,0.5,0.40909792362111924
+code-llama-instruct:34:ggufv2:Q2_K,0.0,0.8125,0.5,0.0,0.0,0.0,0.625,1.0,0.75,0.5,0.3886966290623633
+openhermes-2.5:7:ggufv2:Q2_K,0.0,1.0,1.0,0.5,0.0,0.0,0.375,0.5,0.875,0.5,0.39440531887330776
+llama-2-chat:7:ggufv2:Q8_0,0.0,0.8125,1.0,0.75,0.0,0.0,0.1875,0.5,0.5,0.5,0.3653860211398959
 mixtral-instruct-v0.1:46_7:ggufv2:Q6_K,,,0.0,,,,,1.0,,0.5,0.5
-openhermes-2.5:7:ggufv2:Q2_K,0.0,1.0,1.0,0.5,0.0,0.0,0.375,0.5,0.875,0.4722222222222222,0.39430748796051085
-code-llama-instruct:7:ggufv2:Q2_K,0.1176470588235294,0.875,0.3333333333333333,0.25,0.5,0.0,0.375,0.75,0.875,0.45288671023965144,0.3029245580367224
-llama-2-chat:7:ggufv2:Q4_K_M,0.0,0.78125,1.0,0.75,0.0,0.0,0.3125,0.5,0.7,0.44930555555555557,0.3641776843828698
-chatglm3:6:ggmlv3:q4_0,0.7647058823529411,0.34375,0.75,0.5,0.0,0.0,0.5,1.0,0.0,0.4287173202614379,0.3513865049542796
-code-llama-instruct:13:ggufv2:Q8_0,0.0,0.875,0.8333333333333334,0.0,0.0,0.0,0.625,0.5,1.0,0.42592592592592593,0.40386719421415535
-llama-2-chat:7:ggufv2:Q8_0,0.0,0.8125,1.0,0.75,0.0,0.0,0.1875,0.5,0.5,0.4166666666666667,0.3644344934278313
-code-llama-instruct:7:ggufv2:Q8_0,0.0,0.875,1.0,0.0,0.0,0.0,0.5,0.5,0.875,0.4166666666666667,0.4039733214513608
-llama-2-chat:70:ggufv2:Q2_K,0.0,0.75,1.0,0.0,0.0,0.0,0.5,0.5,1.0,0.4166666666666667,0.408248290463863
-code-llama-instruct:34:ggufv2:Q2_K,0.0,0.8125,0.5,0.0,0.0,0.0,0.625,1.0,0.75,0.4097222222222222,0.38764682602646844
-code-llama-instruct:34:ggufv2:Q8_0,0.0,0.875,0.3333333333333333,0.25,0.0,0.0,0.375,0.75,1.0,0.39814814814814814,0.36752378309239514
-llama-2-chat:13:ggufv2:Q4_K_M,0.0,0.875,1.0,0.0,0.0,0.0,0.3125,0.5,0.75,0.3819444444444444,0.3891368257256083
-llama-2-chat:13:ggufv2:Q8_0,0.0,0.875,1.0,0.0,0.0,0.0,0.40625,0.5,0.625,0.3784722222222222,0.377435506713922
-code-llama-instruct:13:ggufv2:Q4_K_M,0.0,0.875,0.3333333333333333,0.0,0.0,0.0,0.625,0.5,1.0,0.37037037037037035,0.3775348710717613
-mixtral-instruct-v0.1:46_7:ggufv2:Q4_K_M,0.0,0.8125,0.1666666666666666,0.25,0.0,0.0,0.5,1.0,0.5,0.3587962962962963,0.3483007939593339
-code-llama-instruct:34:ggufv2:Q4_K_M,0.0,0.875,0.5,0.0,0.0,0.0,0.375,0.25,1.0,0.3333333333333333,0.3679900360969936
-mixtral-instruct-v0.1:46_7:ggufv2:Q2_K,0.0,0.65625,0.3333333333333333,0.0,0.0,0.0,0.59375,0.75,0.375,0.3009259259259259,0.29485053088372465
-mixtral-instruct-v0.1:46_7:ggufv2:Q8_0,0.0,0.8125,0.0833333333333333,0.0,0.0,0.0,0.375,0.75,0.625,0.29398148148148145,0.33049200551064345
-code-llama-instruct:13:ggufv2:Q2_K,0.0,0.875,0.0833333333333333,0.0,0.0,0.0,0.4375,0.25,1.0,0.29398148148148145,0.3724766497316793
+code-llama-instruct:13:ggufv2:Q8_0,0.0,0.875,0.8333333333333334,0.0,0.0,0.0,0.625,0.5,1.0,0.5,0.4045459274389095
+code-llama-instruct:7:ggufv2:Q8_0,0.0,0.875,1.0,0.0,0.0,0.0,0.5,0.5,0.875,0.5,0.4048319267163706
+llama-2-chat:13:ggufv2:Q8_0,0.0,0.875,1.0,0.0,0.0,0.0,0.40625,0.5,0.625,0.40625,0.37753770966914313
+code-llama-instruct:7:ggufv2:Q2_K,0.1176470588235294,0.875,0.3333333333333333,0.25,0.5,0.0,0.375,0.75,0.875,0.375,0.3039242040788116
+code-llama-instruct:13:ggufv2:Q4_K_M,0.0,0.875,0.3333333333333333,0.0,0.0,0.0,0.625,0.5,1.0,0.3333333333333333,0.3777164982978897
+mixtral-instruct-v0.1:46_7:ggufv2:Q2_K,0.0,0.65625,0.3333333333333333,0.0,0.0,0.0,0.59375,0.75,0.375,0.3333333333333333,0.29502857415494727
+code-llama-instruct:34:ggufv2:Q8_0,0.0,0.875,0.3333333333333333,0.25,0.0,0.0,0.375,0.75,1.0,0.3333333333333333,0.36809486163283217
+llama-2-chat:13:ggufv2:Q4_K_M,0.0,0.875,1.0,0.0,0.0,0.0,0.3125,0.5,0.75,0.3125,0.3897559777889522
 mixtral-instruct-v0.1:46_7:ggufv2:Q5_0,,,0.0833333333333333,,,,,0.5,,0.29166666666666663,0.20833333333333334
-llama-2-chat:13:ggufv2:Q2_K,0.0,0.1875,0.8333333333333334,0.0,0.0,0.0,0.46875,1.0,0.0,0.2766203703703704,0.3742633940614961
-llama-2-chat:7:ggufv2:Q2_K,0.0,0.625,0.8333333333333334,0.0,0.0,0.0,0.15625,0.5,0.375,0.2766203703703704,0.3002663712272593
+code-llama-instruct:34:ggufv2:Q4_K_M,0.0,0.875,0.5,0.0,0.0,0.0,0.375,0.25,1.0,0.25,0.3689323936863109
+mixtral-instruct-v0.1:46_7:ggufv2:Q4_K_M,0.0,0.8125,0.1666666666666666,0.25,0.0,0.0,0.5,1.0,0.5,0.25,0.34999586637770663
 mixtral-instruct-v0.1:46_7:ggufv2:Q3_K_M,,,0.0,,,,,0.5,,0.25,0.25
+llama-2-chat:7:ggufv2:Q2_K,0.0,0.625,0.8333333333333334,0.0,0.0,0.0,0.15625,0.5,0.375,0.15625,0.3026694505500948
+code-llama-instruct:13:ggufv2:Q2_K,0.0,0.875,0.0833333333333333,0.0,0.0,0.0,0.4375,0.25,1.0,0.0833333333333333,0.37838620327274153
+mixtral-instruct-v0.1:46_7:ggufv2:Q8_0,0.0,0.8125,0.0833333333333333,0.0,0.0,0.0,0.375,0.75,0.625,0.0833333333333333,0.33713829497443865
+llama-2-chat:13:ggufv2:Q2_K,0.0,0.1875,0.8333333333333334,0.0,0.0,0.0,0.46875,1.0,0.0,0.0,0.3843500631778468
diff --git a/docs/benchmark.md b/docs/benchmark.md
@@ -7,13 +7,17 @@ Click the column names to reorder.
 
 {{ read_csv('benchmark/results/preprocessed_for_frontend/overview-model.csv', colalign=("left","right")) }}
 
+![Boxplot Model](boxplot-per-model.png)
+
 ## Scores per quantisation
 
 Table sorted by mean score in descending order.
 Click the column names to reorder.
 
 {{ read_csv('benchmark/results/preprocessed_for_frontend/overview-quantisation.csv', colalign=("left","right")) }}
 
+![Boxplot Quantisation](boxplot-per-quantisation.png)
+
 ## Scores of all tasks
 
 Wide table; you may need to scroll horizontally to see all columns.

diff --git a/docs/boxplot-per-model.png b/docs/boxplot-per-model.png
diff --git a/docs/boxplot-per-quantisation.png b/docs/boxplot-per-quantisation.png
diff --git a/docs/boxplot-per-task.png b/docs/boxplot-per-task.png
diff --git a/docs/scripts/hooks.py b/docs/scripts/hooks.py
@@ -1,5 +1,10 @@
 import os
 import re
+import seaborn as sns
+import matplotlib
+
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
 
 import pandas as pd
 
@@ -121,9 +126,9 @@ def create_overview_table(result_files_path: str, result_file_names: list[str]):
         subtask_result = subtask_result[file_name_without_extension]
         subtask_results.append(subtask_result)
     overview = pd.concat(subtask_results, axis=1)
-    overview["Mean Accuracy"] = overview.mean(axis=1)
+    overview["Median Accuracy"] = overview.median(axis=1)
     overview["SD"] = overview.std(axis=1)
-    overview = overview.sort_values(by="Mean Accuracy", ascending=False)
+    overview = overview.sort_values(by="Median Accuracy", ascending=False)
     # split "Full model name" at : to get Model name, size, version, and quantisation
     overview.to_csv(
         f"{result_files_path}preprocessed_for_frontend/overview.csv",
@@ -158,15 +163,17 @@ def create_overview_table(result_files_path: str, result_file_names: list[str]):
             "Size",
             "Version",
             "Quantisation",
-            "Mean Accuracy",
+            "Median Accuracy",
             "SD",
         ]
     ]
     # round mean and sd to 2 decimal places
-    overview_per_quantisation["Mean Accuracy"] = overview_per_quantisation[
-        "Mean Accuracy"
+    overview_per_quantisation.loc[:, "Median Accuracy"] = (
+        overview_per_quantisation["Median Accuracy"].round(2)
+    )
+    overview_per_quantisation.loc[:, "SD"] = overview_per_quantisation[
+        "SD"
     ].round(2)
-    overview_per_quantisation["SD"] = overview_per_quantisation["SD"].round(2)
     overview_per_quantisation.to_csv(
         f"{result_files_path}preprocessed_for_frontend/overview-quantisation.csv",
         index=False,
@@ -178,24 +185,123 @@ def create_overview_table(result_files_path: str, result_file_names: list[str]):
         ["Model name", "Size"]
     ).agg(
         {
-            "Mean Accuracy": "mean",
+            "Median Accuracy": "median",
             "SD": "mean",
         }
     )
     # round mean and SD to 2 decimal places
-    overview_per_size["Mean Accuracy"] = overview_per_size[
-        "Mean Accuracy"
+    overview_per_size["Median Accuracy"] = overview_per_size[
+        "Median Accuracy"
     ].round(2)
     overview_per_size["SD"] = overview_per_size["SD"].round(2)
     # sort by mean, descending
     overview_per_size = overview_per_size.sort_values(
-        by="Mean Accuracy", ascending=False
+        by="Median Accuracy", ascending=False
     )
     overview_per_size.to_csv(
         f"{result_files_path}preprocessed_for_frontend/overview-model.csv",
         index=True,
     )
 
+    plot_accuracy_per_model(overview)
+    plot_accuracy_per_quantisation(overview)
+    plot_accuracy_per_task(overview)
+
+
+def plot_accuracy_per_model(overview) -> None:
+    sns.set_theme(style="whitegrid")
+    overview_melted = overview.melt(
+        id_vars=[
+            "Full model name",
+            "Model name",
+            "Size",
+            "Version",
+            "Quantisation",
+            "Median Accuracy",
+            "SD",
+        ],
+        var_name="Task",
+        value_name="Accuracy",
+    )
+    plt.figure(figsize=(10, 6))
+    sns.boxplot(x="Model name", y="Accuracy", hue="Size", data=overview_melted)
+    plt.title("Boxplot across tasks, per Model")
+    plt.xticks(rotation=45)
+    plt.savefig(
+        f"docs/boxplot-per-model.png",
+        bbox_inches="tight",
+    )
+    plt.close()
+
+
+def plot_accuracy_per_quantisation(overview) -> None:
+    sns.set_theme(style="whitegrid")
+    overview_melted = overview.melt(
+        id_vars=[
+            "Full model name",
+            "Model name",
+            "Size",
+            "Version",
+            "Quantisation",
+            "Median Accuracy",
+            "SD",
+        ],
+        var_name="Task",
+        value_name="Accuracy",
+    )
+    # unify quantisation names: 2-bit, 3-bit, etc
+    digit_pattern = r"\d+"
+    overview_melted["Quantisation"] = overview_melted["Quantisation"].apply(
+        lambda x: f"{re.findall(digit_pattern, x)[0]}-bit" if x else "None"
+    )
+    # set quantisation of gpt models to None
+    overview_melted["Quantisation"] = overview_melted.apply(
+        lambda row: (
+            "None"
+            if row["Model name"] in ["gpt-3.5-turbo", "gpt-4"]
+            else row["Quantisation"]
+        ),
+        axis=1,
+    )
+
+    plt.figure(figsize=(10, 6))
+    sns.boxplot(
+        x="Model name", y="Accuracy", hue="Quantisation", data=overview_melted
+    )
+    plt.title("Boxplot across tasks, per Quantisation")
+    plt.xticks(rotation=45)
+    plt.savefig(
+        f"docs/boxplot-per-quantisation.png",
+        bbox_inches="tight",
+    )
+    plt.close()
+
+
+def plot_accuracy_per_task(overview):
+    sns.set_theme(style="whitegrid")
+    overview_melted = overview.melt(
+        id_vars=[
+            "Full model name",
+            "Model name",
+            "Size",
+            "Version",
+            "Quantisation",
+            "Median Accuracy",
+            "SD",
+        ],
+        var_name="Task",
+        value_name="Accuracy",
+    )
+    plt.figure(figsize=(10, 6))
+    sns.boxplot(x="Task", y="Accuracy", hue="Model name", data=overview_melted)
+    plt.title("Boxplot across models, per Task")
+    plt.xticks(rotation=45)
+    plt.savefig(
+        f"docs/boxplot-per-task.png",
+        bbox_inches="tight",
+    )
+    plt.close()
+
 
 if __name__ == "__main__":
     on_pre_build(None)
diff --git a/docs/vectorstore_host-reference.md → docs/vectorstore_agent-reference.md b/docs/vectorstore_host-reference.md → docs/vectorstore_agent-reference.md