From 0fc1b7c2e49d606e24e8ba41a87b268527c79d86 Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Wed, 5 Feb 2025 17:26:40 +0800 Subject: [PATCH 1/3] update .gitignore --- .gitignore | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index cb7f9d46..8d72e4df 100644 --- a/.gitignore +++ b/.gitignore @@ -169,4 +169,28 @@ demo.ipynb *json .vscode *.swp -GPT4o_MINI/ \ No newline at end of file +GPT4o_MINI/ + +2weiyun* +script.py +Gemini* +Claude3-5V* +GLM4V* +GPT4o* +GPT4V* +mmmu_debug +bailingMM +BailingMM* +SenseChat* +Step* +DoubaoVL +arch +BlueLM* +mmb_* +Reka* +Taiyi +TeleMM +apple.jpg +assets/LOGO.png +api_list.txt +vlmeval/gemini_tmp.py \ No newline at end of file From 9cfdac6a2c3be271573a0e469e8e72ea28c41b57 Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Thu, 6 Feb 2025 13:52:24 +0800 Subject: [PATCH 2/3] update CIRCULAR implementation --- vlmeval/tools.py | 165 ++++++++++++++++++++++++++--------------------- 1 file changed, 91 insertions(+), 74 deletions(-) diff --git a/vlmeval/tools.py b/vlmeval/tools.py index 023ea48b..a7569209 100644 --- a/vlmeval/tools.py +++ b/vlmeval/tools.py @@ -1,4 +1,5 @@ import sys +from collections import deque from vlmeval.dataset import SUPPORTED_DATASETS from vlmeval.config import * from vlmeval.smp import * @@ -157,88 +158,104 @@ def MISSING(lvl): def CIRCULAR(inp): + def proc_str(s): + chs = set(s) + chs = [x for x in chs if x not in string.ascii_letters and x != ' '] + for ch in chs: + s = s.replace(ch, ' ') + return s + + def abnormal_entry(line): + choices = {k: line[k] for k in string.ascii_uppercase if k in line and not pd.isna(line[k])} + for k in choices: + s = proc_str(choices[k]).split() + hit_words = [x for x in s if x in choices] + hit_words = set(hit_words) + if len(hit_words) > 1: + return True + return False + assert inp.endswith('.tsv') data = load(inp) OFFSET = 1e6 while max(data['index']) >= OFFSET: OFFSET *= 10 - - assert 'E' not in data, 'Currently build_circular only works for up to 4-choice questions' - data_2c = data[pd.isna(data['C'])] - data_3c = data[~pd.isna(data['C']) & pd.isna(data['D'])] - data_4c = data[~pd.isna(data['D'])] - map_2c = [('AB', 'BA')] - map_3c = [('ABC', 'BCA'), ('ABC', 'CAB')] - map_4c = [('ABCD', 'BCDA'), ('ABCD', 'CDAB'), ('ABCD', 'DABC')] - - def okn(o, n=4): - ostr = o.replace(',', ' ') - osplits = ostr.split() - if sum([c in osplits for c in string.ascii_uppercase[:n - 1]]) == n - 1: - return False - olower = o.lower() - olower = olower.replace(',', ' ') - olower_splits = olower.split() - if 'all' in olower_splits or 'none' in olower_splits: - return False - return True - - yay4, nay4 = [], [] - lt4 = len(data_4c) - for i in range(lt4): - if okn(data_4c.iloc[i]['D'], 4): - yay4.append(i) + n_opt = 2 + for i, ch in enumerate(string.ascii_uppercase): + if ch in data: + n_opt = ord(ch) - ord('A') + 1 else: - nay4.append(i) - data_4c_y = data_4c.iloc[yay4] - data_4c_n = data_4c.iloc[nay4] - data_3c = pd.concat([data_4c_n, data_3c]) - - yay3, nay3 = [], [] - lt3 = len(data_3c) - for i in range(lt3): - if okn(data_3c.iloc[i]['C'], 3): - yay3.append(i) + for j in range(i + 1, 26): + assert string.ascii_uppercase[j] not in data + groups = defaultdict(list) + for i in range(len(data)): + item = data.iloc[i] + this_n_opt = 0 + for j, ch in enumerate(string.ascii_uppercase[:n_opt]): + if not pd.isna(item[ch]): + this_n_opt = j + 1 + else: + for k in range(j + 1, n_opt): + assert pd.isna(item[string.ascii_uppercase[k]]), (k, item) + assert this_n_opt >= 2 or this_n_opt == 0 + flag = abnormal_entry(item) + if flag or this_n_opt == 0: + groups['abnormal'].append(item) + elif ord(item['answer']) - ord('A') + 1 > this_n_opt: + groups['abnormal'].append(item) else: - nay3.append(i) - data_3c_y = data_3c.iloc[yay3] - data_3c_n = data_3c.iloc[nay3] - data_2c = pd.concat([data_3c_n, data_2c]) - - def remap(data_in, tup, off): - off = int(off) - data = data_in.copy() - char_map = {k: v for k, v in zip(*tup)} - idx = data.pop('index') - answer = data.pop('answer') - answer_new = [char_map[x] if x in char_map else x for x in answer] - data['answer'] = answer_new - options = {} - for c in char_map: - options[char_map[c]] = data.pop(c) - for c in options: - data[c] = options[c] - data.pop('image') - data['image'] = idx - idx = [x + off for x in idx] - data['index'] = idx - return data - - data_all = pd.concat([ - data_2c, - data_3c_y, - data_4c_y, - remap(data_2c, map_2c[0], OFFSET), - remap(data_3c_y, map_3c[0], OFFSET), - remap(data_4c_y, map_4c[0], OFFSET), - remap(data_3c_y, map_3c[1], OFFSET * 2), - remap(data_4c_y, map_4c[1], OFFSET * 2), - remap(data_4c_y, map_4c[2], OFFSET * 3), - ]) - - tgt_file = inp.replace('.tsv', '_CIRC.tsv') + groups[this_n_opt].append(item) + for k in groups: + groups[k] = pd.concat(groups[k], axis=1).T + print(f'{k if k == "abnormal" else str(k) + "-choice"} records: {len(groups[k])}') + + data_all = [] + + for k in groups: + if k == 'abnormal': + warnings.warn( + f"{len(groups['abnormal'])} abnormal entries detected. The problems can be: " + "1. Choice labels found in some choice contents; 2. No choices found for this question; " + "3. The answer is not a valid choice. Will not apply circular to those samples." + ) + abdata = groups['abnormal'] + abdata['g_index'] = abdata['index'] + data_all.append(abdata) + else: + cir_data = [] + assert isinstance(k, int) and k >= 2 + labels = string.ascii_uppercase[:k] + rotates = [labels] + dq = deque(labels) + for i in range(k - 1): + dq.rotate(1) + rotates.append(list(dq)) + for i, rot in enumerate(rotates): + if i == 0: + data = groups[k].copy() + data['g_index'] = data['index'] + cir_data.append(data) + else: + try: + data = groups[k].copy() + data['index'] = [x + OFFSET * i for x in data['index']] + data['g_index'] = [x % OFFSET for x in data['index']] + c_map = {k: v for k, v in zip(rotates[0], rot)} + data['answer'] = [c_map[x] for x in data['answer']] + for s, t in c_map.items(): + data[t] = groups[k][s] + cir_data.append(data) + except: + print(set(data['answer'])) + raise NotImplementedError + data_all.append(pd.concat(cir_data)) + data_all = pd.concat(data_all) + data_all['index'] = [int(x) for x in data_all['index']] + data_all['g_index'] = [int(x) for x in data_all['g_index']] + + tgt_file = inp.replace('.tsv', '_circular.tsv') dump(data_all, tgt_file) - print(f'The circularized data is saved to {tgt_file}') + print(f'Processed data are saved to {tgt_file}: {len(load(inp))} raw records, {len(data_all)} circularized records.') assert osp.exists(tgt_file) print(f'The MD5 for the circularized data is {md5(tgt_file)}') From fd7a88a36e7876fe55a82c871cb5be55a0e3d340 Mon Sep 17 00:00:00 2001 From: kennymckormick Date: Thu, 6 Feb 2025 14:13:31 +0800 Subject: [PATCH 3/3] [Refine] Update CircularEval --- vlmeval/dataset/utils/multiple_choice.py | 15 +++++++---- vlmeval/smp/vlm.py | 33 ------------------------ 2 files changed, 10 insertions(+), 38 deletions(-) diff --git a/vlmeval/dataset/utils/multiple_choice.py b/vlmeval/dataset/utils/multiple_choice.py index 32c060b8..ce0d29c1 100644 --- a/vlmeval/dataset/utils/multiple_choice.py +++ b/vlmeval/dataset/utils/multiple_choice.py @@ -406,18 +406,23 @@ def mcq_circular_eval(model, data, meta, nproc, result_file, dataset_name=None): for idx in list(meta['index']) + list(data['index']): assert istype(idx, int) + if 'g_index' not in data: + data['g_index'] = [int(x % 1e6) for x in data['index']] # Only keep those lines in the meta data data = data[data['index'].isin(answer_map)] data['GT'] = [answer_map[idx] for idx in data['index']] - data_main = data[data['index'] < int(1e6)] - + + data['tmp_flag'] = [x == y for x, y in zip(data['index'], data['g_index'])] + data_main = data[data['tmp_flag']] + data_main.pop('tmp_flag') + data_groups = [] for i in range(len(data_main)): # Dealing with the normal part idx = data_main.iloc[i]['index'] if idx not in result: - sub_data = data[data['index'] % int(1e6) == idx] + sub_data = data[data['g_index'] == idx] data_groups.append(sub_data) if len(data_groups): @@ -425,13 +430,13 @@ def mcq_circular_eval(model, data, meta, nproc, result_file, dataset_name=None): remain = [] for dg, pf in zip(data_groups, prefetched): if pf is not None: - result[dg.iloc[0]['index'] % 1e6] = pf + result[dg.iloc[0]['g_index']] = pf else: remain.append(dg) dump(result, result_file) tups = [dict(model=model, sub_data=x, dataset_name=dataset_name) for x in remain] - keys = [x.iloc[0]['index'] % 1e6 for x in remain] + keys = [x.iloc[0]['g_index'] for x in remain] if len(tups) == 0: pass diff --git a/vlmeval/smp/vlm.py b/vlmeval/smp/vlm.py index f3dcdd7b..be14e31c 100644 --- a/vlmeval/smp/vlm.py +++ b/vlmeval/smp/vlm.py @@ -144,36 +144,3 @@ def gpt_key_set(): def apiok(wrapper): s = wrapper.generate('Hello!') return wrapper.fail_msg not in s - - -def circular_pred(df, extract_func=None): - if extract_func is None: - extract_func = lambda x: x # noqa: E731 - df = df.sort_values('index') - from vlmeval.utils import can_infer_option - - shift = int(1e6) - - choices = [extract_func(x) for x in df['prediction']] - pred_map = {i: c for i, c in zip(df['index'], choices)} - flag_map = {i: True for i in pred_map if i < 1e6} - valid_map = {i: True for i in pred_map if i < 1e6} - for i in df['index']: - if i >= shift and pred_map[i] and pred_map[i - shift]: - if pred_map[i] not in list( - string.ascii_uppercase - ) or pred_map[ # noqa: W504 - i - shift - ] not in list( - string.ascii_uppercase - ): - - valid_map[i % shift] = False - continue - if (ord(pred_map[i]) - ord(pred_map[i - shift])) % 4 == 1: - continue - else: - flag_map[i % shift] = False - flag_map = {k: v for k, v in flag_map.items() if valid_map[k]} - flags = list(flag_map.values()) - return np.mean(flags)