Skip to content

Adds SpokeN-100-English #2342

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: maeb
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/create_tasks_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def create_task_lang_table(tasks: list[mteb.AbsTask], sort_by_sum=False) -> str:
if lang in PROGRAMMING_LANGS:
lang = "code"
if table_dict.get(lang) is None:
table_dict[lang] = {k: 0 for k in sorted(get_args(TASK_TYPE))}
table_dict[lang] = dict.fromkeys(sorted(get_args(TASK_TYPE)), 0)
table_dict[lang][task.metadata.type] += 1

## Wrangle for polars
Expand Down
1 change: 1 addition & 0 deletions mteb/abstasks/TaskMetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
"Speaker Count Identification",
"Spoken Digit Classification",
"Gender Clustering",
"Vocal Sound Classification",
"Rendered semantic textual similarity",
"Sentiment Analysis",
"Intent Classification",
Expand Down
2 changes: 1 addition & 1 deletion mteb/abstasks/stratification.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,7 +216,7 @@ def _prepare_stratification(self, y: np.ndarray) -> tuple:
[self.percentage_per_fold[i] * self.n_samples for i in range(self.n_splits)]
)
rows = sp.lil_matrix(y).rows
rows_used = {i: False for i in range(self.n_samples)}
rows_used = dict.fromkeys(range(self.n_samples), False)
all_combinations = []
per_row_combinations = [[] for i in range(self.n_samples)]
samples_with_combination = {}
Expand Down
2 changes: 1 addition & 1 deletion mteb/evaluation/evaluators/RetrievalEvaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ def search_cross_encoder(
logging.info(
f"previous_results is None. Using all the documents to rerank: {len(corpus)}"
)
q_results = {doc_id: 0.0 for doc_id in corpus.keys()}
q_results = dict.fromkeys(corpus.keys(), 0.0)
else:
q_results = self.previous_results[qid]
# take the top-k only
Expand Down
2 changes: 1 addition & 1 deletion mteb/leaderboard/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def scores_to_tables(
joint_table_style = (
joint_table.style.format(
{
**{column: "{:.2f}" for column in score_columns},
**dict.fromkeys(score_columns, "{:.2f}"),
"Rank (Borda)": "{:.0f}",
"Zero-shot": format_zero_shot,
},
Expand Down
3 changes: 1 addition & 2 deletions mteb/task_aggregation.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,7 @@ def borda_count(
results = results.to_legacy_dict()
n_candidates = sum(len(revs) for revs in results.values())
candidate_scores = {
model: {revision: 0.0 for revision in revisions}
for model, revisions in results.items()
model: dict.fromkeys(revisions, 0.0) for model, revisions in results.items()
}

tasks = defaultdict(list) # {task_name: [(model, revision, score), ...]}
Expand Down
1 change: 1 addition & 0 deletions mteb/tasks/Audio/AudioClassification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .eng.MridinghamStroke import *
from .eng.MridinghamTonic import *
from .eng.NSynth import *
from .eng.SpokeN import *
from .eng.SpokenQAforIC import *
from .eng.VoxCelebSA import *
from .eng.VoxLingua107Top10 import *
47 changes: 47 additions & 0 deletions mteb/tasks/Audio/AudioClassification/eng/SpokeN.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from __future__ import annotations

from mteb.abstasks.Audio.AbsTaskAudioClassification import (
AbsTaskAudioClassification,
)
from mteb.abstasks.TaskMetadata import TaskMetadata


class SpokeNEnglishClassification(AbsTaskAudioClassification):
metadata = TaskMetadata(
name="SpokeNEnglish",
description="Human Sound Classification Dataset.",
reference="https://zenodo.org/records/10810044",
dataset={
"path": "mteb/SpokeN-100-English",
"revision": "afbff14d927de14412d8124502313ea6d9d140e0",
},
type="AudioClassification",
category="a2t",
eval_splits=["train"],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this task should be with cross validation

eval_langs=["eng-Latn"],
main_score="accuracy",
date=("2024-01-01", "2024-01-01"),
domains=["Spoken"],
task_subtypes=["Vocal Sound Classification"],
license="cc-by-sa-4.0",
annotations_creators="LM-generated",
dialect=[],
modalities=["audio"],
sample_creation="found",
bibtex_citation="""@misc{groh2024spoken100crosslingualbenchmarkingdataset,
title={SpokeN-100: A Cross-Lingual Benchmarking Dataset for The Classification of Spoken Numbers in Different Languages},
author={René Groh and Nina Goes and Andreas M. Kist},
year={2024},
eprint={2403.09753},
archivePrefix={arXiv},
primaryClass={cs.SD},
url={https://arxiv.org/abs/2403.09753},
}""",
descriptive_stats={
"n_samples": {"train": 3200},
},
)

audio_column_name: str = "audio"
label_column_name: str = "label"
samples_per_label: int = 32
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@
def _load_wit_data(
path: str, langs: list, splits: str, cache_dir: str = None, revision: str = None
):
corpus = {lang: {split: None for split in splits} for lang in langs}
queries = {lang: {split: None for split in splits} for lang in langs}
relevant_docs = {lang: {split: None for split in splits} for lang in langs}
corpus = {lang: dict.fromkeys(splits) for lang in langs}
queries = {lang: dict.fromkeys(splits) for lang in langs}
relevant_docs = {lang: dict.fromkeys(splits) for lang in langs}

split = "test"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
def _load_xflickrco_data(
path: str, langs: list, splits: str, cache_dir: str = None, revision: str = None
):
corpus = {lang: {split: None for split in splits} for lang in langs}
queries = {lang: {split: None for split in splits} for lang in langs}
relevant_docs = {lang: {split: None for split in splits} for lang in langs}
corpus = {lang: dict.fromkeys(splits) for lang in langs}
queries = {lang: dict.fromkeys(splits) for lang in langs}
relevant_docs = {lang: dict.fromkeys(splits) for lang in langs}

split = "test"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,9 +49,9 @@
def _load_xm3600_data(
path: str, langs: list, splits: str, cache_dir: str = None, revision: str = None
):
corpus = {lang: {split: None for split in splits} for lang in langs}
queries = {lang: {split: None for split in splits} for lang in langs}
relevant_docs = {lang: {split: None for split in splits} for lang in langs}
corpus = {lang: dict.fromkeys(splits) for lang in langs}
queries = {lang: dict.fromkeys(splits) for lang in langs}
relevant_docs = {lang: dict.fromkeys(splits) for lang in langs}

split = "test"

Expand Down
4 changes: 1 addition & 3 deletions mteb/tasks/Retrieval/dan/TwitterHjerneRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,7 @@ def dataset_transform(self) -> None:
answer_id = str(text2id[a])
answer_ids.append(answer_id)

self.relevant_docs[split][query_id] = {
answer_id: 1 for answer_id in answer_ids
}
self.relevant_docs[split][query_id] = dict.fromkeys(answer_ids, 1)


def answers_to_list(example: dict) -> dict:
Expand Down
3 changes: 2 additions & 1 deletion mteb/tasks/Retrieval/deu/GerDaLIRRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,8 @@ def load_data(self, **kwargs):
self.corpus = {self._EVAL_SPLIT: {row["_id"]: row for row in corpus_rows}}
self.relevant_docs = {
self._EVAL_SPLIT: {
row["_id"]: {v: 1 for v in row["text"].split(" ")} for row in qrels_rows
row["_id"]: dict.fromkeys(row["text"].split(" "), 1)
for row in qrels_rows
}
}

Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def load_data(self, **kwargs):
existing_docs=all_docs,
)
corpus.update(neg_docs)
relevant_docs[q_id] = {k: 1 for k in pos_docs}
relevant_docs[q_id] = dict.fromkeys(pos_docs, 1)
corpus = {
key: doc.get("title", "") + " " + doc["text"] for key, doc in corpus.items()
}
Expand Down
8 changes: 3 additions & 5 deletions mteb/tasks/Retrieval/eng/BrightRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,9 @@ def load_bright_data(
cache_dir: str = None,
revision: str = None,
):
corpus = {domain: {split: None for split in eval_splits} for domain in DOMAINS}
queries = {domain: {split: None for split in eval_splits} for domain in DOMAINS}
relevant_docs = {
domain: {split: None for split in eval_splits} for domain in DOMAINS
}
corpus = {domain: dict.fromkeys(eval_splits) for domain in DOMAINS}
queries = {domain: dict.fromkeys(eval_splits) for domain in DOMAINS}
relevant_docs = {domain: dict.fromkeys(eval_splits) for domain in DOMAINS}

for domain in domains:
domain_corpus = datasets.load_dataset(
Expand Down
12 changes: 3 additions & 9 deletions mteb/tasks/Retrieval/multilingual/CUREv1Retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,15 +120,9 @@ def load_data(self, **kwargs):
cache_dir = kwargs.get("cache_dir", None)

# Iterate over splits and languages
corpus = {
language: {split: None for split in eval_splits} for language in languages
}
queries = {
language: {split: None for split in eval_splits} for language in languages
}
relevant_docs = {
language: {split: None for split in eval_splits} for language in languages
}
corpus = {language: dict.fromkeys(eval_splits) for language in languages}
queries = {language: dict.fromkeys(eval_splits) for language in languages}
relevant_docs = {language: dict.fromkeys(eval_splits) for language in languages}
for split in eval_splits:
# Since this is a cross-lingual dataset, the corpus and the relevant documents do not depend on the language
split_corpus = self._load_corpus(split=split, cache_dir=cache_dir)
Expand Down
12 changes: 6 additions & 6 deletions mteb/tasks/Retrieval/multilingual/MIRACLRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@
def _load_miracl_data(
path: str, langs: list, splits: str, cache_dir: str = None, revision: str = None
):
corpus = {lang: {split: None for split in splits} for lang in langs}
queries = {lang: {split: None for split in splits} for lang in langs}
relevant_docs = {lang: {split: None for split in splits} for lang in langs}
corpus = {lang: dict.fromkeys(splits) for lang in langs}
queries = {lang: dict.fromkeys(splits) for lang in langs}
relevant_docs = {lang: dict.fromkeys(splits) for lang in langs}

split = _EVAL_SPLIT

Expand Down Expand Up @@ -156,9 +156,9 @@ def load_data(self, **kwargs):
def _load_miracl_data_hard_negatives(
path: str, langs: list, splits: str, cache_dir: str = None, revision: str = None
):
corpus = {lang: {split: None for split in splits} for lang in langs}
queries = {lang: {split: None for split in splits} for lang in langs}
relevant_docs = {lang: {split: None for split in splits} for lang in langs}
corpus = {lang: dict.fromkeys(splits) for lang in langs}
queries = {lang: dict.fromkeys(splits) for lang in langs}
relevant_docs = {lang: dict.fromkeys(splits) for lang in langs}

split = _EVAL_SPLIT

Expand Down
6 changes: 3 additions & 3 deletions mteb/tasks/Retrieval/multilingual/MultiLongDocRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ def load_mldr_data(
cache_dir: str = None,
revision: str = None,
):
corpus = {lang: {split: None for split in eval_splits} for lang in langs}
queries = {lang: {split: None for split in eval_splits} for lang in langs}
relevant_docs = {lang: {split: None for split in eval_splits} for lang in langs}
corpus = {lang: dict.fromkeys(eval_splits) for lang in langs}
queries = {lang: dict.fromkeys(eval_splits) for lang in langs}
relevant_docs = {lang: dict.fromkeys(eval_splits) for lang in langs}

for lang in langs:
lang_corpus = datasets.load_dataset(
Expand Down
12 changes: 6 additions & 6 deletions mteb/tasks/Retrieval/multilingual/NeuCLIR2022Retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ def load_neuclir_data(
cache_dir: str | None = None,
revision: str | None = None,
):
corpus = {lang: {split: None for split in eval_splits} for lang in langs}
queries = {lang: {split: None for split in eval_splits} for lang in langs}
relevant_docs = {lang: {split: None for split in eval_splits} for lang in langs}
corpus = {lang: dict.fromkeys(eval_splits) for lang in langs}
queries = {lang: dict.fromkeys(eval_splits) for lang in langs}
relevant_docs = {lang: dict.fromkeys(eval_splits) for lang in langs}

for lang in langs:
lang_corpus = datasets.load_dataset(
Expand Down Expand Up @@ -110,9 +110,9 @@ def load_neuclir_data_hard_negatives(
revision: str | None = None,
):
split = "test"
corpus = {lang: {split: None for split in eval_splits} for lang in langs}
queries = {lang: {split: None for split in eval_splits} for lang in langs}
relevant_docs = {lang: {split: None for split in eval_splits} for lang in langs}
corpus = {lang: dict.fromkeys(eval_splits) for lang in langs}
queries = {lang: dict.fromkeys(eval_splits) for lang in langs}
relevant_docs = {lang: dict.fromkeys(eval_splits) for lang in langs}

for lang in langs:
corpus_identifier = f"corpus-{lang}"
Expand Down
12 changes: 6 additions & 6 deletions mteb/tasks/Retrieval/multilingual/NeuCLIR2023Retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ def load_neuclir_data(
cache_dir: str | None = None,
revision: str | None = None,
):
corpus = {lang: {split: None for split in eval_splits} for lang in langs}
queries = {lang: {split: None for split in eval_splits} for lang in langs}
relevant_docs = {lang: {split: None for split in eval_splits} for lang in langs}
corpus = {lang: dict.fromkeys(eval_splits) for lang in langs}
queries = {lang: dict.fromkeys(eval_splits) for lang in langs}
relevant_docs = {lang: dict.fromkeys(eval_splits) for lang in langs}

for lang in langs:
lang_corpus = datasets.load_dataset(
Expand Down Expand Up @@ -111,9 +111,9 @@ def load_neuclir_data_hard_negatives(
revision: str | None = None,
):
split = "test"
corpus = {lang: {split: None for split in eval_splits} for lang in langs}
queries = {lang: {split: None for split in eval_splits} for lang in langs}
relevant_docs = {lang: {split: None for split in eval_splits} for lang in langs}
corpus = {lang: dict.fromkeys(eval_splits) for lang in langs}
queries = {lang: dict.fromkeys(eval_splits) for lang in langs}
relevant_docs = {lang: dict.fromkeys(eval_splits) for lang in langs}

for lang in langs:
corpus_identifier = f"corpus-{lang}"
Expand Down
2 changes: 1 addition & 1 deletion mteb/tasks/Retrieval/multilingual/XMarketRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def _load_xmarket_data(
corpus[lang][split] = {row["_id"]: row for row in corpus_rows}
queries[lang][split] = {row["_id"]: row["text"] for row in query_rows}
relevant_docs[lang][split] = {
row["_id"]: {v: 1 for v in row["text"].split(" ")} for row in qrels_rows
row["_id"]: dict.fromkeys(row["text"].split(" "), 1) for row in qrels_rows
}

corpus = datasets.DatasetDict(corpus)
Expand Down
3 changes: 2 additions & 1 deletion mteb/tasks/Retrieval/spa/SpanishPassageRetrievalS2P.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,8 @@ def load_data(self, **kwargs):
self.corpus = {"test": {row["_id"]: row for row in corpus_rows}}
self.relevant_docs = {
"test": {
row["_id"]: {v: 1 for v in row["text"].split(" ")} for row in qrels_rows
row["_id"]: dict.fromkeys(row["text"].split(" "), 1)
for row in qrels_rows
}
}

Expand Down
3 changes: 2 additions & 1 deletion mteb/tasks/Retrieval/spa/SpanishPassageRetrievalS2S.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,8 @@ def load_data(self, **kwargs):
self.corpus = {"test": {row["_id"]: row for row in corpus_rows}}
self.relevant_docs = {
"test": {
row["_id"]: {v: 1 for v in row["text"].split(" ")} for row in qrels_rows
row["_id"]: dict.fromkeys(row["text"].split(" "), 1)
for row in qrels_rows
}
}

Expand Down
2 changes: 1 addition & 1 deletion tests/test_reproducible_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def test_validate_task_to_prompt_name(task_name: str | mteb.AbsTask):
else:
task_names = [task_name]

model_prompts = {task_name: "prompt_name" for task_name in task_names}
model_prompts = dict.fromkeys(task_names, "prompt_name")
model_prompts |= {task_name + "-query": "prompt_name" for task_name in task_names}
model_prompts |= {task_name + "-passage": "prompt_name" for task_name in task_names}
model_prompts |= {
Expand Down