Skip to content

Commit

Permalink
refactor: update TaskMetadata (#1076)
Browse files Browse the repository at this point in the history
* update TaskMetadata.py

* update _add_stats.grit

* update _add_stats.grit

* update 415 files

* update _add_stats.grit and TaskMetadata.py

* update _add_stats.grit

* update 86 files

* update 6 files update 341 files

* delete 1 file and update 416 files

* misc.
  • Loading branch information
MartinBernstorff committed Jul 12, 2024
1 parent 86eb18c commit 57c1c12
Show file tree
Hide file tree
Showing 417 changed files with 9,580 additions and 9,097 deletions.
14 changes: 6 additions & 8 deletions docs/adding_a_dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class SciDocsReranking(AbsTaskReranking):
reference="https://allenai.org/data/scidocs",
type="Reranking",
category="s2s",
modalities=["text"],
eval_splits=["test"],
eval_langs=["eng-Latn"],
main_score="map",
Expand All @@ -38,16 +39,13 @@ class SciDocsReranking(AbsTaskReranking):
"revision": "d3c5e1fc0b855ab6097bf1cda04dd73947d7caab",
}
date=("2000-01-01", "2020-12-31"), # best guess
form="written",
domains=["Academic", "Non-fiction"],
domains=["Academic", "Non-fiction", "Domains"],
task_subtypes=["Scientific Reranking"],
license="cc-by-4.0",
socioeconomic_status="high",
annotations_creators="derived",
dialect=[],
text_creation="found",
n_samples={"test": 19599},
avg_character_length={"test": 69.0},
sample_creation="found",
descriptive_stats={"n_samples": {"test": 19599}, "avg_character_length": {"test": 69.0}},
bibtex_citation="""
@inproceedings{cohan-etal-2020-specter,
title = "{SPECTER}: Document-level Representation Learning using Citation-informed Transformers",
Expand Down Expand Up @@ -94,6 +92,7 @@ class VGClustering(AbsTaskClustering):
reference="https://huggingface.co/datasets/navjordj/VG_summarization",
type="Clustering",
category="p2p",
modalities=["text"],
eval_splits=["test"],
eval_langs=["nob-Latn"],
main_score="v_measure",
Expand All @@ -102,11 +101,10 @@ class VGClustering(AbsTaskClustering):
"revision": "d4c5a8ba10ae71224752c727094ac4c46947fa29",
},
date=("2012-01-01", "2020-01-01"),
form="written",
form="Written",
domains=["Academic", "Non-fiction"],
task_subtypes=["Scientific Reranking"],
license="cc-by-nc",
socioeconomic_status="high",
annotations_creators="derived",
dialect=[],
text_creation="found",
Expand Down
1 change: 1 addition & 0 deletions docs/mmteb/points/1076.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"GitHub": "MartinBernstorff", "Bug fixes": 6}
1 change: 0 additions & 1 deletion mteb/abstasks/AbsTaskPairClassification.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from __future__ import annotations

import logging
from collections import defaultdict

from datasets import Dataset

Expand Down
31 changes: 12 additions & 19 deletions mteb/abstasks/TaskMetadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import logging
from datetime import date
from typing import Any, List, Mapping, Union
from typing import Any, Dict, List, Mapping, Optional, Union

from pydantic import AnyUrl, BaseModel, BeforeValidator, TypeAdapter, field_validator
from typing_extensions import Annotated, Literal
Expand Down Expand Up @@ -57,10 +57,12 @@
"Spoken",
"Subtitles",
"Web",
"Written",
"Programming",
None,
]

TEXT_CREATION_METHOD = Literal[
SAMPLE_CREATION_METHOD = Literal[
"found",
"created",
"machine-translated",
Expand All @@ -71,13 +73,6 @@
"LM-generated and verified",
]

SOCIOECONOMIC_STATUS = Literal[
"high",
"medium",
"low",
"mixed",
]

TASK_TYPE = Literal[
"BitextMining",
"Classification",
Expand Down Expand Up @@ -134,6 +129,9 @@
"shell",
]

METRIC_NAME = str
METRIC_VALUE = Union[int, float, Dict[str, Any]]

logger = logging.getLogger(__name__)


Expand All @@ -155,16 +153,15 @@ class TaskMetadata(BaseModel):
huggingface dataset contain different languages).
main_score: The main score used for evaluation.
date: The date when the data was collected. Specified as a tuple of two dates.
form: The form of the data. Either "spoken", "written".
domains: The domains of the data. These includes "Non-fiction", "Social", "Fiction", "News", "Academic", "Blog", "Encyclopaedic",
"Government", "Legal", "Medical", "Poetry", "Religious", "Reviews", "Web", "Spoken". A dataset can belong to multiple domains.
"Government", "Legal", "Medical", "Poetry", "Religious", "Reviews", "Web", "Spoken", "Written". A dataset can belong to multiple domains.
task_subtypes: The subtypes of the task. E.g. includes "Sentiment/Hate speech", "Thematic Clustering". Feel free to update the list as needed.
license: The license of the data.
socioeconomic_status: The socioeconomic status of the data. Includes "high", "medium", "low", "mixed".
annotations_creators: The type of the annotators. Includes "expert-annotated" (annotated by experts), "human-annotated" (annotated e.g. by
mturkers), "derived" (derived from structure in the data).
dialect: The dialect of the data, if applicable. Ideally specified as a BCP-47 language tag. Empty list if no dialects are present.
text_creation: The method of text creation. Includes "found", "created", "machine-translated", "machine-translated and verified", and
sample_creation: The method of text creation. Includes "found", "created", "machine-translated", "machine-translated and verified", and
"machine-translated and localized".
bibtex_citation: The BibTeX citation for the dataset. Should be an empty string if no citation is available.
n_samples: The number of samples in the dataset. This should only be for the splits evaluated on. For retrieval tasks, this should be the
Expand All @@ -178,6 +175,7 @@ class TaskMetadata(BaseModel):
name: str
description: str
type: TASK_TYPE
modalities: list[Literal["text"]]
category: TASK_CATEGORY
reference: STR_URL | None # URL to documentation, e.g. published paper

Expand All @@ -186,22 +184,17 @@ class TaskMetadata(BaseModel):
main_score: str # Might want a literal here

date: tuple[STR_DATE, STR_DATE] | None # When the data was collected
form: list[Literal["spoken", "written"]] | None
domains: list[TASK_DOMAIN] | None
task_subtypes: list[TASK_SUBTYPE] | None
license: str | None

socioeconomic_status: SOCIOECONOMIC_STATUS | None
annotations_creators: ANNOTATOR_TYPE | None
dialect: list[str] | None

text_creation: TEXT_CREATION_METHOD | None
sample_creation: SAMPLE_CREATION_METHOD | None
bibtex_citation: str | None

n_samples: dict[SPLIT_NAME, int] | None
avg_character_length: (
Union[dict[SPLIT_NAME, float], dict[SPLIT_NAME, dict[str, Any]]] | None
)
descriptive_stats: dict[METRIC_NAME, Optional[dict[SPLIT_NAME, METRIC_VALUE]]]

@field_validator("dataset")
def _check_dataset_path_is_specified(cls, dataset):
Expand Down
13 changes: 7 additions & 6 deletions mteb/tasks/BitextMining/dan/BornholmskBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,17 @@ class BornholmBitextMining(AbsTaskBitextMining):
reference="https://aclanthology.org/W19-6138/",
type="BitextMining",
category="s2s",
modalities=["text"],
eval_splits=["test"],
eval_langs=["dan-Latn"],
main_score="f1",
date=("2019-01-01", "2019-12-31"),
form=["written"],
domains=["Web", "Social", "Fiction"],
domains=["Web", "Social", "Fiction", "Written"],
license="CC-BY-4.0",
task_subtypes=["Dialect pairing"],
socioeconomic_status="mixed",
annotations_creators="expert-annotated",
dialect=["da-dan-bornholm"],
text_creation="created",
sample_creation="created",
bibtex_citation="""
@inproceedings{derczynskiBornholmskNaturalLanguage2019,
title = {Bornholmsk natural language processing: Resources and tools},
Expand All @@ -42,8 +41,10 @@ class BornholmBitextMining(AbsTaskBitextMining):
file = {Available Version (via Google Scholar):/Users/au554730/Zotero/storage/FBQ73ZYN/Derczynski and Kjeldsen - 2019 - Bornholmsk natural language processing Resources .pdf:application/pdf},
}
""",
avg_character_length={"test": 89.7},
n_samples={"test": 500},
descriptive_stats={
"n_samples": {"test": 500},
"avg_character_length": {"test": 89.7},
},
)

def dataset_transform(self):
Expand Down
13 changes: 7 additions & 6 deletions mteb/tasks/BitextMining/kat/TbilisiCityHallBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,23 @@ class TbilisiCityHallBitextMining(AbsTaskBitextMining, MultilingualTask):
description="Parallel news titles from the Tbilisi City Hall website (https://tbilisi.gov.ge/).",
type="BitextMining",
category="s2s",
modalities=["text"],
eval_splits=[_EVAL_SPLIT],
eval_langs=_EVAL_LANGS,
main_score="f1",
domains=["News"],
text_creation="created",
n_samples={_EVAL_SPLIT: 1820},
domains=["News", "Written"],
sample_creation="created",
reference="https://huggingface.co/datasets/jupyterjazz/tbilisi-city-hall-titles",
date=("2024-05-02", "2024-05-03"),
form=["written"],
task_subtypes=[],
license="Not specified",
socioeconomic_status="mixed",
annotations_creators="derived",
dialect=[],
bibtex_citation="",
avg_character_length={_EVAL_SPLIT: 78},
descriptive_stats={
"n_samples": {_EVAL_SPLIT: 1820},
"avg_character_length": {_EVAL_SPLIT: 78},
},
)

def load_data(self, **kwargs) -> None:
Expand Down
13 changes: 7 additions & 6 deletions mteb/tasks/BitextMining/multilingual/BUCCBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,17 @@ class BUCCBitextMining(AbsTaskBitextMining, MultilingualTask):
reference="https://comparable.limsi.fr/bucc2018/bucc2018-task.html",
type="BitextMining",
category="s2s",
modalities=["text"],
eval_splits=_SPLITS,
eval_langs=_LANGUAGES,
main_score="f1",
date=("2017-01-01", "2018-12-31"),
form=["written"],
domains=[],
domains=["Written"],
task_subtypes=[],
license="unknown",
socioeconomic_status="mixed",
annotations_creators="human-annotated",
dialect=[],
text_creation="human-translated",
sample_creation="human-translated",
bibtex_citation="""@inproceedings{zweigenbaum-etal-2017-overview,
title = "Overview of the Second {BUCC} Shared Task: Spotting Parallel Sentences in Comparable Corpora",
author = "Zweigenbaum, Pierre and
Expand All @@ -57,8 +56,10 @@ class BUCCBitextMining(AbsTaskBitextMining, MultilingualTask):
pages = "60--67",
abstract = "This paper presents the BUCC 2017 shared task on parallel sentence extraction from comparable corpora. It recalls the design of the datasets, presents their final construction and statistics and the methods used to evaluate system results. 13 runs were submitted to the shared task by 4 teams, covering three of the four proposed language pairs: French-English (7 runs), German-English (3 runs), and Chinese-English (3 runs). The best F-scores as measured against the gold standard were 0.84 (German-English), 0.80 (French-English), and 0.43 (Chinese-English). Because of the design of the dataset, in which not all gold parallel sentence pairs are known, these are only minimum values. We examined manually a small sample of the false negative sentence pairs for the most precise French-English runs and estimated the number of parallel sentence pairs not yet in the provided gold standard. Adding them to the gold standard leads to revised estimates for the French-English F-scores of at most +1.5pt. This suggests that the BUCC 2017 datasets provide a reasonable approximate evaluation of the parallel sentence spotting task.",
}""",
n_samples={"test": 641684},
avg_character_length={"test": 101.3},
descriptive_stats={
"n_samples": {"test": 641684},
"avg_character_length": {"test": 101.3},
},
)

def dataset_transform(self):
Expand Down
13 changes: 7 additions & 6 deletions mteb/tasks/BitextMining/multilingual/BUCCBitextMiningFast.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,17 @@ class BUCCBitextMiningFast(AbsTaskBitextMining, MultilingualTask):
reference="https://comparable.limsi.fr/bucc2018/bucc2018-task.html",
type="BitextMining",
category="s2s",
modalities=["text"],
eval_splits=_SPLITS,
eval_langs=_LANGUAGES,
main_score="f1",
date=("2017-01-01", "2018-12-31"),
form=["written"],
domains=[],
domains=["Written"],
task_subtypes=[],
license="unknown",
socioeconomic_status="mixed",
annotations_creators="human-annotated",
dialect=[],
text_creation="human-translated",
sample_creation="human-translated",
bibtex_citation="""@inproceedings{zweigenbaum-etal-2017-overview,
title = "Overview of the Second {BUCC} Shared Task: Spotting Parallel Sentences in Comparable Corpora",
author = "Zweigenbaum, Pierre and
Expand All @@ -57,6 +56,8 @@ class BUCCBitextMiningFast(AbsTaskBitextMining, MultilingualTask):
pages = "60--67",
abstract = "This paper presents the BUCC 2017 shared task on parallel sentence extraction from comparable corpora. It recalls the design of the datasets, presents their final construction and statistics and the methods used to evaluate system results. 13 runs were submitted to the shared task by 4 teams, covering three of the four proposed language pairs: French-English (7 runs), German-English (3 runs), and Chinese-English (3 runs). The best F-scores as measured against the gold standard were 0.84 (German-English), 0.80 (French-English), and 0.43 (Chinese-English). Because of the design of the dataset, in which not all gold parallel sentence pairs are known, these are only minimum values. We examined manually a small sample of the false negative sentence pairs for the most precise French-English runs and estimated the number of parallel sentence pairs not yet in the provided gold standard. Adding them to the gold standard leads to revised estimates for the French-English F-scores of at most +1.5pt. This suggests that the BUCC 2017 datasets provide a reasonable approximate evaluation of the parallel sentence spotting task.",
}""",
n_samples={"test": 641684},
avg_character_length={"test": 101.3},
descriptive_stats={
"n_samples": {"test": 641684},
"avg_character_length": {"test": 101.3},
},
)
13 changes: 7 additions & 6 deletions mteb/tasks/BitextMining/multilingual/BibleNLPBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -872,21 +872,22 @@ class BibleNLPBitextMining(AbsTaskBitextMining, MultilingualTask):
reference="https://arxiv.org/abs/2304.09919",
type="BitextMining",
category="s2s",
modalities=["text"],
eval_splits=_SPLIT,
eval_langs=_LANGUAGES_MAPPING,
main_score="f1",
# World English Bible (WEB) first draft 1997, finished 2020
date=("1997-01-01", "2020-12-31"),
form=["written"],
domains=["Religious"],
domains=["Religious", "Written"],
task_subtypes=[],
license="CC-BY-SA-4.0",
socioeconomic_status="medium",
annotations_creators="expert-annotated",
dialect=[],
text_creation="created",
n_samples={"train": _N},
avg_character_length={"train": 120},
sample_creation="created",
descriptive_stats={
"n_samples": {"train": _N},
"avg_character_length": {"train": 120},
},
bibtex_citation="""@article{akerman2023ebible,
title={The eBible Corpus: Data and Model Benchmarks for Bible Translation for Low-Resource Languages},
author={Akerman, Vesa and Baines, David and Daspit, Damien and Hermjakob, Ulf and Jang, Taeho and Leong, Colin and Martin, Michael and Mathew, Joel and Robie, Jonathan and Schwarting, Marcus},
Expand Down
10 changes: 4 additions & 6 deletions mteb/tasks/BitextMining/multilingual/DiaBLaBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,21 +19,20 @@ class DiaBLaBitextMining(AbsTaskBitextMining, MultilingualTask):
reference="https://inria.hal.science/hal-03021633",
type="BitextMining",
category="s2s",
modalities=["text"],
eval_splits=["test"],
eval_langs={
"fr-en": ["fra-Latn", "eng-Latn"],
"en-fr": ["eng-Latn", "fra-Latn"],
},
main_score="f1",
date=("2016-01-01", "2017-12-31"),
form=["written"],
domains=["Social"],
domains=["Social", "Written"],
task_subtypes=[],
license="CC BY-NC-SA 4.0",
socioeconomic_status="mixed",
annotations_creators="human-annotated",
dialect=[],
text_creation="created",
sample_creation="created",
bibtex_citation="""
@inproceedings{gonzalez2019diabla,
title={DiaBLa: A Corpus of Bilingual Spontaneous Written Dialogues for Machine Translation},
Expand All @@ -43,8 +42,7 @@ class DiaBLaBitextMining(AbsTaskBitextMining, MultilingualTask):
year={2019}
}
""",
n_samples={},
avg_character_length={},
descriptive_stats={"n_samples": {}, "avg_character_length": {}},
)

def load_data(self, **kwargs):
Expand Down
13 changes: 7 additions & 6 deletions mteb/tasks/BitextMining/multilingual/FloresBitextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,18 +248,17 @@ class FloresBitextMining(AbsTaskBitextMining, MultilingualTask):
reference="https://huggingface.co/datasets/facebook/flores",
type="BitextMining",
category="s2s",
modalities=["text"],
eval_splits=_SPLIT,
eval_langs=_LANGUAGES_MAPPING,
main_score="f1",
date=("2022-01-01", "2022-12-31"),
form=["written"],
domains=["Non-fiction", "Encyclopaedic"],
domains=["Non-fiction", "Encyclopaedic", "Written"],
task_subtypes=[],
license="CC BY-SA 4.0",
socioeconomic_status="mixed",
annotations_creators="human-annotated",
dialect=[],
text_creation="created",
sample_creation="created",
bibtex_citation="""
@inproceedings{goyal2022flores,
title={The FLORES-101 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation},
Expand All @@ -269,8 +268,10 @@ class FloresBitextMining(AbsTaskBitextMining, MultilingualTask):
year={2022}
}
""",
n_samples={"dev": 997, "devtest": 1012},
avg_character_length={},
descriptive_stats={
"n_samples": {"dev": 997, "devtest": 1012},
"avg_character_length": {},
},
)

def load_data(self, **kwargs: Any) -> None:
Expand Down
Loading

0 comments on commit 57c1c12

Please sign in to comment.