[v2] Update v2 again (#1864)

* fix: Add reported annotation and re-added public_training_data (#1846) * fix: Add additional dataset annotations * fix: readded public training data * update voyage annotations * 1.29.11 Automatically generated by python-semantic-release * fix: Leaderboard Refinements (#1849) * Added better descriptions to benchmarks and removed beta tags * Fixed zero-shot filtering on app loading * Added zero-shot definition in an accordion * NaN values are now filled with blank * Added type hints to filter_models * 1.29.12 Automatically generated by python-semantic-release * fix: Fixed leaderboard search bar (#1852) Fixed leaderboard search bar * 1.29.13 Automatically generated by python-semantic-release * fix: Hotfixed public_training_data type annotation (#1857) Fixed public_training_data flag type to include boolean, as this is how all models are annotated * fix: Fix zeta alpha mistral (#1736) * fix zeta alpha mistral * update use_instructions * update training datasets * Update mteb/models/e5_instruct.py Co-authored-by: Kenneth Enevoldsen <[email protected]> * update float * Update mteb/models/e5_instruct.py --------- Co-authored-by: Kenneth Enevoldsen <[email protected]> * Add more annotations (#1833) * apply additions from #1794 * add annotations for rumodels * add nomic training data * fix metadata * update rest of model meta * fix bge reranker * 1.29.14 Automatically generated by python-semantic-release * fix: Adding missing model meta (#1856) * Added CDE models * Added bge-en-icl * Updated CDE to bge_full_data * Fixed public_training_data flag type to include boolean, as this is how all models are annotated * Added public training data link instead of bool to CDE and BGE * Added GME models * Changed Torch to PyTorch * Added metadata on LENS models * Added ember_v1 * Added metadata for amazon titan * Removed GME implementation * 1.29.15 Automatically generated by python-semantic-release * fix: Added correct training data annotation to LENS (#1859) Added correct training data annotation to LENS * 1.29.16 Automatically generated by python-semantic-release * lint * fix meta * fix meta * fix empty model meta * lint --------- Co-authored-by: Kenneth Enevoldsen <[email protected]> Co-authored-by: github-actions <[email protected]> Co-authored-by: Márton Kardos <[email protected]>
embeddings-benchmark · Jan 24, 2025 · f1d418c · f1d418c
1 parent 6da8a13
commit f1d418c
Show file tree

Hide file tree

Showing 47 changed files with 1,264 additions and 385 deletions.
diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py
@@ -71,7 +71,7 @@ def load_results(
 
 
 MTEB_EN = Benchmark(
-    name="MTEB(eng, beta)",
+    name="MTEB(eng)",
     tasks=MTEBTasks(
         get_tasks(
             tasks=[
@@ -128,7 +128,13 @@ def load_results(
             get_task("STS22.v2", eval_splits=["test"], hf_subsets=["en"]),
         ),
     ),
-    description="English benchmarks from MTEB",
+    description="""The new English Massive Text Embedding Benchmark.
+This benchmark was created to account for the fact that many models have now been finetuned
+to tasks in the original MTEB, and contains tasks that are not as frequently used for model training.
+This way the new benchmark and leaderboard can give our users a more realistic expectation of models' generalization performance.
+
+The original MTEB leaderboard is available under the [MTEB(eng, classic)](http://mteb-leaderboard-2-demo.hf.space/?benchmark_name=MTEB%28eng%2C+classic%29) tab.
+    """,
     citation="",
     contacts=["KennethEnevoldsen", "Muennighoff"],
 )
@@ -216,7 +222,12 @@ def load_results(
             get_task("STS22", eval_splits=["test"], hf_subsets=["en"]),
         )
     ),
-    description="The original English benchmark by Muennighoff et al., (2023).",
+    description="""The original English benchmark by Muennighoff et al., (2023).
+This page is an adaptation of the [old MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard).
+
+> We recommend that you use [MTEB(eng)](http://mteb-leaderboard-2-demo.hf.space/?benchmark_name=MTEB%28eng%29) instead,
+as many models have been tuned on MTEB(eng, classic) datasets, and MTEB(eng) might give a more accurate representation of models' generalization performance.
+    """,
     citation="""@inproceedings{muennighoff-etal-2023-mteb,
     title = "{MTEB}: Massive Text Embedding Benchmark",
     author = "Muennighoff, Niklas  and
@@ -275,7 +286,7 @@ def load_results(
             "STS22",
         ],
     ),
-    description="Main Russian benchmarks from MTEB",
+    description="A Russian version of the Massive Text Embedding Benchmark with a number of novel Russian tasks in all task categories of the original MTEB.",
     reference="https://aclanthology.org/2023.eacl-main.148/",
     citation="""@misc{snegirev2024russianfocusedembeddersexplorationrumteb,
       title={The Russian-focused embedders' exploration: ruMTEB benchmark and Russian embedding model design}, 
@@ -324,8 +335,8 @@ def load_results(
             "LegalQuAD",
         ]
     ),
-    description="Legal benchmarks from MTEB.",
-    reference="https://aclanthology.org/2023.eacl-main.148/",
+    description="A benchmark of retrieval tasks in the legal domain.",
+    reference=None,
     citation=None,
 )
 
@@ -365,7 +376,10 @@ def load_results(
             "Tatoeba",
         ]
     ),
-    description="BitextMining benchmark from MINERS",
+    description="""Bitext Mining texts from the MINERS benchmark, a benchmark designed to evaluate the
+    ability of multilingual LMs in semantic retrieval tasks,
+    including bitext mining and classification via retrieval-augmented contexts.
+    """,
     reference="https://arxiv.org/pdf/2406.07424",
     citation="""
     @article{winata2024miners,
@@ -533,7 +547,7 @@ def load_results(
         )
         + (get_task("STS22", eval_splits=["test"], hf_subsets=["fr"]),)
     ),
-    description="Main French benchmarks from MTEB",
+    description="MTEB-French, a French expansion of the original benchmark with high-quality native French datasets.",
     reference="https://arxiv.org/abs/2405.20468",
     citation="""@misc{ciancone2024mtebfrenchresourcesfrenchsentence,
       title={MTEB-French: Resources for French Sentence Embedding Evaluation and Analysis}, 
@@ -581,7 +595,7 @@ def load_results(
             "STS22",
         ],
     ),
-    description="Main German benchmarks from MTEB",
+    description="A benchmark for text-embedding performance in German.",
     reference="https://arxiv.org/html/2401.02709v1",
     citation="""@misc{wehrli2024germantextembeddingclustering,
       title={German Text Embedding Clustering Benchmark}, 
@@ -613,7 +627,7 @@ def load_results(
             "KorSTS",
         ],
     ),
-    description="Main Korean benchmarks from MTEB",
+    description="A benchmark and leaderboard for evaluation of text embedding in Korean.",
     reference=None,
     citation=None,
 )
@@ -650,7 +664,11 @@ def load_results(
         )
         + (get_task("STS22", eval_splits=["test"], hf_subsets=["pl"]),),
     ),
-    description="Main Polish benchmarks from MTEB",
+    description="""Polish Massive Text Embedding Benchmark (PL-MTEB), a comprehensive benchmark for text embeddings in Polish. The PL-MTEB consists of 28 diverse NLP
+tasks from 5 task types. With tasks adapted based on previously used datasets by the Polish
+NLP community. In addition, a new PLSC (Polish Library of Science Corpus) dataset was created
+consisting of titles and abstracts of scientific publications in Polish, which was used as the basis for
+two novel clustering tasks.""",  # Rephrased from the abstract
     reference="https://arxiv.org/abs/2405.10138",
     citation="""@article{poswiata2024plmteb,
     title={PL-MTEB: Polish Massive Text Embedding Benchmark},
@@ -695,14 +713,14 @@ def load_results(
             "typescript",
         ],
     ),
-    description="Main code benchmarks from MTEB",
+    description="A massive code embedding benchmark covering retrieval tasks in a miriad of popular programming languages.",
     reference=None,
     citation=None,
 )
 
 
 MTEB_multilingual = Benchmark(
-    name="MTEB(Multilingual, beta)",
+    name="MTEB(Multilingual)",
     tasks=get_tasks(
         tasks=[
             "BornholmBitextMining",
@@ -840,7 +858,7 @@ def load_results(
             "MIRACLRetrievalHardNegatives",
         ],
     ),
-    description="The Multilingual benchmarks from MMTEB. Currently under development.",
+    description="A large-scale multilingual expansion of MTEB, driven mainly by highly-curated community contributions covering 250+ languages.",
     reference=None,
     citation=None,
     contacts=["KennethEnevoldsen", "isaac-chung"],
@@ -875,7 +893,7 @@ def load_results(
             "ESCIReranking",
         ],
     ),
-    description="Main Japanese benchmarks from MTEB",
+    description="JMTEB is a benchmark for evaluating Japanese text embedding models.",
     reference="https://github.com/sbintuitions/JMTEB",
     citation=None,
 )
@@ -915,7 +933,7 @@ def load_results(
 ]
 
 MTEB_INDIC = Benchmark(
-    name="MTEB(Indic, beta)",
+    name="MTEB(Indic)",
     tasks=get_tasks(
         tasks=[
             # Bitext
@@ -952,7 +970,7 @@ def load_results(
         languages=indic_languages,
         exclusive_language_filter=True,
     ),
-    description="Main Indic benchmark from MMTEB",
+    description="A regional geopolitical text embedding benchmark targetting embedding performance on Indic languages.",
     reference=None,
     citation=None,
     contacts=["KennethEnevoldsen", "isaac-chung"],
@@ -1003,7 +1021,7 @@ def load_results(
 ]
 
 MTEB_EU = Benchmark(
-    name="MTEB(Europe, beta)",
+    name="MTEB(Europe)",
     tasks=get_tasks(
         tasks=[
             "BornholmBitextMining",
@@ -1084,7 +1102,7 @@ def load_results(
         languages=eu_languages,
         exclusive_language_filter=True,
     ),
-    description="Main European benchmark from MMTEB",
+    description="A regional geopolitical text embedding benchmark targetting embedding performance on European languages.",
     reference=None,
     citation=None,
     contacts=["KennethEnevoldsen", "isaac-chung"],
@@ -1102,7 +1120,10 @@ def load_results(
             "LEMBWikimQARetrieval",
         ],
     ),
-    description="The main benchmark for evaluating long document retrieval.",
+    description="""LongEmbed is a benchmark oriented at exploring models' performance on long-context retrieval.
+    The benchmark comprises two synthetic tasks and four carefully chosen real-world tasks,
+    featuring documents of varying length and dispersed target information.
+    """,  # Pieced together from paper abstract.
     reference="https://arxiv.org/abs/2404.12096v2",
     citation="""@article{zhu2024longembed,
   title={LongEmbed: Extending Embedding Models for Long Context Retrieval},
@@ -1117,7 +1138,13 @@ def load_results(
     tasks=get_tasks(
         tasks=["BrightRetrieval"],
     ),
-    description="A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.",
+    description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.
+    BRIGHT is the first text retrieval
+    benchmark that requires intensive reasoning to retrieve relevant documents with
+    a dataset consisting of 1,384 real-world queries spanning diverse domains, such as
+    economics, psychology, mathematics, and coding. These queries are drawn from
+    naturally occurring and carefully curated human data.
+    """,
     reference="https://brightbenchmark.github.io/",
     citation="""@article{su2024bright,
   title={Bright: A realistic and challenging benchmark for reasoning-intensive retrieval},