Skip to content

Commit 692bd26

Browse files
fix: Adding missing model meta (#1856)
* Added CDE models * Added bge-en-icl * Updated CDE to bge_full_data * Fixed public_training_data flag type to include boolean, as this is how all models are annotated * Added public training data link instead of bool to CDE and BGE * Added GME models * Changed Torch to PyTorch * Added metadata on LENS models * Added ember_v1 * Added metadata for amazon titan * Removed GME implementation
1 parent fde446d commit 692bd26

File tree

6 files changed

+292
-0
lines changed

6 files changed

+292
-0
lines changed

mteb/models/bge_models.py

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
from mteb.model_meta import ModelMeta, sentence_transformers_loader
66

7+
from .e5_instruct import E5_MISTRAL_TRAINING_DATA
8+
79
model_prompts = {"query": "Represent this sentence for searching relevant passages: "}
810
model_prompts_zh = {"query": "为这个句子生成表示以用于检索相关文章:"}
911

@@ -496,3 +498,83 @@
496498
public_training_data=None,
497499
training_datasets=None, # not disclosed
498500
)
501+
502+
# Contents of cfli/bge-full-data
503+
bge_full_data = {
504+
# source: https://arxiv.org/pdf/2409.15700
505+
# Charles Goodhart is turning back and forth
506+
# in his grave as I'm annotating this
507+
# |Retrieval|
508+
# ELI5
509+
# SQuaD
510+
# TriviaQA
511+
# QuoraDuplicateQuestions
512+
"HotpotQA": ["train"],
513+
"FEVER": ["train"],
514+
"MSMARCO": ["train"],
515+
"NQ": ["train"],
516+
"ArguAna": ["train"],
517+
"FiQA2018": ["train"],
518+
# |Reranking|
519+
"SciDocsReranking": ["train"],
520+
"StackOverflowDupQuestions": ["train"],
521+
# |Classification|
522+
"AmazonReviewsClassification": ["train"],
523+
"AmazonCounterfactualClassification": ["train"],
524+
"Banking77Classification": ["train"],
525+
"EmotionClassification": ["train"],
526+
"TweetSentimentExtractionClassification": ["train"],
527+
"MTOPIntentClassification": ["train"],
528+
"ImdbClassification": ["train"],
529+
"ToxicConversationsClassification": ["train"],
530+
# |Clustering|
531+
"ArxivClusteringS2S": ["train"],
532+
"ArxivClusteringP2P": ["train"],
533+
"BiorxivClusteringS2S": ["train"],
534+
"BiorxivClusteringP2P": ["train"],
535+
"MedrxivClusteringS2S": ["train"],
536+
"MedrxivClusteringP2P": ["train"],
537+
"BiorxivClusteringS2S.v2": ["train"],
538+
"BiorxivClusteringP2P.v2": ["train"],
539+
"MedrxivClusteringS2S.v2": ["train"],
540+
"MedrxivClusteringP2P.v2": ["train"],
541+
"RedditClusteringP2P": ["train"],
542+
"RedditClustering": ["train"],
543+
"RedditClustering.v2": ["train"],
544+
"TwentyNewsgroupsClustering": ["train"],
545+
"TwentyNewsgroupsClustering.v2": ["train"],
546+
# |STS|
547+
"STS22": ["train"],
548+
"STS22.v2": ["train"],
549+
"STSBenchmark": ["train"],
550+
}
551+
552+
bge_en_icl = ModelMeta(
553+
loader=partial(
554+
sentence_transformers_loader,
555+
model_name="BAAI/bge-en-icl",
556+
revision="971c7e1445cc86656ca0bd85ed770b8675a40bb5",
557+
),
558+
name="BAAI/bge-en-icl",
559+
languages=[
560+
"eng_Latn",
561+
],
562+
open_weights=True,
563+
revision="971c7e1445cc86656ca0bd85ed770b8675a40bb5",
564+
release_date="2024-07-25", # initial commit of hf model.
565+
n_parameters=7.11 * 1e9,
566+
embed_dim=4096,
567+
license="apache-2",
568+
max_tokens=32768,
569+
reference="https://huggingface.co/BAAI/bge-en-icl",
570+
similarity_fn_name="cosine",
571+
framework=["Sentence Transformers", "PyTorch"],
572+
use_instructions=False,
573+
public_training_code="https://github.com/FlagOpen/FlagEmbedding",
574+
public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
575+
training_datasets={
576+
**E5_MISTRAL_TRAINING_DATA,
577+
**bge_full_data,
578+
},
579+
adapted_from="intfloat/e5-mistral-7b-instruct",
580+
)

mteb/models/cde_models.py

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
from __future__ import annotations
2+
3+
import logging
4+
5+
from mteb.model_meta import ModelMeta
6+
7+
from .bge_models import bge_full_data
8+
9+
logger = logging.getLogger(__name__)
10+
11+
12+
cde_small_v1 = ModelMeta(
13+
loader=None, # I will leave this at None for now,
14+
name="jxm/cde-small-v1",
15+
languages=["eng_Latn"],
16+
open_weights=True,
17+
revision="8d5736163718a8b65cd787b75ed61020d18bad3c",
18+
release_date="2024-09-24",
19+
n_parameters=int(281 * 1e6), # Though the second-stage model is only 140M
20+
max_tokens=512,
21+
embed_dim=768,
22+
license="mit",
23+
similarity_fn_name="cosine",
24+
framework=["Sentence Transformers"],
25+
reference="https://huggingface.co/jxm/cde-small-v1",
26+
use_instructions=True,
27+
adapted_from="nomic-ai/nomic-bert-2048",
28+
superseded_by="jxm/cde-small-v2",
29+
training_datasets=bge_full_data,
30+
public_training_code="https://github.com/jxmorris12/cde",
31+
public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
32+
)
33+
34+
cde_small_v2 = ModelMeta(
35+
loader=None, # I will leave this at None for now,
36+
name="jxm/cde-small-v2",
37+
languages=["eng_Latn"],
38+
open_weights=True,
39+
revision="a7e5882ad52c27ea2831fc8258f24379c25cb459",
40+
release_date="2025-01-13",
41+
n_parameters=int(306 * 1e6), # Though the second-stage model is only 140M
42+
max_tokens=512,
43+
embed_dim=768,
44+
license="mit",
45+
similarity_fn_name="cosine",
46+
framework=["Sentence Transformers"],
47+
reference="https://huggingface.co/jxm/cde-small-v1",
48+
use_instructions=True,
49+
adapted_from="answerdotai/ModernBERT-base",
50+
superseded_by="jxm/cde-small-v2",
51+
training_datasets=bge_full_data,
52+
public_training_code="https://github.com/jxmorris12/cde",
53+
public_training_data="https://huggingface.co/datasets/cfli/bge-full-data",
54+
)

mteb/models/gme_models.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
from __future__ import annotations
2+
3+
import logging
4+
from functools import partial
5+
6+
from mteb.model_meta import ModelMeta
7+
8+
logger = logging.getLogger(__name__)
9+
10+
11+
gme_qwen2_vl_2b_instruct = ModelMeta(
12+
loader=None,
13+
name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct",
14+
languages=["eng_Latn"],
15+
open_weights=True,
16+
revision="cfeb66885b598de483cc04eb08c7d9da534d7afe",
17+
release_date="2024-12-21",
18+
n_parameters=int(2.21 * 1e9),
19+
max_tokens=32768,
20+
embed_dim=1536,
21+
license="mit",
22+
similarity_fn_name="cosine",
23+
framework=["PyTorch"],
24+
reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-2B-Instruct",
25+
use_instructions=True,
26+
adapted_from=None,
27+
superseded_by=None,
28+
training_datasets={
29+
# Only annotating text data for now
30+
# source: https://arxiv.org/pdf/2412.16855
31+
"MSMARCO": ["train"],
32+
"MSMARCO.v2": ["train"],
33+
},
34+
public_training_code=None,
35+
public_training_data=None,
36+
)
37+
38+
gme_qwen2_vl_7b_instruct = ModelMeta(
39+
loader=None,
40+
name="Alibaba-NLP/gme-Qwen2-VL-2B-Instruct",
41+
languages=["eng_Latn"],
42+
open_weights=True,
43+
revision="d42eca5a540526cfa982a349724b24b25c12a95e",
44+
release_date="2024-12-21",
45+
n_parameters=int(8.29 * 1e9),
46+
max_tokens=32768,
47+
embed_dim=3584,
48+
license="mit",
49+
similarity_fn_name="cosine",
50+
framework=["PyTorch"],
51+
reference="https://huggingface.co/Alibaba-NLP/gme-Qwen2-VL-7B-Instruct",
52+
use_instructions=True,
53+
adapted_from=None,
54+
superseded_by=None,
55+
training_datasets={
56+
# Only annotating text data for now
57+
# source: https://arxiv.org/pdf/2412.16855
58+
"MSMARCO": ["train"],
59+
"MSMARCO.v2": ["train"],
60+
},
61+
public_training_code=None,
62+
public_training_data=None,
63+
)

mteb/models/lens_models.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
from __future__ import annotations
2+
3+
from functools import partial
4+
5+
import torch
6+
7+
from mteb.encoder_interface import PromptType
8+
from mteb.model_meta import ModelMeta, sentence_transformers_loader
9+
from mteb.models.instruct_wrapper import instruct_wrapper
10+
11+
lens_d4000 = ModelMeta(
12+
loader=None, # TODO: implement this in the future
13+
name="yibinlei/LENS-d4000",
14+
languages=None,
15+
open_weights=True,
16+
revision="e473b33364e6c48a324796fd1411d3b93670c6fe",
17+
release_date="2025-01-17",
18+
n_parameters=int(7.11 * 1e9),
19+
embed_dim=4000,
20+
license="apache-2.0",
21+
reference="https://huggingface.co/yibinlei/LENS-d4000",
22+
similarity_fn_name="cosine",
23+
framework=["PyTorch"],
24+
use_instructions=True,
25+
public_training_code=None,
26+
public_training_data=None,
27+
training_datasets=None,
28+
max_tokens=32768,
29+
)
30+
31+
lens_d8000 = ModelMeta(
32+
loader=None, # TODO: implement this in the future
33+
name="yibinlei/LENS-d8000",
34+
languages=None,
35+
open_weights=True,
36+
revision="a0b87bd91cb27b6f2f0b0fe22c28026da1d464ef",
37+
release_date="2025-01-17",
38+
n_parameters=int(7.11 * 1e9),
39+
embed_dim=8000,
40+
license="apache-2.0",
41+
reference="https://huggingface.co/yibinlei/LENS-d8000",
42+
similarity_fn_name="cosine",
43+
framework=["PyTorch"],
44+
use_instructions=True,
45+
public_training_code=None,
46+
public_training_data=None,
47+
training_datasets=None,
48+
max_tokens=32768,
49+
)

mteb/models/misc_models.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1737,3 +1737,41 @@
17371737
training_datasets=None, # They "scraped" things from the internet, we don't know, could be leakage
17381738
superseded_by=None,
17391739
)
1740+
ember_v1 = ModelMeta(
1741+
name="llmrails/ember-v1",
1742+
revision="5e5ce5904901f6ce1c353a95020f17f09e5d021d",
1743+
release_date="2023-10-10",
1744+
languages=["eng_Latn"],
1745+
n_parameters=335 * 1e6,
1746+
max_tokens=512,
1747+
embed_dim=1024,
1748+
license="mit",
1749+
open_weights=True,
1750+
public_training_code=None,
1751+
public_training_data=None,
1752+
framework=["PyTorch", "Sentence Transformers"],
1753+
reference="https://huggingface.co/llmrails/ember-v1",
1754+
similarity_fn_name="cosine",
1755+
use_instructions=None,
1756+
training_datasets=None,
1757+
superseded_by=None,
1758+
)
1759+
amazon_titan_text_embeddings_v2 = ModelMeta(
1760+
name="amazon/Titan-text-embeddings-v2",
1761+
revision="1",
1762+
release_date="2024-04-30",
1763+
languages=["eng_Latn"],
1764+
n_parameters=None,
1765+
max_tokens=None,
1766+
embed_dim=None,
1767+
license="proprietary",
1768+
open_weights=False,
1769+
public_training_code=None,
1770+
public_training_data=None,
1771+
framework=[],
1772+
reference="https://huggingface.co/amazon/Titan-text-embeddings-v2",
1773+
similarity_fn_name="cosine",
1774+
use_instructions=False,
1775+
training_datasets=None,
1776+
superseded_by=None,
1777+
)

mteb/models/overview.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,20 @@
1515
arctic_models,
1616
bge_models,
1717
bm25,
18+
cde_models,
1819
cohere_models,
1920
colbert_models,
2021
e5_instruct,
2122
e5_models,
23+
gme_models,
2224
google_models,
2325
gritlm_models,
2426
gte_models,
2527
ibm_granite_models,
2628
inf_models,
2729
jasper_models,
2830
jina_models,
31+
lens_models,
2932
linq_models,
3033
llm2vec_models,
3134
misc_models,
@@ -56,6 +59,7 @@
5659
arctic_models,
5760
bge_models,
5861
bm25,
62+
cde_models,
5963
cohere_models,
6064
colbert_models,
6165
e5_instruct,
@@ -64,9 +68,11 @@
6468
google_models,
6569
gritlm_models,
6670
gte_models,
71+
gme_models,
6772
ibm_granite_models,
6873
inf_models,
6974
jina_models,
75+
lens_models,
7076
linq_models,
7177
llm2vec_models,
7278
mxbai_models,

0 commit comments

Comments
 (0)