sbintuitions · lsz05 · Sep 27, 2024 · Sep 27, 2024 · Oct 1, 2024 · Oct 1, 2024
diff --git a/docs/results/jinaai/jina-embeddings-v3/summary.json b/docs/results/jinaai/jina-embeddings-v3/summary.json
@@ -0,0 +1,62 @@
+{
+    "Classification": {
+        "amazon_counterfactual_classification": {
+            "macro_f1": 0.7882733929438857
+        },
+        "amazon_review_classification": {
+            "macro_f1": 0.5933239824757218
+        },
+        "massive_intent_classification": {
+            "macro_f1": 0.7765343277120157
+        },
+        "massive_scenario_classification": {
+            "macro_f1": 0.8974174944345525
+        }
+    },
+    "Reranking": {
+        "esci": {
+            "ndcg@10": 0.9271464336251287
+        }
+    },
+    "Retrieval": {
+        "jagovfaqs_22k": {
+            "ndcg@10": 0.7149884473155108
+        },
+        "jaqket": {
+            "ndcg@10": 0.46484206025698144
+        },
+        "mrtydi": {
+            "ndcg@10": 0.4544765083850943
+        },
+        "nlp_journal_abs_intro": {
+            "ndcg@10": 0.9843205562446103
+        },
+        "nlp_journal_title_abs": {
+            "ndcg@10": 0.9561509620323349
+        },
+        "nlp_journal_title_intro": {
+            "ndcg@10": 0.9385000684351988
+        }
+    },
+    "STS": {
+        "jsick": {
+            "spearman": 0.781637470000662
+        },
+        "jsts": {
+            "spearman": 0.8193234425217734
+        }
+    },
+    "Clustering": {
+        "livedoor_news": {
+            "v_measure_score": 0.5472248713636514
+        },
+        "mewsc16": {
+            "v_measure_score": 0.4818974386694296
+        }
+    },
+    "PairClassification": {
+        "paws_x_ja": {
+            "binary_f1": 0.623716814159292
+        }
+    }
+}
diff --git a/docs/results/sbintuitions/sarashina-embedding-v1-1b/summary.json b/docs/results/sbintuitions/sarashina-embedding-v1-1b/summary.json
@@ -0,0 +1,62 @@
+{
+    "Classification": {
+        "amazon_counterfactual_classification": {
+            "macro_f1": 0.7910202863961814
+        },
+        "amazon_review_classification": {
+            "macro_f1": 0.614759364446128
+        },
+        "massive_intent_classification": {
+            "macro_f1": 0.8225880728874561
+        },
+        "massive_scenario_classification": {
+            "macro_f1": 0.9065030576701741
+        }
+    },
+    "Reranking": {
+        "esci": {
+            "ndcg@10": 0.9374394712541568
+        }
+    },
+    "Retrieval": {
+        "jagovfaqs_22k": {
+            "ndcg@10": 0.7168374490004555
+        },
+        "jaqket": {
+            "ndcg@10": 0.7279485535689915
+        },
+        "mrtydi": {
+            "ndcg@10": 0.41952210141116814
+        },
+        "nlp_journal_abs_intro": {
+            "ndcg@10": 0.9394095717236127
+        },
+        "nlp_journal_title_abs": {
+            "ndcg@10": 0.9695624263086593
+        },
+        "nlp_journal_title_intro": {
+            "ndcg@10": 0.8832876426024624
+        }
+    },
+    "STS": {
+        "jsick": {
+            "spearman": 0.8022484725822061
+        },
+        "jsts": {
+            "spearman": 0.851980317221987
+        }
+    },
+    "Clustering": {
+        "livedoor_news": {
+            "v_measure_score": 0.5641831341687762
+        },
+        "mewsc16": {
+            "v_measure_score": 0.5129216698739159
+        }
+    },
+    "PairClassification": {
+        "paws_x_ja": {
+            "binary_f1": 0.62
+        }
+    }
+}
diff --git a/leaderboard.md b/leaderboard.md
@@ -7,7 +7,9 @@ The summary shows the average scores within each task. The average score is the
 
 | Model                                         | Avg.      | Retrieval   | STS       | Classification   | Reranking   | Clustering   | PairClassification   |
 |:----------------------------------------------|:----------|:------------|:----------|:-----------------|:------------|:-------------|:---------------------|
-| OpenAI/text-embedding-3-large                 | **74.05** | **74.48**   | 82.52     | **77.58**        | **93.58**   | 53.32        | 62.35                |
+| sbintuitions/sarashina-embedding-v1-1b        | **75.50** | **77.61**   | 82.71     | **78.37**        | **93.74**   | 53.86        | 62.00                |
+| OpenAI/text-embedding-3-large                 | 74.05     | 74.48       | 82.52     | 77.58            | 93.58       | 53.32        | 62.35                |
+| jinaai/jina-embeddings-v3                     | 73.44     | 75.22       | 80.05     | 76.39            | 92.71       | 51.46        | 62.37                |
 | cl-nagoya/ruri-large                          | 73.31     | 73.02       | 83.13     | 77.43            | 92.99       | 51.82        | 62.29                |
 | pkshatech/GLuCoSE-base-ja-v2                  | 72.23     | 73.36       | 82.96     | 74.21            | 93.01       | 48.65        | 62.37                |
 | pkshatech/RoSEtta-base-ja                     | 72.04     | 73.21       | 81.39     | 72.41            | 92.69       | 53.23        | 61.74                |
@@ -38,11 +40,13 @@ The summary shows the average scores within each task. The average score is the
 ## Retrieval
 | Model                                         | Avg.      | jagovfaqs_22k<br>(ndcg@10)   | jaqket<br>(ndcg@10)   | mrtydi<br>(ndcg@10)   | nlp_journal_abs_intro<br>(ndcg@10)   | nlp_journal_title_abs<br>(ndcg@10)   | nlp_journal_title_intro<br>(ndcg@10)   |
 |:----------------------------------------------|:----------|:-----------------------------|:----------------------|:----------------------|:-------------------------------------|:-------------------------------------|:---------------------------------------|
-| OpenAI/text-embedding-3-large                 | **74.48** | 72.41                        | 48.21                 | 34.88                 | **99.33**                            | 96.55                                | **95.47**                              |
-| pkshatech/GLuCoSE-base-ja-v2                  | 73.36     | 69.79                        | **67.29**             | 41.86                 | 90.29                                | 95.11                                | 75.80                                  |
+| sbintuitions/sarashina-embedding-v1-1b        | **77.61** | 71.68                        | **72.79**             | 41.95                 | 93.94                                | 96.96                                | 88.33                                  |
+| jinaai/jina-embeddings-v3                     | 75.22     | 71.50                        | 46.48                 | **45.45**             | 98.43                                | 95.62                                | 93.85                                  |
+| OpenAI/text-embedding-3-large                 | 74.48     | 72.41                        | 48.21                 | 34.88                 | **99.33**                            | 96.55                                | **95.47**                              |
+| pkshatech/GLuCoSE-base-ja-v2                  | 73.36     | 69.79                        | 67.29                 | 41.86                 | 90.29                                | 95.11                                | 75.80                                  |
 | pkshatech/RoSEtta-base-ja                     | 73.21     | 65.96                        | 65.33                 | 36.73                 | 95.54                                | 94.08                                | 81.63                                  |
 | cl-nagoya/ruri-large                          | 73.02     | **76.68**                    | 61.74                 | 38.03                 | 87.12                                | 96.58                                | 77.97                                  |
-| intfloat/multilingual-e5-large                | 70.98     | 70.30                        | 58.78                 | **43.63**             | 86.00                                | 94.70                                | 72.48                                  |
+| intfloat/multilingual-e5-large                | 70.98     | 70.30                        | 58.78                 | 43.63                 | 86.00                                | 94.70                                | 72.48                                  |
 | cl-nagoya/ruri-base                           | 69.82     | 74.56                        | 50.12                 | 35.45                 | 86.89                                | 96.57                                | 75.31                                  |
 | cl-nagoya/ruri-small                          | 69.41     | 73.65                        | 48.44                 | 33.43                 | 87.69                                | **97.17**                            | 76.09                                  |
 | intfloat/multilingual-e5-base                 | 68.21     | 65.34                        | 50.67                 | 38.38                 | 87.10                                | 94.73                                | 73.05                                  |
@@ -70,15 +74,17 @@ The summary shows the average scores within each task. The average score is the
 | Model                                         | Avg.      | jsick<br>(spearman)   | jsts<br>(spearman)   |
 |:----------------------------------------------|:----------|:----------------------|:---------------------|
 | cl-nagoya/sup-simcse-ja-large                 | **83.18** | 83.80                 | 82.57                |
-| cl-nagoya/ruri-large                          | 83.13     | 82.00                 | **84.26**            |
+| cl-nagoya/ruri-large                          | 83.13     | 82.00                 | 84.26                |
 | pkshatech/GLuCoSE-base-ja-v2                  | 82.96     | **84.96**             | 80.96                |
 | cl-nagoya/ruri-base                           | 82.87     | 82.32                 | 83.43                |
 | cl-nagoya/ruri-small                          | 82.79     | 83.44                 | 82.13                |
+| sbintuitions/sarashina-embedding-v1-1b        | 82.71     | 80.22                 | **85.20**            |
 | OpenAI/text-embedding-3-large                 | 82.52     | 81.27                 | 83.77                |
 | cl-nagoya/sup-simcse-ja-base                  | 82.05     | 82.83                 | 81.27                |
 | pkshatech/RoSEtta-base-ja                     | 81.39     | 83.83                 | 78.95                |
 | cl-nagoya/unsup-simcse-ja-large               | 80.56     | 80.15                 | 80.98                |
 | intfloat/multilingual-e5-small                | 80.07     | 81.50                 | 78.65                |
+| jinaai/jina-embeddings-v3                     | 80.05     | 78.16                 | 81.93                |
 | intfloat/multilingual-e5-base                 | 79.84     | 81.28                 | 78.39                |
 | intfloat/multilingual-e5-large                | 79.70     | 78.40                 | 80.99                |
 | OpenAI/text-embedding-3-small                 | 79.46     | 80.83                 | 78.08                |
@@ -100,10 +106,12 @@ The summary shows the average scores within each task. The average score is the
 ## Classification
 | Model                                         | Avg.      | amazon_counterfactual<br>(macro_f1)   | amazon_review<br>(macro_f1)   | massive_intent<br>(macro_f1)   | massive_scenario<br>(macro_f1)   |
 |:----------------------------------------------|:----------|:--------------------------------------|:------------------------------|:-------------------------------|:---------------------------------|
-| OpenAI/text-embedding-3-large                 | **77.58** | 77.90                                 | **60.44**                     | 80.91                          | **91.08**                        |
+| sbintuitions/sarashina-embedding-v1-1b        | **78.37** | 79.10                                 | **61.48**                     | 82.26                          | 90.65                            |
+| OpenAI/text-embedding-3-large                 | 77.58     | 77.90                                 | 60.44                         | 80.91                          | **91.08**                        |
 | cl-nagoya/ruri-large                          | 77.43     | 80.81                                 | 56.80                         | **82.56**                      | 89.56                            |
 | pkshatech/GLuCoSE-base-ja                     | 76.82     | **82.44**                             | 58.07                         | 78.85                          | 87.94                            |
 | oshizo/sbert-jsnli-luke-japanese-base-lite    | 76.61     | 79.95                                 | 57.48                         | 80.26                          | 88.75                            |
+| jinaai/jina-embeddings-v3                     | 76.39     | 78.83                                 | 59.33                         | 77.65                          | 89.74                            |
 | cl-nagoya/ruri-small                          | 76.22     | 79.92                                 | 55.61                         | 81.49                          | 87.88                            |
 | cl-nagoya/ruri-base                           | 75.58     | 76.66                                 | 55.76                         | 81.41                          | 88.49                            |
 | cl-nagoya/unsup-simcse-ja-large               | 74.66     | 76.79                                 | 55.37                         | 79.13                          | 87.36                            |
@@ -131,7 +139,8 @@ The summary shows the average scores within each task. The average score is the
 ## Reranking
 | Model                                         | Avg.      | esci<br>(ndcg@10)   |
 |:----------------------------------------------|:----------|:--------------------|
-| OpenAI/text-embedding-3-large                 | **93.58** | **93.58**           |
+| sbintuitions/sarashina-embedding-v1-1b        | **93.74** | **93.74**           |
+| OpenAI/text-embedding-3-large                 | 93.58     | 93.58               |
 | OpenAI/text-embedding-ada-002                 | 93.04     | 93.04               |
 | intfloat/multilingual-e5-small                | 93.03     | 93.03               |
 | pkshatech/GLuCoSE-base-ja-v2                  | 93.01     | 93.01               |
@@ -141,6 +150,7 @@ The summary shows the average scores within each task. The average score is the
 | OpenAI/text-embedding-3-small                 | 92.92     | 92.92               |
 | cl-nagoya/ruri-base                           | 92.91     | 92.91               |
 | intfloat/multilingual-e5-base                 | 92.85     | 92.85               |
+| jinaai/jina-embeddings-v3                     | 92.71     | 92.71               |
 | pkshatech/RoSEtta-base-ja                     | 92.69     | 92.69               |
 | pkshatech/GLuCoSE-base-ja                     | 91.90     | 91.90               |
 | cl-nagoya/sup-simcse-ja-base                  | 91.83     | 91.83               |
@@ -163,10 +173,12 @@ The summary shows the average scores within each task. The average score is the
 | Model                                         | Avg.      | livedoor_news<br>(v_measure_score)   | mewsc16<br>(v_measure_score)   |
 |:----------------------------------------------|:----------|:-------------------------------------|:-------------------------------|
 | cl-nagoya/ruri-base                           | **54.16** | 54.27                                | **54.04**                      |
+| sbintuitions/sarashina-embedding-v1-1b        | 53.86     | 56.42                                | 51.29                          |
 | OpenAI/text-embedding-3-large                 | 53.32     | 57.09                                | 49.55                          |
 | pkshatech/RoSEtta-base-ja                     | 53.23     | **58.62**                            | 47.85                          |
 | cl-nagoya/ruri-large                          | 51.82     | 51.39                                | 52.25                          |
 | cl-nagoya/sup-simcse-ja-base                  | 51.79     | 52.67                                | 50.91                          |
+| jinaai/jina-embeddings-v3                     | 51.46     | 54.72                                | 48.19                          |
 | intfloat/multilingual-e5-large                | 51.24     | 57.13                                | 45.34                          |
 | cl-nagoya/ruri-small                          | 51.19     | 50.96                                | 51.41                          |
 | OpenAI/text-embedding-3-small                 | 51.06     | 54.57                                | 47.55                          |
@@ -204,6 +216,7 @@ The summary shows the average scores within each task. The average score is the
 | cl-nagoya/ruri-base                           | 62.38     | 62.38                      |
 | oshizo/sbert-jsnli-luke-japanese-base-lite    | 62.38     | 62.38                      |
 | MU-Kindai/Japanese-DiffCSE-BERT-base          | 62.38     | 62.38                      |
+| jinaai/jina-embeddings-v3                     | 62.37     | 62.37                      |
 | pkshatech/GLuCoSE-base-ja-v2                  | 62.37     | 62.37                      |
 | MU-Kindai/Japanese-SimCSE-BERT-base-sup       | 62.37     | 62.37                      |
 | MU-Kindai/Japanese-SimCSE-BERT-large-sup      | 62.35     | 62.35                      |
@@ -219,5 +232,6 @@ The summary shows the average scores within each task. The average score is the
 | intfloat/multilingual-e5-small                | 62.19     | 62.19                      |
 | intfloat/multilingual-e5-large                | 62.15     | 62.15                      |
 | cl-nagoya/ruri-small                          | 62.11     | 62.11                      |
+| sbintuitions/sarashina-embedding-v1-1b        | 62.00     | 62.00                      |
 | pkshatech/RoSEtta-base-ja                     | 61.74     | 61.74                      |
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@ description = "The evaluation scripts for JMTEB (Japanese Massive Text Embedding
 name = "JMTEB"
 packages = [{from = "src", include = "jmteb"}]
 readme = "README.md"
-version = "1.3.2"
+version = "1.3.3"
 
 [tool.poetry.dependencies]
 python = ">=3.10,<4.0"

diff --git a/src/jmteb/__main__.py b/src/jmteb/__main__.py
@@ -97,6 +97,18 @@ def main(
     if len(args.evaluators) == 0:
         raise ValueError("No evaluator is selected. Please check the config file or the command line arguments.")
 
+    # save config as yaml
+    if args.save_dir:
+        Path(args.save_dir).mkdir(parents=True, exist_ok=True)
+        parser.save(
+            args,
+            Path(args.save_dir) / "jmteb_config.yaml",
+            format="yaml",
+            overwrite=True,
+            multifile=False,
+            skip_check=True,
+        )
+
     args = parser.instantiate_classes(args)
     if isinstance(args.evaluators, str):
         raise ValueError(