From d274e42952b359597322fbf696fc3da4e4ff7f08 Mon Sep 17 00:00:00 2001
From: Chandra Sekhar Gupta <38103118+guptha23@users.noreply.github.com>
Date: Thu, 28 Dec 2023 12:48:00 +0530
Subject: [PATCH] updating readme files for evaluation. (#2926)

Co-authored-by: Chandra Sekhar Gupta Aravpalli <caravapalli@microsoft.com>
---
 .../system/evaluation/fill-mask/README.md     |  3 +-
 .../evaluation/question-answering/README.md   | 39 ++++++++++++-------
 .../system/evaluation/summarization/README.md |  1 -
 .../evaluation/text-classification/README.md  |  1 -
 .../evaluation/text-generation/README.md      |  1 -
 .../evaluation/token-classification/README.md |  1 -
 .../system/evaluation/translation/README.md   |  1 -
 7 files changed, 27 insertions(+), 20 deletions(-)
diff --git a/sdk/python/foundation-models/system/evaluation/fill-mask/README.md b/sdk/python/foundation-models/system/evaluation/fill-mask/README.md
index 5609ee1a3b8..e07414310d4 100644
--- a/sdk/python/foundation-models/system/evaluation/fill-mask/README.md
+++ b/sdk/python/foundation-models/system/evaluation/fill-mask/README.md
@@ -8,8 +8,7 @@
 |     model_id      | Model used for calculating Perplexity. Perplexity can only be calculated for causal language models.             | str       | "gpt2", "bert-base-uncased" |
 |    batch_size     | The batch size to run texts through the model                                                                    | int       | 16                          |
 |  add_start_token  | Boolean flag to add the start token to the texts so the perplexity can include the probability of the first word | boolean   | true, false                 |
-| custom_dimensions | Used to report telemetry data (can later be used to perform PII scrubbing)                                       | dict      |                             |
 
 ### List of supported metrics:
 
-* perplexities
\ No newline at end of file
+* perplexity
\ No newline at end of file
diff --git a/sdk/python/foundation-models/system/evaluation/question-answering/README.md b/sdk/python/foundation-models/system/evaluation/question-answering/README.md
index e6020e1d504..d21b4e23596 100644
--- a/sdk/python/foundation-models/system/evaluation/question-answering/README.md
+++ b/sdk/python/foundation-models/system/evaluation/question-answering/README.md
@@ -2,19 +2,32 @@
 
 ### List of supported keyword arguments:
 
-|  Keyword Argument  | Description                                                                    | Type      | Sample                      |
-|:------------------:|:-------------------------------------------------------------------------------|-----------|-----------------------------|
-|      metrics       | List for subset of metrics to be computed. All supported metrics listed below. | list<str> | ["exact_match", "f1_score"] |
-|     tokenizer      | Tokenizer object to perform tokenization on provided input text                | boolean   | false, true                 |
-| regexes_to_ignore  | List of regex to ignore in our input data points                               | list      | ["$[A-Z]+"]                 |
-|    ignore_case     | Boolean flag to indicate whether we need to ignore case                        | boolean   | false, true                 |
-| ignore_punctuation | Boolean flag to indicate whether we need to ignore punctuation                 | boolean   | false, true                 |
-|   ignore_numbers   | Boolean flag to indicate whether we need to ignore numbers                     | boolean   | false, true                 |
-| custom_dimensions  | Used to report telemetry data (can later be used to perform PII scrubbing)     | dict      |                             |
+|   Keyword Argument    | Description                                                                    | Type            | Sample                      |
+|:---------------------:|:-------------------------------------------------------------------------------|-----------------|-----------------------------|
+|        metrics        | List for subset of metrics to be computed. All supported metrics listed below. | list<str>       | ["exact_match", "f1_score"] |
+|       tokenizer       | Tokenizer object to perform tokenization on provided input text                | python function | --                          |
+|   regexes_to_ignore   | List of regex to ignore in our input data points                               | list            | ["$[A-Z]+"]                 |
+|      ignore_case      | Boolean flag to indicate whether we need to ignore case                        | boolean         | false                |
+|  ignore_punctuation   | Boolean flag to indicate whether we need to ignore punctuation                 | boolean         | false                |
+|    ignore_numbers     | Boolean flag to indicate whether we need to ignore numbers                     | boolean         | false               |
+|        lang           | String of two letters indicating the language of the sentences, in ISO 639-1 format. (default="en") | string | "en" |
+|      model_type       | String specifying which model to use, according to the BERT specification. (default="microsoft/deberta-large") | string | "microsoft/deberta-large" |
+|          idf          | Boolean flag to use idf weights during computation of BERT score. (default=False) | boolean | false |
+| rescale_with_baseline | Boolean flag to rescale BERTScore with the pre-computed baseline. (default=True)  | boolean | true |
 
 ### List of supported metrics:
 
-* rouge1
-* rouge2
-* rougeLsum
-* rougeL
\ No newline at end of file
+- ada_similarity
+- bertscore
+- exact_match
+- f1_score
+- gpt_coherence
+- gpt_fluency
+- gpt_groundedness
+- gpt_relevance
+- gpt_similarity
+- llm_coherence
+- llm_fluency
+- llm_groundedness
+- llm_relevance
+- llm_similarity
\ No newline at end of file
diff --git a/sdk/python/foundation-models/system/evaluation/summarization/README.md b/sdk/python/foundation-models/system/evaluation/summarization/README.md
index 4973b2a713e..92d8a19a373 100644
--- a/sdk/python/foundation-models/system/evaluation/summarization/README.md
+++ b/sdk/python/foundation-models/system/evaluation/summarization/README.md
@@ -7,7 +7,6 @@
 |      metrics      | List for subset of metrics to be computed. All supported metrics listed below.        | list<str> | ["rouge1", "rouge2", "rougeL", "rougeLsum"] |
 |    aggregator     | Boolean flag to indicate if need to aggregate rouge scores for individual data points | boolean   | true, false                                 |
 |      stemmer      | Boolean flag to indicate whether to use Porter Stemmer for suffixes                   | boolean   | true, false                                 |
-| custom_dimensions | Used to report telemetry data (can later be used to perform PII scrubbing)            | dict      |                                             |
 
 ### List of supported metrics:
 
diff --git a/sdk/python/foundation-models/system/evaluation/text-classification/README.md b/sdk/python/foundation-models/system/evaluation/text-classification/README.md
index c444319df02..5c571186e4c 100644
--- a/sdk/python/foundation-models/system/evaluation/text-classification/README.md
+++ b/sdk/python/foundation-models/system/evaluation/text-classification/README.md
@@ -14,7 +14,6 @@
 |        multilabel        | Boolean variable that computes multilabel metrics when set to True             | boolean          | false (Should be false for single label classification)         |
 |      positive_label      | Label to be treated as positive label                                          | int/str          | 0, "CONTRADICTION"                                              |
 |    confidence_metrics    | List of metrics to compute confidence intervals                                | list<str>        | ["accuracy", "f1_score_micro"]                                  |
-|    custom_dimensions     | Used to report telemetry data (can later be used to perform PII scrubbing)     | dict             |                                                                 |
 
 ### List of supported metrics:
 
diff --git a/sdk/python/foundation-models/system/evaluation/text-generation/README.md b/sdk/python/foundation-models/system/evaluation/text-generation/README.md
index 2f679472775..8c87edd8e72 100644
--- a/sdk/python/foundation-models/system/evaluation/text-generation/README.md
+++ b/sdk/python/foundation-models/system/evaluation/text-generation/README.md
@@ -9,7 +9,6 @@
 |     smoothing     | Boolean flag to indicate if bleu score needs to be smoothened                         | boolean   | false, true                              |
 |    aggregator     | Boolean flag to indicate if need to aggregate rouge scores for individual data points | boolean   | true, false                              |
 |      stemmer      | Boolean flag to indicate whether to use Porter Stemmer for suffixes                   | boolean   | true, false                              |
-| custom_dimensions | Used to report telemetry data (can later be used to perform PII scrubbing)            | dict      |                                          |
 
 ### List of supported metrics:
 
diff --git a/sdk/python/foundation-models/system/evaluation/token-classification/README.md b/sdk/python/foundation-models/system/evaluation/token-classification/README.md
index 6f82df11eb8..4d301412032 100644
--- a/sdk/python/foundation-models/system/evaluation/token-classification/README.md
+++ b/sdk/python/foundation-models/system/evaluation/token-classification/README.md
@@ -6,7 +6,6 @@
 |:------------------------:|:-------------------------------------------------------------------------------|-----------|---------------------------------------------------------------|
 |         metrics          | List for subset of metrics to be computed. All supported metrics listed below. | list<str> | ["accuracy", "f1_score_macro", "f1_score_micro"]              |
 |       labels_list        | List for supported labels for tokens                                           | list<str> | ["B-PER", "I-PER", "O", "B-LOC", "I-LOC", "B-MISC", "I-MISC"] |
-|    custom_dimensions     | Used to report telemetry data (can later be used to perform PII scrubbing)     | dict      |                                                               |
 
 ### List of supported metrics:
 
diff --git a/sdk/python/foundation-models/system/evaluation/translation/README.md b/sdk/python/foundation-models/system/evaluation/translation/README.md
index 6c6c0383e86..fcc56d6eccb 100644
--- a/sdk/python/foundation-models/system/evaluation/translation/README.md
+++ b/sdk/python/foundation-models/system/evaluation/translation/README.md
@@ -7,7 +7,6 @@
 |      metrics      | List for subset of metrics to be computed. All supported metrics listed below. | list<str> | ["bleu_1", "bleu_2", "bleu_3", "bleu_4"] |
 |     tokenizer     | Tokenizer object to perform tokenization on provided input text                |           |                                          |
 |     smoothing     | Boolean flag to indicate if bleu score needs to be smoothened                  | boolean   | false, true                              |
-| custom_dimensions | Used to report telemetry data (can later be used to perform PII scrubbing)     | dict      |                                          |
 
 ### List of supported metrics: