Skip to content

Commit

Permalink
feat: add configurable param logical_operator (OR/AND) to factual kno…
Browse files Browse the repository at this point in the history
…wledge (#307)

* Added metric to factual knowledge + unit/integration tests

cr: https://code.amazon.com/reviews/CR-135854933

* fixed changes from PR comments

* Deleted metrics.py and restored code in util.py

* added factual knowledge metrics to constants.py

* added factual knowledge metrics to be included in binary score

* updated score descriptions for factual knowledge

* feat: add configurable param logical_operator (OR/AND) to factual knoweldge

* fixed changes from PR comments

* added warning and fixed typo

* modified warnings and fixed invalid config tests for factual_knowledge
  • Loading branch information
kirupang-code authored Jul 16, 2024
1 parent e65c10e commit 5e70ca7
Show file tree
Hide file tree
Showing 3 changed files with 240 additions and 23 deletions.
70 changes: 58 additions & 12 deletions src/fmeval/eval_algorithms/factual_knowledge.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,16 +86,26 @@ def __init__(
model_output_key: str = DatasetColumns.MODEL_OUTPUT.value.name,
output_keys: List[str] = SCORE_NAMES,
target_output_delimiter: Optional[str] = "<OR>",
logical_operator: str = "OR",
):
"""FactualKnowledgeScores initializer.
:param target_output_key: The record key corresponding to the target output.
:param model_output_key: The record key corresponding to the model output.
:param output_key: The key corresponding to the factual knowledge score that
will be added to the input record.
:param target_output_delimiter: See the docstring in `FactualKnowledgeConfig`.
:param target_output_delimiter: This delimiter is used to combine all possible target outputs into
a single string. For example, if valid answers are ["UK", "England"] and the delimiter is "<OR>",
then the target output text will be "UK<OR>England". This can be useful to account for multiple
valid target outputs or to ensure that multiple target outputs are contained in the model output
(which can be configured using the logical_operator).
:param logical_operator: The logical operator can be set to "OR" (default) or "AND". When the logical operator
is "OR" (the default behavior), at least one of the possible target outputs (separated by the
target_output_delimiter) must be contained in the model output for the answer to be correct. When the logical
operator is "AND", ALL possible target outputs (separated by the target_output_delimiter) must be contained in
the model output in order for the answer to be correct.
"""
super().__init__(target_output_key, model_output_key, output_keys, target_output_delimiter)
super().__init__(target_output_key, model_output_key, output_keys, target_output_delimiter, logical_operator)
self.register_input_output_keys(
input_keys=[target_output_key, model_output_key],
output_keys=output_keys,
Expand All @@ -104,6 +114,7 @@ def __init__(
self.model_output_key = model_output_key
self.output_keys = output_keys
self.target_output_delimiter = target_output_delimiter
self.logical_operator = logical_operator

@validate_call
def __call__(self, record: Dict[str, Any]) -> Dict[str, Any]:
Expand Down Expand Up @@ -139,25 +150,48 @@ def _get_score(
`FactualKnowledge` for more details on what these numerical values represent.
"""
possible_targets = target_output.split(self.target_output_delimiter)
return max([score_fn(model_output, target, **fn_kwargs) for target in possible_targets])
if self.logical_operator == "OR":
return max([score_fn(model_output, target, **fn_kwargs) for target in possible_targets])
else: # self.logical_operator is "AND"
# checks that every target is in model_output, otherwise returns 0.0
return min([score_fn(model_output, target, **fn_kwargs) for target in possible_targets])


@dataclass(frozen=True)
class FactualKnowledgeConfig(EvalAlgorithmConfig):
"""Configures the factual knowledge evaluation algorithm.
:param target_output_delimiter: There can be multiple valid target outputs for a given question.
This delimiter is used to combine all possible target outputs into a single string.
For example, if valid answers are ["UK", "England"] and the delimiter is "<OR>", then the
target output text will be "UK<OR>England".
:param target_output_delimiter: This delimiter is used to combine all possible target outputs into
a single string. For example, if valid answers are ["UK", "England"] and the delimiter is "<OR>",
then the target output text will be "UK<OR>England". This can be useful to account for multiple
valid target outputs or to ensure that multiple target outputs are contained in the model output
(which can be configured using the logical_operator).
:param logical_operator: The logical operator can be set to "OR" (default) or "AND". When the logical operator
is "OR" (the default behavior), at least one of the possible target outputs (separated by the
target_output_delimiter) must be contained in the model output for the answer to be correct. When the logical
operator is "AND", ALL possible target outputs (separated by the target_output_delimiter) must be contained in
the model output in order for the answer to be correct.
"""

target_output_delimiter: Optional[str] = "<OR>"
logical_operator: str = "OR"

def __post_init__(self):
if self.target_output_delimiter == "":
raise EvalAlgorithmClientError(
"Empty target_output_delimiter is provided. Please either provide a non-empty string, or set it to None."
"Empty target_output_delimiter is provided. Please either provide a non-empty string, "
"or set it to None."
)
if self.logical_operator not in ["OR", "AND"]:
raise EvalAlgorithmClientError(
'Invalid logical_operator is provided. The only valid inputs are strings "OR" and "AND".'
)
if self.target_output_delimiter in ["<OR>", "<AND>"] and self.target_output_delimiter != "<{}>".format(
self.logical_operator
):
logger.warning(
f"The target_output_delimiter `{self.target_output_delimiter}` and logical_operator"
f" `{self.logical_operator}` are not consistent."
)


Expand All @@ -178,9 +212,16 @@ class FactualKnowledge(EvalAlgorithmInterface):
in the model output after both strings are normalized.
Inspired by HELM: https://github.com/stanford-crfm/helm/blob/62f817eb695a31e8389e3f7be30609d3f0871837/src/helm/benchmark/metrics/basic_metrics.py#L144
If there is more than one correct target answer, answers are seperated by the `target_output_delimiter` which can be
configured inside the `FactualKnowledgeConfig`. It defaults to `<OR>`, i.e, the target answer in this example could
be Germany<OR>Berlin (since Berlin is its own federal state).
If there is more than one correct target answer, the `logical_operator` can be set to "OR" (default) and
answers are seperated by the `target_output_delimiter`, both of which are configured inside the
`FactualKnowledgeConfig`. The `target_output_delimiter` defaults to `<OR>`, i.e, the target answer in this
example could be Germany<OR>Berlin (since Berlin is its own federal state).
If there are multiple correct target answers that must be included in the model output,
the `logical_operator` can be set to "AND". For example, consider the prompt 'What are the three primary colors?'.
The target answer would be Red<AND>Yellow<AND>Blue" (note that the target_output_delimiter could be anything,
but it is "<AND>" here for the sake of consistency with the logical_operator value).Red, yellow, and blue must
all be contained in the model generation for the answer to be correct under this configuration.
"""

eval_name = EvalAlgorithm.FACTUAL_KNOWLEDGE.value
Expand All @@ -192,7 +233,12 @@ def __init__(self, eval_algorithm_config: FactualKnowledgeConfig = FactualKnowle
"""
super().__init__(eval_algorithm_config)
self.pipeline = TransformPipeline(
[FactualKnowledgeScores(target_output_delimiter=eval_algorithm_config.target_output_delimiter)]
[
FactualKnowledgeScores(
target_output_delimiter=eval_algorithm_config.target_output_delimiter,
logical_operator=eval_algorithm_config.logical_operator,
)
]
)

def evaluate_sample(self, target_output: str, model_output: str) -> List[EvalScore]: # type: ignore[override]
Expand Down
2 changes: 1 addition & 1 deletion test/integration/test_factual_knowledge.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
ABS_TOL = 1e-4
os.environ["PARALLELIZATION_FACTOR"] = "2"

config = FactualKnowledgeConfig("<OR>")
config = FactualKnowledgeConfig("<OR>", "OR")
eval_algo = FactualKnowledge(config)

logger = logging.getLogger(__name__)
Expand Down
Loading

0 comments on commit 5e70ca7

Please sign in to comment.