From 9e172f2b5db26d9cf7ef96384f6af2077e213b13 Mon Sep 17 00:00:00 2001 From: Ulyana Date: Wed, 21 Aug 2024 16:17:17 -0700 Subject: [PATCH 1/5] add custom types to docstrings --- .../studio/trustworthy_language_model.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/cleanlab_studio/studio/trustworthy_language_model.py b/cleanlab_studio/studio/trustworthy_language_model.py index 468751b8..91f334ce 100644 --- a/cleanlab_studio/studio/trustworthy_language_model.py +++ b/cleanlab_studio/studio/trustworthy_language_model.py @@ -22,9 +22,9 @@ from cleanlab_studio.errors import ValidationError from cleanlab_studio.internal.api import api from cleanlab_studio.internal.constants import ( + _TLM_DEFAULT_MODEL, _TLM_MAX_RETRIES, _VALID_TLM_QUALITY_PRESETS, - _TLM_DEFAULT_MODEL, ) from cleanlab_studio.internal.tlm.concurrency import TlmRateHandler from cleanlab_studio.internal.tlm.validation import ( @@ -655,8 +655,19 @@ class TLMScore(TypedDict): TLMScoreResponse = Union[float, TLMScore] +""" +TLMScoreResponse represents a single TLM response that can be either float, representing the trustworthiness score or a TLMScore object containing both the trustworthiness score and log dictionary keys. +""" + TLMBatchScoreResponse = Union[List[float], List[TLMScore]] +""" +TLMBatchScoreResponse represents a TLM response that can be either a list of floats or a list of TLMScore objects. The list will have the be length as the input list of prompts, response pairs. +""" + TLMOptionalBatchScoreResponse = Union[List[Optional[float]], List[Optional[TLMScore]]] +""" +TLMOptionalBatchScoreResponse represents a TLM response that can be either a list of floats or None (if the call to the TLM failed) or a list of TLMScore objects or None (if the call to the TLM failed). The list will have the be length as the input list of prompts, response pairs. +""" class TLMOptions(TypedDict): From cf49d8292ae2fc5dd3634d43667f905a1d4c2838 Mon Sep 17 00:00:00 2001 From: Ulyana Date: Wed, 21 Aug 2024 16:19:56 -0700 Subject: [PATCH 2/5] up version --- cleanlab_studio/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab_studio/version.py b/cleanlab_studio/version.py index a643b309..6252ad31 100644 --- a/cleanlab_studio/version.py +++ b/cleanlab_studio/version.py @@ -1,7 +1,7 @@ # Note to developers: # Consider if backend's MIN_CLI_VERSION needs updating when pushing any changes to this file. -__version__ = "2.2.1" +__version__ = "2.2.2" SCHEMA_VERSION = "0.2.0" MIN_SCHEMA_VERSION = "0.1.0" From 7043a346cc7fdf94cf9b154c3b5692e6b1de2e77 Mon Sep 17 00:00:00 2001 From: Ulyana Date: Wed, 21 Aug 2024 16:45:26 -0700 Subject: [PATCH 3/5] add to docstring --- .../studio/trustworthy_language_model.py | 26 +++++++------------ 1 file changed, 9 insertions(+), 17 deletions(-) diff --git a/cleanlab_studio/studio/trustworthy_language_model.py b/cleanlab_studio/studio/trustworthy_language_model.py index 91f334ce..f6be442c 100644 --- a/cleanlab_studio/studio/trustworthy_language_model.py +++ b/cleanlab_studio/studio/trustworthy_language_model.py @@ -192,7 +192,7 @@ async def _batch_get_trustworthiness_score( capture_exceptions (bool): if should return None in place of the response for any errors or timeout processing some inputs Returns: - Union[TLMBatchScoreResponse, TLMOptionalBatchScoreResponse]: TLM trustworthiness score for each prompt (in supplied order) + Union[TLMBatchScoreResponse, TLMOptionalBatchScoreResponse]: TLM trustworthiness score for each prompt (in supplied order). """ if capture_exceptions: per_query_timeout, per_batch_timeout = self._timeout, None @@ -437,8 +437,10 @@ def get_trustworthiness_score( response (str | Sequence[str]): existing response (or list of responses) associated with the input prompts. These can be from any LLM or human-written responses. Returns: - float | List[float]: float or list of floats (if multiple prompt-responses were provided) corresponding - to the TLM's trustworthiness score. + TLMScoreResponse | TLMBatchScoreResponse: **TLMScoreResponse** represents a single TLM response that can be either float, representing the trustworthiness score or a TLMScore object containing both the trustworthiness score and log dictionary keys. + + **TLMBatchScoreResponse** (if multiple prompt-responses were provided) represents a TLM response that can be either a list of floats or a list of TLMScore objects. The list will have the be length as the input list of prompts, response pairs. + The score quantifies how confident TLM is that the given response is good for the given prompt. If running on many prompt-response pairs simultaneously: this method will raise an exception if any TLM errors or timeouts occur. @@ -493,7 +495,7 @@ def try_get_trustworthiness_score( prompt (Sequence[str]): list of prompts for the TLM to evaluate response (Sequence[str]): list of existing responses corresponding to the input prompts (from any LLM or human-written) Returns: - List[float]: list of floats corresponding to the TLM's trustworthiness score. + TLMOptionalBatchScoreResponse: a TLM response that can be either a list of floats or None (if the call to the TLM failed) or a list of TLMScore objects or None (if the call to the TLM failed). The list will have the be length as the input list of prompts, response pairs. The floats correspond to the TLM's trustworthiness score. The score quantifies how confident TLM is that the given response is good for the given prompt. The returned list will always have the same length as the input list. In case of TLM error or timeout on any prompt-response pair, @@ -537,8 +539,9 @@ async def get_trustworthiness_score_async( prompt (str | Sequence[str]): prompt (or list of prompts) for the TLM to evaluate response (str | Sequence[str]): response (or list of responses) corresponding to the input prompts Returns: - float | List[float]: float or list of floats (if multiple prompt-responses were provided) corresponding - to the TLM's trustworthiness score. + TLMScoreResponse | float | List[float]: **TLMScoreResponse** represents a single TLM response that can be either float, representing the trustworthiness score or a TLMScore object + containing both the trustworthiness score and log dictionary keys, + or float or list of floats (if multiple prompt-responses were provided) corresponding to the TLM's trustworthiness score. The score quantifies how confident TLM is that the given response is good for the given prompt. This method will raise an exception if any errors occur or if you hit a timeout (given a timeout is specified). """ @@ -655,19 +658,8 @@ class TLMScore(TypedDict): TLMScoreResponse = Union[float, TLMScore] -""" -TLMScoreResponse represents a single TLM response that can be either float, representing the trustworthiness score or a TLMScore object containing both the trustworthiness score and log dictionary keys. -""" - TLMBatchScoreResponse = Union[List[float], List[TLMScore]] -""" -TLMBatchScoreResponse represents a TLM response that can be either a list of floats or a list of TLMScore objects. The list will have the be length as the input list of prompts, response pairs. -""" - TLMOptionalBatchScoreResponse = Union[List[Optional[float]], List[Optional[TLMScore]]] -""" -TLMOptionalBatchScoreResponse represents a TLM response that can be either a list of floats or None (if the call to the TLM failed) or a list of TLMScore objects or None (if the call to the TLM failed). The list will have the be length as the input list of prompts, response pairs. -""" class TLMOptions(TypedDict): From 9d103c2b3c0779a0405e961a0a25c951096bbc2c Mon Sep 17 00:00:00 2001 From: Ulyana Date: Fri, 30 Aug 2024 16:03:53 -0700 Subject: [PATCH 4/5] update type aliases --- .../studio/trustworthy_language_model.py | 32 ++++++++++++++----- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/cleanlab_studio/studio/trustworthy_language_model.py b/cleanlab_studio/studio/trustworthy_language_model.py index 7453445c..ab858b21 100644 --- a/cleanlab_studio/studio/trustworthy_language_model.py +++ b/cleanlab_studio/studio/trustworthy_language_model.py @@ -4,6 +4,14 @@ **This module is not meant to be imported and used directly.** Instead, use [`Studio.TLM()`](/reference/python/studio/#method-tlm) to instantiate a [TLM](#class-tlm) object, and then you can use the methods like [`prompt()`](#method-prompt) and [`get_trustworthiness_score()`](#method-get_trustworthiness_score) documented on this page. The [Trustworthy Language Model tutorial](/tutorials/tlm/) further explains TLM and its use cases. + +### Type Aliases + +Type aliases returned by the TLM module. + +- `TLMScoreResponse = Union[float, TLMScore]`: a single TLM response that can be either float, representing the trustworthiness score or a [TLMScore](#class-tlmscore) object containing both the trustworthiness score and log dictionary keys. +- `TLMBatchScoreResponse = Union[List[float], List[TLMScore]]`: a TLM response that can be either a list of floats or a list of [TLMScore](#class-tlmscore) objects containing both the trustworthiness score and log dictionary keys. The list will have the be length as the input list of prompts, response pairs. +- `TLMOptionalBatchScoreResponse = Union[List[Optional[float]], List[Optional[TLMScore]]]`: a TLM response that can be either a list of floats or None (if the call to the TLM failed) or a list of [TLMScore](#class-tlmscore) objects containing both the trustworthiness score and log dictionary keys or None (if the call to the TLM failed). The list will have the be length as the input list of prompts, response pairs. """ from __future__ import annotations @@ -437,9 +445,9 @@ def get_trustworthiness_score( response (str | Sequence[str]): existing response (or list of responses) associated with the input prompts. These can be from any LLM or human-written responses. Returns: - TLMScoreResponse | TLMBatchScoreResponse: **TLMScoreResponse** represents a single TLM response that can be either float, representing the trustworthiness score or a TLMScore object containing both the trustworthiness score and log dictionary keys. + TLMScoreResponse | TLMBatchScoreResponse: If a single prompt/response pair was passed in, method returns either a float (representing the output trustworthiness score) or a TLMScore object containing both the trustworthiness score and log dictionary keys. See the documentation for [TLMScoreResponse](#type-aliases) for more details. - **TLMBatchScoreResponse** (if multiple prompt-responses were provided) represents a TLM response that can be either a list of floats or a list of TLMScore objects. The list will have the be length as the input list of prompts, response pairs. + If a list of prompt/responses was passed in, method returns a list of floats representing the trustworthiness score or a list of TLMScore objects each containing both the trustworthiness score and log dictionary keys for each prompt-response pair passed in. See the documentation for [TLMBatchScoreResponse](#type-aliases) for more details. The score quantifies how confident TLM is that the given response is good for the given prompt. If running on many prompt-response pairs simultaneously: @@ -495,7 +503,10 @@ def try_get_trustworthiness_score( prompt (Sequence[str]): list of prompts for the TLM to evaluate response (Sequence[str]): list of existing responses corresponding to the input prompts (from any LLM or human-written) Returns: - TLMOptionalBatchScoreResponse: a TLM response that can be either a list of floats or None (if the call to the TLM failed) or a list of TLMScore objects or None (if the call to the TLM failed). The list will have the be length as the input list of prompts, response pairs. The floats correspond to the TLM's trustworthiness score. + TLMOptionalBatchScoreResponse: If a single prompt/response pair was passed in, method returns either a float (representing the output trustworthiness score), a None (if the call to the TLM failed), or a TLMScore object containing both the trustworthiness score and log dictionary keys. + + If a list of prompt/responses was passed in, method returns a list of floats representing the trustworthiness score or a list of TLMScore objects each containing both the trustworthiness score and log dictionary keys for each prompt-response pair passed in. For all TLM calls that failed, the returned list will contain None instead. See the documentation for [TLMOptionalBatchScoreResponse](#type-aliases) for more details. + The score quantifies how confident TLM is that the given response is good for the given prompt. The returned list will always have the same length as the input list. In case of TLM error or timeout on any prompt-response pair, @@ -526,7 +537,7 @@ async def get_trustworthiness_score_async( prompt: Union[str, Sequence[str]], response: Union[str, Sequence[str]], **kwargs: Any, - ) -> Union[TLMScoreResponse, List[float], List[TLMScore]]: + ) -> Union[TLMBatchScoreResponse, TLMScoreResponse]: """Asynchronously gets trustworthiness score for prompt-response pairs. This method is similar to the [`get_trustworthiness_score()`](#method-get_trustworthiness_score) method but operates asynchronously, allowing for non-blocking concurrent operations. @@ -539,9 +550,9 @@ async def get_trustworthiness_score_async( prompt (str | Sequence[str]): prompt (or list of prompts) for the TLM to evaluate response (str | Sequence[str]): response (or list of responses) corresponding to the input prompts Returns: - TLMScoreResponse | float | List[float]: **TLMScoreResponse** represents a single TLM response that can be either float, representing the trustworthiness score or a TLMScore object - containing both the trustworthiness score and log dictionary keys, - or float or list of floats (if multiple prompt-responses were provided) corresponding to the TLM's trustworthiness score. + TLMScoreResponse | TLMBatchScoreResponse: If a single prompt/response pair was passed in, method returns either a float (representing the output trustworthiness score) or a TLMScore object containing both the trustworthiness score and log dictionary keys. See the documentation for [TLMScoreResponse](#type-aliases) for more details. + + If a list of prompt/responses was passed in, method returns a list of floats representing the trustworthiness score or a list of TLMScore objects each containing both the trustworthiness score and log dictionary keys for each prompt-response pair passed in. See the documentation for [TLMBatchScoreResponse](#type-aliases) for more details. The score quantifies how confident TLM is that the given response is good for the given prompt. This method will raise an exception if any errors occur or if you hit a timeout (given a timeout is specified). """ @@ -650,7 +661,12 @@ class TLMResponse(TypedDict): class TLMScore(TypedDict): """A typed dict containing the trustworthiness score and additional logs from the Trustworthy Language Model. - This dictionary is similar to TLMResponse, except it does not contain the response key. + Attributes: + trustworthiness_score (float, optional): score between 0-1 corresponding to the trustworthiness of the response. + A higher score indicates a higher confidence that the response is correct/trustworthy. The trustworthiness score + is omitted if TLM is run with quality preset "base". + + log (dict, optional): additional logs and metadata returned from the LLM call only if the `log` key was specified in TLMOptions. """ trustworthiness_score: Optional[float] From 6d6e5402fd87f47b4a1a81da6e8b664dc78b68f1 Mon Sep 17 00:00:00 2001 From: Ulyana Date: Fri, 30 Aug 2024 16:51:03 -0700 Subject: [PATCH 5/5] address pr comments Co-authored-by: Jay Zhang --- cleanlab_studio/studio/trustworthy_language_model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cleanlab_studio/studio/trustworthy_language_model.py b/cleanlab_studio/studio/trustworthy_language_model.py index ab858b21..dbbd1a9d 100644 --- a/cleanlab_studio/studio/trustworthy_language_model.py +++ b/cleanlab_studio/studio/trustworthy_language_model.py @@ -10,8 +10,8 @@ Type aliases returned by the TLM module. - `TLMScoreResponse = Union[float, TLMScore]`: a single TLM response that can be either float, representing the trustworthiness score or a [TLMScore](#class-tlmscore) object containing both the trustworthiness score and log dictionary keys. -- `TLMBatchScoreResponse = Union[List[float], List[TLMScore]]`: a TLM response that can be either a list of floats or a list of [TLMScore](#class-tlmscore) objects containing both the trustworthiness score and log dictionary keys. The list will have the be length as the input list of prompts, response pairs. -- `TLMOptionalBatchScoreResponse = Union[List[Optional[float]], List[Optional[TLMScore]]]`: a TLM response that can be either a list of floats or None (if the call to the TLM failed) or a list of [TLMScore](#class-tlmscore) objects containing both the trustworthiness score and log dictionary keys or None (if the call to the TLM failed). The list will have the be length as the input list of prompts, response pairs. +- `TLMBatchScoreResponse = Union[List[float], List[TLMScore]]`: a TLM response that can be either a list of floats or a list of [TLMScore](#class-tlmscore) objects containing both the trustworthiness score and log dictionary keys. The list will have the same length as the input list of prompts, response pairs. +- `TLMOptionalBatchScoreResponse = Union[List[Optional[float]], List[Optional[TLMScore]]]`: a TLM response that can be either a list of floats or None (if the call to the TLM failed) or a list of [TLMScore](#class-tlmscore) objects containing both the trustworthiness score and log dictionary keys or None (if the call to the TLM failed). The list will have the same length as the input list of prompts, response pairs. """ from __future__ import annotations