kiln_ai.datamodel.eval

  1import json
  2from enum import Enum
  3from typing import TYPE_CHECKING, Any, Dict, List, Union
  4
  5from pydantic import BaseModel, Field, model_validator
  6from typing_extensions import Self
  7
  8from kiln_ai.datamodel.basemodel import (
  9    ID_TYPE,
 10    NAME_FIELD,
 11    KilnParentedModel,
 12    KilnParentModel,
 13)
 14from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType
 15from kiln_ai.datamodel.dataset_filters import DatasetFilterId
 16from kiln_ai.datamodel.json_schema import string_to_json_key
 17from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
 18
 19if TYPE_CHECKING:
 20    from kiln_ai.datamodel.task import Task
 21
 22EvalScores = Dict[str, float]
 23
 24
 25class EvalTemplateId(str, Enum):
 26    """
 27    An eval template is a pre-defined eval that can be used as a starting point for a new eval.
 28    """
 29
 30    kiln_requirements = "kiln_requirements"
 31    toxicity = "toxicity"
 32    bias = "bias"
 33    maliciousness = "maliciousness"
 34    factual_correctness = "factual_correctness"
 35    jailbreak = "jailbreak"
 36
 37
 38class EvalConfigType(str, Enum):
 39    g_eval = "g_eval"
 40    llm_as_judge = "llm_as_judge"
 41
 42
 43class EvalOutputScore(BaseModel):
 44    """
 45    A definition of a score that an evaluator will produce.
 46
 47    Very similar to TaskRequirement, but conceptually different keeping in a separate models.
 48    """
 49
 50    name: str = Field(
 51        description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance."
 52    )
 53    instruction: str | None = Field(
 54        default=None,
 55        description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.",
 56    )
 57    type: TaskOutputRatingType = Field(
 58        description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical')."
 59    )
 60
 61    def json_key(self) -> str:
 62        """
 63        The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
 64
 65        For example, "Overall Rating" -> "overall_rating"
 66        """
 67        return string_to_json_key(self.name)
 68
 69    @model_validator(mode="after")
 70    def validate_type(self) -> Self:
 71        if self.type == TaskOutputRatingType.custom:
 72            raise ValueError(
 73                f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
 74            )
 75        return self
 76
 77
 78class EvalRun(KilnParentedModel):
 79    """
 80    The results of running an eval on a single dataset item.
 81
 82    This is a child of an EvalConfig, which specifies how the scores were generated.
 83
 84    Eval runs can be one of 2 types:
 85    1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run.
 86    2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
 87    """
 88
 89    dataset_id: ID_TYPE = Field(
 90        description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun."
 91    )
 92    task_run_config_id: ID_TYPE | None = Field(
 93        description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config."
 94    )
 95    eval_config_eval: bool = Field(
 96        description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.",
 97        default=False,
 98    )
 99    # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally.
100    input: str = Field(
101        description="The input to the task. JSON formatted for structured input, plaintext for unstructured input."
102    )
103    output: str = Field(
104        description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
105    )
106    intermediate_outputs: Dict[str, str] | None = Field(
107        default=None,
108        description="The intermediate outputs of the task (example, eval thinking).",
109    )
110    scores: EvalScores = Field(
111        description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)."
112    )
113
114    def parent_eval_config(self) -> Union["EvalConfig", None]:
115        if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
116            raise ValueError("parent must be an EvalConfig")
117        return self.parent  # type: ignore
118
119    @model_validator(mode="after")
120    def validate_eval_run_types(self) -> Self:
121        if self.eval_config_eval and self.task_run_config_id is not None:
122            raise ValueError(
123                "task_run_config_id must be None if eval_config_eval is true"
124            )
125        if not self.eval_config_eval and self.task_run_config_id is None:
126            raise ValueError(
127                "task_run_config_id must be set if eval_config_eval is false"
128            )
129        return self
130
131    @model_validator(mode="after")
132    def validate_scores(self) -> Self:
133        # We're checking the scores have the expected keys from the grand-parent eval
134        if self.scores is None or len(self.scores) == 0:
135            raise ValueError("scores are required, and must have at least one score.")
136
137        parent_eval_config = self.parent_eval_config()
138        eval = parent_eval_config.parent_eval() if parent_eval_config else None
139        if not eval:
140            # Can't validate without the grand-parent eval, allow it to be validated later
141            return self
142
143        output_score_keys = [score.json_key() for score in eval.output_scores]
144        if set(output_score_keys) != set(self.scores.keys()):
145            raise ValueError(
146                f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
147            )
148
149        # Check that each score is expected in this eval and the correct type
150        for output_score in eval.output_scores:
151            match output_score.type:
152                case TaskOutputRatingType.five_star:
153                    five_star_score = self.scores[output_score.json_key()]
154                    if (
155                        not isinstance(five_star_score, float)
156                        or five_star_score < 1.0
157                        or five_star_score > 5.0
158                    ):
159                        raise ValueError(
160                            f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
161                        )
162                case TaskOutputRatingType.pass_fail:
163                    pass_fail_score = self.scores[output_score.json_key()]
164                    if (
165                        not isinstance(pass_fail_score, float)
166                        or pass_fail_score < 0.0
167                        or pass_fail_score > 1.0
168                    ):
169                        raise ValueError(
170                            f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
171                        )
172                case TaskOutputRatingType.pass_fail_critical:
173                    pass_fail_critical_score = self.scores[output_score.json_key()]
174                    if (
175                        not isinstance(pass_fail_critical_score, float)
176                        or pass_fail_critical_score < -1.0
177                        or pass_fail_critical_score > 1.0
178                    ):
179                        raise ValueError(
180                            f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
181                        )
182                case TaskOutputRatingType.custom:
183                    raise ValueError(
184                        f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
185                    )
186                case _:
187                    # Catch missing cases
188                    raise_exhaustive_enum_error(output_score.type)
189        return self
190
191
192class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}):
193    """
194    A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
195
196    A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
197    """
198
199    name: str = NAME_FIELD
200    model_name: str = Field(
201        description="The name of the model to use for this eval config. ",
202    )
203    model_provider: str = Field(
204        description="The provider of the model to use for this eval config.",
205    )
206    config_type: EvalConfigType = Field(
207        default=EvalConfigType.g_eval,
208        description="This is used to determine the type of eval to run.",
209    )
210    properties: dict[str, Any] = Field(
211        default={},
212        description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.",
213    )
214
215    def parent_eval(self) -> Union["Eval", None]:
216        if self.parent is not None and self.parent.__class__.__name__ != "Eval":
217            raise ValueError("parent must be an Eval")
218        return self.parent  # type: ignore
219
220    def runs(self, readonly: bool = False) -> list[EvalRun]:
221        return super().runs(readonly=readonly)  # type: ignore
222
223    @model_validator(mode="after")
224    def validate_properties(self) -> Self:
225        if (
226            self.config_type == EvalConfigType.g_eval
227            or self.config_type == EvalConfigType.llm_as_judge
228        ):
229            if "eval_steps" not in self.properties or not isinstance(
230                self.properties["eval_steps"], list
231            ):
232                raise ValueError("eval_steps is required and must be a list for g_eval")
233            if "task_description" in self.properties and not isinstance(
234                self.properties["task_description"], str
235            ):
236                raise ValueError(
237                    "task_description is optional, but if provided must be a string"
238                )
239            return self
240        else:
241            raise ValueError(f"Invalid eval config type: {self.config_type}")
242
243    @model_validator(mode="after")
244    def validate_json_serializable(self) -> "EvalConfig":
245        try:
246            # This will raise a TypeError if the dict contains non-JSON-serializable objects
247            json.dumps(self.properties)
248        except TypeError as e:
249            raise ValueError(f"Properties must be JSON serializable: {str(e)}")
250        return self
251
252
253class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}):
254    name: str = NAME_FIELD
255    description: str | None = Field(
256        default=None, description="The description of the eval"
257    )
258    template: EvalTemplateId | None = Field(
259        default=None,
260        description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.",
261    )
262    current_config_id: ID_TYPE = Field(
263        default=None,
264        description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.",
265    )
266    eval_set_filter_id: DatasetFilterId = Field(
267        description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id."
268    )
269    eval_configs_filter_id: DatasetFilterId = Field(
270        description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id."
271    )
272    output_scores: List[EvalOutputScore] = Field(
273        description="The scores this evaluator should produce."
274    )
275
276    # Workaround to return typed parent without importing Task
277    def parent_task(self) -> Union["Task", None]:
278        if self.parent is not None and self.parent.__class__.__name__ != "Task":
279            raise ValueError("parent must be a Task")
280        return self.parent  # type: ignore
281
282    def configs(self, readonly: bool = False) -> list[EvalConfig]:
283        return super().configs(readonly=readonly)  # type: ignore
284
285    @model_validator(mode="after")
286    def validate_scores(self) -> Self:
287        if self.output_scores is None or len(self.output_scores) == 0:
288            raise ValueError(
289                "output_scores are required, and must have at least one score."
290            )
291
292        # check for duplicate names (once transformed to JSON keys)
293        output_score_keys = [score.json_key() for score in self.output_scores]
294        if len(output_score_keys) != len(set(output_score_keys)):
295            raise ValueError(
296                f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
297            )
298        return self
EvalScores = typing.Dict[str, float]
class EvalTemplateId(builtins.str, enum.Enum):
26class EvalTemplateId(str, Enum):
27    """
28    An eval template is a pre-defined eval that can be used as a starting point for a new eval.
29    """
30
31    kiln_requirements = "kiln_requirements"
32    toxicity = "toxicity"
33    bias = "bias"
34    maliciousness = "maliciousness"
35    factual_correctness = "factual_correctness"
36    jailbreak = "jailbreak"

An eval template is a pre-defined eval that can be used as a starting point for a new eval.

kiln_requirements = <EvalTemplateId.kiln_requirements: 'kiln_requirements'>
toxicity = <EvalTemplateId.toxicity: 'toxicity'>
bias = <EvalTemplateId.bias: 'bias'>
maliciousness = <EvalTemplateId.maliciousness: 'maliciousness'>
factual_correctness = <EvalTemplateId.factual_correctness: 'factual_correctness'>
jailbreak = <EvalTemplateId.jailbreak: 'jailbreak'>
class EvalConfigType(builtins.str, enum.Enum):
39class EvalConfigType(str, Enum):
40    g_eval = "g_eval"
41    llm_as_judge = "llm_as_judge"

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.

g_eval = <EvalConfigType.g_eval: 'g_eval'>
llm_as_judge = <EvalConfigType.llm_as_judge: 'llm_as_judge'>
class EvalOutputScore(pydantic.main.BaseModel):
44class EvalOutputScore(BaseModel):
45    """
46    A definition of a score that an evaluator will produce.
47
48    Very similar to TaskRequirement, but conceptually different keeping in a separate models.
49    """
50
51    name: str = Field(
52        description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance."
53    )
54    instruction: str | None = Field(
55        default=None,
56        description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.",
57    )
58    type: TaskOutputRatingType = Field(
59        description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical')."
60    )
61
62    def json_key(self) -> str:
63        """
64        The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
65
66        For example, "Overall Rating" -> "overall_rating"
67        """
68        return string_to_json_key(self.name)
69
70    @model_validator(mode="after")
71    def validate_type(self) -> Self:
72        if self.type == TaskOutputRatingType.custom:
73            raise ValueError(
74                f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
75            )
76        return self

A definition of a score that an evaluator will produce.

Very similar to TaskRequirement, but conceptually different keeping in a separate models.

name: str
instruction: str | None
def json_key(self) -> str:
62    def json_key(self) -> str:
63        """
64        The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
65
66        For example, "Overall Rating" -> "overall_rating"
67        """
68        return string_to_json_key(self.name)

The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.

For example, "Overall Rating" -> "overall_rating"

@model_validator(mode='after')
def validate_type(self) -> Self:
70    @model_validator(mode="after")
71    def validate_type(self) -> Self:
72        if self.type == TaskOutputRatingType.custom:
73            raise ValueError(
74                f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
75            )
76        return self
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class EvalRun(kiln_ai.datamodel.basemodel.KilnParentedModel):
 79class EvalRun(KilnParentedModel):
 80    """
 81    The results of running an eval on a single dataset item.
 82
 83    This is a child of an EvalConfig, which specifies how the scores were generated.
 84
 85    Eval runs can be one of 2 types:
 86    1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run.
 87    2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
 88    """
 89
 90    dataset_id: ID_TYPE = Field(
 91        description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun."
 92    )
 93    task_run_config_id: ID_TYPE | None = Field(
 94        description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config."
 95    )
 96    eval_config_eval: bool = Field(
 97        description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.",
 98        default=False,
 99    )
100    # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally.
101    input: str = Field(
102        description="The input to the task. JSON formatted for structured input, plaintext for unstructured input."
103    )
104    output: str = Field(
105        description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
106    )
107    intermediate_outputs: Dict[str, str] | None = Field(
108        default=None,
109        description="The intermediate outputs of the task (example, eval thinking).",
110    )
111    scores: EvalScores = Field(
112        description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)."
113    )
114
115    def parent_eval_config(self) -> Union["EvalConfig", None]:
116        if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
117            raise ValueError("parent must be an EvalConfig")
118        return self.parent  # type: ignore
119
120    @model_validator(mode="after")
121    def validate_eval_run_types(self) -> Self:
122        if self.eval_config_eval and self.task_run_config_id is not None:
123            raise ValueError(
124                "task_run_config_id must be None if eval_config_eval is true"
125            )
126        if not self.eval_config_eval and self.task_run_config_id is None:
127            raise ValueError(
128                "task_run_config_id must be set if eval_config_eval is false"
129            )
130        return self
131
132    @model_validator(mode="after")
133    def validate_scores(self) -> Self:
134        # We're checking the scores have the expected keys from the grand-parent eval
135        if self.scores is None or len(self.scores) == 0:
136            raise ValueError("scores are required, and must have at least one score.")
137
138        parent_eval_config = self.parent_eval_config()
139        eval = parent_eval_config.parent_eval() if parent_eval_config else None
140        if not eval:
141            # Can't validate without the grand-parent eval, allow it to be validated later
142            return self
143
144        output_score_keys = [score.json_key() for score in eval.output_scores]
145        if set(output_score_keys) != set(self.scores.keys()):
146            raise ValueError(
147                f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
148            )
149
150        # Check that each score is expected in this eval and the correct type
151        for output_score in eval.output_scores:
152            match output_score.type:
153                case TaskOutputRatingType.five_star:
154                    five_star_score = self.scores[output_score.json_key()]
155                    if (
156                        not isinstance(five_star_score, float)
157                        or five_star_score < 1.0
158                        or five_star_score > 5.0
159                    ):
160                        raise ValueError(
161                            f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
162                        )
163                case TaskOutputRatingType.pass_fail:
164                    pass_fail_score = self.scores[output_score.json_key()]
165                    if (
166                        not isinstance(pass_fail_score, float)
167                        or pass_fail_score < 0.0
168                        or pass_fail_score > 1.0
169                    ):
170                        raise ValueError(
171                            f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
172                        )
173                case TaskOutputRatingType.pass_fail_critical:
174                    pass_fail_critical_score = self.scores[output_score.json_key()]
175                    if (
176                        not isinstance(pass_fail_critical_score, float)
177                        or pass_fail_critical_score < -1.0
178                        or pass_fail_critical_score > 1.0
179                    ):
180                        raise ValueError(
181                            f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
182                        )
183                case TaskOutputRatingType.custom:
184                    raise ValueError(
185                        f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
186                    )
187                case _:
188                    # Catch missing cases
189                    raise_exhaustive_enum_error(output_score.type)
190        return self

The results of running an eval on a single dataset item.

This is a child of an EvalConfig, which specifies how the scores were generated.

Eval runs can be one of 2 types: 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.

dataset_id: Optional[str]
task_run_config_id: Optional[str]
eval_config_eval: bool
input: str
output: str
intermediate_outputs: Optional[Dict[str, str]]
scores: Dict[str, float]
def parent_eval_config(self) -> Optional[EvalConfig]:
115    def parent_eval_config(self) -> Union["EvalConfig", None]:
116        if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
117            raise ValueError("parent must be an EvalConfig")
118        return self.parent  # type: ignore
@model_validator(mode='after')
def validate_eval_run_types(self) -> Self:
120    @model_validator(mode="after")
121    def validate_eval_run_types(self) -> Self:
122        if self.eval_config_eval and self.task_run_config_id is not None:
123            raise ValueError(
124                "task_run_config_id must be None if eval_config_eval is true"
125            )
126        if not self.eval_config_eval and self.task_run_config_id is None:
127            raise ValueError(
128                "task_run_config_id must be set if eval_config_eval is false"
129            )
130        return self
@model_validator(mode='after')
def validate_scores(self) -> Self:
132    @model_validator(mode="after")
133    def validate_scores(self) -> Self:
134        # We're checking the scores have the expected keys from the grand-parent eval
135        if self.scores is None or len(self.scores) == 0:
136            raise ValueError("scores are required, and must have at least one score.")
137
138        parent_eval_config = self.parent_eval_config()
139        eval = parent_eval_config.parent_eval() if parent_eval_config else None
140        if not eval:
141            # Can't validate without the grand-parent eval, allow it to be validated later
142            return self
143
144        output_score_keys = [score.json_key() for score in eval.output_scores]
145        if set(output_score_keys) != set(self.scores.keys()):
146            raise ValueError(
147                f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
148            )
149
150        # Check that each score is expected in this eval and the correct type
151        for output_score in eval.output_scores:
152            match output_score.type:
153                case TaskOutputRatingType.five_star:
154                    five_star_score = self.scores[output_score.json_key()]
155                    if (
156                        not isinstance(five_star_score, float)
157                        or five_star_score < 1.0
158                        or five_star_score > 5.0
159                    ):
160                        raise ValueError(
161                            f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
162                        )
163                case TaskOutputRatingType.pass_fail:
164                    pass_fail_score = self.scores[output_score.json_key()]
165                    if (
166                        not isinstance(pass_fail_score, float)
167                        or pass_fail_score < 0.0
168                        or pass_fail_score > 1.0
169                    ):
170                        raise ValueError(
171                            f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
172                        )
173                case TaskOutputRatingType.pass_fail_critical:
174                    pass_fail_critical_score = self.scores[output_score.json_key()]
175                    if (
176                        not isinstance(pass_fail_critical_score, float)
177                        or pass_fail_critical_score < -1.0
178                        or pass_fail_critical_score > 1.0
179                    ):
180                        raise ValueError(
181                            f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
182                        )
183                case TaskOutputRatingType.custom:
184                    raise ValueError(
185                        f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
186                    )
187                case _:
188                    # Catch missing cases
189                    raise_exhaustive_enum_error(output_score.type)
190        return self
def relationship_name() -> str:
438        def relationship_name_method() -> str:
439            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
431        def parent_class_method() -> Type[KilnParentModel]:
432            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class EvalConfig(kiln_ai.datamodel.basemodel.KilnParentedModel, kiln_ai.datamodel.basemodel.KilnParentModel):
193class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}):
194    """
195    A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
196
197    A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
198    """
199
200    name: str = NAME_FIELD
201    model_name: str = Field(
202        description="The name of the model to use for this eval config. ",
203    )
204    model_provider: str = Field(
205        description="The provider of the model to use for this eval config.",
206    )
207    config_type: EvalConfigType = Field(
208        default=EvalConfigType.g_eval,
209        description="This is used to determine the type of eval to run.",
210    )
211    properties: dict[str, Any] = Field(
212        default={},
213        description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.",
214    )
215
216    def parent_eval(self) -> Union["Eval", None]:
217        if self.parent is not None and self.parent.__class__.__name__ != "Eval":
218            raise ValueError("parent must be an Eval")
219        return self.parent  # type: ignore
220
221    def runs(self, readonly: bool = False) -> list[EvalRun]:
222        return super().runs(readonly=readonly)  # type: ignore
223
224    @model_validator(mode="after")
225    def validate_properties(self) -> Self:
226        if (
227            self.config_type == EvalConfigType.g_eval
228            or self.config_type == EvalConfigType.llm_as_judge
229        ):
230            if "eval_steps" not in self.properties or not isinstance(
231                self.properties["eval_steps"], list
232            ):
233                raise ValueError("eval_steps is required and must be a list for g_eval")
234            if "task_description" in self.properties and not isinstance(
235                self.properties["task_description"], str
236            ):
237                raise ValueError(
238                    "task_description is optional, but if provided must be a string"
239                )
240            return self
241        else:
242            raise ValueError(f"Invalid eval config type: {self.config_type}")
243
244    @model_validator(mode="after")
245    def validate_json_serializable(self) -> "EvalConfig":
246        try:
247            # This will raise a TypeError if the dict contains non-JSON-serializable objects
248            json.dumps(self.properties)
249        except TypeError as e:
250            raise ValueError(f"Properties must be JSON serializable: {str(e)}")
251        return self

A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.

A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.

name: str
model_name: str
model_provider: str
config_type: EvalConfigType
properties: dict[str, typing.Any]
def parent_eval(self) -> Optional[Eval]:
216    def parent_eval(self) -> Union["Eval", None]:
217        if self.parent is not None and self.parent.__class__.__name__ != "Eval":
218            raise ValueError("parent must be an Eval")
219        return self.parent  # type: ignore
def runs(self, readonly=False) -> List[EvalRun]:
420        def child_method(self, readonly: bool = False) -> list[child_class]:
421            return child_class.all_children_of_parent_path(self.path, readonly=readonly)

The type of the None singleton.

@model_validator(mode='after')
def validate_properties(self) -> Self:
224    @model_validator(mode="after")
225    def validate_properties(self) -> Self:
226        if (
227            self.config_type == EvalConfigType.g_eval
228            or self.config_type == EvalConfigType.llm_as_judge
229        ):
230            if "eval_steps" not in self.properties or not isinstance(
231                self.properties["eval_steps"], list
232            ):
233                raise ValueError("eval_steps is required and must be a list for g_eval")
234            if "task_description" in self.properties and not isinstance(
235                self.properties["task_description"], str
236            ):
237                raise ValueError(
238                    "task_description is optional, but if provided must be a string"
239                )
240            return self
241        else:
242            raise ValueError(f"Invalid eval config type: {self.config_type}")
@model_validator(mode='after')
def validate_json_serializable(self) -> EvalConfig:
244    @model_validator(mode="after")
245    def validate_json_serializable(self) -> "EvalConfig":
246        try:
247            # This will raise a TypeError if the dict contains non-JSON-serializable objects
248            json.dumps(self.properties)
249        except TypeError as e:
250            raise ValueError(f"Properties must be JSON serializable: {str(e)}")
251        return self
def relationship_name() -> str:
438        def relationship_name_method() -> str:
439            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
431        def parent_class_method() -> Type[KilnParentModel]:
432            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class Eval(kiln_ai.datamodel.basemodel.KilnParentedModel, kiln_ai.datamodel.basemodel.KilnParentModel):
254class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}):
255    name: str = NAME_FIELD
256    description: str | None = Field(
257        default=None, description="The description of the eval"
258    )
259    template: EvalTemplateId | None = Field(
260        default=None,
261        description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.",
262    )
263    current_config_id: ID_TYPE = Field(
264        default=None,
265        description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.",
266    )
267    eval_set_filter_id: DatasetFilterId = Field(
268        description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id."
269    )
270    eval_configs_filter_id: DatasetFilterId = Field(
271        description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id."
272    )
273    output_scores: List[EvalOutputScore] = Field(
274        description="The scores this evaluator should produce."
275    )
276
277    # Workaround to return typed parent without importing Task
278    def parent_task(self) -> Union["Task", None]:
279        if self.parent is not None and self.parent.__class__.__name__ != "Task":
280            raise ValueError("parent must be a Task")
281        return self.parent  # type: ignore
282
283    def configs(self, readonly: bool = False) -> list[EvalConfig]:
284        return super().configs(readonly=readonly)  # type: ignore
285
286    @model_validator(mode="after")
287    def validate_scores(self) -> Self:
288        if self.output_scores is None or len(self.output_scores) == 0:
289            raise ValueError(
290                "output_scores are required, and must have at least one score."
291            )
292
293        # check for duplicate names (once transformed to JSON keys)
294        output_score_keys = [score.json_key() for score in self.output_scores]
295        if len(output_score_keys) != len(set(output_score_keys)):
296            raise ValueError(
297                f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
298            )
299        return self

Base model for Kiln models that have a parent-child relationship. This base class is for child models.

This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.

Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.

name: str
description: str | None
template: EvalTemplateId | None
current_config_id: Optional[str]
eval_set_filter_id: Annotated[str, AfterValidator(func=<function <lambda> at 0x7fcb133b3ba0>)]
eval_configs_filter_id: Annotated[str, AfterValidator(func=<function <lambda> at 0x7fcb133b3ba0>)]
output_scores: List[EvalOutputScore]
def parent_task(self) -> Optional[kiln_ai.datamodel.Task]:
278    def parent_task(self) -> Union["Task", None]:
279        if self.parent is not None and self.parent.__class__.__name__ != "Task":
280            raise ValueError("parent must be a Task")
281        return self.parent  # type: ignore
def configs(self, readonly=False) -> List[EvalConfig]:
420        def child_method(self, readonly: bool = False) -> list[child_class]:
421            return child_class.all_children_of_parent_path(self.path, readonly=readonly)

The type of the None singleton.

@model_validator(mode='after')
def validate_scores(self) -> Self:
286    @model_validator(mode="after")
287    def validate_scores(self) -> Self:
288        if self.output_scores is None or len(self.output_scores) == 0:
289            raise ValueError(
290                "output_scores are required, and must have at least one score."
291            )
292
293        # check for duplicate names (once transformed to JSON keys)
294        output_score_keys = [score.json_key() for score in self.output_scores]
295        if len(output_score_keys) != len(set(output_score_keys)):
296            raise ValueError(
297                f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
298            )
299        return self
def relationship_name() -> str:
438        def relationship_name_method() -> str:
439            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
431        def parent_class_method() -> Type[KilnParentModel]:
432            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.