kiln_ai.datamodel.eval

  1import json
  2from enum import Enum
  3from typing import TYPE_CHECKING, Any, Dict, List, Union
  4
  5from pydantic import BaseModel, Field, model_validator
  6from typing_extensions import Self
  7
  8from kiln_ai.datamodel.basemodel import (
  9    ID_TYPE,
 10    FilenameString,
 11    KilnParentedModel,
 12    KilnParentModel,
 13)
 14from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType
 15from kiln_ai.datamodel.dataset_filters import DatasetFilterId
 16from kiln_ai.datamodel.json_schema import string_to_json_key
 17from kiln_ai.datamodel.task_run import Usage
 18from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
 19
 20if TYPE_CHECKING:
 21    from kiln_ai.datamodel.task import Task
 22
 23EvalScores = Dict[str, float]
 24
 25
 26class EvalTemplateId(str, Enum):
 27    """
 28    An eval template is a pre-defined eval that can be used as a starting point for a new eval.
 29    """
 30
 31    kiln_requirements = "kiln_requirements"
 32    issue = "kiln_issue"
 33    toxicity = "toxicity"
 34    bias = "bias"
 35    maliciousness = "maliciousness"
 36    factual_correctness = "factual_correctness"
 37    jailbreak = "jailbreak"
 38
 39
 40class EvalConfigType(str, Enum):
 41    g_eval = "g_eval"
 42    llm_as_judge = "llm_as_judge"
 43
 44
 45class EvalOutputScore(BaseModel):
 46    """
 47    A definition of a score that an evaluator will produce.
 48
 49    Very similar to TaskRequirement, but conceptually different keeping in a separate models.
 50    """
 51
 52    name: str = Field(
 53        description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance."
 54    )
 55    instruction: str | None = Field(
 56        default=None,
 57        description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.",
 58    )
 59    type: TaskOutputRatingType = Field(
 60        description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical')."
 61    )
 62
 63    def json_key(self) -> str:
 64        """
 65        The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
 66
 67        For example, "Overall Rating" -> "overall_rating"
 68        """
 69        return string_to_json_key(self.name)
 70
 71    @model_validator(mode="after")
 72    def validate_type(self) -> Self:
 73        if self.type == TaskOutputRatingType.custom:
 74            raise ValueError(
 75                f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
 76            )
 77        return self
 78
 79
 80class EvalRun(KilnParentedModel):
 81    """
 82    The results of running an eval on a single dataset item.
 83
 84    This is a child of an EvalConfig, which specifies how the scores were generated.
 85
 86    Eval runs can be one of 2 types:
 87    1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run.
 88    2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
 89    """
 90
 91    dataset_id: ID_TYPE = Field(
 92        description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun."
 93    )
 94    task_run_config_id: ID_TYPE | None = Field(
 95        description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config."
 96    )
 97    eval_config_eval: bool = Field(
 98        description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.",
 99        default=False,
100    )
101    # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally.
102    input: str = Field(
103        description="The input to the task. JSON formatted for structured input, plaintext for unstructured input."
104    )
105    output: str = Field(
106        description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
107    )
108    intermediate_outputs: Dict[str, str] | None = Field(
109        default=None,
110        description="The intermediate outputs of the task (example, eval thinking).",
111    )
112    scores: EvalScores = Field(
113        description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)."
114    )
115    task_run_usage: Usage | None = Field(
116        default=None,
117        description="The usage of the task run that produced this eval run output (not the usage by the evaluation model).",
118    )
119
120    def parent_eval_config(self) -> Union["EvalConfig", None]:
121        if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
122            raise ValueError("parent must be an EvalConfig")
123        return self.parent  # type: ignore
124
125    @model_validator(mode="after")
126    def validate_eval_run_types(self) -> Self:
127        if self.eval_config_eval and self.task_run_config_id is not None:
128            raise ValueError(
129                "task_run_config_id must be None if eval_config_eval is true"
130            )
131        if not self.eval_config_eval and self.task_run_config_id is None:
132            raise ValueError(
133                "task_run_config_id must be set if eval_config_eval is false"
134            )
135        return self
136
137    @model_validator(mode="after")
138    def validate_scores(self) -> Self:
139        # We're checking the scores have the expected keys from the grand-parent eval
140        if self.scores is None or len(self.scores) == 0:
141            raise ValueError("scores are required, and must have at least one score.")
142
143        parent_eval_config = self.parent_eval_config()
144        eval = parent_eval_config.parent_eval() if parent_eval_config else None
145        if not eval:
146            # Can't validate without the grand-parent eval, allow it to be validated later
147            return self
148
149        output_score_keys = [score.json_key() for score in eval.output_scores]
150        if set(output_score_keys) != set(self.scores.keys()):
151            raise ValueError(
152                f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
153            )
154
155        # Check that each score is expected in this eval and the correct type
156        for output_score in eval.output_scores:
157            match output_score.type:
158                case TaskOutputRatingType.five_star:
159                    five_star_score = self.scores[output_score.json_key()]
160                    if (
161                        not isinstance(five_star_score, float)
162                        or five_star_score < 1.0
163                        or five_star_score > 5.0
164                    ):
165                        raise ValueError(
166                            f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
167                        )
168                case TaskOutputRatingType.pass_fail:
169                    pass_fail_score = self.scores[output_score.json_key()]
170                    if (
171                        not isinstance(pass_fail_score, float)
172                        or pass_fail_score < 0.0
173                        or pass_fail_score > 1.0
174                    ):
175                        raise ValueError(
176                            f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
177                        )
178                case TaskOutputRatingType.pass_fail_critical:
179                    pass_fail_critical_score = self.scores[output_score.json_key()]
180                    if (
181                        not isinstance(pass_fail_critical_score, float)
182                        or pass_fail_critical_score < -1.0
183                        or pass_fail_critical_score > 1.0
184                    ):
185                        raise ValueError(
186                            f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
187                        )
188                case TaskOutputRatingType.custom:
189                    raise ValueError(
190                        f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
191                    )
192                case _:
193                    # Catch missing cases
194                    raise_exhaustive_enum_error(output_score.type)
195        return self
196
197
198class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}):
199    """
200    A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
201
202    A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
203    """
204
205    name: FilenameString = Field(description="The name of the eval config.")
206    model_name: str = Field(
207        description="The name of the model to use for this eval config. ",
208    )
209    model_provider: str = Field(
210        description="The provider of the model to use for this eval config.",
211    )
212    config_type: EvalConfigType = Field(
213        default=EvalConfigType.g_eval,
214        description="This is used to determine the type of eval to run.",
215    )
216    properties: dict[str, Any] = Field(
217        default={},
218        description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.",
219    )
220
221    def parent_eval(self) -> Union["Eval", None]:
222        if self.parent is not None and self.parent.__class__.__name__ != "Eval":
223            raise ValueError("parent must be an Eval")
224        return self.parent  # type: ignore
225
226    def runs(self, readonly: bool = False) -> list[EvalRun]:
227        return super().runs(readonly=readonly)  # type: ignore
228
229    @model_validator(mode="after")
230    def validate_properties(self) -> Self:
231        if (
232            self.config_type == EvalConfigType.g_eval
233            or self.config_type == EvalConfigType.llm_as_judge
234        ):
235            if "eval_steps" not in self.properties or not isinstance(
236                self.properties["eval_steps"], list
237            ):
238                raise ValueError("eval_steps is required and must be a list for g_eval")
239            if "task_description" in self.properties and not isinstance(
240                self.properties["task_description"], str
241            ):
242                raise ValueError(
243                    "task_description is optional, but if provided must be a string"
244                )
245            return self
246        else:
247            raise ValueError(f"Invalid eval config type: {self.config_type}")
248
249    @model_validator(mode="after")
250    def validate_json_serializable(self) -> "EvalConfig":
251        try:
252            # This will raise a TypeError if the dict contains non-JSON-serializable objects
253            json.dumps(self.properties)
254        except TypeError as e:
255            raise ValueError(f"Properties must be JSON serializable: {str(e)}")
256        return self
257
258
259class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}):
260    name: FilenameString = Field(description="The name of the eval.")
261    description: str | None = Field(
262        default=None, description="The description of the eval"
263    )
264    template: EvalTemplateId | None = Field(
265        default=None,
266        description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.",
267    )
268    current_config_id: ID_TYPE = Field(
269        default=None,
270        description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.",
271    )
272    current_run_config_id: ID_TYPE = Field(
273        default=None,
274        description="The id of the a run config which was selected as the best run config for this eval. The run config must belong to the parent Task.",
275    )
276    eval_set_filter_id: DatasetFilterId = Field(
277        description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id."
278    )
279    eval_configs_filter_id: DatasetFilterId = Field(
280        description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id."
281    )
282    output_scores: List[EvalOutputScore] = Field(
283        description="The scores this evaluator should produce."
284    )
285    favourite: bool = Field(
286        default=False,
287        description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.",
288    )
289    template_properties: dict[str, str | int | bool | float] = Field(
290        default={},
291        description="Properties to be used to execute the eval. This is template_type specific and should serialize to a json dict.",
292    )
293
294    # Workaround to return typed parent without importing Task
295    def parent_task(self) -> Union["Task", None]:
296        if self.parent is not None and self.parent.__class__.__name__ != "Task":
297            raise ValueError("parent must be a Task")
298        return self.parent  # type: ignore
299
300    def configs(self, readonly: bool = False) -> list[EvalConfig]:
301        return super().configs(readonly=readonly)  # type: ignore
302
303    @model_validator(mode="after")
304    def validate_scores(self) -> Self:
305        if self.output_scores is None or len(self.output_scores) == 0:
306            raise ValueError(
307                "output_scores are required, and must have at least one score."
308            )
309
310        # check for duplicate names (once transformed to JSON keys)
311        output_score_keys = [score.json_key() for score in self.output_scores]
312        if len(output_score_keys) != len(set(output_score_keys)):
313            raise ValueError(
314                f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
315            )
316        return self
317
318    @model_validator(mode="after")
319    def validate_template_properties(self) -> Self:
320        # Check for properties that are required for the issue template
321        if self.template == EvalTemplateId.issue:
322            if "issue_prompt" not in self.template_properties or not isinstance(
323                self.template_properties["issue_prompt"], str
324            ):
325                raise ValueError("issue_prompt is required for issue template")
326            if "failure_example" in self.template_properties and not isinstance(
327                self.template_properties["failure_example"], str
328            ):
329                raise ValueError(
330                    "failure_example is optional for issue template, but if provided must be a string"
331                )
332            if "pass_example" in self.template_properties and not isinstance(
333                self.template_properties["pass_example"], str
334            ):
335                raise ValueError(
336                    "pass_example is optional for issue template, but if provided must be a string"
337                )
338        return self
EvalScores = typing.Dict[str, float]
class EvalTemplateId(builtins.str, enum.Enum):
27class EvalTemplateId(str, Enum):
28    """
29    An eval template is a pre-defined eval that can be used as a starting point for a new eval.
30    """
31
32    kiln_requirements = "kiln_requirements"
33    issue = "kiln_issue"
34    toxicity = "toxicity"
35    bias = "bias"
36    maliciousness = "maliciousness"
37    factual_correctness = "factual_correctness"
38    jailbreak = "jailbreak"

An eval template is a pre-defined eval that can be used as a starting point for a new eval.

kiln_requirements = <EvalTemplateId.kiln_requirements: 'kiln_requirements'>
issue = <EvalTemplateId.issue: 'kiln_issue'>
toxicity = <EvalTemplateId.toxicity: 'toxicity'>
bias = <EvalTemplateId.bias: 'bias'>
maliciousness = <EvalTemplateId.maliciousness: 'maliciousness'>
factual_correctness = <EvalTemplateId.factual_correctness: 'factual_correctness'>
jailbreak = <EvalTemplateId.jailbreak: 'jailbreak'>
class EvalConfigType(builtins.str, enum.Enum):
41class EvalConfigType(str, Enum):
42    g_eval = "g_eval"
43    llm_as_judge = "llm_as_judge"

str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str

Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.

g_eval = <EvalConfigType.g_eval: 'g_eval'>
llm_as_judge = <EvalConfigType.llm_as_judge: 'llm_as_judge'>
class EvalOutputScore(pydantic.main.BaseModel):
46class EvalOutputScore(BaseModel):
47    """
48    A definition of a score that an evaluator will produce.
49
50    Very similar to TaskRequirement, but conceptually different keeping in a separate models.
51    """
52
53    name: str = Field(
54        description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance."
55    )
56    instruction: str | None = Field(
57        default=None,
58        description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.",
59    )
60    type: TaskOutputRatingType = Field(
61        description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical')."
62    )
63
64    def json_key(self) -> str:
65        """
66        The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
67
68        For example, "Overall Rating" -> "overall_rating"
69        """
70        return string_to_json_key(self.name)
71
72    @model_validator(mode="after")
73    def validate_type(self) -> Self:
74        if self.type == TaskOutputRatingType.custom:
75            raise ValueError(
76                f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
77            )
78        return self

A definition of a score that an evaluator will produce.

Very similar to TaskRequirement, but conceptually different keeping in a separate models.

name: str
instruction: str | None
def json_key(self) -> str:
64    def json_key(self) -> str:
65        """
66        The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
67
68        For example, "Overall Rating" -> "overall_rating"
69        """
70        return string_to_json_key(self.name)

The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.

For example, "Overall Rating" -> "overall_rating"

@model_validator(mode='after')
def validate_type(self) -> Self:
72    @model_validator(mode="after")
73    def validate_type(self) -> Self:
74        if self.type == TaskOutputRatingType.custom:
75            raise ValueError(
76                f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
77            )
78        return self
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class EvalRun(kiln_ai.datamodel.basemodel.KilnParentedModel):
 81class EvalRun(KilnParentedModel):
 82    """
 83    The results of running an eval on a single dataset item.
 84
 85    This is a child of an EvalConfig, which specifies how the scores were generated.
 86
 87    Eval runs can be one of 2 types:
 88    1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run.
 89    2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
 90    """
 91
 92    dataset_id: ID_TYPE = Field(
 93        description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun."
 94    )
 95    task_run_config_id: ID_TYPE | None = Field(
 96        description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config."
 97    )
 98    eval_config_eval: bool = Field(
 99        description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.",
100        default=False,
101    )
102    # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally.
103    input: str = Field(
104        description="The input to the task. JSON formatted for structured input, plaintext for unstructured input."
105    )
106    output: str = Field(
107        description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
108    )
109    intermediate_outputs: Dict[str, str] | None = Field(
110        default=None,
111        description="The intermediate outputs of the task (example, eval thinking).",
112    )
113    scores: EvalScores = Field(
114        description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)."
115    )
116    task_run_usage: Usage | None = Field(
117        default=None,
118        description="The usage of the task run that produced this eval run output (not the usage by the evaluation model).",
119    )
120
121    def parent_eval_config(self) -> Union["EvalConfig", None]:
122        if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
123            raise ValueError("parent must be an EvalConfig")
124        return self.parent  # type: ignore
125
126    @model_validator(mode="after")
127    def validate_eval_run_types(self) -> Self:
128        if self.eval_config_eval and self.task_run_config_id is not None:
129            raise ValueError(
130                "task_run_config_id must be None if eval_config_eval is true"
131            )
132        if not self.eval_config_eval and self.task_run_config_id is None:
133            raise ValueError(
134                "task_run_config_id must be set if eval_config_eval is false"
135            )
136        return self
137
138    @model_validator(mode="after")
139    def validate_scores(self) -> Self:
140        # We're checking the scores have the expected keys from the grand-parent eval
141        if self.scores is None or len(self.scores) == 0:
142            raise ValueError("scores are required, and must have at least one score.")
143
144        parent_eval_config = self.parent_eval_config()
145        eval = parent_eval_config.parent_eval() if parent_eval_config else None
146        if not eval:
147            # Can't validate without the grand-parent eval, allow it to be validated later
148            return self
149
150        output_score_keys = [score.json_key() for score in eval.output_scores]
151        if set(output_score_keys) != set(self.scores.keys()):
152            raise ValueError(
153                f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
154            )
155
156        # Check that each score is expected in this eval and the correct type
157        for output_score in eval.output_scores:
158            match output_score.type:
159                case TaskOutputRatingType.five_star:
160                    five_star_score = self.scores[output_score.json_key()]
161                    if (
162                        not isinstance(five_star_score, float)
163                        or five_star_score < 1.0
164                        or five_star_score > 5.0
165                    ):
166                        raise ValueError(
167                            f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
168                        )
169                case TaskOutputRatingType.pass_fail:
170                    pass_fail_score = self.scores[output_score.json_key()]
171                    if (
172                        not isinstance(pass_fail_score, float)
173                        or pass_fail_score < 0.0
174                        or pass_fail_score > 1.0
175                    ):
176                        raise ValueError(
177                            f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
178                        )
179                case TaskOutputRatingType.pass_fail_critical:
180                    pass_fail_critical_score = self.scores[output_score.json_key()]
181                    if (
182                        not isinstance(pass_fail_critical_score, float)
183                        or pass_fail_critical_score < -1.0
184                        or pass_fail_critical_score > 1.0
185                    ):
186                        raise ValueError(
187                            f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
188                        )
189                case TaskOutputRatingType.custom:
190                    raise ValueError(
191                        f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
192                    )
193                case _:
194                    # Catch missing cases
195                    raise_exhaustive_enum_error(output_score.type)
196        return self

The results of running an eval on a single dataset item.

This is a child of an EvalConfig, which specifies how the scores were generated.

Eval runs can be one of 2 types: 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.

dataset_id: Optional[str]
task_run_config_id: Optional[str]
eval_config_eval: bool
input: str
output: str
intermediate_outputs: Optional[Dict[str, str]]
scores: Dict[str, float]
task_run_usage: kiln_ai.datamodel.Usage | None
def parent_eval_config(self) -> Optional[EvalConfig]:
121    def parent_eval_config(self) -> Union["EvalConfig", None]:
122        if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
123            raise ValueError("parent must be an EvalConfig")
124        return self.parent  # type: ignore
@model_validator(mode='after')
def validate_eval_run_types(self) -> Self:
126    @model_validator(mode="after")
127    def validate_eval_run_types(self) -> Self:
128        if self.eval_config_eval and self.task_run_config_id is not None:
129            raise ValueError(
130                "task_run_config_id must be None if eval_config_eval is true"
131            )
132        if not self.eval_config_eval and self.task_run_config_id is None:
133            raise ValueError(
134                "task_run_config_id must be set if eval_config_eval is false"
135            )
136        return self
@model_validator(mode='after')
def validate_scores(self) -> Self:
138    @model_validator(mode="after")
139    def validate_scores(self) -> Self:
140        # We're checking the scores have the expected keys from the grand-parent eval
141        if self.scores is None or len(self.scores) == 0:
142            raise ValueError("scores are required, and must have at least one score.")
143
144        parent_eval_config = self.parent_eval_config()
145        eval = parent_eval_config.parent_eval() if parent_eval_config else None
146        if not eval:
147            # Can't validate without the grand-parent eval, allow it to be validated later
148            return self
149
150        output_score_keys = [score.json_key() for score in eval.output_scores]
151        if set(output_score_keys) != set(self.scores.keys()):
152            raise ValueError(
153                f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
154            )
155
156        # Check that each score is expected in this eval and the correct type
157        for output_score in eval.output_scores:
158            match output_score.type:
159                case TaskOutputRatingType.five_star:
160                    five_star_score = self.scores[output_score.json_key()]
161                    if (
162                        not isinstance(five_star_score, float)
163                        or five_star_score < 1.0
164                        or five_star_score > 5.0
165                    ):
166                        raise ValueError(
167                            f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
168                        )
169                case TaskOutputRatingType.pass_fail:
170                    pass_fail_score = self.scores[output_score.json_key()]
171                    if (
172                        not isinstance(pass_fail_score, float)
173                        or pass_fail_score < 0.0
174                        or pass_fail_score > 1.0
175                    ):
176                        raise ValueError(
177                            f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
178                        )
179                case TaskOutputRatingType.pass_fail_critical:
180                    pass_fail_critical_score = self.scores[output_score.json_key()]
181                    if (
182                        not isinstance(pass_fail_critical_score, float)
183                        or pass_fail_critical_score < -1.0
184                        or pass_fail_critical_score > 1.0
185                    ):
186                        raise ValueError(
187                            f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
188                        )
189                case TaskOutputRatingType.custom:
190                    raise ValueError(
191                        f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
192                    )
193                case _:
194                    # Catch missing cases
195                    raise_exhaustive_enum_error(output_score.type)
196        return self
def relationship_name() -> str:
464        def relationship_name_method() -> str:
465            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
457        def parent_class_method() -> Type[KilnParentModel]:
458            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class EvalConfig(kiln_ai.datamodel.basemodel.KilnParentedModel, kiln_ai.datamodel.basemodel.KilnParentModel):
199class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}):
200    """
201    A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
202
203    A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
204    """
205
206    name: FilenameString = Field(description="The name of the eval config.")
207    model_name: str = Field(
208        description="The name of the model to use for this eval config. ",
209    )
210    model_provider: str = Field(
211        description="The provider of the model to use for this eval config.",
212    )
213    config_type: EvalConfigType = Field(
214        default=EvalConfigType.g_eval,
215        description="This is used to determine the type of eval to run.",
216    )
217    properties: dict[str, Any] = Field(
218        default={},
219        description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.",
220    )
221
222    def parent_eval(self) -> Union["Eval", None]:
223        if self.parent is not None and self.parent.__class__.__name__ != "Eval":
224            raise ValueError("parent must be an Eval")
225        return self.parent  # type: ignore
226
227    def runs(self, readonly: bool = False) -> list[EvalRun]:
228        return super().runs(readonly=readonly)  # type: ignore
229
230    @model_validator(mode="after")
231    def validate_properties(self) -> Self:
232        if (
233            self.config_type == EvalConfigType.g_eval
234            or self.config_type == EvalConfigType.llm_as_judge
235        ):
236            if "eval_steps" not in self.properties or not isinstance(
237                self.properties["eval_steps"], list
238            ):
239                raise ValueError("eval_steps is required and must be a list for g_eval")
240            if "task_description" in self.properties and not isinstance(
241                self.properties["task_description"], str
242            ):
243                raise ValueError(
244                    "task_description is optional, but if provided must be a string"
245                )
246            return self
247        else:
248            raise ValueError(f"Invalid eval config type: {self.config_type}")
249
250    @model_validator(mode="after")
251    def validate_json_serializable(self) -> "EvalConfig":
252        try:
253            # This will raise a TypeError if the dict contains non-JSON-serializable objects
254            json.dumps(self.properties)
255        except TypeError as e:
256            raise ValueError(f"Properties must be JSON serializable: {str(e)}")
257        return self

A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.

A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7fe1ec5165c0>, json_schema_input_type=PydanticUndefined)]
model_name: str
model_provider: str
config_type: EvalConfigType
properties: dict[str, typing.Any]
def parent_eval(self) -> Optional[Eval]:
222    def parent_eval(self) -> Union["Eval", None]:
223        if self.parent is not None and self.parent.__class__.__name__ != "Eval":
224            raise ValueError("parent must be an Eval")
225        return self.parent  # type: ignore
def runs(self, readonly=False) -> List[EvalRun]:
446        def child_method(self, readonly: bool = False) -> list[child_class]:
447            return child_class.all_children_of_parent_path(self.path, readonly=readonly)

The type of the None singleton.

@model_validator(mode='after')
def validate_properties(self) -> Self:
230    @model_validator(mode="after")
231    def validate_properties(self) -> Self:
232        if (
233            self.config_type == EvalConfigType.g_eval
234            or self.config_type == EvalConfigType.llm_as_judge
235        ):
236            if "eval_steps" not in self.properties or not isinstance(
237                self.properties["eval_steps"], list
238            ):
239                raise ValueError("eval_steps is required and must be a list for g_eval")
240            if "task_description" in self.properties and not isinstance(
241                self.properties["task_description"], str
242            ):
243                raise ValueError(
244                    "task_description is optional, but if provided must be a string"
245                )
246            return self
247        else:
248            raise ValueError(f"Invalid eval config type: {self.config_type}")
@model_validator(mode='after')
def validate_json_serializable(self) -> EvalConfig:
250    @model_validator(mode="after")
251    def validate_json_serializable(self) -> "EvalConfig":
252        try:
253            # This will raise a TypeError if the dict contains non-JSON-serializable objects
254            json.dumps(self.properties)
255        except TypeError as e:
256            raise ValueError(f"Properties must be JSON serializable: {str(e)}")
257        return self
def relationship_name() -> str:
464        def relationship_name_method() -> str:
465            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
457        def parent_class_method() -> Type[KilnParentModel]:
458            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.

class Eval(kiln_ai.datamodel.basemodel.KilnParentedModel, kiln_ai.datamodel.basemodel.KilnParentModel):
260class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}):
261    name: FilenameString = Field(description="The name of the eval.")
262    description: str | None = Field(
263        default=None, description="The description of the eval"
264    )
265    template: EvalTemplateId | None = Field(
266        default=None,
267        description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.",
268    )
269    current_config_id: ID_TYPE = Field(
270        default=None,
271        description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.",
272    )
273    current_run_config_id: ID_TYPE = Field(
274        default=None,
275        description="The id of the a run config which was selected as the best run config for this eval. The run config must belong to the parent Task.",
276    )
277    eval_set_filter_id: DatasetFilterId = Field(
278        description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id."
279    )
280    eval_configs_filter_id: DatasetFilterId = Field(
281        description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id."
282    )
283    output_scores: List[EvalOutputScore] = Field(
284        description="The scores this evaluator should produce."
285    )
286    favourite: bool = Field(
287        default=False,
288        description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.",
289    )
290    template_properties: dict[str, str | int | bool | float] = Field(
291        default={},
292        description="Properties to be used to execute the eval. This is template_type specific and should serialize to a json dict.",
293    )
294
295    # Workaround to return typed parent without importing Task
296    def parent_task(self) -> Union["Task", None]:
297        if self.parent is not None and self.parent.__class__.__name__ != "Task":
298            raise ValueError("parent must be a Task")
299        return self.parent  # type: ignore
300
301    def configs(self, readonly: bool = False) -> list[EvalConfig]:
302        return super().configs(readonly=readonly)  # type: ignore
303
304    @model_validator(mode="after")
305    def validate_scores(self) -> Self:
306        if self.output_scores is None or len(self.output_scores) == 0:
307            raise ValueError(
308                "output_scores are required, and must have at least one score."
309            )
310
311        # check for duplicate names (once transformed to JSON keys)
312        output_score_keys = [score.json_key() for score in self.output_scores]
313        if len(output_score_keys) != len(set(output_score_keys)):
314            raise ValueError(
315                f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
316            )
317        return self
318
319    @model_validator(mode="after")
320    def validate_template_properties(self) -> Self:
321        # Check for properties that are required for the issue template
322        if self.template == EvalTemplateId.issue:
323            if "issue_prompt" not in self.template_properties or not isinstance(
324                self.template_properties["issue_prompt"], str
325            ):
326                raise ValueError("issue_prompt is required for issue template")
327            if "failure_example" in self.template_properties and not isinstance(
328                self.template_properties["failure_example"], str
329            ):
330                raise ValueError(
331                    "failure_example is optional for issue template, but if provided must be a string"
332                )
333            if "pass_example" in self.template_properties and not isinstance(
334                self.template_properties["pass_example"], str
335            ):
336                raise ValueError(
337                    "pass_example is optional for issue template, but if provided must be a string"
338                )
339        return self

Base model for Kiln models that have a parent-child relationship. This base class is for child models.

This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.

Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7fe1ec5165c0>, json_schema_input_type=PydanticUndefined)]
description: str | None
template: EvalTemplateId | None
current_config_id: Optional[str]
current_run_config_id: Optional[str]
eval_set_filter_id: Annotated[str, AfterValidator(func=<function <lambda> at 0x7fe1ec246ac0>)]
eval_configs_filter_id: Annotated[str, AfterValidator(func=<function <lambda> at 0x7fe1ec246ac0>)]
output_scores: List[EvalOutputScore]
favourite: bool
template_properties: dict[str, str | int | bool | float]
def parent_task(self) -> Optional[kiln_ai.datamodel.Task]:
296    def parent_task(self) -> Union["Task", None]:
297        if self.parent is not None and self.parent.__class__.__name__ != "Task":
298            raise ValueError("parent must be a Task")
299        return self.parent  # type: ignore
def configs(self, readonly=False) -> List[EvalConfig]:
446        def child_method(self, readonly: bool = False) -> list[child_class]:
447            return child_class.all_children_of_parent_path(self.path, readonly=readonly)

The type of the None singleton.

@model_validator(mode='after')
def validate_scores(self) -> Self:
304    @model_validator(mode="after")
305    def validate_scores(self) -> Self:
306        if self.output_scores is None or len(self.output_scores) == 0:
307            raise ValueError(
308                "output_scores are required, and must have at least one score."
309            )
310
311        # check for duplicate names (once transformed to JSON keys)
312        output_score_keys = [score.json_key() for score in self.output_scores]
313        if len(output_score_keys) != len(set(output_score_keys)):
314            raise ValueError(
315                f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
316            )
317        return self
@model_validator(mode='after')
def validate_template_properties(self) -> Self:
319    @model_validator(mode="after")
320    def validate_template_properties(self) -> Self:
321        # Check for properties that are required for the issue template
322        if self.template == EvalTemplateId.issue:
323            if "issue_prompt" not in self.template_properties or not isinstance(
324                self.template_properties["issue_prompt"], str
325            ):
326                raise ValueError("issue_prompt is required for issue template")
327            if "failure_example" in self.template_properties and not isinstance(
328                self.template_properties["failure_example"], str
329            ):
330                raise ValueError(
331                    "failure_example is optional for issue template, but if provided must be a string"
332                )
333            if "pass_example" in self.template_properties and not isinstance(
334                self.template_properties["pass_example"], str
335            ):
336                raise ValueError(
337                    "pass_example is optional for issue template, but if provided must be a string"
338                )
339        return self
def relationship_name() -> str:
464        def relationship_name_method() -> str:
465            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
457        def parent_class_method() -> Type[KilnParentModel]:
458            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
122                    def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None:
123                        """We need to both initialize private attributes and call the user-defined model_post_init
124                        method.
125                        """
126                        init_private_attributes(self, context)
127                        original_model_post_init(self, context)

We need to both initialize private attributes and call the user-defined model_post_init method.