kiln_ai.datamodel.eval
1import json 2from enum import Enum 3from typing import TYPE_CHECKING, Any, Dict, List, Union 4 5from pydantic import BaseModel, Field, model_validator 6from typing_extensions import Self 7 8from kiln_ai.datamodel.basemodel import ( 9 ID_TYPE, 10 NAME_FIELD, 11 KilnParentedModel, 12 KilnParentModel, 13) 14from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType 15from kiln_ai.datamodel.dataset_filters import DatasetFilterId 16from kiln_ai.datamodel.json_schema import string_to_json_key 17from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error 18 19if TYPE_CHECKING: 20 from kiln_ai.datamodel.task import Task 21 22EvalScores = Dict[str, float] 23 24 25class EvalTemplateId(str, Enum): 26 """ 27 An eval template is a pre-defined eval that can be used as a starting point for a new eval. 28 """ 29 30 kiln_requirements = "kiln_requirements" 31 toxicity = "toxicity" 32 bias = "bias" 33 maliciousness = "maliciousness" 34 factual_correctness = "factual_correctness" 35 jailbreak = "jailbreak" 36 37 38class EvalConfigType(str, Enum): 39 g_eval = "g_eval" 40 llm_as_judge = "llm_as_judge" 41 42 43class EvalOutputScore(BaseModel): 44 """ 45 A definition of a score that an evaluator will produce. 46 47 Very similar to TaskRequirement, but conceptually different keeping in a separate models. 48 """ 49 50 name: str = Field( 51 description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance." 52 ) 53 instruction: str | None = Field( 54 default=None, 55 description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.", 56 ) 57 type: TaskOutputRatingType = Field( 58 description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical')." 59 ) 60 61 def json_key(self) -> str: 62 """ 63 The JSON key for the score, used when running the evaluator with a LLM and we need JSON output. 64 65 For example, "Overall Rating" -> "overall_rating" 66 """ 67 return string_to_json_key(self.name) 68 69 @model_validator(mode="after") 70 def validate_type(self) -> Self: 71 if self.type == TaskOutputRatingType.custom: 72 raise ValueError( 73 f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score." 74 ) 75 return self 76 77 78class EvalRun(KilnParentedModel): 79 """ 80 The results of running an eval on a single dataset item. 81 82 This is a child of an EvalConfig, which specifies how the scores were generated. 83 84 Eval runs can be one of 2 types: 85 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 86 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item. 87 """ 88 89 dataset_id: ID_TYPE = Field( 90 description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun." 91 ) 92 task_run_config_id: ID_TYPE | None = Field( 93 description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config." 94 ) 95 eval_config_eval: bool = Field( 96 description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.", 97 default=False, 98 ) 99 # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally. 100 input: str = Field( 101 description="The input to the task. JSON formatted for structured input, plaintext for unstructured input." 102 ) 103 output: str = Field( 104 description="The output of the task. JSON formatted for structured output, plaintext for unstructured output." 105 ) 106 intermediate_outputs: Dict[str, str] | None = Field( 107 default=None, 108 description="The intermediate outputs of the task (example, eval thinking).", 109 ) 110 scores: EvalScores = Field( 111 description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)." 112 ) 113 114 def parent_eval_config(self) -> Union["EvalConfig", None]: 115 if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig": 116 raise ValueError("parent must be an EvalConfig") 117 return self.parent # type: ignore 118 119 @model_validator(mode="after") 120 def validate_eval_run_types(self) -> Self: 121 if self.eval_config_eval and self.task_run_config_id is not None: 122 raise ValueError( 123 "task_run_config_id must be None if eval_config_eval is true" 124 ) 125 if not self.eval_config_eval and self.task_run_config_id is None: 126 raise ValueError( 127 "task_run_config_id must be set if eval_config_eval is false" 128 ) 129 return self 130 131 @model_validator(mode="after") 132 def validate_scores(self) -> Self: 133 # We're checking the scores have the expected keys from the grand-parent eval 134 if self.scores is None or len(self.scores) == 0: 135 raise ValueError("scores are required, and must have at least one score.") 136 137 parent_eval_config = self.parent_eval_config() 138 eval = parent_eval_config.parent_eval() if parent_eval_config else None 139 if not eval: 140 # Can't validate without the grand-parent eval, allow it to be validated later 141 return self 142 143 output_score_keys = [score.json_key() for score in eval.output_scores] 144 if set(output_score_keys) != set(self.scores.keys()): 145 raise ValueError( 146 f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]" 147 ) 148 149 # Check that each score is expected in this eval and the correct type 150 for output_score in eval.output_scores: 151 match output_score.type: 152 case TaskOutputRatingType.five_star: 153 five_star_score = self.scores[output_score.json_key()] 154 if ( 155 not isinstance(five_star_score, float) 156 or five_star_score < 1.0 157 or five_star_score > 5.0 158 ): 159 raise ValueError( 160 f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}" 161 ) 162 case TaskOutputRatingType.pass_fail: 163 pass_fail_score = self.scores[output_score.json_key()] 164 if ( 165 not isinstance(pass_fail_score, float) 166 or pass_fail_score < 0.0 167 or pass_fail_score > 1.0 168 ): 169 raise ValueError( 170 f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}" 171 ) 172 case TaskOutputRatingType.pass_fail_critical: 173 pass_fail_critical_score = self.scores[output_score.json_key()] 174 if ( 175 not isinstance(pass_fail_critical_score, float) 176 or pass_fail_critical_score < -1.0 177 or pass_fail_critical_score > 1.0 178 ): 179 raise ValueError( 180 f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}" 181 ) 182 case TaskOutputRatingType.custom: 183 raise ValueError( 184 f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score." 185 ) 186 case _: 187 # Catch missing cases 188 raise_exhaustive_enum_error(output_score.type) 189 return self 190 191 192class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}): 193 """ 194 A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc. 195 196 A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config. 197 """ 198 199 name: str = NAME_FIELD 200 model_name: str = Field( 201 description="The name of the model to use for this eval config. ", 202 ) 203 model_provider: str = Field( 204 description="The provider of the model to use for this eval config.", 205 ) 206 config_type: EvalConfigType = Field( 207 default=EvalConfigType.g_eval, 208 description="This is used to determine the type of eval to run.", 209 ) 210 properties: dict[str, Any] = Field( 211 default={}, 212 description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.", 213 ) 214 215 def parent_eval(self) -> Union["Eval", None]: 216 if self.parent is not None and self.parent.__class__.__name__ != "Eval": 217 raise ValueError("parent must be an Eval") 218 return self.parent # type: ignore 219 220 def runs(self, readonly: bool = False) -> list[EvalRun]: 221 return super().runs(readonly=readonly) # type: ignore 222 223 @model_validator(mode="after") 224 def validate_properties(self) -> Self: 225 if ( 226 self.config_type == EvalConfigType.g_eval 227 or self.config_type == EvalConfigType.llm_as_judge 228 ): 229 if "eval_steps" not in self.properties or not isinstance( 230 self.properties["eval_steps"], list 231 ): 232 raise ValueError("eval_steps is required and must be a list for g_eval") 233 if "task_description" in self.properties and not isinstance( 234 self.properties["task_description"], str 235 ): 236 raise ValueError( 237 "task_description is optional, but if provided must be a string" 238 ) 239 return self 240 else: 241 raise ValueError(f"Invalid eval config type: {self.config_type}") 242 243 @model_validator(mode="after") 244 def validate_json_serializable(self) -> "EvalConfig": 245 try: 246 # This will raise a TypeError if the dict contains non-JSON-serializable objects 247 json.dumps(self.properties) 248 except TypeError as e: 249 raise ValueError(f"Properties must be JSON serializable: {str(e)}") 250 return self 251 252 253class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}): 254 name: str = NAME_FIELD 255 description: str | None = Field( 256 default=None, description="The description of the eval" 257 ) 258 template: EvalTemplateId | None = Field( 259 default=None, 260 description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.", 261 ) 262 current_config_id: ID_TYPE = Field( 263 default=None, 264 description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.", 265 ) 266 eval_set_filter_id: DatasetFilterId = Field( 267 description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id." 268 ) 269 eval_configs_filter_id: DatasetFilterId = Field( 270 description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id." 271 ) 272 output_scores: List[EvalOutputScore] = Field( 273 description="The scores this evaluator should produce." 274 ) 275 276 # Workaround to return typed parent without importing Task 277 def parent_task(self) -> Union["Task", None]: 278 if self.parent is not None and self.parent.__class__.__name__ != "Task": 279 raise ValueError("parent must be a Task") 280 return self.parent # type: ignore 281 282 def configs(self, readonly: bool = False) -> list[EvalConfig]: 283 return super().configs(readonly=readonly) # type: ignore 284 285 @model_validator(mode="after") 286 def validate_scores(self) -> Self: 287 if self.output_scores is None or len(self.output_scores) == 0: 288 raise ValueError( 289 "output_scores are required, and must have at least one score." 290 ) 291 292 # check for duplicate names (once transformed to JSON keys) 293 output_score_keys = [score.json_key() for score in self.output_scores] 294 if len(output_score_keys) != len(set(output_score_keys)): 295 raise ValueError( 296 f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]" 297 ) 298 return self
26class EvalTemplateId(str, Enum): 27 """ 28 An eval template is a pre-defined eval that can be used as a starting point for a new eval. 29 """ 30 31 kiln_requirements = "kiln_requirements" 32 toxicity = "toxicity" 33 bias = "bias" 34 maliciousness = "maliciousness" 35 factual_correctness = "factual_correctness" 36 jailbreak = "jailbreak"
An eval template is a pre-defined eval that can be used as a starting point for a new eval.
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.
44class EvalOutputScore(BaseModel): 45 """ 46 A definition of a score that an evaluator will produce. 47 48 Very similar to TaskRequirement, but conceptually different keeping in a separate models. 49 """ 50 51 name: str = Field( 52 description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance." 53 ) 54 instruction: str | None = Field( 55 default=None, 56 description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.", 57 ) 58 type: TaskOutputRatingType = Field( 59 description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical')." 60 ) 61 62 def json_key(self) -> str: 63 """ 64 The JSON key for the score, used when running the evaluator with a LLM and we need JSON output. 65 66 For example, "Overall Rating" -> "overall_rating" 67 """ 68 return string_to_json_key(self.name) 69 70 @model_validator(mode="after") 71 def validate_type(self) -> Self: 72 if self.type == TaskOutputRatingType.custom: 73 raise ValueError( 74 f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score." 75 ) 76 return self
A definition of a score that an evaluator will produce.
Very similar to TaskRequirement, but conceptually different keeping in a separate models.
62 def json_key(self) -> str: 63 """ 64 The JSON key for the score, used when running the evaluator with a LLM and we need JSON output. 65 66 For example, "Overall Rating" -> "overall_rating" 67 """ 68 return string_to_json_key(self.name)
The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
For example, "Overall Rating" -> "overall_rating"
79class EvalRun(KilnParentedModel): 80 """ 81 The results of running an eval on a single dataset item. 82 83 This is a child of an EvalConfig, which specifies how the scores were generated. 84 85 Eval runs can be one of 2 types: 86 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 87 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item. 88 """ 89 90 dataset_id: ID_TYPE = Field( 91 description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun." 92 ) 93 task_run_config_id: ID_TYPE | None = Field( 94 description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config." 95 ) 96 eval_config_eval: bool = Field( 97 description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.", 98 default=False, 99 ) 100 # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally. 101 input: str = Field( 102 description="The input to the task. JSON formatted for structured input, plaintext for unstructured input." 103 ) 104 output: str = Field( 105 description="The output of the task. JSON formatted for structured output, plaintext for unstructured output." 106 ) 107 intermediate_outputs: Dict[str, str] | None = Field( 108 default=None, 109 description="The intermediate outputs of the task (example, eval thinking).", 110 ) 111 scores: EvalScores = Field( 112 description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)." 113 ) 114 115 def parent_eval_config(self) -> Union["EvalConfig", None]: 116 if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig": 117 raise ValueError("parent must be an EvalConfig") 118 return self.parent # type: ignore 119 120 @model_validator(mode="after") 121 def validate_eval_run_types(self) -> Self: 122 if self.eval_config_eval and self.task_run_config_id is not None: 123 raise ValueError( 124 "task_run_config_id must be None if eval_config_eval is true" 125 ) 126 if not self.eval_config_eval and self.task_run_config_id is None: 127 raise ValueError( 128 "task_run_config_id must be set if eval_config_eval is false" 129 ) 130 return self 131 132 @model_validator(mode="after") 133 def validate_scores(self) -> Self: 134 # We're checking the scores have the expected keys from the grand-parent eval 135 if self.scores is None or len(self.scores) == 0: 136 raise ValueError("scores are required, and must have at least one score.") 137 138 parent_eval_config = self.parent_eval_config() 139 eval = parent_eval_config.parent_eval() if parent_eval_config else None 140 if not eval: 141 # Can't validate without the grand-parent eval, allow it to be validated later 142 return self 143 144 output_score_keys = [score.json_key() for score in eval.output_scores] 145 if set(output_score_keys) != set(self.scores.keys()): 146 raise ValueError( 147 f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]" 148 ) 149 150 # Check that each score is expected in this eval and the correct type 151 for output_score in eval.output_scores: 152 match output_score.type: 153 case TaskOutputRatingType.five_star: 154 five_star_score = self.scores[output_score.json_key()] 155 if ( 156 not isinstance(five_star_score, float) 157 or five_star_score < 1.0 158 or five_star_score > 5.0 159 ): 160 raise ValueError( 161 f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}" 162 ) 163 case TaskOutputRatingType.pass_fail: 164 pass_fail_score = self.scores[output_score.json_key()] 165 if ( 166 not isinstance(pass_fail_score, float) 167 or pass_fail_score < 0.0 168 or pass_fail_score > 1.0 169 ): 170 raise ValueError( 171 f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}" 172 ) 173 case TaskOutputRatingType.pass_fail_critical: 174 pass_fail_critical_score = self.scores[output_score.json_key()] 175 if ( 176 not isinstance(pass_fail_critical_score, float) 177 or pass_fail_critical_score < -1.0 178 or pass_fail_critical_score > 1.0 179 ): 180 raise ValueError( 181 f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}" 182 ) 183 case TaskOutputRatingType.custom: 184 raise ValueError( 185 f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score." 186 ) 187 case _: 188 # Catch missing cases 189 raise_exhaustive_enum_error(output_score.type) 190 return self
The results of running an eval on a single dataset item.
This is a child of an EvalConfig, which specifies how the scores were generated.
Eval runs can be one of 2 types: 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
120 @model_validator(mode="after") 121 def validate_eval_run_types(self) -> Self: 122 if self.eval_config_eval and self.task_run_config_id is not None: 123 raise ValueError( 124 "task_run_config_id must be None if eval_config_eval is true" 125 ) 126 if not self.eval_config_eval and self.task_run_config_id is None: 127 raise ValueError( 128 "task_run_config_id must be set if eval_config_eval is false" 129 ) 130 return self
132 @model_validator(mode="after") 133 def validate_scores(self) -> Self: 134 # We're checking the scores have the expected keys from the grand-parent eval 135 if self.scores is None or len(self.scores) == 0: 136 raise ValueError("scores are required, and must have at least one score.") 137 138 parent_eval_config = self.parent_eval_config() 139 eval = parent_eval_config.parent_eval() if parent_eval_config else None 140 if not eval: 141 # Can't validate without the grand-parent eval, allow it to be validated later 142 return self 143 144 output_score_keys = [score.json_key() for score in eval.output_scores] 145 if set(output_score_keys) != set(self.scores.keys()): 146 raise ValueError( 147 f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]" 148 ) 149 150 # Check that each score is expected in this eval and the correct type 151 for output_score in eval.output_scores: 152 match output_score.type: 153 case TaskOutputRatingType.five_star: 154 five_star_score = self.scores[output_score.json_key()] 155 if ( 156 not isinstance(five_star_score, float) 157 or five_star_score < 1.0 158 or five_star_score > 5.0 159 ): 160 raise ValueError( 161 f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}" 162 ) 163 case TaskOutputRatingType.pass_fail: 164 pass_fail_score = self.scores[output_score.json_key()] 165 if ( 166 not isinstance(pass_fail_score, float) 167 or pass_fail_score < 0.0 168 or pass_fail_score > 1.0 169 ): 170 raise ValueError( 171 f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}" 172 ) 173 case TaskOutputRatingType.pass_fail_critical: 174 pass_fail_critical_score = self.scores[output_score.json_key()] 175 if ( 176 not isinstance(pass_fail_critical_score, float) 177 or pass_fail_critical_score < -1.0 178 or pass_fail_critical_score > 1.0 179 ): 180 raise ValueError( 181 f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}" 182 ) 183 case TaskOutputRatingType.custom: 184 raise ValueError( 185 f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score." 186 ) 187 case _: 188 # Catch missing cases 189 raise_exhaustive_enum_error(output_score.type) 190 return self
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
193class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}): 194 """ 195 A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc. 196 197 A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config. 198 """ 199 200 name: str = NAME_FIELD 201 model_name: str = Field( 202 description="The name of the model to use for this eval config. ", 203 ) 204 model_provider: str = Field( 205 description="The provider of the model to use for this eval config.", 206 ) 207 config_type: EvalConfigType = Field( 208 default=EvalConfigType.g_eval, 209 description="This is used to determine the type of eval to run.", 210 ) 211 properties: dict[str, Any] = Field( 212 default={}, 213 description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.", 214 ) 215 216 def parent_eval(self) -> Union["Eval", None]: 217 if self.parent is not None and self.parent.__class__.__name__ != "Eval": 218 raise ValueError("parent must be an Eval") 219 return self.parent # type: ignore 220 221 def runs(self, readonly: bool = False) -> list[EvalRun]: 222 return super().runs(readonly=readonly) # type: ignore 223 224 @model_validator(mode="after") 225 def validate_properties(self) -> Self: 226 if ( 227 self.config_type == EvalConfigType.g_eval 228 or self.config_type == EvalConfigType.llm_as_judge 229 ): 230 if "eval_steps" not in self.properties or not isinstance( 231 self.properties["eval_steps"], list 232 ): 233 raise ValueError("eval_steps is required and must be a list for g_eval") 234 if "task_description" in self.properties and not isinstance( 235 self.properties["task_description"], str 236 ): 237 raise ValueError( 238 "task_description is optional, but if provided must be a string" 239 ) 240 return self 241 else: 242 raise ValueError(f"Invalid eval config type: {self.config_type}") 243 244 @model_validator(mode="after") 245 def validate_json_serializable(self) -> "EvalConfig": 246 try: 247 # This will raise a TypeError if the dict contains non-JSON-serializable objects 248 json.dumps(self.properties) 249 except TypeError as e: 250 raise ValueError(f"Properties must be JSON serializable: {str(e)}") 251 return self
A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
420 def child_method(self, readonly: bool = False) -> list[child_class]: 421 return child_class.all_children_of_parent_path(self.path, readonly=readonly)
The type of the None singleton.
224 @model_validator(mode="after") 225 def validate_properties(self) -> Self: 226 if ( 227 self.config_type == EvalConfigType.g_eval 228 or self.config_type == EvalConfigType.llm_as_judge 229 ): 230 if "eval_steps" not in self.properties or not isinstance( 231 self.properties["eval_steps"], list 232 ): 233 raise ValueError("eval_steps is required and must be a list for g_eval") 234 if "task_description" in self.properties and not isinstance( 235 self.properties["task_description"], str 236 ): 237 raise ValueError( 238 "task_description is optional, but if provided must be a string" 239 ) 240 return self 241 else: 242 raise ValueError(f"Invalid eval config type: {self.config_type}")
244 @model_validator(mode="after") 245 def validate_json_serializable(self) -> "EvalConfig": 246 try: 247 # This will raise a TypeError if the dict contains non-JSON-serializable objects 248 json.dumps(self.properties) 249 except TypeError as e: 250 raise ValueError(f"Properties must be JSON serializable: {str(e)}") 251 return self
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
254class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}): 255 name: str = NAME_FIELD 256 description: str | None = Field( 257 default=None, description="The description of the eval" 258 ) 259 template: EvalTemplateId | None = Field( 260 default=None, 261 description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.", 262 ) 263 current_config_id: ID_TYPE = Field( 264 default=None, 265 description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.", 266 ) 267 eval_set_filter_id: DatasetFilterId = Field( 268 description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id." 269 ) 270 eval_configs_filter_id: DatasetFilterId = Field( 271 description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id." 272 ) 273 output_scores: List[EvalOutputScore] = Field( 274 description="The scores this evaluator should produce." 275 ) 276 277 # Workaround to return typed parent without importing Task 278 def parent_task(self) -> Union["Task", None]: 279 if self.parent is not None and self.parent.__class__.__name__ != "Task": 280 raise ValueError("parent must be a Task") 281 return self.parent # type: ignore 282 283 def configs(self, readonly: bool = False) -> list[EvalConfig]: 284 return super().configs(readonly=readonly) # type: ignore 285 286 @model_validator(mode="after") 287 def validate_scores(self) -> Self: 288 if self.output_scores is None or len(self.output_scores) == 0: 289 raise ValueError( 290 "output_scores are required, and must have at least one score." 291 ) 292 293 # check for duplicate names (once transformed to JSON keys) 294 output_score_keys = [score.json_key() for score in self.output_scores] 295 if len(output_score_keys) != len(set(output_score_keys)): 296 raise ValueError( 297 f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]" 298 ) 299 return self
Base model for Kiln models that have a parent-child relationship. This base class is for child models.
This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.
Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.
420 def child_method(self, readonly: bool = False) -> list[child_class]: 421 return child_class.all_children_of_parent_path(self.path, readonly=readonly)
The type of the None singleton.
286 @model_validator(mode="after") 287 def validate_scores(self) -> Self: 288 if self.output_scores is None or len(self.output_scores) == 0: 289 raise ValueError( 290 "output_scores are required, and must have at least one score." 291 ) 292 293 # check for duplicate names (once transformed to JSON keys) 294 output_score_keys = [score.json_key() for score in self.output_scores] 295 if len(output_score_keys) != len(set(output_score_keys)): 296 raise ValueError( 297 f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]" 298 ) 299 return self
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.