kiln_ai.datamodel.eval
1import json 2from enum import Enum 3from typing import TYPE_CHECKING, Any, Dict, List, Union 4 5from pydantic import BaseModel, Field, model_validator 6from typing_extensions import Self 7 8from kiln_ai.datamodel.basemodel import ( 9 ID_TYPE, 10 FilenameString, 11 KilnParentedModel, 12 KilnParentModel, 13) 14from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType 15from kiln_ai.datamodel.dataset_filters import DatasetFilterId 16from kiln_ai.datamodel.json_schema import string_to_json_key 17from kiln_ai.datamodel.task_run import Usage 18from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error 19 20if TYPE_CHECKING: 21 from kiln_ai.datamodel.task import Task 22 23EvalScores = Dict[str, float] 24 25 26class EvalTemplateId(str, Enum): 27 """ 28 An eval template is a pre-defined eval that can be used as a starting point for a new eval. 29 """ 30 31 kiln_requirements = "kiln_requirements" 32 issue = "kiln_issue" 33 toxicity = "toxicity" 34 bias = "bias" 35 maliciousness = "maliciousness" 36 factual_correctness = "factual_correctness" 37 jailbreak = "jailbreak" 38 39 40class EvalConfigType(str, Enum): 41 g_eval = "g_eval" 42 llm_as_judge = "llm_as_judge" 43 44 45class EvalOutputScore(BaseModel): 46 """ 47 A definition of a score that an evaluator will produce. 48 49 Very similar to TaskRequirement, but conceptually different keeping in a separate models. 50 """ 51 52 name: str = Field( 53 description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance." 54 ) 55 instruction: str | None = Field( 56 default=None, 57 description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.", 58 ) 59 type: TaskOutputRatingType = Field( 60 description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical')." 61 ) 62 63 def json_key(self) -> str: 64 """ 65 The JSON key for the score, used when running the evaluator with a LLM and we need JSON output. 66 67 For example, "Overall Rating" -> "overall_rating" 68 """ 69 return string_to_json_key(self.name) 70 71 @model_validator(mode="after") 72 def validate_type(self) -> Self: 73 if self.type == TaskOutputRatingType.custom: 74 raise ValueError( 75 f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score." 76 ) 77 return self 78 79 80class EvalRun(KilnParentedModel): 81 """ 82 The results of running an eval on a single dataset item. 83 84 This is a child of an EvalConfig, which specifies how the scores were generated. 85 86 Eval runs can be one of 2 types: 87 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 88 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item. 89 """ 90 91 dataset_id: ID_TYPE = Field( 92 description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun." 93 ) 94 task_run_config_id: ID_TYPE | None = Field( 95 description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config." 96 ) 97 eval_config_eval: bool = Field( 98 description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.", 99 default=False, 100 ) 101 # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally. 102 input: str = Field( 103 description="The input to the task. JSON formatted for structured input, plaintext for unstructured input." 104 ) 105 output: str = Field( 106 description="The output of the task. JSON formatted for structured output, plaintext for unstructured output." 107 ) 108 intermediate_outputs: Dict[str, str] | None = Field( 109 default=None, 110 description="The intermediate outputs of the task (example, eval thinking).", 111 ) 112 scores: EvalScores = Field( 113 description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)." 114 ) 115 task_run_usage: Usage | None = Field( 116 default=None, 117 description="The usage of the task run that produced this eval run output (not the usage by the evaluation model).", 118 ) 119 120 def parent_eval_config(self) -> Union["EvalConfig", None]: 121 if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig": 122 raise ValueError("parent must be an EvalConfig") 123 return self.parent # type: ignore 124 125 @model_validator(mode="after") 126 def validate_eval_run_types(self) -> Self: 127 if self.eval_config_eval and self.task_run_config_id is not None: 128 raise ValueError( 129 "task_run_config_id must be None if eval_config_eval is true" 130 ) 131 if not self.eval_config_eval and self.task_run_config_id is None: 132 raise ValueError( 133 "task_run_config_id must be set if eval_config_eval is false" 134 ) 135 return self 136 137 @model_validator(mode="after") 138 def validate_scores(self) -> Self: 139 # We're checking the scores have the expected keys from the grand-parent eval 140 if self.scores is None or len(self.scores) == 0: 141 raise ValueError("scores are required, and must have at least one score.") 142 143 parent_eval_config = self.parent_eval_config() 144 eval = parent_eval_config.parent_eval() if parent_eval_config else None 145 if not eval: 146 # Can't validate without the grand-parent eval, allow it to be validated later 147 return self 148 149 output_score_keys = [score.json_key() for score in eval.output_scores] 150 if set(output_score_keys) != set(self.scores.keys()): 151 raise ValueError( 152 f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]" 153 ) 154 155 # Check that each score is expected in this eval and the correct type 156 for output_score in eval.output_scores: 157 match output_score.type: 158 case TaskOutputRatingType.five_star: 159 five_star_score = self.scores[output_score.json_key()] 160 if ( 161 not isinstance(five_star_score, float) 162 or five_star_score < 1.0 163 or five_star_score > 5.0 164 ): 165 raise ValueError( 166 f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}" 167 ) 168 case TaskOutputRatingType.pass_fail: 169 pass_fail_score = self.scores[output_score.json_key()] 170 if ( 171 not isinstance(pass_fail_score, float) 172 or pass_fail_score < 0.0 173 or pass_fail_score > 1.0 174 ): 175 raise ValueError( 176 f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}" 177 ) 178 case TaskOutputRatingType.pass_fail_critical: 179 pass_fail_critical_score = self.scores[output_score.json_key()] 180 if ( 181 not isinstance(pass_fail_critical_score, float) 182 or pass_fail_critical_score < -1.0 183 or pass_fail_critical_score > 1.0 184 ): 185 raise ValueError( 186 f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}" 187 ) 188 case TaskOutputRatingType.custom: 189 raise ValueError( 190 f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score." 191 ) 192 case _: 193 # Catch missing cases 194 raise_exhaustive_enum_error(output_score.type) 195 return self 196 197 198class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}): 199 """ 200 A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc. 201 202 A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config. 203 """ 204 205 name: FilenameString = Field(description="The name of the eval config.") 206 model_name: str = Field( 207 description="The name of the model to use for this eval config. ", 208 ) 209 model_provider: str = Field( 210 description="The provider of the model to use for this eval config.", 211 ) 212 config_type: EvalConfigType = Field( 213 default=EvalConfigType.g_eval, 214 description="This is used to determine the type of eval to run.", 215 ) 216 properties: dict[str, Any] = Field( 217 default={}, 218 description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.", 219 ) 220 221 def parent_eval(self) -> Union["Eval", None]: 222 if self.parent is not None and self.parent.__class__.__name__ != "Eval": 223 raise ValueError("parent must be an Eval") 224 return self.parent # type: ignore 225 226 def runs(self, readonly: bool = False) -> list[EvalRun]: 227 return super().runs(readonly=readonly) # type: ignore 228 229 @model_validator(mode="after") 230 def validate_properties(self) -> Self: 231 if ( 232 self.config_type == EvalConfigType.g_eval 233 or self.config_type == EvalConfigType.llm_as_judge 234 ): 235 if "eval_steps" not in self.properties or not isinstance( 236 self.properties["eval_steps"], list 237 ): 238 raise ValueError("eval_steps is required and must be a list for g_eval") 239 if "task_description" in self.properties and not isinstance( 240 self.properties["task_description"], str 241 ): 242 raise ValueError( 243 "task_description is optional, but if provided must be a string" 244 ) 245 return self 246 else: 247 raise ValueError(f"Invalid eval config type: {self.config_type}") 248 249 @model_validator(mode="after") 250 def validate_json_serializable(self) -> "EvalConfig": 251 try: 252 # This will raise a TypeError if the dict contains non-JSON-serializable objects 253 json.dumps(self.properties) 254 except TypeError as e: 255 raise ValueError(f"Properties must be JSON serializable: {str(e)}") 256 return self 257 258 259class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}): 260 name: FilenameString = Field(description="The name of the eval.") 261 description: str | None = Field( 262 default=None, description="The description of the eval" 263 ) 264 template: EvalTemplateId | None = Field( 265 default=None, 266 description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.", 267 ) 268 current_config_id: ID_TYPE = Field( 269 default=None, 270 description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.", 271 ) 272 current_run_config_id: ID_TYPE = Field( 273 default=None, 274 description="The id of the a run config which was selected as the best run config for this eval. The run config must belong to the parent Task.", 275 ) 276 eval_set_filter_id: DatasetFilterId = Field( 277 description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id." 278 ) 279 eval_configs_filter_id: DatasetFilterId = Field( 280 description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id." 281 ) 282 output_scores: List[EvalOutputScore] = Field( 283 description="The scores this evaluator should produce." 284 ) 285 favourite: bool = Field( 286 default=False, 287 description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.", 288 ) 289 template_properties: dict[str, str | int | bool | float] = Field( 290 default={}, 291 description="Properties to be used to execute the eval. This is template_type specific and should serialize to a json dict.", 292 ) 293 294 # Workaround to return typed parent without importing Task 295 def parent_task(self) -> Union["Task", None]: 296 if self.parent is not None and self.parent.__class__.__name__ != "Task": 297 raise ValueError("parent must be a Task") 298 return self.parent # type: ignore 299 300 def configs(self, readonly: bool = False) -> list[EvalConfig]: 301 return super().configs(readonly=readonly) # type: ignore 302 303 @model_validator(mode="after") 304 def validate_scores(self) -> Self: 305 if self.output_scores is None or len(self.output_scores) == 0: 306 raise ValueError( 307 "output_scores are required, and must have at least one score." 308 ) 309 310 # check for duplicate names (once transformed to JSON keys) 311 output_score_keys = [score.json_key() for score in self.output_scores] 312 if len(output_score_keys) != len(set(output_score_keys)): 313 raise ValueError( 314 f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]" 315 ) 316 return self 317 318 @model_validator(mode="after") 319 def validate_template_properties(self) -> Self: 320 # Check for properties that are required for the issue template 321 if self.template == EvalTemplateId.issue: 322 if "issue_prompt" not in self.template_properties or not isinstance( 323 self.template_properties["issue_prompt"], str 324 ): 325 raise ValueError("issue_prompt is required for issue template") 326 if "failure_example" in self.template_properties and not isinstance( 327 self.template_properties["failure_example"], str 328 ): 329 raise ValueError( 330 "failure_example is optional for issue template, but if provided must be a string" 331 ) 332 if "pass_example" in self.template_properties and not isinstance( 333 self.template_properties["pass_example"], str 334 ): 335 raise ValueError( 336 "pass_example is optional for issue template, but if provided must be a string" 337 ) 338 return self
27class EvalTemplateId(str, Enum): 28 """ 29 An eval template is a pre-defined eval that can be used as a starting point for a new eval. 30 """ 31 32 kiln_requirements = "kiln_requirements" 33 issue = "kiln_issue" 34 toxicity = "toxicity" 35 bias = "bias" 36 maliciousness = "maliciousness" 37 factual_correctness = "factual_correctness" 38 jailbreak = "jailbreak"
An eval template is a pre-defined eval that can be used as a starting point for a new eval.
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.
46class EvalOutputScore(BaseModel): 47 """ 48 A definition of a score that an evaluator will produce. 49 50 Very similar to TaskRequirement, but conceptually different keeping in a separate models. 51 """ 52 53 name: str = Field( 54 description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance." 55 ) 56 instruction: str | None = Field( 57 default=None, 58 description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.", 59 ) 60 type: TaskOutputRatingType = Field( 61 description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical')." 62 ) 63 64 def json_key(self) -> str: 65 """ 66 The JSON key for the score, used when running the evaluator with a LLM and we need JSON output. 67 68 For example, "Overall Rating" -> "overall_rating" 69 """ 70 return string_to_json_key(self.name) 71 72 @model_validator(mode="after") 73 def validate_type(self) -> Self: 74 if self.type == TaskOutputRatingType.custom: 75 raise ValueError( 76 f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score." 77 ) 78 return self
A definition of a score that an evaluator will produce.
Very similar to TaskRequirement, but conceptually different keeping in a separate models.
64 def json_key(self) -> str: 65 """ 66 The JSON key for the score, used when running the evaluator with a LLM and we need JSON output. 67 68 For example, "Overall Rating" -> "overall_rating" 69 """ 70 return string_to_json_key(self.name)
The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
For example, "Overall Rating" -> "overall_rating"
81class EvalRun(KilnParentedModel): 82 """ 83 The results of running an eval on a single dataset item. 84 85 This is a child of an EvalConfig, which specifies how the scores were generated. 86 87 Eval runs can be one of 2 types: 88 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 89 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item. 90 """ 91 92 dataset_id: ID_TYPE = Field( 93 description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun." 94 ) 95 task_run_config_id: ID_TYPE | None = Field( 96 description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config." 97 ) 98 eval_config_eval: bool = Field( 99 description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.", 100 default=False, 101 ) 102 # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally. 103 input: str = Field( 104 description="The input to the task. JSON formatted for structured input, plaintext for unstructured input." 105 ) 106 output: str = Field( 107 description="The output of the task. JSON formatted for structured output, plaintext for unstructured output." 108 ) 109 intermediate_outputs: Dict[str, str] | None = Field( 110 default=None, 111 description="The intermediate outputs of the task (example, eval thinking).", 112 ) 113 scores: EvalScores = Field( 114 description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)." 115 ) 116 task_run_usage: Usage | None = Field( 117 default=None, 118 description="The usage of the task run that produced this eval run output (not the usage by the evaluation model).", 119 ) 120 121 def parent_eval_config(self) -> Union["EvalConfig", None]: 122 if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig": 123 raise ValueError("parent must be an EvalConfig") 124 return self.parent # type: ignore 125 126 @model_validator(mode="after") 127 def validate_eval_run_types(self) -> Self: 128 if self.eval_config_eval and self.task_run_config_id is not None: 129 raise ValueError( 130 "task_run_config_id must be None if eval_config_eval is true" 131 ) 132 if not self.eval_config_eval and self.task_run_config_id is None: 133 raise ValueError( 134 "task_run_config_id must be set if eval_config_eval is false" 135 ) 136 return self 137 138 @model_validator(mode="after") 139 def validate_scores(self) -> Self: 140 # We're checking the scores have the expected keys from the grand-parent eval 141 if self.scores is None or len(self.scores) == 0: 142 raise ValueError("scores are required, and must have at least one score.") 143 144 parent_eval_config = self.parent_eval_config() 145 eval = parent_eval_config.parent_eval() if parent_eval_config else None 146 if not eval: 147 # Can't validate without the grand-parent eval, allow it to be validated later 148 return self 149 150 output_score_keys = [score.json_key() for score in eval.output_scores] 151 if set(output_score_keys) != set(self.scores.keys()): 152 raise ValueError( 153 f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]" 154 ) 155 156 # Check that each score is expected in this eval and the correct type 157 for output_score in eval.output_scores: 158 match output_score.type: 159 case TaskOutputRatingType.five_star: 160 five_star_score = self.scores[output_score.json_key()] 161 if ( 162 not isinstance(five_star_score, float) 163 or five_star_score < 1.0 164 or five_star_score > 5.0 165 ): 166 raise ValueError( 167 f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}" 168 ) 169 case TaskOutputRatingType.pass_fail: 170 pass_fail_score = self.scores[output_score.json_key()] 171 if ( 172 not isinstance(pass_fail_score, float) 173 or pass_fail_score < 0.0 174 or pass_fail_score > 1.0 175 ): 176 raise ValueError( 177 f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}" 178 ) 179 case TaskOutputRatingType.pass_fail_critical: 180 pass_fail_critical_score = self.scores[output_score.json_key()] 181 if ( 182 not isinstance(pass_fail_critical_score, float) 183 or pass_fail_critical_score < -1.0 184 or pass_fail_critical_score > 1.0 185 ): 186 raise ValueError( 187 f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}" 188 ) 189 case TaskOutputRatingType.custom: 190 raise ValueError( 191 f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score." 192 ) 193 case _: 194 # Catch missing cases 195 raise_exhaustive_enum_error(output_score.type) 196 return self
The results of running an eval on a single dataset item.
This is a child of an EvalConfig, which specifies how the scores were generated.
Eval runs can be one of 2 types: 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
126 @model_validator(mode="after") 127 def validate_eval_run_types(self) -> Self: 128 if self.eval_config_eval and self.task_run_config_id is not None: 129 raise ValueError( 130 "task_run_config_id must be None if eval_config_eval is true" 131 ) 132 if not self.eval_config_eval and self.task_run_config_id is None: 133 raise ValueError( 134 "task_run_config_id must be set if eval_config_eval is false" 135 ) 136 return self
138 @model_validator(mode="after") 139 def validate_scores(self) -> Self: 140 # We're checking the scores have the expected keys from the grand-parent eval 141 if self.scores is None or len(self.scores) == 0: 142 raise ValueError("scores are required, and must have at least one score.") 143 144 parent_eval_config = self.parent_eval_config() 145 eval = parent_eval_config.parent_eval() if parent_eval_config else None 146 if not eval: 147 # Can't validate without the grand-parent eval, allow it to be validated later 148 return self 149 150 output_score_keys = [score.json_key() for score in eval.output_scores] 151 if set(output_score_keys) != set(self.scores.keys()): 152 raise ValueError( 153 f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]" 154 ) 155 156 # Check that each score is expected in this eval and the correct type 157 for output_score in eval.output_scores: 158 match output_score.type: 159 case TaskOutputRatingType.five_star: 160 five_star_score = self.scores[output_score.json_key()] 161 if ( 162 not isinstance(five_star_score, float) 163 or five_star_score < 1.0 164 or five_star_score > 5.0 165 ): 166 raise ValueError( 167 f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}" 168 ) 169 case TaskOutputRatingType.pass_fail: 170 pass_fail_score = self.scores[output_score.json_key()] 171 if ( 172 not isinstance(pass_fail_score, float) 173 or pass_fail_score < 0.0 174 or pass_fail_score > 1.0 175 ): 176 raise ValueError( 177 f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}" 178 ) 179 case TaskOutputRatingType.pass_fail_critical: 180 pass_fail_critical_score = self.scores[output_score.json_key()] 181 if ( 182 not isinstance(pass_fail_critical_score, float) 183 or pass_fail_critical_score < -1.0 184 or pass_fail_critical_score > 1.0 185 ): 186 raise ValueError( 187 f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}" 188 ) 189 case TaskOutputRatingType.custom: 190 raise ValueError( 191 f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score." 192 ) 193 case _: 194 # Catch missing cases 195 raise_exhaustive_enum_error(output_score.type) 196 return self
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
199class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}): 200 """ 201 A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc. 202 203 A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config. 204 """ 205 206 name: FilenameString = Field(description="The name of the eval config.") 207 model_name: str = Field( 208 description="The name of the model to use for this eval config. ", 209 ) 210 model_provider: str = Field( 211 description="The provider of the model to use for this eval config.", 212 ) 213 config_type: EvalConfigType = Field( 214 default=EvalConfigType.g_eval, 215 description="This is used to determine the type of eval to run.", 216 ) 217 properties: dict[str, Any] = Field( 218 default={}, 219 description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.", 220 ) 221 222 def parent_eval(self) -> Union["Eval", None]: 223 if self.parent is not None and self.parent.__class__.__name__ != "Eval": 224 raise ValueError("parent must be an Eval") 225 return self.parent # type: ignore 226 227 def runs(self, readonly: bool = False) -> list[EvalRun]: 228 return super().runs(readonly=readonly) # type: ignore 229 230 @model_validator(mode="after") 231 def validate_properties(self) -> Self: 232 if ( 233 self.config_type == EvalConfigType.g_eval 234 or self.config_type == EvalConfigType.llm_as_judge 235 ): 236 if "eval_steps" not in self.properties or not isinstance( 237 self.properties["eval_steps"], list 238 ): 239 raise ValueError("eval_steps is required and must be a list for g_eval") 240 if "task_description" in self.properties and not isinstance( 241 self.properties["task_description"], str 242 ): 243 raise ValueError( 244 "task_description is optional, but if provided must be a string" 245 ) 246 return self 247 else: 248 raise ValueError(f"Invalid eval config type: {self.config_type}") 249 250 @model_validator(mode="after") 251 def validate_json_serializable(self) -> "EvalConfig": 252 try: 253 # This will raise a TypeError if the dict contains non-JSON-serializable objects 254 json.dumps(self.properties) 255 except TypeError as e: 256 raise ValueError(f"Properties must be JSON serializable: {str(e)}") 257 return self
A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
446 def child_method(self, readonly: bool = False) -> list[child_class]: 447 return child_class.all_children_of_parent_path(self.path, readonly=readonly)
The type of the None singleton.
230 @model_validator(mode="after") 231 def validate_properties(self) -> Self: 232 if ( 233 self.config_type == EvalConfigType.g_eval 234 or self.config_type == EvalConfigType.llm_as_judge 235 ): 236 if "eval_steps" not in self.properties or not isinstance( 237 self.properties["eval_steps"], list 238 ): 239 raise ValueError("eval_steps is required and must be a list for g_eval") 240 if "task_description" in self.properties and not isinstance( 241 self.properties["task_description"], str 242 ): 243 raise ValueError( 244 "task_description is optional, but if provided must be a string" 245 ) 246 return self 247 else: 248 raise ValueError(f"Invalid eval config type: {self.config_type}")
250 @model_validator(mode="after") 251 def validate_json_serializable(self) -> "EvalConfig": 252 try: 253 # This will raise a TypeError if the dict contains non-JSON-serializable objects 254 json.dumps(self.properties) 255 except TypeError as e: 256 raise ValueError(f"Properties must be JSON serializable: {str(e)}") 257 return self
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.
260class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}): 261 name: FilenameString = Field(description="The name of the eval.") 262 description: str | None = Field( 263 default=None, description="The description of the eval" 264 ) 265 template: EvalTemplateId | None = Field( 266 default=None, 267 description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.", 268 ) 269 current_config_id: ID_TYPE = Field( 270 default=None, 271 description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.", 272 ) 273 current_run_config_id: ID_TYPE = Field( 274 default=None, 275 description="The id of the a run config which was selected as the best run config for this eval. The run config must belong to the parent Task.", 276 ) 277 eval_set_filter_id: DatasetFilterId = Field( 278 description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id." 279 ) 280 eval_configs_filter_id: DatasetFilterId = Field( 281 description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id." 282 ) 283 output_scores: List[EvalOutputScore] = Field( 284 description="The scores this evaluator should produce." 285 ) 286 favourite: bool = Field( 287 default=False, 288 description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.", 289 ) 290 template_properties: dict[str, str | int | bool | float] = Field( 291 default={}, 292 description="Properties to be used to execute the eval. This is template_type specific and should serialize to a json dict.", 293 ) 294 295 # Workaround to return typed parent without importing Task 296 def parent_task(self) -> Union["Task", None]: 297 if self.parent is not None and self.parent.__class__.__name__ != "Task": 298 raise ValueError("parent must be a Task") 299 return self.parent # type: ignore 300 301 def configs(self, readonly: bool = False) -> list[EvalConfig]: 302 return super().configs(readonly=readonly) # type: ignore 303 304 @model_validator(mode="after") 305 def validate_scores(self) -> Self: 306 if self.output_scores is None or len(self.output_scores) == 0: 307 raise ValueError( 308 "output_scores are required, and must have at least one score." 309 ) 310 311 # check for duplicate names (once transformed to JSON keys) 312 output_score_keys = [score.json_key() for score in self.output_scores] 313 if len(output_score_keys) != len(set(output_score_keys)): 314 raise ValueError( 315 f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]" 316 ) 317 return self 318 319 @model_validator(mode="after") 320 def validate_template_properties(self) -> Self: 321 # Check for properties that are required for the issue template 322 if self.template == EvalTemplateId.issue: 323 if "issue_prompt" not in self.template_properties or not isinstance( 324 self.template_properties["issue_prompt"], str 325 ): 326 raise ValueError("issue_prompt is required for issue template") 327 if "failure_example" in self.template_properties and not isinstance( 328 self.template_properties["failure_example"], str 329 ): 330 raise ValueError( 331 "failure_example is optional for issue template, but if provided must be a string" 332 ) 333 if "pass_example" in self.template_properties and not isinstance( 334 self.template_properties["pass_example"], str 335 ): 336 raise ValueError( 337 "pass_example is optional for issue template, but if provided must be a string" 338 ) 339 return self
Base model for Kiln models that have a parent-child relationship. This base class is for child models.
This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.
Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.
446 def child_method(self, readonly: bool = False) -> list[child_class]: 447 return child_class.all_children_of_parent_path(self.path, readonly=readonly)
The type of the None singleton.
304 @model_validator(mode="after") 305 def validate_scores(self) -> Self: 306 if self.output_scores is None or len(self.output_scores) == 0: 307 raise ValueError( 308 "output_scores are required, and must have at least one score." 309 ) 310 311 # check for duplicate names (once transformed to JSON keys) 312 output_score_keys = [score.json_key() for score in self.output_scores] 313 if len(output_score_keys) != len(set(output_score_keys)): 314 raise ValueError( 315 f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]" 316 ) 317 return self
319 @model_validator(mode="after") 320 def validate_template_properties(self) -> Self: 321 # Check for properties that are required for the issue template 322 if self.template == EvalTemplateId.issue: 323 if "issue_prompt" not in self.template_properties or not isinstance( 324 self.template_properties["issue_prompt"], str 325 ): 326 raise ValueError("issue_prompt is required for issue template") 327 if "failure_example" in self.template_properties and not isinstance( 328 self.template_properties["failure_example"], str 329 ): 330 raise ValueError( 331 "failure_example is optional for issue template, but if provided must be a string" 332 ) 333 if "pass_example" in self.template_properties and not isinstance( 334 self.template_properties["pass_example"], str 335 ): 336 raise ValueError( 337 "pass_example is optional for issue template, but if provided must be a string" 338 ) 339 return self
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict
][pydantic.config.ConfigDict].
122 def wrapped_model_post_init(self: BaseModel, context: Any, /) -> None: 123 """We need to both initialize private attributes and call the user-defined model_post_init 124 method. 125 """ 126 init_private_attributes(self, context) 127 original_model_post_init(self, context)
We need to both initialize private attributes and call the user-defined model_post_init method.