kiln_ai.datamodel.eval
1import json 2from enum import Enum 3from threading import Lock 4from typing import TYPE_CHECKING, Any, Dict, List, Union 5 6from pydantic import BaseModel, Field, model_validator 7from typing_extensions import Self 8 9from kiln_ai.datamodel.basemodel import ( 10 ID_TYPE, 11 FilenameString, 12 FilenameStringShort, 13 KilnParentedModel, 14 KilnParentModel, 15) 16from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType 17from kiln_ai.datamodel.dataset_filters import DatasetFilterId 18from kiln_ai.datamodel.json_schema import string_to_json_key 19from kiln_ai.datamodel.task_run import Usage 20from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error 21 22if TYPE_CHECKING: 23 from kiln_ai.datamodel.spec import Spec 24 from kiln_ai.datamodel.task import Task 25 26EvalScores = Dict[str, float] 27 28# Module-level set to track evals currently being migrated (to prevent recursion) 29# Protected by _migration_lock to ensure thread-safe access 30_migration_lock = Lock() 31_currently_migrating_eval_ids: set[ID_TYPE] = set() 32 33 34class EvalTemplateId(str, Enum): 35 """ 36 An eval template is a pre-defined eval that can be used as a starting point for a new eval. 37 """ 38 39 kiln_requirements = "kiln_requirements" 40 desired_behaviour = "desired_behaviour" 41 issue = "kiln_issue" 42 tool_call = "tool_call" 43 toxicity = "toxicity" 44 bias = "bias" 45 maliciousness = "maliciousness" 46 factual_correctness = "factual_correctness" 47 jailbreak = "jailbreak" 48 rag = "rag" 49 50 51class EvalConfigType(str, Enum): 52 g_eval = "g_eval" 53 llm_as_judge = "llm_as_judge" 54 55 56class EvalOutputScore(BaseModel): 57 """ 58 A definition of a score that an evaluator will produce. 59 60 Very similar to TaskRequirement, but conceptually different keeping in a separate models. 61 """ 62 63 name: FilenameStringShort = Field( 64 description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance." 65 ) 66 instruction: str | None = Field( 67 default=None, 68 description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.", 69 ) 70 type: TaskOutputRatingType = Field( 71 description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical').", 72 ) 73 74 def json_key(self) -> str: 75 """ 76 The JSON key for the score, used when running the evaluator with a LLM and we need JSON output. 77 78 For example, "Overall Rating" -> "overall_rating" 79 """ 80 return string_to_json_key(self.name) 81 82 @model_validator(mode="after") 83 def validate_type(self) -> Self: 84 if self.type == TaskOutputRatingType.custom: 85 raise ValueError( 86 f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score." 87 ) 88 return self 89 90 91class EvalRun(KilnParentedModel): 92 """ 93 The results of running an eval on a single dataset item. 94 95 This is a child of an EvalConfig, which specifies how the scores were generated. 96 97 Eval runs can be one of 2 types: 98 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 99 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item. 100 """ 101 102 dataset_id: ID_TYPE = Field( 103 description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun." 104 ) 105 task_run_config_id: ID_TYPE | None = Field( 106 description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config." 107 ) 108 eval_config_eval: bool = Field( 109 description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.", 110 default=False, 111 ) 112 # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally. 113 input: str = Field( 114 description="The input to the task. JSON formatted for structured input, plaintext for unstructured input." 115 ) 116 output: str = Field( 117 description="The output of the task. JSON formatted for structured output, plaintext for unstructured output." 118 ) 119 reference_answer: str | None = Field( 120 default=None, 121 description="The reference answer for the input. JSON formatted for structured reference answer, plaintext for unstructured reference answer. Used for reference answer evals.", 122 ) 123 intermediate_outputs: Dict[str, str] | None = Field( 124 default=None, 125 description="The intermediate outputs of the task (example, eval thinking).", 126 ) 127 task_run_trace: str | None = Field( 128 default=None, 129 description="The JSON formatted trace of the task run that produced the output.", 130 ) 131 scores: EvalScores = Field( 132 description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)." 133 ) 134 task_run_usage: Usage | None = Field( 135 default=None, 136 description="The usage of the task run that produced this eval run output (not the usage by the evaluation model).", 137 ) 138 139 def parent_eval_config(self) -> Union["EvalConfig", None]: 140 if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig": 141 raise ValueError("parent must be an EvalConfig") 142 return self.parent # type: ignore 143 144 @model_validator(mode="after") 145 def validate_output_fields(self) -> Self: 146 parent_eval_config = self.parent_eval_config() 147 parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None 148 if not parent_eval: 149 return self 150 151 evaluation_data_type = parent_eval.evaluation_data_type 152 if ( 153 evaluation_data_type == EvalDataType.final_answer 154 and self.task_run_trace is not None 155 ): 156 raise ValueError("final_answer runs should not set trace") 157 elif ( 158 not self.eval_config_eval 159 and evaluation_data_type == EvalDataType.full_trace 160 and self.task_run_trace is None 161 ): 162 raise ValueError("full_trace task run eval runs should include trace") 163 164 return self 165 166 @model_validator(mode="after") 167 def validate_eval_run_types(self) -> Self: 168 if self.eval_config_eval and self.task_run_config_id is not None: 169 raise ValueError( 170 "task_run_config_id must be None if eval_config_eval is true" 171 ) 172 if not self.eval_config_eval and self.task_run_config_id is None: 173 raise ValueError( 174 "task_run_config_id must be set if eval_config_eval is false" 175 ) 176 return self 177 178 @model_validator(mode="after") 179 def validate_scores(self) -> Self: 180 # We're checking the scores have the expected keys from the grand-parent eval 181 if self.scores is None or len(self.scores) == 0: 182 raise ValueError("scores are required, and must have at least one score.") 183 184 parent_eval_config = self.parent_eval_config() 185 eval = parent_eval_config.parent_eval() if parent_eval_config else None 186 if not eval: 187 # Can't validate without the grand-parent eval, allow it to be validated later 188 return self 189 190 output_score_keys = [score.json_key() for score in eval.output_scores] 191 if set(output_score_keys) != set(self.scores.keys()): 192 raise ValueError( 193 f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]" 194 ) 195 196 # Check that each score is expected in this eval and the correct type 197 for output_score in eval.output_scores: 198 match output_score.type: 199 case TaskOutputRatingType.five_star: 200 five_star_score = self.scores[output_score.json_key()] 201 if ( 202 not isinstance(five_star_score, float) 203 or five_star_score < 1.0 204 or five_star_score > 5.0 205 ): 206 raise ValueError( 207 f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}" 208 ) 209 case TaskOutputRatingType.pass_fail: 210 pass_fail_score = self.scores[output_score.json_key()] 211 if ( 212 not isinstance(pass_fail_score, float) 213 or pass_fail_score < 0.0 214 or pass_fail_score > 1.0 215 ): 216 raise ValueError( 217 f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}" 218 ) 219 case TaskOutputRatingType.pass_fail_critical: 220 pass_fail_critical_score = self.scores[output_score.json_key()] 221 if ( 222 not isinstance(pass_fail_critical_score, float) 223 or pass_fail_critical_score < -1.0 224 or pass_fail_critical_score > 1.0 225 ): 226 raise ValueError( 227 f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}" 228 ) 229 case TaskOutputRatingType.custom: 230 raise ValueError( 231 f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score." 232 ) 233 case _: 234 # Catch missing cases 235 raise_exhaustive_enum_error(output_score.type) 236 return self 237 238 @model_validator(mode="after") 239 def validate_reference_answer(self) -> Self: 240 parent_eval_config = self.parent_eval_config() 241 parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None 242 if not parent_eval: 243 # Can't validate without the grand-parent eval, allow it to be validated later 244 return self 245 246 evaluation_data_type = parent_eval.evaluation_data_type 247 if ( 248 self.reference_answer is not None 249 and evaluation_data_type != EvalDataType.reference_answer 250 ): 251 raise ValueError( 252 f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}" 253 ) 254 return self 255 256 257class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}): 258 """ 259 A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc. 260 261 A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config. 262 """ 263 264 name: FilenameString = Field(description="The name of the eval config.") 265 model_name: str = Field( 266 description="The name of the model to use for this eval config. ", 267 ) 268 model_provider: str = Field( 269 description="The provider of the model to use for this eval config.", 270 ) 271 config_type: EvalConfigType = Field( 272 default=EvalConfigType.g_eval, 273 description="This is used to determine the type of eval to run.", 274 ) 275 properties: dict[str, Any] = Field( 276 default={}, 277 description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.", 278 ) 279 280 def parent_eval(self) -> Union["Eval", None]: 281 if self.parent is not None and self.parent.__class__.__name__ != "Eval": 282 raise ValueError("parent must be an Eval") 283 return self.parent # type: ignore 284 285 def runs(self, readonly: bool = False) -> list[EvalRun]: 286 return super().runs(readonly=readonly) # type: ignore 287 288 @model_validator(mode="after") 289 def validate_properties(self) -> Self: 290 if ( 291 self.config_type == EvalConfigType.g_eval 292 or self.config_type == EvalConfigType.llm_as_judge 293 ): 294 if "eval_steps" not in self.properties or not isinstance( 295 self.properties["eval_steps"], list 296 ): 297 raise ValueError("eval_steps is required and must be a list for g_eval") 298 if "task_description" in self.properties and not isinstance( 299 self.properties["task_description"], str 300 ): 301 raise ValueError( 302 "task_description is optional, but if provided must be a string" 303 ) 304 return self 305 else: 306 raise ValueError(f"Invalid eval config type: {self.config_type}") 307 308 @model_validator(mode="after") 309 def validate_json_serializable(self) -> "EvalConfig": 310 try: 311 # This will raise a TypeError if the dict contains non-JSON-serializable objects 312 json.dumps(self.properties) 313 except TypeError as e: 314 raise ValueError(f"Properties must be JSON serializable: {e!s}") 315 return self 316 317 318class EvalDataType(str, Enum): 319 final_answer = "final_answer" 320 full_trace = "full_trace" 321 reference_answer = "reference_answer" 322 323 324class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}): 325 name: FilenameString = Field(description="The name of the eval.") 326 description: str | None = Field( 327 default=None, description="The description of the eval" 328 ) 329 template: EvalTemplateId | None = Field( 330 default=None, 331 description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.", 332 ) 333 current_config_id: ID_TYPE = Field( 334 default=None, 335 description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.", 336 ) 337 eval_set_filter_id: DatasetFilterId = Field( 338 description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id and train_set_filter_id." 339 ) 340 eval_configs_filter_id: DatasetFilterId | None = Field( 341 default=None, 342 description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id.", 343 ) 344 train_set_filter_id: DatasetFilterId | None = Field( 345 default=None, 346 description="The id of the dataset filter which defines which dataset items are included in the training set for fine-tuning. Should be mutually exclusive with eval_set_filter_id.", 347 ) 348 output_scores: List[EvalOutputScore] = Field( 349 description="The scores this evaluator should produce." 350 ) 351 favourite: bool = Field( 352 default=False, 353 description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.", 354 ) 355 template_properties: dict[str, str | int | bool | float] | None = Field( 356 default=None, 357 description="Properties to be used to execute the eval. This is template_type specific and should serialize to a json dict.", 358 ) 359 evaluation_data_type: EvalDataType = Field( 360 default=EvalDataType.final_answer, 361 description="The output of the task run to evaluate. Can be final answer or full trace.", 362 ) 363 364 # Workaround to return typed parent without importing Task 365 def parent_task(self) -> Union["Task", None]: 366 if self.parent is not None and self.parent.__class__.__name__ != "Task": 367 raise ValueError("parent must be a Task") 368 return self.parent # type: ignore 369 370 def configs(self, readonly: bool = False) -> list[EvalConfig]: 371 return super().configs(readonly=readonly) # type: ignore 372 373 # Workaround to return typed parent without importing Spec 374 def associated_spec(self, readonly: bool = False) -> Union["Spec", None]: 375 """ 376 Get the spec associated with this eval, if any. 377 Returns None for legacy evals that are not associated with a spec. 378 """ 379 380 task = self.parent_task() 381 if not task or not self.id: 382 return None 383 384 specs = task.specs(readonly=readonly) 385 for spec in specs: 386 if spec.eval_id == self.id: 387 return spec 388 return None 389 390 @model_validator(mode="after") 391 def upgrade_old_reference_answer_eval_config(self) -> Self: 392 """ 393 Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set. 394 395 For reference_answer evals that don't have a current_config_id set, this migration 396 will set the first config (by created_at) as the default. 397 """ 398 if self.id is None: 399 return self 400 401 # Only run during file loading 402 if not self._loaded_from_file: 403 return self 404 405 # Skip if already migrated (has a current_config_id set) 406 if self.current_config_id is not None: 407 return self 408 409 # Only migrate reference_answer evals 410 if self.evaluation_data_type != EvalDataType.reference_answer: 411 return self 412 413 # Prevent recursion: self.configs() loads child files, which re-loads this parent 414 # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file) 415 # This causes the validator to run again, creating an infinite loop without this guard. 416 with _migration_lock: 417 if self.id in _currently_migrating_eval_ids: 418 return self 419 _currently_migrating_eval_ids.add(self.id) 420 421 try: 422 # Get the configs - these are loaded from child files 423 configs_list = self.configs(readonly=True) 424 if configs_list and len(configs_list) > 0: 425 # Sort by created_at to get the oldest (first created) config 426 sorted_configs = sorted(configs_list, key=lambda c: c.created_at) 427 self.current_config_id = sorted_configs[0].id 428 finally: 429 with _migration_lock: 430 _currently_migrating_eval_ids.discard(self.id) 431 432 return self 433 434 @model_validator(mode="after") 435 def validate_scores(self) -> Self: 436 if self.output_scores is None or len(self.output_scores) == 0: 437 raise ValueError( 438 "output_scores are required, and must have at least one score." 439 ) 440 441 # check for duplicate names (once transformed to JSON keys) 442 output_score_keys = [score.json_key() for score in self.output_scores] 443 if len(output_score_keys) != len(set(output_score_keys)): 444 raise ValueError( 445 f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]" 446 ) 447 return self 448 449 @model_validator(mode="after") 450 def validate_template_properties(self) -> Self: 451 # eval_configs_filter_id is required for all templates except "rag" 452 if ( 453 self.template is not EvalTemplateId.rag 454 and self.eval_configs_filter_id is None 455 ): 456 raise ValueError( 457 "eval_configs_filter_id is required for all templates except 'rag'" 458 ) 459 460 # For spec-based evals, template_properties will be None and validation happens in the spec 461 # For legacy evals, template_properties contains the data and we validate here 462 if self.template_properties is None: 463 return self 464 465 # Check for properties that are required for the issue template (legacy evals only) 466 if self.template == EvalTemplateId.issue: 467 if "issue_prompt" not in self.template_properties or not isinstance( 468 self.template_properties["issue_prompt"], str 469 ): 470 raise ValueError("issue_prompt is required for issue template") 471 if "failure_example" in self.template_properties and not isinstance( 472 self.template_properties["failure_example"], str 473 ): 474 raise ValueError( 475 "failure_example is optional for issue template, but if provided must be a string" 476 ) 477 if "pass_example" in self.template_properties and not isinstance( 478 self.template_properties["pass_example"], str 479 ): 480 raise ValueError( 481 "pass_example is optional for issue template, but if provided must be a string" 482 ) 483 484 if self.template == EvalTemplateId.tool_call: 485 if self.evaluation_data_type != EvalDataType.full_trace: 486 raise ValueError( 487 "tool_call template should have evaluation_data_type set to full_trace" 488 ) 489 if ( 490 "tool" not in self.template_properties 491 or not isinstance(self.template_properties["tool"], str) 492 or not self.template_properties["tool"].strip() 493 ): 494 raise ValueError("tool is required for tool call template") 495 if "tool_function_name" not in self.template_properties or not isinstance( 496 self.template_properties["tool_function_name"], str 497 ): 498 raise ValueError( 499 "tool_function_name is required for tool call template" 500 ) 501 if ( 502 "appropriate_tool_use_guidelines" not in self.template_properties 503 or not isinstance( 504 self.template_properties["appropriate_tool_use_guidelines"], str 505 ) 506 or not self.template_properties[ 507 "appropriate_tool_use_guidelines" 508 ].strip() 509 ): 510 raise ValueError( 511 "appropriate_tool_use_guidelines is required for tool call template" 512 ) 513 if ( 514 "inappropriate_tool_use_guidelines" in self.template_properties 515 and not isinstance( 516 self.template_properties["inappropriate_tool_use_guidelines"], str 517 ) 518 ): 519 raise ValueError( 520 "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string" 521 ) 522 return self
35class EvalTemplateId(str, Enum): 36 """ 37 An eval template is a pre-defined eval that can be used as a starting point for a new eval. 38 """ 39 40 kiln_requirements = "kiln_requirements" 41 desired_behaviour = "desired_behaviour" 42 issue = "kiln_issue" 43 tool_call = "tool_call" 44 toxicity = "toxicity" 45 bias = "bias" 46 maliciousness = "maliciousness" 47 factual_correctness = "factual_correctness" 48 jailbreak = "jailbreak" 49 rag = "rag"
An eval template is a pre-defined eval that can be used as a starting point for a new eval.
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.
57class EvalOutputScore(BaseModel): 58 """ 59 A definition of a score that an evaluator will produce. 60 61 Very similar to TaskRequirement, but conceptually different keeping in a separate models. 62 """ 63 64 name: FilenameStringShort = Field( 65 description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance." 66 ) 67 instruction: str | None = Field( 68 default=None, 69 description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.", 70 ) 71 type: TaskOutputRatingType = Field( 72 description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical').", 73 ) 74 75 def json_key(self) -> str: 76 """ 77 The JSON key for the score, used when running the evaluator with a LLM and we need JSON output. 78 79 For example, "Overall Rating" -> "overall_rating" 80 """ 81 return string_to_json_key(self.name) 82 83 @model_validator(mode="after") 84 def validate_type(self) -> Self: 85 if self.type == TaskOutputRatingType.custom: 86 raise ValueError( 87 f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score." 88 ) 89 return self
A definition of a score that an evaluator will produce.
Very similar to TaskRequirement, but conceptually different keeping in a separate models.
75 def json_key(self) -> str: 76 """ 77 The JSON key for the score, used when running the evaluator with a LLM and we need JSON output. 78 79 For example, "Overall Rating" -> "overall_rating" 80 """ 81 return string_to_json_key(self.name)
The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
For example, "Overall Rating" -> "overall_rating"
92class EvalRun(KilnParentedModel): 93 """ 94 The results of running an eval on a single dataset item. 95 96 This is a child of an EvalConfig, which specifies how the scores were generated. 97 98 Eval runs can be one of 2 types: 99 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 100 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item. 101 """ 102 103 dataset_id: ID_TYPE = Field( 104 description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun." 105 ) 106 task_run_config_id: ID_TYPE | None = Field( 107 description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config." 108 ) 109 eval_config_eval: bool = Field( 110 description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.", 111 default=False, 112 ) 113 # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally. 114 input: str = Field( 115 description="The input to the task. JSON formatted for structured input, plaintext for unstructured input." 116 ) 117 output: str = Field( 118 description="The output of the task. JSON formatted for structured output, plaintext for unstructured output." 119 ) 120 reference_answer: str | None = Field( 121 default=None, 122 description="The reference answer for the input. JSON formatted for structured reference answer, plaintext for unstructured reference answer. Used for reference answer evals.", 123 ) 124 intermediate_outputs: Dict[str, str] | None = Field( 125 default=None, 126 description="The intermediate outputs of the task (example, eval thinking).", 127 ) 128 task_run_trace: str | None = Field( 129 default=None, 130 description="The JSON formatted trace of the task run that produced the output.", 131 ) 132 scores: EvalScores = Field( 133 description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)." 134 ) 135 task_run_usage: Usage | None = Field( 136 default=None, 137 description="The usage of the task run that produced this eval run output (not the usage by the evaluation model).", 138 ) 139 140 def parent_eval_config(self) -> Union["EvalConfig", None]: 141 if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig": 142 raise ValueError("parent must be an EvalConfig") 143 return self.parent # type: ignore 144 145 @model_validator(mode="after") 146 def validate_output_fields(self) -> Self: 147 parent_eval_config = self.parent_eval_config() 148 parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None 149 if not parent_eval: 150 return self 151 152 evaluation_data_type = parent_eval.evaluation_data_type 153 if ( 154 evaluation_data_type == EvalDataType.final_answer 155 and self.task_run_trace is not None 156 ): 157 raise ValueError("final_answer runs should not set trace") 158 elif ( 159 not self.eval_config_eval 160 and evaluation_data_type == EvalDataType.full_trace 161 and self.task_run_trace is None 162 ): 163 raise ValueError("full_trace task run eval runs should include trace") 164 165 return self 166 167 @model_validator(mode="after") 168 def validate_eval_run_types(self) -> Self: 169 if self.eval_config_eval and self.task_run_config_id is not None: 170 raise ValueError( 171 "task_run_config_id must be None if eval_config_eval is true" 172 ) 173 if not self.eval_config_eval and self.task_run_config_id is None: 174 raise ValueError( 175 "task_run_config_id must be set if eval_config_eval is false" 176 ) 177 return self 178 179 @model_validator(mode="after") 180 def validate_scores(self) -> Self: 181 # We're checking the scores have the expected keys from the grand-parent eval 182 if self.scores is None or len(self.scores) == 0: 183 raise ValueError("scores are required, and must have at least one score.") 184 185 parent_eval_config = self.parent_eval_config() 186 eval = parent_eval_config.parent_eval() if parent_eval_config else None 187 if not eval: 188 # Can't validate without the grand-parent eval, allow it to be validated later 189 return self 190 191 output_score_keys = [score.json_key() for score in eval.output_scores] 192 if set(output_score_keys) != set(self.scores.keys()): 193 raise ValueError( 194 f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]" 195 ) 196 197 # Check that each score is expected in this eval and the correct type 198 for output_score in eval.output_scores: 199 match output_score.type: 200 case TaskOutputRatingType.five_star: 201 five_star_score = self.scores[output_score.json_key()] 202 if ( 203 not isinstance(five_star_score, float) 204 or five_star_score < 1.0 205 or five_star_score > 5.0 206 ): 207 raise ValueError( 208 f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}" 209 ) 210 case TaskOutputRatingType.pass_fail: 211 pass_fail_score = self.scores[output_score.json_key()] 212 if ( 213 not isinstance(pass_fail_score, float) 214 or pass_fail_score < 0.0 215 or pass_fail_score > 1.0 216 ): 217 raise ValueError( 218 f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}" 219 ) 220 case TaskOutputRatingType.pass_fail_critical: 221 pass_fail_critical_score = self.scores[output_score.json_key()] 222 if ( 223 not isinstance(pass_fail_critical_score, float) 224 or pass_fail_critical_score < -1.0 225 or pass_fail_critical_score > 1.0 226 ): 227 raise ValueError( 228 f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}" 229 ) 230 case TaskOutputRatingType.custom: 231 raise ValueError( 232 f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score." 233 ) 234 case _: 235 # Catch missing cases 236 raise_exhaustive_enum_error(output_score.type) 237 return self 238 239 @model_validator(mode="after") 240 def validate_reference_answer(self) -> Self: 241 parent_eval_config = self.parent_eval_config() 242 parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None 243 if not parent_eval: 244 # Can't validate without the grand-parent eval, allow it to be validated later 245 return self 246 247 evaluation_data_type = parent_eval.evaluation_data_type 248 if ( 249 self.reference_answer is not None 250 and evaluation_data_type != EvalDataType.reference_answer 251 ): 252 raise ValueError( 253 f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}" 254 ) 255 return self
The results of running an eval on a single dataset item.
This is a child of an EvalConfig, which specifies how the scores were generated.
Eval runs can be one of 2 types: 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
145 @model_validator(mode="after") 146 def validate_output_fields(self) -> Self: 147 parent_eval_config = self.parent_eval_config() 148 parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None 149 if not parent_eval: 150 return self 151 152 evaluation_data_type = parent_eval.evaluation_data_type 153 if ( 154 evaluation_data_type == EvalDataType.final_answer 155 and self.task_run_trace is not None 156 ): 157 raise ValueError("final_answer runs should not set trace") 158 elif ( 159 not self.eval_config_eval 160 and evaluation_data_type == EvalDataType.full_trace 161 and self.task_run_trace is None 162 ): 163 raise ValueError("full_trace task run eval runs should include trace") 164 165 return self
167 @model_validator(mode="after") 168 def validate_eval_run_types(self) -> Self: 169 if self.eval_config_eval and self.task_run_config_id is not None: 170 raise ValueError( 171 "task_run_config_id must be None if eval_config_eval is true" 172 ) 173 if not self.eval_config_eval and self.task_run_config_id is None: 174 raise ValueError( 175 "task_run_config_id must be set if eval_config_eval is false" 176 ) 177 return self
179 @model_validator(mode="after") 180 def validate_scores(self) -> Self: 181 # We're checking the scores have the expected keys from the grand-parent eval 182 if self.scores is None or len(self.scores) == 0: 183 raise ValueError("scores are required, and must have at least one score.") 184 185 parent_eval_config = self.parent_eval_config() 186 eval = parent_eval_config.parent_eval() if parent_eval_config else None 187 if not eval: 188 # Can't validate without the grand-parent eval, allow it to be validated later 189 return self 190 191 output_score_keys = [score.json_key() for score in eval.output_scores] 192 if set(output_score_keys) != set(self.scores.keys()): 193 raise ValueError( 194 f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]" 195 ) 196 197 # Check that each score is expected in this eval and the correct type 198 for output_score in eval.output_scores: 199 match output_score.type: 200 case TaskOutputRatingType.five_star: 201 five_star_score = self.scores[output_score.json_key()] 202 if ( 203 not isinstance(five_star_score, float) 204 or five_star_score < 1.0 205 or five_star_score > 5.0 206 ): 207 raise ValueError( 208 f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}" 209 ) 210 case TaskOutputRatingType.pass_fail: 211 pass_fail_score = self.scores[output_score.json_key()] 212 if ( 213 not isinstance(pass_fail_score, float) 214 or pass_fail_score < 0.0 215 or pass_fail_score > 1.0 216 ): 217 raise ValueError( 218 f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}" 219 ) 220 case TaskOutputRatingType.pass_fail_critical: 221 pass_fail_critical_score = self.scores[output_score.json_key()] 222 if ( 223 not isinstance(pass_fail_critical_score, float) 224 or pass_fail_critical_score < -1.0 225 or pass_fail_critical_score > 1.0 226 ): 227 raise ValueError( 228 f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}" 229 ) 230 case TaskOutputRatingType.custom: 231 raise ValueError( 232 f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score." 233 ) 234 case _: 235 # Catch missing cases 236 raise_exhaustive_enum_error(output_score.type) 237 return self
239 @model_validator(mode="after") 240 def validate_reference_answer(self) -> Self: 241 parent_eval_config = self.parent_eval_config() 242 parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None 243 if not parent_eval: 244 # Can't validate without the grand-parent eval, allow it to be validated later 245 return self 246 247 evaluation_data_type = parent_eval.evaluation_data_type 248 if ( 249 self.reference_answer is not None 250 and evaluation_data_type != EvalDataType.reference_answer 251 ): 252 raise ValueError( 253 f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}" 254 ) 255 return self
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.
258class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}): 259 """ 260 A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc. 261 262 A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config. 263 """ 264 265 name: FilenameString = Field(description="The name of the eval config.") 266 model_name: str = Field( 267 description="The name of the model to use for this eval config. ", 268 ) 269 model_provider: str = Field( 270 description="The provider of the model to use for this eval config.", 271 ) 272 config_type: EvalConfigType = Field( 273 default=EvalConfigType.g_eval, 274 description="This is used to determine the type of eval to run.", 275 ) 276 properties: dict[str, Any] = Field( 277 default={}, 278 description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.", 279 ) 280 281 def parent_eval(self) -> Union["Eval", None]: 282 if self.parent is not None and self.parent.__class__.__name__ != "Eval": 283 raise ValueError("parent must be an Eval") 284 return self.parent # type: ignore 285 286 def runs(self, readonly: bool = False) -> list[EvalRun]: 287 return super().runs(readonly=readonly) # type: ignore 288 289 @model_validator(mode="after") 290 def validate_properties(self) -> Self: 291 if ( 292 self.config_type == EvalConfigType.g_eval 293 or self.config_type == EvalConfigType.llm_as_judge 294 ): 295 if "eval_steps" not in self.properties or not isinstance( 296 self.properties["eval_steps"], list 297 ): 298 raise ValueError("eval_steps is required and must be a list for g_eval") 299 if "task_description" in self.properties and not isinstance( 300 self.properties["task_description"], str 301 ): 302 raise ValueError( 303 "task_description is optional, but if provided must be a string" 304 ) 305 return self 306 else: 307 raise ValueError(f"Invalid eval config type: {self.config_type}") 308 309 @model_validator(mode="after") 310 def validate_json_serializable(self) -> "EvalConfig": 311 try: 312 # This will raise a TypeError if the dict contains non-JSON-serializable objects 313 json.dumps(self.properties) 314 except TypeError as e: 315 raise ValueError(f"Properties must be JSON serializable: {e!s}") 316 return self
A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
709 def child_method(self, readonly: bool = False) -> list[child_class]: # type: ignore[invalid-type-form] 710 return child_class.all_children_of_parent_path(self.path, readonly=readonly)
The type of the None singleton.
289 @model_validator(mode="after") 290 def validate_properties(self) -> Self: 291 if ( 292 self.config_type == EvalConfigType.g_eval 293 or self.config_type == EvalConfigType.llm_as_judge 294 ): 295 if "eval_steps" not in self.properties or not isinstance( 296 self.properties["eval_steps"], list 297 ): 298 raise ValueError("eval_steps is required and must be a list for g_eval") 299 if "task_description" in self.properties and not isinstance( 300 self.properties["task_description"], str 301 ): 302 raise ValueError( 303 "task_description is optional, but if provided must be a string" 304 ) 305 return self 306 else: 307 raise ValueError(f"Invalid eval config type: {self.config_type}")
309 @model_validator(mode="after") 310 def validate_json_serializable(self) -> "EvalConfig": 311 try: 312 # This will raise a TypeError if the dict contains non-JSON-serializable objects 313 json.dumps(self.properties) 314 except TypeError as e: 315 raise ValueError(f"Properties must be JSON serializable: {e!s}") 316 return self
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.
319class EvalDataType(str, Enum): 320 final_answer = "final_answer" 321 full_trace = "full_trace" 322 reference_answer = "reference_answer"
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.
325class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}): 326 name: FilenameString = Field(description="The name of the eval.") 327 description: str | None = Field( 328 default=None, description="The description of the eval" 329 ) 330 template: EvalTemplateId | None = Field( 331 default=None, 332 description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.", 333 ) 334 current_config_id: ID_TYPE = Field( 335 default=None, 336 description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.", 337 ) 338 eval_set_filter_id: DatasetFilterId = Field( 339 description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id and train_set_filter_id." 340 ) 341 eval_configs_filter_id: DatasetFilterId | None = Field( 342 default=None, 343 description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id.", 344 ) 345 train_set_filter_id: DatasetFilterId | None = Field( 346 default=None, 347 description="The id of the dataset filter which defines which dataset items are included in the training set for fine-tuning. Should be mutually exclusive with eval_set_filter_id.", 348 ) 349 output_scores: List[EvalOutputScore] = Field( 350 description="The scores this evaluator should produce." 351 ) 352 favourite: bool = Field( 353 default=False, 354 description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.", 355 ) 356 template_properties: dict[str, str | int | bool | float] | None = Field( 357 default=None, 358 description="Properties to be used to execute the eval. This is template_type specific and should serialize to a json dict.", 359 ) 360 evaluation_data_type: EvalDataType = Field( 361 default=EvalDataType.final_answer, 362 description="The output of the task run to evaluate. Can be final answer or full trace.", 363 ) 364 365 # Workaround to return typed parent without importing Task 366 def parent_task(self) -> Union["Task", None]: 367 if self.parent is not None and self.parent.__class__.__name__ != "Task": 368 raise ValueError("parent must be a Task") 369 return self.parent # type: ignore 370 371 def configs(self, readonly: bool = False) -> list[EvalConfig]: 372 return super().configs(readonly=readonly) # type: ignore 373 374 # Workaround to return typed parent without importing Spec 375 def associated_spec(self, readonly: bool = False) -> Union["Spec", None]: 376 """ 377 Get the spec associated with this eval, if any. 378 Returns None for legacy evals that are not associated with a spec. 379 """ 380 381 task = self.parent_task() 382 if not task or not self.id: 383 return None 384 385 specs = task.specs(readonly=readonly) 386 for spec in specs: 387 if spec.eval_id == self.id: 388 return spec 389 return None 390 391 @model_validator(mode="after") 392 def upgrade_old_reference_answer_eval_config(self) -> Self: 393 """ 394 Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set. 395 396 For reference_answer evals that don't have a current_config_id set, this migration 397 will set the first config (by created_at) as the default. 398 """ 399 if self.id is None: 400 return self 401 402 # Only run during file loading 403 if not self._loaded_from_file: 404 return self 405 406 # Skip if already migrated (has a current_config_id set) 407 if self.current_config_id is not None: 408 return self 409 410 # Only migrate reference_answer evals 411 if self.evaluation_data_type != EvalDataType.reference_answer: 412 return self 413 414 # Prevent recursion: self.configs() loads child files, which re-loads this parent 415 # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file) 416 # This causes the validator to run again, creating an infinite loop without this guard. 417 with _migration_lock: 418 if self.id in _currently_migrating_eval_ids: 419 return self 420 _currently_migrating_eval_ids.add(self.id) 421 422 try: 423 # Get the configs - these are loaded from child files 424 configs_list = self.configs(readonly=True) 425 if configs_list and len(configs_list) > 0: 426 # Sort by created_at to get the oldest (first created) config 427 sorted_configs = sorted(configs_list, key=lambda c: c.created_at) 428 self.current_config_id = sorted_configs[0].id 429 finally: 430 with _migration_lock: 431 _currently_migrating_eval_ids.discard(self.id) 432 433 return self 434 435 @model_validator(mode="after") 436 def validate_scores(self) -> Self: 437 if self.output_scores is None or len(self.output_scores) == 0: 438 raise ValueError( 439 "output_scores are required, and must have at least one score." 440 ) 441 442 # check for duplicate names (once transformed to JSON keys) 443 output_score_keys = [score.json_key() for score in self.output_scores] 444 if len(output_score_keys) != len(set(output_score_keys)): 445 raise ValueError( 446 f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]" 447 ) 448 return self 449 450 @model_validator(mode="after") 451 def validate_template_properties(self) -> Self: 452 # eval_configs_filter_id is required for all templates except "rag" 453 if ( 454 self.template is not EvalTemplateId.rag 455 and self.eval_configs_filter_id is None 456 ): 457 raise ValueError( 458 "eval_configs_filter_id is required for all templates except 'rag'" 459 ) 460 461 # For spec-based evals, template_properties will be None and validation happens in the spec 462 # For legacy evals, template_properties contains the data and we validate here 463 if self.template_properties is None: 464 return self 465 466 # Check for properties that are required for the issue template (legacy evals only) 467 if self.template == EvalTemplateId.issue: 468 if "issue_prompt" not in self.template_properties or not isinstance( 469 self.template_properties["issue_prompt"], str 470 ): 471 raise ValueError("issue_prompt is required for issue template") 472 if "failure_example" in self.template_properties and not isinstance( 473 self.template_properties["failure_example"], str 474 ): 475 raise ValueError( 476 "failure_example is optional for issue template, but if provided must be a string" 477 ) 478 if "pass_example" in self.template_properties and not isinstance( 479 self.template_properties["pass_example"], str 480 ): 481 raise ValueError( 482 "pass_example is optional for issue template, but if provided must be a string" 483 ) 484 485 if self.template == EvalTemplateId.tool_call: 486 if self.evaluation_data_type != EvalDataType.full_trace: 487 raise ValueError( 488 "tool_call template should have evaluation_data_type set to full_trace" 489 ) 490 if ( 491 "tool" not in self.template_properties 492 or not isinstance(self.template_properties["tool"], str) 493 or not self.template_properties["tool"].strip() 494 ): 495 raise ValueError("tool is required for tool call template") 496 if "tool_function_name" not in self.template_properties or not isinstance( 497 self.template_properties["tool_function_name"], str 498 ): 499 raise ValueError( 500 "tool_function_name is required for tool call template" 501 ) 502 if ( 503 "appropriate_tool_use_guidelines" not in self.template_properties 504 or not isinstance( 505 self.template_properties["appropriate_tool_use_guidelines"], str 506 ) 507 or not self.template_properties[ 508 "appropriate_tool_use_guidelines" 509 ].strip() 510 ): 511 raise ValueError( 512 "appropriate_tool_use_guidelines is required for tool call template" 513 ) 514 if ( 515 "inappropriate_tool_use_guidelines" in self.template_properties 516 and not isinstance( 517 self.template_properties["inappropriate_tool_use_guidelines"], str 518 ) 519 ): 520 raise ValueError( 521 "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string" 522 ) 523 return self
Base model for Kiln models that have a parent-child relationship. This base class is for child models.
This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.
Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.
709 def child_method(self, readonly: bool = False) -> list[child_class]: # type: ignore[invalid-type-form] 710 return child_class.all_children_of_parent_path(self.path, readonly=readonly)
The type of the None singleton.
375 def associated_spec(self, readonly: bool = False) -> Union["Spec", None]: 376 """ 377 Get the spec associated with this eval, if any. 378 Returns None for legacy evals that are not associated with a spec. 379 """ 380 381 task = self.parent_task() 382 if not task or not self.id: 383 return None 384 385 specs = task.specs(readonly=readonly) 386 for spec in specs: 387 if spec.eval_id == self.id: 388 return spec 389 return None
Get the spec associated with this eval, if any. Returns None for legacy evals that are not associated with a spec.
391 @model_validator(mode="after") 392 def upgrade_old_reference_answer_eval_config(self) -> Self: 393 """ 394 Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set. 395 396 For reference_answer evals that don't have a current_config_id set, this migration 397 will set the first config (by created_at) as the default. 398 """ 399 if self.id is None: 400 return self 401 402 # Only run during file loading 403 if not self._loaded_from_file: 404 return self 405 406 # Skip if already migrated (has a current_config_id set) 407 if self.current_config_id is not None: 408 return self 409 410 # Only migrate reference_answer evals 411 if self.evaluation_data_type != EvalDataType.reference_answer: 412 return self 413 414 # Prevent recursion: self.configs() loads child files, which re-loads this parent 415 # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file) 416 # This causes the validator to run again, creating an infinite loop without this guard. 417 with _migration_lock: 418 if self.id in _currently_migrating_eval_ids: 419 return self 420 _currently_migrating_eval_ids.add(self.id) 421 422 try: 423 # Get the configs - these are loaded from child files 424 configs_list = self.configs(readonly=True) 425 if configs_list and len(configs_list) > 0: 426 # Sort by created_at to get the oldest (first created) config 427 sorted_configs = sorted(configs_list, key=lambda c: c.created_at) 428 self.current_config_id = sorted_configs[0].id 429 finally: 430 with _migration_lock: 431 _currently_migrating_eval_ids.discard(self.id) 432 433 return self
Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set.
For reference_answer evals that don't have a current_config_id set, this migration will set the first config (by created_at) as the default.
435 @model_validator(mode="after") 436 def validate_scores(self) -> Self: 437 if self.output_scores is None or len(self.output_scores) == 0: 438 raise ValueError( 439 "output_scores are required, and must have at least one score." 440 ) 441 442 # check for duplicate names (once transformed to JSON keys) 443 output_score_keys = [score.json_key() for score in self.output_scores] 444 if len(output_score_keys) != len(set(output_score_keys)): 445 raise ValueError( 446 f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]" 447 ) 448 return self
450 @model_validator(mode="after") 451 def validate_template_properties(self) -> Self: 452 # eval_configs_filter_id is required for all templates except "rag" 453 if ( 454 self.template is not EvalTemplateId.rag 455 and self.eval_configs_filter_id is None 456 ): 457 raise ValueError( 458 "eval_configs_filter_id is required for all templates except 'rag'" 459 ) 460 461 # For spec-based evals, template_properties will be None and validation happens in the spec 462 # For legacy evals, template_properties contains the data and we validate here 463 if self.template_properties is None: 464 return self 465 466 # Check for properties that are required for the issue template (legacy evals only) 467 if self.template == EvalTemplateId.issue: 468 if "issue_prompt" not in self.template_properties or not isinstance( 469 self.template_properties["issue_prompt"], str 470 ): 471 raise ValueError("issue_prompt is required for issue template") 472 if "failure_example" in self.template_properties and not isinstance( 473 self.template_properties["failure_example"], str 474 ): 475 raise ValueError( 476 "failure_example is optional for issue template, but if provided must be a string" 477 ) 478 if "pass_example" in self.template_properties and not isinstance( 479 self.template_properties["pass_example"], str 480 ): 481 raise ValueError( 482 "pass_example is optional for issue template, but if provided must be a string" 483 ) 484 485 if self.template == EvalTemplateId.tool_call: 486 if self.evaluation_data_type != EvalDataType.full_trace: 487 raise ValueError( 488 "tool_call template should have evaluation_data_type set to full_trace" 489 ) 490 if ( 491 "tool" not in self.template_properties 492 or not isinstance(self.template_properties["tool"], str) 493 or not self.template_properties["tool"].strip() 494 ): 495 raise ValueError("tool is required for tool call template") 496 if "tool_function_name" not in self.template_properties or not isinstance( 497 self.template_properties["tool_function_name"], str 498 ): 499 raise ValueError( 500 "tool_function_name is required for tool call template" 501 ) 502 if ( 503 "appropriate_tool_use_guidelines" not in self.template_properties 504 or not isinstance( 505 self.template_properties["appropriate_tool_use_guidelines"], str 506 ) 507 or not self.template_properties[ 508 "appropriate_tool_use_guidelines" 509 ].strip() 510 ): 511 raise ValueError( 512 "appropriate_tool_use_guidelines is required for tool call template" 513 ) 514 if ( 515 "inappropriate_tool_use_guidelines" in self.template_properties 516 and not isinstance( 517 self.template_properties["inappropriate_tool_use_guidelines"], str 518 ) 519 ): 520 raise ValueError( 521 "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string" 522 ) 523 return self
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.