kiln_ai.datamodel.eval
1import json 2from enum import Enum 3from threading import Lock 4from typing import TYPE_CHECKING, Any, Dict, List, Union 5 6from pydantic import BaseModel, Field, model_validator 7from typing_extensions import Self 8 9from kiln_ai.datamodel.basemodel import ( 10 ID_TYPE, 11 FilenameString, 12 FilenameStringShort, 13 KilnParentedModel, 14 KilnParentModel, 15) 16from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType 17from kiln_ai.datamodel.dataset_filters import DatasetFilterId 18from kiln_ai.datamodel.json_schema import string_to_json_key 19from kiln_ai.datamodel.task_run import Usage 20from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error 21 22if TYPE_CHECKING: 23 from kiln_ai.datamodel.spec import Spec 24 from kiln_ai.datamodel.task import Task 25 26EvalScores = Dict[str, float] 27 28# Module-level set to track evals currently being migrated (to prevent recursion) 29# Protected by _migration_lock to ensure thread-safe access 30_migration_lock = Lock() 31_currently_migrating_eval_ids: set[ID_TYPE] = set() 32 33 34class EvalTemplateId(str, Enum): 35 """ 36 An eval template is a pre-defined eval that can be used as a starting point for a new eval. 37 """ 38 39 kiln_requirements = "kiln_requirements" 40 desired_behaviour = "desired_behaviour" 41 issue = "kiln_issue" 42 tool_call = "tool_call" 43 toxicity = "toxicity" 44 bias = "bias" 45 maliciousness = "maliciousness" 46 factual_correctness = "factual_correctness" 47 jailbreak = "jailbreak" 48 rag = "rag" 49 50 51class EvalConfigType(str, Enum): 52 """The type of eval configuration, determining how scores are generated.""" 53 54 g_eval = "g_eval" 55 llm_as_judge = "llm_as_judge" 56 57 58class EvalOutputScore(BaseModel): 59 """ 60 A definition of a score that an evaluator will produce. 61 62 Very similar to TaskRequirement, but conceptually different keeping in a separate models. 63 """ 64 65 name: FilenameStringShort = Field( 66 description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance." 67 ) 68 instruction: str | None = Field( 69 default=None, 70 description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.", 71 ) 72 type: TaskOutputRatingType = Field( 73 description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical').", 74 ) 75 76 def json_key(self) -> str: 77 """ 78 The JSON key for the score, used when running the evaluator with a LLM and we need JSON output. 79 80 For example, "Overall Rating" -> "overall_rating" 81 """ 82 return string_to_json_key(self.name) 83 84 @model_validator(mode="after") 85 def validate_type(self) -> Self: 86 if self.type == TaskOutputRatingType.custom: 87 raise ValueError( 88 f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score." 89 ) 90 return self 91 92 93class EvalRun(KilnParentedModel): 94 """ 95 The results of running an eval on a single dataset item. 96 97 This is a child of an EvalConfig, which specifies how the scores were generated. 98 99 Eval runs can be one of 2 types: 100 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 101 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item. 102 """ 103 104 dataset_id: ID_TYPE = Field( 105 description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun." 106 ) 107 task_run_config_id: ID_TYPE | None = Field( 108 description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config." 109 ) 110 eval_config_eval: bool = Field( 111 description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.", 112 default=False, 113 ) 114 # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally. 115 input: str = Field( 116 description="The input to the task. JSON formatted for structured input, plaintext for unstructured input." 117 ) 118 output: str = Field( 119 description="The output of the task. JSON formatted for structured output, plaintext for unstructured output." 120 ) 121 reference_answer: str | None = Field( 122 default=None, 123 description="The reference answer for the input. JSON formatted for structured reference answer, plaintext for unstructured reference answer. Used for reference answer evals.", 124 ) 125 intermediate_outputs: Dict[str, str] | None = Field( 126 default=None, 127 description="The intermediate outputs of the task (example, eval thinking).", 128 ) 129 task_run_trace: str | None = Field( 130 default=None, 131 description="The JSON formatted trace of the task run that produced the output.", 132 ) 133 scores: EvalScores = Field( 134 description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)." 135 ) 136 task_run_usage: Usage | None = Field( 137 default=None, 138 description="The usage of the task run that produced this eval run output (not the usage by the evaluation model).", 139 ) 140 141 def parent_eval_config(self) -> Union["EvalConfig", None]: 142 if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig": 143 raise ValueError("parent must be an EvalConfig") 144 return self.parent # type: ignore 145 146 @model_validator(mode="after") 147 def validate_output_fields(self) -> Self: 148 parent_eval_config = self.parent_eval_config() 149 parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None 150 if not parent_eval: 151 return self 152 153 evaluation_data_type = parent_eval.evaluation_data_type 154 if ( 155 evaluation_data_type == EvalDataType.final_answer 156 and self.task_run_trace is not None 157 ): 158 raise ValueError("final_answer runs should not set trace") 159 elif ( 160 not self.eval_config_eval 161 and evaluation_data_type == EvalDataType.full_trace 162 and self.task_run_trace is None 163 ): 164 raise ValueError("full_trace task run eval runs should include trace") 165 166 return self 167 168 @model_validator(mode="after") 169 def validate_eval_run_types(self) -> Self: 170 if self.eval_config_eval and self.task_run_config_id is not None: 171 raise ValueError( 172 "task_run_config_id must be None if eval_config_eval is true" 173 ) 174 if not self.eval_config_eval and self.task_run_config_id is None: 175 raise ValueError( 176 "task_run_config_id must be set if eval_config_eval is false" 177 ) 178 return self 179 180 @model_validator(mode="after") 181 def validate_scores(self) -> Self: 182 # We're checking the scores have the expected keys from the grand-parent eval 183 if self.scores is None or len(self.scores) == 0: 184 raise ValueError("scores are required, and must have at least one score.") 185 186 parent_eval_config = self.parent_eval_config() 187 eval = parent_eval_config.parent_eval() if parent_eval_config else None 188 if not eval: 189 # Can't validate without the grand-parent eval, allow it to be validated later 190 return self 191 192 output_score_keys = [score.json_key() for score in eval.output_scores] 193 if set(output_score_keys) != set(self.scores.keys()): 194 raise ValueError( 195 f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]" 196 ) 197 198 # Check that each score is expected in this eval and the correct type 199 for output_score in eval.output_scores: 200 match output_score.type: 201 case TaskOutputRatingType.five_star: 202 five_star_score = self.scores[output_score.json_key()] 203 if ( 204 not isinstance(five_star_score, float) 205 or five_star_score < 1.0 206 or five_star_score > 5.0 207 ): 208 raise ValueError( 209 f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}" 210 ) 211 case TaskOutputRatingType.pass_fail: 212 pass_fail_score = self.scores[output_score.json_key()] 213 if ( 214 not isinstance(pass_fail_score, float) 215 or pass_fail_score < 0.0 216 or pass_fail_score > 1.0 217 ): 218 raise ValueError( 219 f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}" 220 ) 221 case TaskOutputRatingType.pass_fail_critical: 222 pass_fail_critical_score = self.scores[output_score.json_key()] 223 if ( 224 not isinstance(pass_fail_critical_score, float) 225 or pass_fail_critical_score < -1.0 226 or pass_fail_critical_score > 1.0 227 ): 228 raise ValueError( 229 f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}" 230 ) 231 case TaskOutputRatingType.custom: 232 raise ValueError( 233 f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score." 234 ) 235 case _: 236 # Catch missing cases 237 raise_exhaustive_enum_error(output_score.type) 238 return self 239 240 @model_validator(mode="after") 241 def validate_reference_answer(self) -> Self: 242 parent_eval_config = self.parent_eval_config() 243 parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None 244 if not parent_eval: 245 # Can't validate without the grand-parent eval, allow it to be validated later 246 return self 247 248 evaluation_data_type = parent_eval.evaluation_data_type 249 if ( 250 self.reference_answer is not None 251 and evaluation_data_type != EvalDataType.reference_answer 252 ): 253 raise ValueError( 254 f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}" 255 ) 256 return self 257 258 259class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}): 260 """ 261 A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc. 262 263 A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config. 264 """ 265 266 name: FilenameString = Field(description="The name of the eval config.") 267 model_name: str = Field( 268 description="The name of the model to use for this eval config. ", 269 ) 270 model_provider: str = Field( 271 description="The provider of the model to use for this eval config.", 272 ) 273 config_type: EvalConfigType = Field( 274 default=EvalConfigType.g_eval, 275 description="This is used to determine the type of eval to run.", 276 ) 277 properties: dict[str, Any] = Field( 278 default={}, 279 description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.", 280 ) 281 282 def parent_eval(self) -> Union["Eval", None]: 283 if self.parent is not None and self.parent.__class__.__name__ != "Eval": 284 raise ValueError("parent must be an Eval") 285 return self.parent # type: ignore 286 287 def runs(self, readonly: bool = False) -> list[EvalRun]: 288 return super().runs(readonly=readonly) # type: ignore 289 290 @model_validator(mode="after") 291 def validate_properties(self) -> Self: 292 if ( 293 self.config_type == EvalConfigType.g_eval 294 or self.config_type == EvalConfigType.llm_as_judge 295 ): 296 if "eval_steps" not in self.properties or not isinstance( 297 self.properties["eval_steps"], list 298 ): 299 raise ValueError("eval_steps is required and must be a list for g_eval") 300 if "task_description" in self.properties and not isinstance( 301 self.properties["task_description"], str 302 ): 303 raise ValueError( 304 "task_description is optional, but if provided must be a string" 305 ) 306 return self 307 else: 308 raise ValueError(f"Invalid eval config type: {self.config_type}") 309 310 @model_validator(mode="after") 311 def validate_json_serializable(self) -> "EvalConfig": 312 try: 313 # This will raise a TypeError if the dict contains non-JSON-serializable objects 314 json.dumps(self.properties) 315 except TypeError as e: 316 raise ValueError(f"Properties must be JSON serializable: {e!s}") 317 return self 318 319 320class EvalDataType(str, Enum): 321 """The type of task output data to evaluate.""" 322 323 final_answer = "final_answer" 324 full_trace = "full_trace" 325 reference_answer = "reference_answer" 326 327 328class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}): 329 """An evaluator definition that specifies what to evaluate and how scores should be produced.""" 330 331 name: FilenameString = Field(description="The name of the eval.") 332 description: str | None = Field( 333 default=None, description="The description of the eval" 334 ) 335 template: EvalTemplateId | None = Field( 336 default=None, 337 description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.", 338 ) 339 current_config_id: ID_TYPE = Field( 340 default=None, 341 description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.", 342 ) 343 eval_set_filter_id: DatasetFilterId = Field( 344 description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id and train_set_filter_id." 345 ) 346 eval_configs_filter_id: DatasetFilterId | None = Field( 347 default=None, 348 description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id.", 349 ) 350 train_set_filter_id: DatasetFilterId | None = Field( 351 default=None, 352 description="The id of the dataset filter which defines which dataset items are included in the training set for fine-tuning. Should be mutually exclusive with eval_set_filter_id.", 353 ) 354 output_scores: List[EvalOutputScore] = Field( 355 description="The scores this evaluator should produce." 356 ) 357 favourite: bool = Field( 358 default=False, 359 description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.", 360 ) 361 template_properties: dict[str, str | int | bool | float] | None = Field( 362 default=None, 363 description="Properties to be used to execute the eval. This is template_type specific and should serialize to a json dict.", 364 ) 365 evaluation_data_type: EvalDataType = Field( 366 default=EvalDataType.final_answer, 367 description="The output of the task run to evaluate. Can be final answer or full trace.", 368 ) 369 370 # Workaround to return typed parent without importing Task 371 def parent_task(self) -> Union["Task", None]: 372 if self.parent is not None and self.parent.__class__.__name__ != "Task": 373 raise ValueError("parent must be a Task") 374 return self.parent # type: ignore 375 376 def configs(self, readonly: bool = False) -> list[EvalConfig]: 377 return super().configs(readonly=readonly) # type: ignore 378 379 # Workaround to return typed parent without importing Spec 380 def associated_spec(self, readonly: bool = False) -> Union["Spec", None]: 381 """ 382 Get the spec associated with this eval, if any. 383 Returns None for legacy evals that are not associated with a spec. 384 """ 385 386 task = self.parent_task() 387 if not task or not self.id: 388 return None 389 390 specs = task.specs(readonly=readonly) 391 for spec in specs: 392 if spec.eval_id == self.id: 393 return spec 394 return None 395 396 @model_validator(mode="after") 397 def upgrade_old_reference_answer_eval_config(self) -> Self: 398 """ 399 Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set. 400 401 For reference_answer evals that don't have a current_config_id set, this migration 402 will set the first config (by created_at) as the default. 403 """ 404 if self.id is None: 405 return self 406 407 # Only run during file loading 408 if not self._loaded_from_file: 409 return self 410 411 # Skip if already migrated (has a current_config_id set) 412 if self.current_config_id is not None: 413 return self 414 415 # Only migrate reference_answer evals 416 if self.evaluation_data_type != EvalDataType.reference_answer: 417 return self 418 419 # Prevent recursion: self.configs() loads child files, which re-loads this parent 420 # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file) 421 # This causes the validator to run again, creating an infinite loop without this guard. 422 with _migration_lock: 423 if self.id in _currently_migrating_eval_ids: 424 return self 425 _currently_migrating_eval_ids.add(self.id) 426 427 try: 428 # Get the configs - these are loaded from child files 429 configs_list = self.configs(readonly=True) 430 if configs_list and len(configs_list) > 0: 431 # Sort by created_at to get the oldest (first created) config 432 sorted_configs = sorted(configs_list, key=lambda c: c.created_at) 433 self.current_config_id = sorted_configs[0].id 434 finally: 435 with _migration_lock: 436 _currently_migrating_eval_ids.discard(self.id) 437 438 return self 439 440 @model_validator(mode="after") 441 def migrate_train_set_filter_id(self) -> Self: 442 """ 443 Migration: Auto-create a train_set_filter_id for legacy evals that don't have one. 444 445 Generates a tag-based filter ID from the eval name following the convention 446 used by spec-based evals (e.g., "train_{name_slug}"). 447 """ 448 if self.id is None: 449 return self 450 451 if not self._loaded_from_file: 452 return self 453 454 if self.train_set_filter_id is not None: 455 return self 456 457 tag_suffix = self.name.lower().replace(" ", "_") 458 self.train_set_filter_id = f"tag::train_{tag_suffix}" 459 return self 460 461 @model_validator(mode="after") 462 def validate_scores(self) -> Self: 463 if self.output_scores is None or len(self.output_scores) == 0: 464 raise ValueError( 465 "output_scores are required, and must have at least one score." 466 ) 467 468 # check for duplicate names (once transformed to JSON keys) 469 output_score_keys = [score.json_key() for score in self.output_scores] 470 if len(output_score_keys) != len(set(output_score_keys)): 471 raise ValueError( 472 f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]" 473 ) 474 return self 475 476 @model_validator(mode="after") 477 def validate_template_properties(self) -> Self: 478 # eval_configs_filter_id is required for all templates except "rag" 479 if ( 480 self.template is not EvalTemplateId.rag 481 and self.eval_configs_filter_id is None 482 ): 483 raise ValueError( 484 "eval_configs_filter_id is required for all templates except 'rag'" 485 ) 486 487 # For spec-based evals, template_properties will be None and validation happens in the spec 488 # For legacy evals, template_properties contains the data and we validate here 489 if self.template_properties is None: 490 return self 491 492 # Check for properties that are required for the issue template (legacy evals only) 493 if self.template == EvalTemplateId.issue: 494 if "issue_prompt" not in self.template_properties or not isinstance( 495 self.template_properties["issue_prompt"], str 496 ): 497 raise ValueError("issue_prompt is required for issue template") 498 if "failure_example" in self.template_properties and not isinstance( 499 self.template_properties["failure_example"], str 500 ): 501 raise ValueError( 502 "failure_example is optional for issue template, but if provided must be a string" 503 ) 504 if "pass_example" in self.template_properties and not isinstance( 505 self.template_properties["pass_example"], str 506 ): 507 raise ValueError( 508 "pass_example is optional for issue template, but if provided must be a string" 509 ) 510 511 if self.template == EvalTemplateId.tool_call: 512 if self.evaluation_data_type != EvalDataType.full_trace: 513 raise ValueError( 514 "tool_call template should have evaluation_data_type set to full_trace" 515 ) 516 if ( 517 "tool" not in self.template_properties 518 or not isinstance(self.template_properties["tool"], str) 519 or not self.template_properties["tool"].strip() 520 ): 521 raise ValueError("tool is required for tool call template") 522 if "tool_function_name" not in self.template_properties or not isinstance( 523 self.template_properties["tool_function_name"], str 524 ): 525 raise ValueError( 526 "tool_function_name is required for tool call template" 527 ) 528 if ( 529 "appropriate_tool_use_guidelines" not in self.template_properties 530 or not isinstance( 531 self.template_properties["appropriate_tool_use_guidelines"], str 532 ) 533 or not self.template_properties[ 534 "appropriate_tool_use_guidelines" 535 ].strip() 536 ): 537 raise ValueError( 538 "appropriate_tool_use_guidelines is required for tool call template" 539 ) 540 if ( 541 "inappropriate_tool_use_guidelines" in self.template_properties 542 and not isinstance( 543 self.template_properties["inappropriate_tool_use_guidelines"], str 544 ) 545 ): 546 raise ValueError( 547 "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string" 548 ) 549 return self
35class EvalTemplateId(str, Enum): 36 """ 37 An eval template is a pre-defined eval that can be used as a starting point for a new eval. 38 """ 39 40 kiln_requirements = "kiln_requirements" 41 desired_behaviour = "desired_behaviour" 42 issue = "kiln_issue" 43 tool_call = "tool_call" 44 toxicity = "toxicity" 45 bias = "bias" 46 maliciousness = "maliciousness" 47 factual_correctness = "factual_correctness" 48 jailbreak = "jailbreak" 49 rag = "rag"
An eval template is a pre-defined eval that can be used as a starting point for a new eval.
52class EvalConfigType(str, Enum): 53 """The type of eval configuration, determining how scores are generated.""" 54 55 g_eval = "g_eval" 56 llm_as_judge = "llm_as_judge"
The type of eval configuration, determining how scores are generated.
59class EvalOutputScore(BaseModel): 60 """ 61 A definition of a score that an evaluator will produce. 62 63 Very similar to TaskRequirement, but conceptually different keeping in a separate models. 64 """ 65 66 name: FilenameStringShort = Field( 67 description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance." 68 ) 69 instruction: str | None = Field( 70 default=None, 71 description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.", 72 ) 73 type: TaskOutputRatingType = Field( 74 description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical').", 75 ) 76 77 def json_key(self) -> str: 78 """ 79 The JSON key for the score, used when running the evaluator with a LLM and we need JSON output. 80 81 For example, "Overall Rating" -> "overall_rating" 82 """ 83 return string_to_json_key(self.name) 84 85 @model_validator(mode="after") 86 def validate_type(self) -> Self: 87 if self.type == TaskOutputRatingType.custom: 88 raise ValueError( 89 f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score." 90 ) 91 return self
A definition of a score that an evaluator will produce.
Very similar to TaskRequirement, but conceptually different keeping in a separate models.
77 def json_key(self) -> str: 78 """ 79 The JSON key for the score, used when running the evaluator with a LLM and we need JSON output. 80 81 For example, "Overall Rating" -> "overall_rating" 82 """ 83 return string_to_json_key(self.name)
The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
For example, "Overall Rating" -> "overall_rating"
94class EvalRun(KilnParentedModel): 95 """ 96 The results of running an eval on a single dataset item. 97 98 This is a child of an EvalConfig, which specifies how the scores were generated. 99 100 Eval runs can be one of 2 types: 101 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 102 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item. 103 """ 104 105 dataset_id: ID_TYPE = Field( 106 description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun." 107 ) 108 task_run_config_id: ID_TYPE | None = Field( 109 description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config." 110 ) 111 eval_config_eval: bool = Field( 112 description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.", 113 default=False, 114 ) 115 # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally. 116 input: str = Field( 117 description="The input to the task. JSON formatted for structured input, plaintext for unstructured input." 118 ) 119 output: str = Field( 120 description="The output of the task. JSON formatted for structured output, plaintext for unstructured output." 121 ) 122 reference_answer: str | None = Field( 123 default=None, 124 description="The reference answer for the input. JSON formatted for structured reference answer, plaintext for unstructured reference answer. Used for reference answer evals.", 125 ) 126 intermediate_outputs: Dict[str, str] | None = Field( 127 default=None, 128 description="The intermediate outputs of the task (example, eval thinking).", 129 ) 130 task_run_trace: str | None = Field( 131 default=None, 132 description="The JSON formatted trace of the task run that produced the output.", 133 ) 134 scores: EvalScores = Field( 135 description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)." 136 ) 137 task_run_usage: Usage | None = Field( 138 default=None, 139 description="The usage of the task run that produced this eval run output (not the usage by the evaluation model).", 140 ) 141 142 def parent_eval_config(self) -> Union["EvalConfig", None]: 143 if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig": 144 raise ValueError("parent must be an EvalConfig") 145 return self.parent # type: ignore 146 147 @model_validator(mode="after") 148 def validate_output_fields(self) -> Self: 149 parent_eval_config = self.parent_eval_config() 150 parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None 151 if not parent_eval: 152 return self 153 154 evaluation_data_type = parent_eval.evaluation_data_type 155 if ( 156 evaluation_data_type == EvalDataType.final_answer 157 and self.task_run_trace is not None 158 ): 159 raise ValueError("final_answer runs should not set trace") 160 elif ( 161 not self.eval_config_eval 162 and evaluation_data_type == EvalDataType.full_trace 163 and self.task_run_trace is None 164 ): 165 raise ValueError("full_trace task run eval runs should include trace") 166 167 return self 168 169 @model_validator(mode="after") 170 def validate_eval_run_types(self) -> Self: 171 if self.eval_config_eval and self.task_run_config_id is not None: 172 raise ValueError( 173 "task_run_config_id must be None if eval_config_eval is true" 174 ) 175 if not self.eval_config_eval and self.task_run_config_id is None: 176 raise ValueError( 177 "task_run_config_id must be set if eval_config_eval is false" 178 ) 179 return self 180 181 @model_validator(mode="after") 182 def validate_scores(self) -> Self: 183 # We're checking the scores have the expected keys from the grand-parent eval 184 if self.scores is None or len(self.scores) == 0: 185 raise ValueError("scores are required, and must have at least one score.") 186 187 parent_eval_config = self.parent_eval_config() 188 eval = parent_eval_config.parent_eval() if parent_eval_config else None 189 if not eval: 190 # Can't validate without the grand-parent eval, allow it to be validated later 191 return self 192 193 output_score_keys = [score.json_key() for score in eval.output_scores] 194 if set(output_score_keys) != set(self.scores.keys()): 195 raise ValueError( 196 f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]" 197 ) 198 199 # Check that each score is expected in this eval and the correct type 200 for output_score in eval.output_scores: 201 match output_score.type: 202 case TaskOutputRatingType.five_star: 203 five_star_score = self.scores[output_score.json_key()] 204 if ( 205 not isinstance(five_star_score, float) 206 or five_star_score < 1.0 207 or five_star_score > 5.0 208 ): 209 raise ValueError( 210 f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}" 211 ) 212 case TaskOutputRatingType.pass_fail: 213 pass_fail_score = self.scores[output_score.json_key()] 214 if ( 215 not isinstance(pass_fail_score, float) 216 or pass_fail_score < 0.0 217 or pass_fail_score > 1.0 218 ): 219 raise ValueError( 220 f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}" 221 ) 222 case TaskOutputRatingType.pass_fail_critical: 223 pass_fail_critical_score = self.scores[output_score.json_key()] 224 if ( 225 not isinstance(pass_fail_critical_score, float) 226 or pass_fail_critical_score < -1.0 227 or pass_fail_critical_score > 1.0 228 ): 229 raise ValueError( 230 f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}" 231 ) 232 case TaskOutputRatingType.custom: 233 raise ValueError( 234 f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score." 235 ) 236 case _: 237 # Catch missing cases 238 raise_exhaustive_enum_error(output_score.type) 239 return self 240 241 @model_validator(mode="after") 242 def validate_reference_answer(self) -> Self: 243 parent_eval_config = self.parent_eval_config() 244 parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None 245 if not parent_eval: 246 # Can't validate without the grand-parent eval, allow it to be validated later 247 return self 248 249 evaluation_data_type = parent_eval.evaluation_data_type 250 if ( 251 self.reference_answer is not None 252 and evaluation_data_type != EvalDataType.reference_answer 253 ): 254 raise ValueError( 255 f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}" 256 ) 257 return self
The results of running an eval on a single dataset item.
This is a child of an EvalConfig, which specifies how the scores were generated.
Eval runs can be one of 2 types: 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
147 @model_validator(mode="after") 148 def validate_output_fields(self) -> Self: 149 parent_eval_config = self.parent_eval_config() 150 parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None 151 if not parent_eval: 152 return self 153 154 evaluation_data_type = parent_eval.evaluation_data_type 155 if ( 156 evaluation_data_type == EvalDataType.final_answer 157 and self.task_run_trace is not None 158 ): 159 raise ValueError("final_answer runs should not set trace") 160 elif ( 161 not self.eval_config_eval 162 and evaluation_data_type == EvalDataType.full_trace 163 and self.task_run_trace is None 164 ): 165 raise ValueError("full_trace task run eval runs should include trace") 166 167 return self
169 @model_validator(mode="after") 170 def validate_eval_run_types(self) -> Self: 171 if self.eval_config_eval and self.task_run_config_id is not None: 172 raise ValueError( 173 "task_run_config_id must be None if eval_config_eval is true" 174 ) 175 if not self.eval_config_eval and self.task_run_config_id is None: 176 raise ValueError( 177 "task_run_config_id must be set if eval_config_eval is false" 178 ) 179 return self
181 @model_validator(mode="after") 182 def validate_scores(self) -> Self: 183 # We're checking the scores have the expected keys from the grand-parent eval 184 if self.scores is None or len(self.scores) == 0: 185 raise ValueError("scores are required, and must have at least one score.") 186 187 parent_eval_config = self.parent_eval_config() 188 eval = parent_eval_config.parent_eval() if parent_eval_config else None 189 if not eval: 190 # Can't validate without the grand-parent eval, allow it to be validated later 191 return self 192 193 output_score_keys = [score.json_key() for score in eval.output_scores] 194 if set(output_score_keys) != set(self.scores.keys()): 195 raise ValueError( 196 f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]" 197 ) 198 199 # Check that each score is expected in this eval and the correct type 200 for output_score in eval.output_scores: 201 match output_score.type: 202 case TaskOutputRatingType.five_star: 203 five_star_score = self.scores[output_score.json_key()] 204 if ( 205 not isinstance(five_star_score, float) 206 or five_star_score < 1.0 207 or five_star_score > 5.0 208 ): 209 raise ValueError( 210 f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}" 211 ) 212 case TaskOutputRatingType.pass_fail: 213 pass_fail_score = self.scores[output_score.json_key()] 214 if ( 215 not isinstance(pass_fail_score, float) 216 or pass_fail_score < 0.0 217 or pass_fail_score > 1.0 218 ): 219 raise ValueError( 220 f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}" 221 ) 222 case TaskOutputRatingType.pass_fail_critical: 223 pass_fail_critical_score = self.scores[output_score.json_key()] 224 if ( 225 not isinstance(pass_fail_critical_score, float) 226 or pass_fail_critical_score < -1.0 227 or pass_fail_critical_score > 1.0 228 ): 229 raise ValueError( 230 f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}" 231 ) 232 case TaskOutputRatingType.custom: 233 raise ValueError( 234 f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score." 235 ) 236 case _: 237 # Catch missing cases 238 raise_exhaustive_enum_error(output_score.type) 239 return self
241 @model_validator(mode="after") 242 def validate_reference_answer(self) -> Self: 243 parent_eval_config = self.parent_eval_config() 244 parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None 245 if not parent_eval: 246 # Can't validate without the grand-parent eval, allow it to be validated later 247 return self 248 249 evaluation_data_type = parent_eval.evaluation_data_type 250 if ( 251 self.reference_answer is not None 252 and evaluation_data_type != EvalDataType.reference_answer 253 ): 254 raise ValueError( 255 f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}" 256 ) 257 return self
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.
260class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}): 261 """ 262 A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc. 263 264 A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config. 265 """ 266 267 name: FilenameString = Field(description="The name of the eval config.") 268 model_name: str = Field( 269 description="The name of the model to use for this eval config. ", 270 ) 271 model_provider: str = Field( 272 description="The provider of the model to use for this eval config.", 273 ) 274 config_type: EvalConfigType = Field( 275 default=EvalConfigType.g_eval, 276 description="This is used to determine the type of eval to run.", 277 ) 278 properties: dict[str, Any] = Field( 279 default={}, 280 description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.", 281 ) 282 283 def parent_eval(self) -> Union["Eval", None]: 284 if self.parent is not None and self.parent.__class__.__name__ != "Eval": 285 raise ValueError("parent must be an Eval") 286 return self.parent # type: ignore 287 288 def runs(self, readonly: bool = False) -> list[EvalRun]: 289 return super().runs(readonly=readonly) # type: ignore 290 291 @model_validator(mode="after") 292 def validate_properties(self) -> Self: 293 if ( 294 self.config_type == EvalConfigType.g_eval 295 or self.config_type == EvalConfigType.llm_as_judge 296 ): 297 if "eval_steps" not in self.properties or not isinstance( 298 self.properties["eval_steps"], list 299 ): 300 raise ValueError("eval_steps is required and must be a list for g_eval") 301 if "task_description" in self.properties and not isinstance( 302 self.properties["task_description"], str 303 ): 304 raise ValueError( 305 "task_description is optional, but if provided must be a string" 306 ) 307 return self 308 else: 309 raise ValueError(f"Invalid eval config type: {self.config_type}") 310 311 @model_validator(mode="after") 312 def validate_json_serializable(self) -> "EvalConfig": 313 try: 314 # This will raise a TypeError if the dict contains non-JSON-serializable objects 315 json.dumps(self.properties) 316 except TypeError as e: 317 raise ValueError(f"Properties must be JSON serializable: {e!s}") 318 return self
A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
743 def child_method(self, readonly: bool = False) -> list[child_class]: # type: ignore[invalid-type-form] 744 return child_class.all_children_of_parent_path(self.path, readonly=readonly)
The type of the None singleton.
291 @model_validator(mode="after") 292 def validate_properties(self) -> Self: 293 if ( 294 self.config_type == EvalConfigType.g_eval 295 or self.config_type == EvalConfigType.llm_as_judge 296 ): 297 if "eval_steps" not in self.properties or not isinstance( 298 self.properties["eval_steps"], list 299 ): 300 raise ValueError("eval_steps is required and must be a list for g_eval") 301 if "task_description" in self.properties and not isinstance( 302 self.properties["task_description"], str 303 ): 304 raise ValueError( 305 "task_description is optional, but if provided must be a string" 306 ) 307 return self 308 else: 309 raise ValueError(f"Invalid eval config type: {self.config_type}")
311 @model_validator(mode="after") 312 def validate_json_serializable(self) -> "EvalConfig": 313 try: 314 # This will raise a TypeError if the dict contains non-JSON-serializable objects 315 json.dumps(self.properties) 316 except TypeError as e: 317 raise ValueError(f"Properties must be JSON serializable: {e!s}") 318 return self
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.
321class EvalDataType(str, Enum): 322 """The type of task output data to evaluate.""" 323 324 final_answer = "final_answer" 325 full_trace = "full_trace" 326 reference_answer = "reference_answer"
The type of task output data to evaluate.
329class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}): 330 """An evaluator definition that specifies what to evaluate and how scores should be produced.""" 331 332 name: FilenameString = Field(description="The name of the eval.") 333 description: str | None = Field( 334 default=None, description="The description of the eval" 335 ) 336 template: EvalTemplateId | None = Field( 337 default=None, 338 description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.", 339 ) 340 current_config_id: ID_TYPE = Field( 341 default=None, 342 description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.", 343 ) 344 eval_set_filter_id: DatasetFilterId = Field( 345 description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id and train_set_filter_id." 346 ) 347 eval_configs_filter_id: DatasetFilterId | None = Field( 348 default=None, 349 description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id.", 350 ) 351 train_set_filter_id: DatasetFilterId | None = Field( 352 default=None, 353 description="The id of the dataset filter which defines which dataset items are included in the training set for fine-tuning. Should be mutually exclusive with eval_set_filter_id.", 354 ) 355 output_scores: List[EvalOutputScore] = Field( 356 description="The scores this evaluator should produce." 357 ) 358 favourite: bool = Field( 359 default=False, 360 description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.", 361 ) 362 template_properties: dict[str, str | int | bool | float] | None = Field( 363 default=None, 364 description="Properties to be used to execute the eval. This is template_type specific and should serialize to a json dict.", 365 ) 366 evaluation_data_type: EvalDataType = Field( 367 default=EvalDataType.final_answer, 368 description="The output of the task run to evaluate. Can be final answer or full trace.", 369 ) 370 371 # Workaround to return typed parent without importing Task 372 def parent_task(self) -> Union["Task", None]: 373 if self.parent is not None and self.parent.__class__.__name__ != "Task": 374 raise ValueError("parent must be a Task") 375 return self.parent # type: ignore 376 377 def configs(self, readonly: bool = False) -> list[EvalConfig]: 378 return super().configs(readonly=readonly) # type: ignore 379 380 # Workaround to return typed parent without importing Spec 381 def associated_spec(self, readonly: bool = False) -> Union["Spec", None]: 382 """ 383 Get the spec associated with this eval, if any. 384 Returns None for legacy evals that are not associated with a spec. 385 """ 386 387 task = self.parent_task() 388 if not task or not self.id: 389 return None 390 391 specs = task.specs(readonly=readonly) 392 for spec in specs: 393 if spec.eval_id == self.id: 394 return spec 395 return None 396 397 @model_validator(mode="after") 398 def upgrade_old_reference_answer_eval_config(self) -> Self: 399 """ 400 Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set. 401 402 For reference_answer evals that don't have a current_config_id set, this migration 403 will set the first config (by created_at) as the default. 404 """ 405 if self.id is None: 406 return self 407 408 # Only run during file loading 409 if not self._loaded_from_file: 410 return self 411 412 # Skip if already migrated (has a current_config_id set) 413 if self.current_config_id is not None: 414 return self 415 416 # Only migrate reference_answer evals 417 if self.evaluation_data_type != EvalDataType.reference_answer: 418 return self 419 420 # Prevent recursion: self.configs() loads child files, which re-loads this parent 421 # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file) 422 # This causes the validator to run again, creating an infinite loop without this guard. 423 with _migration_lock: 424 if self.id in _currently_migrating_eval_ids: 425 return self 426 _currently_migrating_eval_ids.add(self.id) 427 428 try: 429 # Get the configs - these are loaded from child files 430 configs_list = self.configs(readonly=True) 431 if configs_list and len(configs_list) > 0: 432 # Sort by created_at to get the oldest (first created) config 433 sorted_configs = sorted(configs_list, key=lambda c: c.created_at) 434 self.current_config_id = sorted_configs[0].id 435 finally: 436 with _migration_lock: 437 _currently_migrating_eval_ids.discard(self.id) 438 439 return self 440 441 @model_validator(mode="after") 442 def migrate_train_set_filter_id(self) -> Self: 443 """ 444 Migration: Auto-create a train_set_filter_id for legacy evals that don't have one. 445 446 Generates a tag-based filter ID from the eval name following the convention 447 used by spec-based evals (e.g., "train_{name_slug}"). 448 """ 449 if self.id is None: 450 return self 451 452 if not self._loaded_from_file: 453 return self 454 455 if self.train_set_filter_id is not None: 456 return self 457 458 tag_suffix = self.name.lower().replace(" ", "_") 459 self.train_set_filter_id = f"tag::train_{tag_suffix}" 460 return self 461 462 @model_validator(mode="after") 463 def validate_scores(self) -> Self: 464 if self.output_scores is None or len(self.output_scores) == 0: 465 raise ValueError( 466 "output_scores are required, and must have at least one score." 467 ) 468 469 # check for duplicate names (once transformed to JSON keys) 470 output_score_keys = [score.json_key() for score in self.output_scores] 471 if len(output_score_keys) != len(set(output_score_keys)): 472 raise ValueError( 473 f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]" 474 ) 475 return self 476 477 @model_validator(mode="after") 478 def validate_template_properties(self) -> Self: 479 # eval_configs_filter_id is required for all templates except "rag" 480 if ( 481 self.template is not EvalTemplateId.rag 482 and self.eval_configs_filter_id is None 483 ): 484 raise ValueError( 485 "eval_configs_filter_id is required for all templates except 'rag'" 486 ) 487 488 # For spec-based evals, template_properties will be None and validation happens in the spec 489 # For legacy evals, template_properties contains the data and we validate here 490 if self.template_properties is None: 491 return self 492 493 # Check for properties that are required for the issue template (legacy evals only) 494 if self.template == EvalTemplateId.issue: 495 if "issue_prompt" not in self.template_properties or not isinstance( 496 self.template_properties["issue_prompt"], str 497 ): 498 raise ValueError("issue_prompt is required for issue template") 499 if "failure_example" in self.template_properties and not isinstance( 500 self.template_properties["failure_example"], str 501 ): 502 raise ValueError( 503 "failure_example is optional for issue template, but if provided must be a string" 504 ) 505 if "pass_example" in self.template_properties and not isinstance( 506 self.template_properties["pass_example"], str 507 ): 508 raise ValueError( 509 "pass_example is optional for issue template, but if provided must be a string" 510 ) 511 512 if self.template == EvalTemplateId.tool_call: 513 if self.evaluation_data_type != EvalDataType.full_trace: 514 raise ValueError( 515 "tool_call template should have evaluation_data_type set to full_trace" 516 ) 517 if ( 518 "tool" not in self.template_properties 519 or not isinstance(self.template_properties["tool"], str) 520 or not self.template_properties["tool"].strip() 521 ): 522 raise ValueError("tool is required for tool call template") 523 if "tool_function_name" not in self.template_properties or not isinstance( 524 self.template_properties["tool_function_name"], str 525 ): 526 raise ValueError( 527 "tool_function_name is required for tool call template" 528 ) 529 if ( 530 "appropriate_tool_use_guidelines" not in self.template_properties 531 or not isinstance( 532 self.template_properties["appropriate_tool_use_guidelines"], str 533 ) 534 or not self.template_properties[ 535 "appropriate_tool_use_guidelines" 536 ].strip() 537 ): 538 raise ValueError( 539 "appropriate_tool_use_guidelines is required for tool call template" 540 ) 541 if ( 542 "inappropriate_tool_use_guidelines" in self.template_properties 543 and not isinstance( 544 self.template_properties["inappropriate_tool_use_guidelines"], str 545 ) 546 ): 547 raise ValueError( 548 "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string" 549 ) 550 return self
An evaluator definition that specifies what to evaluate and how scores should be produced.
743 def child_method(self, readonly: bool = False) -> list[child_class]: # type: ignore[invalid-type-form] 744 return child_class.all_children_of_parent_path(self.path, readonly=readonly)
The type of the None singleton.
381 def associated_spec(self, readonly: bool = False) -> Union["Spec", None]: 382 """ 383 Get the spec associated with this eval, if any. 384 Returns None for legacy evals that are not associated with a spec. 385 """ 386 387 task = self.parent_task() 388 if not task or not self.id: 389 return None 390 391 specs = task.specs(readonly=readonly) 392 for spec in specs: 393 if spec.eval_id == self.id: 394 return spec 395 return None
Get the spec associated with this eval, if any. Returns None for legacy evals that are not associated with a spec.
397 @model_validator(mode="after") 398 def upgrade_old_reference_answer_eval_config(self) -> Self: 399 """ 400 Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set. 401 402 For reference_answer evals that don't have a current_config_id set, this migration 403 will set the first config (by created_at) as the default. 404 """ 405 if self.id is None: 406 return self 407 408 # Only run during file loading 409 if not self._loaded_from_file: 410 return self 411 412 # Skip if already migrated (has a current_config_id set) 413 if self.current_config_id is not None: 414 return self 415 416 # Only migrate reference_answer evals 417 if self.evaluation_data_type != EvalDataType.reference_answer: 418 return self 419 420 # Prevent recursion: self.configs() loads child files, which re-loads this parent 421 # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file) 422 # This causes the validator to run again, creating an infinite loop without this guard. 423 with _migration_lock: 424 if self.id in _currently_migrating_eval_ids: 425 return self 426 _currently_migrating_eval_ids.add(self.id) 427 428 try: 429 # Get the configs - these are loaded from child files 430 configs_list = self.configs(readonly=True) 431 if configs_list and len(configs_list) > 0: 432 # Sort by created_at to get the oldest (first created) config 433 sorted_configs = sorted(configs_list, key=lambda c: c.created_at) 434 self.current_config_id = sorted_configs[0].id 435 finally: 436 with _migration_lock: 437 _currently_migrating_eval_ids.discard(self.id) 438 439 return self
Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set.
For reference_answer evals that don't have a current_config_id set, this migration will set the first config (by created_at) as the default.
441 @model_validator(mode="after") 442 def migrate_train_set_filter_id(self) -> Self: 443 """ 444 Migration: Auto-create a train_set_filter_id for legacy evals that don't have one. 445 446 Generates a tag-based filter ID from the eval name following the convention 447 used by spec-based evals (e.g., "train_{name_slug}"). 448 """ 449 if self.id is None: 450 return self 451 452 if not self._loaded_from_file: 453 return self 454 455 if self.train_set_filter_id is not None: 456 return self 457 458 tag_suffix = self.name.lower().replace(" ", "_") 459 self.train_set_filter_id = f"tag::train_{tag_suffix}" 460 return self
Migration: Auto-create a train_set_filter_id for legacy evals that don't have one.
Generates a tag-based filter ID from the eval name following the convention used by spec-based evals (e.g., "train_{name_slug}").
462 @model_validator(mode="after") 463 def validate_scores(self) -> Self: 464 if self.output_scores is None or len(self.output_scores) == 0: 465 raise ValueError( 466 "output_scores are required, and must have at least one score." 467 ) 468 469 # check for duplicate names (once transformed to JSON keys) 470 output_score_keys = [score.json_key() for score in self.output_scores] 471 if len(output_score_keys) != len(set(output_score_keys)): 472 raise ValueError( 473 f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]" 474 ) 475 return self
477 @model_validator(mode="after") 478 def validate_template_properties(self) -> Self: 479 # eval_configs_filter_id is required for all templates except "rag" 480 if ( 481 self.template is not EvalTemplateId.rag 482 and self.eval_configs_filter_id is None 483 ): 484 raise ValueError( 485 "eval_configs_filter_id is required for all templates except 'rag'" 486 ) 487 488 # For spec-based evals, template_properties will be None and validation happens in the spec 489 # For legacy evals, template_properties contains the data and we validate here 490 if self.template_properties is None: 491 return self 492 493 # Check for properties that are required for the issue template (legacy evals only) 494 if self.template == EvalTemplateId.issue: 495 if "issue_prompt" not in self.template_properties or not isinstance( 496 self.template_properties["issue_prompt"], str 497 ): 498 raise ValueError("issue_prompt is required for issue template") 499 if "failure_example" in self.template_properties and not isinstance( 500 self.template_properties["failure_example"], str 501 ): 502 raise ValueError( 503 "failure_example is optional for issue template, but if provided must be a string" 504 ) 505 if "pass_example" in self.template_properties and not isinstance( 506 self.template_properties["pass_example"], str 507 ): 508 raise ValueError( 509 "pass_example is optional for issue template, but if provided must be a string" 510 ) 511 512 if self.template == EvalTemplateId.tool_call: 513 if self.evaluation_data_type != EvalDataType.full_trace: 514 raise ValueError( 515 "tool_call template should have evaluation_data_type set to full_trace" 516 ) 517 if ( 518 "tool" not in self.template_properties 519 or not isinstance(self.template_properties["tool"], str) 520 or not self.template_properties["tool"].strip() 521 ): 522 raise ValueError("tool is required for tool call template") 523 if "tool_function_name" not in self.template_properties or not isinstance( 524 self.template_properties["tool_function_name"], str 525 ): 526 raise ValueError( 527 "tool_function_name is required for tool call template" 528 ) 529 if ( 530 "appropriate_tool_use_guidelines" not in self.template_properties 531 or not isinstance( 532 self.template_properties["appropriate_tool_use_guidelines"], str 533 ) 534 or not self.template_properties[ 535 "appropriate_tool_use_guidelines" 536 ].strip() 537 ): 538 raise ValueError( 539 "appropriate_tool_use_guidelines is required for tool call template" 540 ) 541 if ( 542 "inappropriate_tool_use_guidelines" in self.template_properties 543 and not isinstance( 544 self.template_properties["inappropriate_tool_use_guidelines"], str 545 ) 546 ): 547 raise ValueError( 548 "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string" 549 ) 550 return self
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.