kiln_ai.datamodel.eval
1import json 2from enum import Enum 3from threading import Lock 4from typing import TYPE_CHECKING, Any, Dict, List, Union 5 6from pydantic import BaseModel, Field, model_validator 7from typing_extensions import Self 8 9from kiln_ai.datamodel.basemodel import ( 10 ID_TYPE, 11 FilenameString, 12 KilnParentedModel, 13 KilnParentModel, 14) 15from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType 16from kiln_ai.datamodel.dataset_filters import DatasetFilterId 17from kiln_ai.datamodel.json_schema import string_to_json_key 18from kiln_ai.datamodel.task_run import Usage 19from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error 20 21if TYPE_CHECKING: 22 from kiln_ai.datamodel.task import Task 23 24EvalScores = Dict[str, float] 25 26# Module-level set to track evals currently being migrated (to prevent recursion) 27# Protected by _migration_lock to ensure thread-safe access 28_migration_lock = Lock() 29_currently_migrating_eval_ids: set[ID_TYPE] = set() 30 31 32class EvalTemplateId(str, Enum): 33 """ 34 An eval template is a pre-defined eval that can be used as a starting point for a new eval. 35 """ 36 37 kiln_requirements = "kiln_requirements" 38 issue = "kiln_issue" 39 tool_call = "tool_call" 40 toxicity = "toxicity" 41 bias = "bias" 42 maliciousness = "maliciousness" 43 factual_correctness = "factual_correctness" 44 jailbreak = "jailbreak" 45 rag = "rag" 46 47 48class EvalConfigType(str, Enum): 49 g_eval = "g_eval" 50 llm_as_judge = "llm_as_judge" 51 52 53class EvalOutputScore(BaseModel): 54 """ 55 A definition of a score that an evaluator will produce. 56 57 Very similar to TaskRequirement, but conceptually different keeping in a separate models. 58 """ 59 60 name: str = Field( 61 description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance." 62 ) 63 instruction: str | None = Field( 64 default=None, 65 description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.", 66 ) 67 type: TaskOutputRatingType = Field( 68 description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical')." 69 ) 70 71 def json_key(self) -> str: 72 """ 73 The JSON key for the score, used when running the evaluator with a LLM and we need JSON output. 74 75 For example, "Overall Rating" -> "overall_rating" 76 """ 77 return string_to_json_key(self.name) 78 79 @model_validator(mode="after") 80 def validate_type(self) -> Self: 81 if self.type == TaskOutputRatingType.custom: 82 raise ValueError( 83 f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score." 84 ) 85 return self 86 87 88class EvalRun(KilnParentedModel): 89 """ 90 The results of running an eval on a single dataset item. 91 92 This is a child of an EvalConfig, which specifies how the scores were generated. 93 94 Eval runs can be one of 2 types: 95 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 96 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item. 97 """ 98 99 dataset_id: ID_TYPE = Field( 100 description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun." 101 ) 102 task_run_config_id: ID_TYPE | None = Field( 103 description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config." 104 ) 105 eval_config_eval: bool = Field( 106 description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.", 107 default=False, 108 ) 109 # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally. 110 input: str = Field( 111 description="The input to the task. JSON formatted for structured input, plaintext for unstructured input." 112 ) 113 output: str = Field( 114 description="The output of the task. JSON formatted for structured output, plaintext for unstructured output." 115 ) 116 reference_answer: str | None = Field( 117 default=None, 118 description="The reference answer for the input. JSON formatted for structured reference answer, plaintext for unstructured reference answer. Used for reference answer evals.", 119 ) 120 intermediate_outputs: Dict[str, str] | None = Field( 121 default=None, 122 description="The intermediate outputs of the task (example, eval thinking).", 123 ) 124 task_run_trace: str | None = Field( 125 default=None, 126 description="The JSON formatted trace of the task run that produced the output.", 127 ) 128 scores: EvalScores = Field( 129 description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)." 130 ) 131 task_run_usage: Usage | None = Field( 132 default=None, 133 description="The usage of the task run that produced this eval run output (not the usage by the evaluation model).", 134 ) 135 136 def parent_eval_config(self) -> Union["EvalConfig", None]: 137 if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig": 138 raise ValueError("parent must be an EvalConfig") 139 return self.parent # type: ignore 140 141 @model_validator(mode="after") 142 def validate_output_fields(self) -> Self: 143 parent_eval_config = self.parent_eval_config() 144 parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None 145 if not parent_eval: 146 return self 147 148 evaluation_data_type = parent_eval.evaluation_data_type 149 if ( 150 evaluation_data_type == EvalDataType.final_answer 151 and self.task_run_trace is not None 152 ): 153 raise ValueError("final_answer runs should not set trace") 154 elif ( 155 not self.eval_config_eval 156 and evaluation_data_type == EvalDataType.full_trace 157 and self.task_run_trace is None 158 ): 159 raise ValueError("full_trace task run eval runs should include trace") 160 161 return self 162 163 @model_validator(mode="after") 164 def validate_eval_run_types(self) -> Self: 165 if self.eval_config_eval and self.task_run_config_id is not None: 166 raise ValueError( 167 "task_run_config_id must be None if eval_config_eval is true" 168 ) 169 if not self.eval_config_eval and self.task_run_config_id is None: 170 raise ValueError( 171 "task_run_config_id must be set if eval_config_eval is false" 172 ) 173 return self 174 175 @model_validator(mode="after") 176 def validate_scores(self) -> Self: 177 # We're checking the scores have the expected keys from the grand-parent eval 178 if self.scores is None or len(self.scores) == 0: 179 raise ValueError("scores are required, and must have at least one score.") 180 181 parent_eval_config = self.parent_eval_config() 182 eval = parent_eval_config.parent_eval() if parent_eval_config else None 183 if not eval: 184 # Can't validate without the grand-parent eval, allow it to be validated later 185 return self 186 187 output_score_keys = [score.json_key() for score in eval.output_scores] 188 if set(output_score_keys) != set(self.scores.keys()): 189 raise ValueError( 190 f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]" 191 ) 192 193 # Check that each score is expected in this eval and the correct type 194 for output_score in eval.output_scores: 195 match output_score.type: 196 case TaskOutputRatingType.five_star: 197 five_star_score = self.scores[output_score.json_key()] 198 if ( 199 not isinstance(five_star_score, float) 200 or five_star_score < 1.0 201 or five_star_score > 5.0 202 ): 203 raise ValueError( 204 f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}" 205 ) 206 case TaskOutputRatingType.pass_fail: 207 pass_fail_score = self.scores[output_score.json_key()] 208 if ( 209 not isinstance(pass_fail_score, float) 210 or pass_fail_score < 0.0 211 or pass_fail_score > 1.0 212 ): 213 raise ValueError( 214 f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}" 215 ) 216 case TaskOutputRatingType.pass_fail_critical: 217 pass_fail_critical_score = self.scores[output_score.json_key()] 218 if ( 219 not isinstance(pass_fail_critical_score, float) 220 or pass_fail_critical_score < -1.0 221 or pass_fail_critical_score > 1.0 222 ): 223 raise ValueError( 224 f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}" 225 ) 226 case TaskOutputRatingType.custom: 227 raise ValueError( 228 f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score." 229 ) 230 case _: 231 # Catch missing cases 232 raise_exhaustive_enum_error(output_score.type) 233 return self 234 235 @model_validator(mode="after") 236 def validate_reference_answer(self) -> Self: 237 parent_eval_config = self.parent_eval_config() 238 parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None 239 if not parent_eval: 240 # Can't validate without the grand-parent eval, allow it to be validated later 241 return self 242 243 evaluation_data_type = parent_eval.evaluation_data_type 244 if ( 245 self.reference_answer is not None 246 and evaluation_data_type != EvalDataType.reference_answer 247 ): 248 raise ValueError( 249 f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}" 250 ) 251 return self 252 253 254class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}): 255 """ 256 A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc. 257 258 A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config. 259 """ 260 261 name: FilenameString = Field(description="The name of the eval config.") 262 model_name: str = Field( 263 description="The name of the model to use for this eval config. ", 264 ) 265 model_provider: str = Field( 266 description="The provider of the model to use for this eval config.", 267 ) 268 config_type: EvalConfigType = Field( 269 default=EvalConfigType.g_eval, 270 description="This is used to determine the type of eval to run.", 271 ) 272 properties: dict[str, Any] = Field( 273 default={}, 274 description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.", 275 ) 276 277 def parent_eval(self) -> Union["Eval", None]: 278 if self.parent is not None and self.parent.__class__.__name__ != "Eval": 279 raise ValueError("parent must be an Eval") 280 return self.parent # type: ignore 281 282 def runs(self, readonly: bool = False) -> list[EvalRun]: 283 return super().runs(readonly=readonly) # type: ignore 284 285 @model_validator(mode="after") 286 def validate_properties(self) -> Self: 287 if ( 288 self.config_type == EvalConfigType.g_eval 289 or self.config_type == EvalConfigType.llm_as_judge 290 ): 291 if "eval_steps" not in self.properties or not isinstance( 292 self.properties["eval_steps"], list 293 ): 294 raise ValueError("eval_steps is required and must be a list for g_eval") 295 if "task_description" in self.properties and not isinstance( 296 self.properties["task_description"], str 297 ): 298 raise ValueError( 299 "task_description is optional, but if provided must be a string" 300 ) 301 return self 302 else: 303 raise ValueError(f"Invalid eval config type: {self.config_type}") 304 305 @model_validator(mode="after") 306 def validate_json_serializable(self) -> "EvalConfig": 307 try: 308 # This will raise a TypeError if the dict contains non-JSON-serializable objects 309 json.dumps(self.properties) 310 except TypeError as e: 311 raise ValueError(f"Properties must be JSON serializable: {e!s}") 312 return self 313 314 315class EvalDataType(str, Enum): 316 final_answer = "final_answer" 317 full_trace = "full_trace" 318 reference_answer = "reference_answer" 319 320 321class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}): 322 name: FilenameString = Field(description="The name of the eval.") 323 description: str | None = Field( 324 default=None, description="The description of the eval" 325 ) 326 template: EvalTemplateId | None = Field( 327 default=None, 328 description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.", 329 ) 330 current_config_id: ID_TYPE = Field( 331 default=None, 332 description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.", 333 ) 334 eval_set_filter_id: DatasetFilterId = Field( 335 description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id." 336 ) 337 eval_configs_filter_id: DatasetFilterId | None = Field( 338 default=None, 339 description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id.", 340 ) 341 output_scores: List[EvalOutputScore] = Field( 342 description="The scores this evaluator should produce." 343 ) 344 favourite: bool = Field( 345 default=False, 346 description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.", 347 ) 348 template_properties: dict[str, str | int | bool | float] = Field( 349 default={}, 350 description="Properties to be used to execute the eval. This is template_type specific and should serialize to a json dict.", 351 ) 352 evaluation_data_type: EvalDataType = Field( 353 default=EvalDataType.final_answer, 354 description="The output of the task run to evaluate. Can be final answer or full trace.", 355 ) 356 357 # Workaround to return typed parent without importing Task 358 def parent_task(self) -> Union["Task", None]: 359 if self.parent is not None and self.parent.__class__.__name__ != "Task": 360 raise ValueError("parent must be a Task") 361 return self.parent # type: ignore 362 363 def configs(self, readonly: bool = False) -> list[EvalConfig]: 364 return super().configs(readonly=readonly) # type: ignore 365 366 @model_validator(mode="after") 367 def upgrade_old_reference_answer_eval_config(self) -> Self: 368 """ 369 Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set. 370 371 For reference_answer evals that don't have a current_config_id set, this migration 372 will set the first config (by created_at) as the default. 373 """ 374 if self.id is None: 375 return self 376 377 # Only run during file loading 378 if not self._loaded_from_file: 379 return self 380 381 # Skip if already migrated (has a current_config_id set) 382 if self.current_config_id is not None: 383 return self 384 385 # Only migrate reference_answer evals 386 if self.evaluation_data_type != EvalDataType.reference_answer: 387 return self 388 389 # Prevent recursion: self.configs() loads child files, which re-loads this parent 390 # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file) 391 # This causes the validator to run again, creating an infinite loop without this guard. 392 with _migration_lock: 393 if self.id in _currently_migrating_eval_ids: 394 return self 395 _currently_migrating_eval_ids.add(self.id) 396 397 try: 398 # Get the configs - these are loaded from child files 399 configs_list = self.configs(readonly=True) 400 if configs_list and len(configs_list) > 0: 401 # Sort by created_at to get the oldest (first created) config 402 sorted_configs = sorted(configs_list, key=lambda c: c.created_at) 403 self.current_config_id = sorted_configs[0].id 404 finally: 405 with _migration_lock: 406 _currently_migrating_eval_ids.discard(self.id) 407 408 return self 409 410 @model_validator(mode="after") 411 def validate_scores(self) -> Self: 412 if self.output_scores is None or len(self.output_scores) == 0: 413 raise ValueError( 414 "output_scores are required, and must have at least one score." 415 ) 416 417 # check for duplicate names (once transformed to JSON keys) 418 output_score_keys = [score.json_key() for score in self.output_scores] 419 if len(output_score_keys) != len(set(output_score_keys)): 420 raise ValueError( 421 f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]" 422 ) 423 return self 424 425 @model_validator(mode="after") 426 def validate_template_properties(self) -> Self: 427 # eval_configs_filter_id is required for all templates except "rag" 428 if ( 429 self.template is not EvalTemplateId.rag 430 and self.eval_configs_filter_id is None 431 ): 432 raise ValueError( 433 "eval_configs_filter_id is required for all templates except 'rag'" 434 ) 435 436 # Check for properties that are required for the issue template 437 if self.template == EvalTemplateId.issue: 438 if "issue_prompt" not in self.template_properties or not isinstance( 439 self.template_properties["issue_prompt"], str 440 ): 441 raise ValueError("issue_prompt is required for issue template") 442 if "failure_example" in self.template_properties and not isinstance( 443 self.template_properties["failure_example"], str 444 ): 445 raise ValueError( 446 "failure_example is optional for issue template, but if provided must be a string" 447 ) 448 if "pass_example" in self.template_properties and not isinstance( 449 self.template_properties["pass_example"], str 450 ): 451 raise ValueError( 452 "pass_example is optional for issue template, but if provided must be a string" 453 ) 454 455 if self.template == EvalTemplateId.tool_call: 456 if self.evaluation_data_type != EvalDataType.full_trace: 457 raise ValueError( 458 "tool_call template should have evaluation_data_type set to full_trace" 459 ) 460 if ( 461 "tool" not in self.template_properties 462 or not isinstance(self.template_properties["tool"], str) 463 or not self.template_properties["tool"].strip() 464 ): 465 raise ValueError("tool is required for tool call template") 466 if "tool_function_name" not in self.template_properties or not isinstance( 467 self.template_properties["tool_function_name"], str 468 ): 469 raise ValueError( 470 "tool_function_name is required for tool call template" 471 ) 472 if ( 473 "appropriate_tool_use_guidelines" not in self.template_properties 474 or not isinstance( 475 self.template_properties["appropriate_tool_use_guidelines"], str 476 ) 477 or not self.template_properties[ 478 "appropriate_tool_use_guidelines" 479 ].strip() 480 ): 481 raise ValueError( 482 "appropriate_tool_use_guidelines is required for tool call template" 483 ) 484 if ( 485 "inappropriate_tool_use_guidelines" in self.template_properties 486 and not isinstance( 487 self.template_properties["inappropriate_tool_use_guidelines"], str 488 ) 489 ): 490 raise ValueError( 491 "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string" 492 ) 493 return self
33class EvalTemplateId(str, Enum): 34 """ 35 An eval template is a pre-defined eval that can be used as a starting point for a new eval. 36 """ 37 38 kiln_requirements = "kiln_requirements" 39 issue = "kiln_issue" 40 tool_call = "tool_call" 41 toxicity = "toxicity" 42 bias = "bias" 43 maliciousness = "maliciousness" 44 factual_correctness = "factual_correctness" 45 jailbreak = "jailbreak" 46 rag = "rag"
An eval template is a pre-defined eval that can be used as a starting point for a new eval.
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.
54class EvalOutputScore(BaseModel): 55 """ 56 A definition of a score that an evaluator will produce. 57 58 Very similar to TaskRequirement, but conceptually different keeping in a separate models. 59 """ 60 61 name: str = Field( 62 description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance." 63 ) 64 instruction: str | None = Field( 65 default=None, 66 description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.", 67 ) 68 type: TaskOutputRatingType = Field( 69 description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical')." 70 ) 71 72 def json_key(self) -> str: 73 """ 74 The JSON key for the score, used when running the evaluator with a LLM and we need JSON output. 75 76 For example, "Overall Rating" -> "overall_rating" 77 """ 78 return string_to_json_key(self.name) 79 80 @model_validator(mode="after") 81 def validate_type(self) -> Self: 82 if self.type == TaskOutputRatingType.custom: 83 raise ValueError( 84 f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score." 85 ) 86 return self
A definition of a score that an evaluator will produce.
Very similar to TaskRequirement, but conceptually different keeping in a separate models.
72 def json_key(self) -> str: 73 """ 74 The JSON key for the score, used when running the evaluator with a LLM and we need JSON output. 75 76 For example, "Overall Rating" -> "overall_rating" 77 """ 78 return string_to_json_key(self.name)
The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
For example, "Overall Rating" -> "overall_rating"
89class EvalRun(KilnParentedModel): 90 """ 91 The results of running an eval on a single dataset item. 92 93 This is a child of an EvalConfig, which specifies how the scores were generated. 94 95 Eval runs can be one of 2 types: 96 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 97 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item. 98 """ 99 100 dataset_id: ID_TYPE = Field( 101 description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun." 102 ) 103 task_run_config_id: ID_TYPE | None = Field( 104 description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config." 105 ) 106 eval_config_eval: bool = Field( 107 description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.", 108 default=False, 109 ) 110 # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally. 111 input: str = Field( 112 description="The input to the task. JSON formatted for structured input, plaintext for unstructured input." 113 ) 114 output: str = Field( 115 description="The output of the task. JSON formatted for structured output, plaintext for unstructured output." 116 ) 117 reference_answer: str | None = Field( 118 default=None, 119 description="The reference answer for the input. JSON formatted for structured reference answer, plaintext for unstructured reference answer. Used for reference answer evals.", 120 ) 121 intermediate_outputs: Dict[str, str] | None = Field( 122 default=None, 123 description="The intermediate outputs of the task (example, eval thinking).", 124 ) 125 task_run_trace: str | None = Field( 126 default=None, 127 description="The JSON formatted trace of the task run that produced the output.", 128 ) 129 scores: EvalScores = Field( 130 description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)." 131 ) 132 task_run_usage: Usage | None = Field( 133 default=None, 134 description="The usage of the task run that produced this eval run output (not the usage by the evaluation model).", 135 ) 136 137 def parent_eval_config(self) -> Union["EvalConfig", None]: 138 if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig": 139 raise ValueError("parent must be an EvalConfig") 140 return self.parent # type: ignore 141 142 @model_validator(mode="after") 143 def validate_output_fields(self) -> Self: 144 parent_eval_config = self.parent_eval_config() 145 parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None 146 if not parent_eval: 147 return self 148 149 evaluation_data_type = parent_eval.evaluation_data_type 150 if ( 151 evaluation_data_type == EvalDataType.final_answer 152 and self.task_run_trace is not None 153 ): 154 raise ValueError("final_answer runs should not set trace") 155 elif ( 156 not self.eval_config_eval 157 and evaluation_data_type == EvalDataType.full_trace 158 and self.task_run_trace is None 159 ): 160 raise ValueError("full_trace task run eval runs should include trace") 161 162 return self 163 164 @model_validator(mode="after") 165 def validate_eval_run_types(self) -> Self: 166 if self.eval_config_eval and self.task_run_config_id is not None: 167 raise ValueError( 168 "task_run_config_id must be None if eval_config_eval is true" 169 ) 170 if not self.eval_config_eval and self.task_run_config_id is None: 171 raise ValueError( 172 "task_run_config_id must be set if eval_config_eval is false" 173 ) 174 return self 175 176 @model_validator(mode="after") 177 def validate_scores(self) -> Self: 178 # We're checking the scores have the expected keys from the grand-parent eval 179 if self.scores is None or len(self.scores) == 0: 180 raise ValueError("scores are required, and must have at least one score.") 181 182 parent_eval_config = self.parent_eval_config() 183 eval = parent_eval_config.parent_eval() if parent_eval_config else None 184 if not eval: 185 # Can't validate without the grand-parent eval, allow it to be validated later 186 return self 187 188 output_score_keys = [score.json_key() for score in eval.output_scores] 189 if set(output_score_keys) != set(self.scores.keys()): 190 raise ValueError( 191 f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]" 192 ) 193 194 # Check that each score is expected in this eval and the correct type 195 for output_score in eval.output_scores: 196 match output_score.type: 197 case TaskOutputRatingType.five_star: 198 five_star_score = self.scores[output_score.json_key()] 199 if ( 200 not isinstance(five_star_score, float) 201 or five_star_score < 1.0 202 or five_star_score > 5.0 203 ): 204 raise ValueError( 205 f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}" 206 ) 207 case TaskOutputRatingType.pass_fail: 208 pass_fail_score = self.scores[output_score.json_key()] 209 if ( 210 not isinstance(pass_fail_score, float) 211 or pass_fail_score < 0.0 212 or pass_fail_score > 1.0 213 ): 214 raise ValueError( 215 f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}" 216 ) 217 case TaskOutputRatingType.pass_fail_critical: 218 pass_fail_critical_score = self.scores[output_score.json_key()] 219 if ( 220 not isinstance(pass_fail_critical_score, float) 221 or pass_fail_critical_score < -1.0 222 or pass_fail_critical_score > 1.0 223 ): 224 raise ValueError( 225 f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}" 226 ) 227 case TaskOutputRatingType.custom: 228 raise ValueError( 229 f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score." 230 ) 231 case _: 232 # Catch missing cases 233 raise_exhaustive_enum_error(output_score.type) 234 return self 235 236 @model_validator(mode="after") 237 def validate_reference_answer(self) -> Self: 238 parent_eval_config = self.parent_eval_config() 239 parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None 240 if not parent_eval: 241 # Can't validate without the grand-parent eval, allow it to be validated later 242 return self 243 244 evaluation_data_type = parent_eval.evaluation_data_type 245 if ( 246 self.reference_answer is not None 247 and evaluation_data_type != EvalDataType.reference_answer 248 ): 249 raise ValueError( 250 f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}" 251 ) 252 return self
The results of running an eval on a single dataset item.
This is a child of an EvalConfig, which specifies how the scores were generated.
Eval runs can be one of 2 types: 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
142 @model_validator(mode="after") 143 def validate_output_fields(self) -> Self: 144 parent_eval_config = self.parent_eval_config() 145 parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None 146 if not parent_eval: 147 return self 148 149 evaluation_data_type = parent_eval.evaluation_data_type 150 if ( 151 evaluation_data_type == EvalDataType.final_answer 152 and self.task_run_trace is not None 153 ): 154 raise ValueError("final_answer runs should not set trace") 155 elif ( 156 not self.eval_config_eval 157 and evaluation_data_type == EvalDataType.full_trace 158 and self.task_run_trace is None 159 ): 160 raise ValueError("full_trace task run eval runs should include trace") 161 162 return self
164 @model_validator(mode="after") 165 def validate_eval_run_types(self) -> Self: 166 if self.eval_config_eval and self.task_run_config_id is not None: 167 raise ValueError( 168 "task_run_config_id must be None if eval_config_eval is true" 169 ) 170 if not self.eval_config_eval and self.task_run_config_id is None: 171 raise ValueError( 172 "task_run_config_id must be set if eval_config_eval is false" 173 ) 174 return self
176 @model_validator(mode="after") 177 def validate_scores(self) -> Self: 178 # We're checking the scores have the expected keys from the grand-parent eval 179 if self.scores is None or len(self.scores) == 0: 180 raise ValueError("scores are required, and must have at least one score.") 181 182 parent_eval_config = self.parent_eval_config() 183 eval = parent_eval_config.parent_eval() if parent_eval_config else None 184 if not eval: 185 # Can't validate without the grand-parent eval, allow it to be validated later 186 return self 187 188 output_score_keys = [score.json_key() for score in eval.output_scores] 189 if set(output_score_keys) != set(self.scores.keys()): 190 raise ValueError( 191 f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]" 192 ) 193 194 # Check that each score is expected in this eval and the correct type 195 for output_score in eval.output_scores: 196 match output_score.type: 197 case TaskOutputRatingType.five_star: 198 five_star_score = self.scores[output_score.json_key()] 199 if ( 200 not isinstance(five_star_score, float) 201 or five_star_score < 1.0 202 or five_star_score > 5.0 203 ): 204 raise ValueError( 205 f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}" 206 ) 207 case TaskOutputRatingType.pass_fail: 208 pass_fail_score = self.scores[output_score.json_key()] 209 if ( 210 not isinstance(pass_fail_score, float) 211 or pass_fail_score < 0.0 212 or pass_fail_score > 1.0 213 ): 214 raise ValueError( 215 f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}" 216 ) 217 case TaskOutputRatingType.pass_fail_critical: 218 pass_fail_critical_score = self.scores[output_score.json_key()] 219 if ( 220 not isinstance(pass_fail_critical_score, float) 221 or pass_fail_critical_score < -1.0 222 or pass_fail_critical_score > 1.0 223 ): 224 raise ValueError( 225 f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}" 226 ) 227 case TaskOutputRatingType.custom: 228 raise ValueError( 229 f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score." 230 ) 231 case _: 232 # Catch missing cases 233 raise_exhaustive_enum_error(output_score.type) 234 return self
236 @model_validator(mode="after") 237 def validate_reference_answer(self) -> Self: 238 parent_eval_config = self.parent_eval_config() 239 parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None 240 if not parent_eval: 241 # Can't validate without the grand-parent eval, allow it to be validated later 242 return self 243 244 evaluation_data_type = parent_eval.evaluation_data_type 245 if ( 246 self.reference_answer is not None 247 and evaluation_data_type != EvalDataType.reference_answer 248 ): 249 raise ValueError( 250 f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}" 251 ) 252 return self
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.
255class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}): 256 """ 257 A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc. 258 259 A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config. 260 """ 261 262 name: FilenameString = Field(description="The name of the eval config.") 263 model_name: str = Field( 264 description="The name of the model to use for this eval config. ", 265 ) 266 model_provider: str = Field( 267 description="The provider of the model to use for this eval config.", 268 ) 269 config_type: EvalConfigType = Field( 270 default=EvalConfigType.g_eval, 271 description="This is used to determine the type of eval to run.", 272 ) 273 properties: dict[str, Any] = Field( 274 default={}, 275 description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.", 276 ) 277 278 def parent_eval(self) -> Union["Eval", None]: 279 if self.parent is not None and self.parent.__class__.__name__ != "Eval": 280 raise ValueError("parent must be an Eval") 281 return self.parent # type: ignore 282 283 def runs(self, readonly: bool = False) -> list[EvalRun]: 284 return super().runs(readonly=readonly) # type: ignore 285 286 @model_validator(mode="after") 287 def validate_properties(self) -> Self: 288 if ( 289 self.config_type == EvalConfigType.g_eval 290 or self.config_type == EvalConfigType.llm_as_judge 291 ): 292 if "eval_steps" not in self.properties or not isinstance( 293 self.properties["eval_steps"], list 294 ): 295 raise ValueError("eval_steps is required and must be a list for g_eval") 296 if "task_description" in self.properties and not isinstance( 297 self.properties["task_description"], str 298 ): 299 raise ValueError( 300 "task_description is optional, but if provided must be a string" 301 ) 302 return self 303 else: 304 raise ValueError(f"Invalid eval config type: {self.config_type}") 305 306 @model_validator(mode="after") 307 def validate_json_serializable(self) -> "EvalConfig": 308 try: 309 # This will raise a TypeError if the dict contains non-JSON-serializable objects 310 json.dumps(self.properties) 311 except TypeError as e: 312 raise ValueError(f"Properties must be JSON serializable: {e!s}") 313 return self
A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
695 def child_method(self, readonly: bool = False) -> list[child_class]: 696 return child_class.all_children_of_parent_path(self.path, readonly=readonly)
The type of the None singleton.
286 @model_validator(mode="after") 287 def validate_properties(self) -> Self: 288 if ( 289 self.config_type == EvalConfigType.g_eval 290 or self.config_type == EvalConfigType.llm_as_judge 291 ): 292 if "eval_steps" not in self.properties or not isinstance( 293 self.properties["eval_steps"], list 294 ): 295 raise ValueError("eval_steps is required and must be a list for g_eval") 296 if "task_description" in self.properties and not isinstance( 297 self.properties["task_description"], str 298 ): 299 raise ValueError( 300 "task_description is optional, but if provided must be a string" 301 ) 302 return self 303 else: 304 raise ValueError(f"Invalid eval config type: {self.config_type}")
306 @model_validator(mode="after") 307 def validate_json_serializable(self) -> "EvalConfig": 308 try: 309 # This will raise a TypeError if the dict contains non-JSON-serializable objects 310 json.dumps(self.properties) 311 except TypeError as e: 312 raise ValueError(f"Properties must be JSON serializable: {e!s}") 313 return self
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.
316class EvalDataType(str, Enum): 317 final_answer = "final_answer" 318 full_trace = "full_trace" 319 reference_answer = "reference_answer"
str(object='') -> str str(bytes_or_buffer[, encoding[, errors]]) -> str
Create a new string object from the given object. If encoding or errors is specified, then the object must expose a data buffer that will be decoded using the given encoding and error handler. Otherwise, returns the result of object.__str__() (if defined) or repr(object). encoding defaults to 'utf-8'. errors defaults to 'strict'.
322class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}): 323 name: FilenameString = Field(description="The name of the eval.") 324 description: str | None = Field( 325 default=None, description="The description of the eval" 326 ) 327 template: EvalTemplateId | None = Field( 328 default=None, 329 description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.", 330 ) 331 current_config_id: ID_TYPE = Field( 332 default=None, 333 description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.", 334 ) 335 eval_set_filter_id: DatasetFilterId = Field( 336 description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id." 337 ) 338 eval_configs_filter_id: DatasetFilterId | None = Field( 339 default=None, 340 description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id.", 341 ) 342 output_scores: List[EvalOutputScore] = Field( 343 description="The scores this evaluator should produce." 344 ) 345 favourite: bool = Field( 346 default=False, 347 description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.", 348 ) 349 template_properties: dict[str, str | int | bool | float] = Field( 350 default={}, 351 description="Properties to be used to execute the eval. This is template_type specific and should serialize to a json dict.", 352 ) 353 evaluation_data_type: EvalDataType = Field( 354 default=EvalDataType.final_answer, 355 description="The output of the task run to evaluate. Can be final answer or full trace.", 356 ) 357 358 # Workaround to return typed parent without importing Task 359 def parent_task(self) -> Union["Task", None]: 360 if self.parent is not None and self.parent.__class__.__name__ != "Task": 361 raise ValueError("parent must be a Task") 362 return self.parent # type: ignore 363 364 def configs(self, readonly: bool = False) -> list[EvalConfig]: 365 return super().configs(readonly=readonly) # type: ignore 366 367 @model_validator(mode="after") 368 def upgrade_old_reference_answer_eval_config(self) -> Self: 369 """ 370 Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set. 371 372 For reference_answer evals that don't have a current_config_id set, this migration 373 will set the first config (by created_at) as the default. 374 """ 375 if self.id is None: 376 return self 377 378 # Only run during file loading 379 if not self._loaded_from_file: 380 return self 381 382 # Skip if already migrated (has a current_config_id set) 383 if self.current_config_id is not None: 384 return self 385 386 # Only migrate reference_answer evals 387 if self.evaluation_data_type != EvalDataType.reference_answer: 388 return self 389 390 # Prevent recursion: self.configs() loads child files, which re-loads this parent 391 # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file) 392 # This causes the validator to run again, creating an infinite loop without this guard. 393 with _migration_lock: 394 if self.id in _currently_migrating_eval_ids: 395 return self 396 _currently_migrating_eval_ids.add(self.id) 397 398 try: 399 # Get the configs - these are loaded from child files 400 configs_list = self.configs(readonly=True) 401 if configs_list and len(configs_list) > 0: 402 # Sort by created_at to get the oldest (first created) config 403 sorted_configs = sorted(configs_list, key=lambda c: c.created_at) 404 self.current_config_id = sorted_configs[0].id 405 finally: 406 with _migration_lock: 407 _currently_migrating_eval_ids.discard(self.id) 408 409 return self 410 411 @model_validator(mode="after") 412 def validate_scores(self) -> Self: 413 if self.output_scores is None or len(self.output_scores) == 0: 414 raise ValueError( 415 "output_scores are required, and must have at least one score." 416 ) 417 418 # check for duplicate names (once transformed to JSON keys) 419 output_score_keys = [score.json_key() for score in self.output_scores] 420 if len(output_score_keys) != len(set(output_score_keys)): 421 raise ValueError( 422 f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]" 423 ) 424 return self 425 426 @model_validator(mode="after") 427 def validate_template_properties(self) -> Self: 428 # eval_configs_filter_id is required for all templates except "rag" 429 if ( 430 self.template is not EvalTemplateId.rag 431 and self.eval_configs_filter_id is None 432 ): 433 raise ValueError( 434 "eval_configs_filter_id is required for all templates except 'rag'" 435 ) 436 437 # Check for properties that are required for the issue template 438 if self.template == EvalTemplateId.issue: 439 if "issue_prompt" not in self.template_properties or not isinstance( 440 self.template_properties["issue_prompt"], str 441 ): 442 raise ValueError("issue_prompt is required for issue template") 443 if "failure_example" in self.template_properties and not isinstance( 444 self.template_properties["failure_example"], str 445 ): 446 raise ValueError( 447 "failure_example is optional for issue template, but if provided must be a string" 448 ) 449 if "pass_example" in self.template_properties and not isinstance( 450 self.template_properties["pass_example"], str 451 ): 452 raise ValueError( 453 "pass_example is optional for issue template, but if provided must be a string" 454 ) 455 456 if self.template == EvalTemplateId.tool_call: 457 if self.evaluation_data_type != EvalDataType.full_trace: 458 raise ValueError( 459 "tool_call template should have evaluation_data_type set to full_trace" 460 ) 461 if ( 462 "tool" not in self.template_properties 463 or not isinstance(self.template_properties["tool"], str) 464 or not self.template_properties["tool"].strip() 465 ): 466 raise ValueError("tool is required for tool call template") 467 if "tool_function_name" not in self.template_properties or not isinstance( 468 self.template_properties["tool_function_name"], str 469 ): 470 raise ValueError( 471 "tool_function_name is required for tool call template" 472 ) 473 if ( 474 "appropriate_tool_use_guidelines" not in self.template_properties 475 or not isinstance( 476 self.template_properties["appropriate_tool_use_guidelines"], str 477 ) 478 or not self.template_properties[ 479 "appropriate_tool_use_guidelines" 480 ].strip() 481 ): 482 raise ValueError( 483 "appropriate_tool_use_guidelines is required for tool call template" 484 ) 485 if ( 486 "inappropriate_tool_use_guidelines" in self.template_properties 487 and not isinstance( 488 self.template_properties["inappropriate_tool_use_guidelines"], str 489 ) 490 ): 491 raise ValueError( 492 "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string" 493 ) 494 return self
Base model for Kiln models that have a parent-child relationship. This base class is for child models.
This class provides functionality for managing hierarchical relationships between models, including parent reference handling and file system organization.
Attributes: parent (KilnBaseModel): Reference to the parent model instance. Not persisted, just in memory.
695 def child_method(self, readonly: bool = False) -> list[child_class]: 696 return child_class.all_children_of_parent_path(self.path, readonly=readonly)
The type of the None singleton.
367 @model_validator(mode="after") 368 def upgrade_old_reference_answer_eval_config(self) -> Self: 369 """ 370 Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set. 371 372 For reference_answer evals that don't have a current_config_id set, this migration 373 will set the first config (by created_at) as the default. 374 """ 375 if self.id is None: 376 return self 377 378 # Only run during file loading 379 if not self._loaded_from_file: 380 return self 381 382 # Skip if already migrated (has a current_config_id set) 383 if self.current_config_id is not None: 384 return self 385 386 # Only migrate reference_answer evals 387 if self.evaluation_data_type != EvalDataType.reference_answer: 388 return self 389 390 # Prevent recursion: self.configs() loads child files, which re-loads this parent 391 # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file) 392 # This causes the validator to run again, creating an infinite loop without this guard. 393 with _migration_lock: 394 if self.id in _currently_migrating_eval_ids: 395 return self 396 _currently_migrating_eval_ids.add(self.id) 397 398 try: 399 # Get the configs - these are loaded from child files 400 configs_list = self.configs(readonly=True) 401 if configs_list and len(configs_list) > 0: 402 # Sort by created_at to get the oldest (first created) config 403 sorted_configs = sorted(configs_list, key=lambda c: c.created_at) 404 self.current_config_id = sorted_configs[0].id 405 finally: 406 with _migration_lock: 407 _currently_migrating_eval_ids.discard(self.id) 408 409 return self
Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set.
For reference_answer evals that don't have a current_config_id set, this migration will set the first config (by created_at) as the default.
411 @model_validator(mode="after") 412 def validate_scores(self) -> Self: 413 if self.output_scores is None or len(self.output_scores) == 0: 414 raise ValueError( 415 "output_scores are required, and must have at least one score." 416 ) 417 418 # check for duplicate names (once transformed to JSON keys) 419 output_score_keys = [score.json_key() for score in self.output_scores] 420 if len(output_score_keys) != len(set(output_score_keys)): 421 raise ValueError( 422 f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]" 423 ) 424 return self
426 @model_validator(mode="after") 427 def validate_template_properties(self) -> Self: 428 # eval_configs_filter_id is required for all templates except "rag" 429 if ( 430 self.template is not EvalTemplateId.rag 431 and self.eval_configs_filter_id is None 432 ): 433 raise ValueError( 434 "eval_configs_filter_id is required for all templates except 'rag'" 435 ) 436 437 # Check for properties that are required for the issue template 438 if self.template == EvalTemplateId.issue: 439 if "issue_prompt" not in self.template_properties or not isinstance( 440 self.template_properties["issue_prompt"], str 441 ): 442 raise ValueError("issue_prompt is required for issue template") 443 if "failure_example" in self.template_properties and not isinstance( 444 self.template_properties["failure_example"], str 445 ): 446 raise ValueError( 447 "failure_example is optional for issue template, but if provided must be a string" 448 ) 449 if "pass_example" in self.template_properties and not isinstance( 450 self.template_properties["pass_example"], str 451 ): 452 raise ValueError( 453 "pass_example is optional for issue template, but if provided must be a string" 454 ) 455 456 if self.template == EvalTemplateId.tool_call: 457 if self.evaluation_data_type != EvalDataType.full_trace: 458 raise ValueError( 459 "tool_call template should have evaluation_data_type set to full_trace" 460 ) 461 if ( 462 "tool" not in self.template_properties 463 or not isinstance(self.template_properties["tool"], str) 464 or not self.template_properties["tool"].strip() 465 ): 466 raise ValueError("tool is required for tool call template") 467 if "tool_function_name" not in self.template_properties or not isinstance( 468 self.template_properties["tool_function_name"], str 469 ): 470 raise ValueError( 471 "tool_function_name is required for tool call template" 472 ) 473 if ( 474 "appropriate_tool_use_guidelines" not in self.template_properties 475 or not isinstance( 476 self.template_properties["appropriate_tool_use_guidelines"], str 477 ) 478 or not self.template_properties[ 479 "appropriate_tool_use_guidelines" 480 ].strip() 481 ): 482 raise ValueError( 483 "appropriate_tool_use_guidelines is required for tool call template" 484 ) 485 if ( 486 "inappropriate_tool_use_guidelines" in self.template_properties 487 and not isinstance( 488 self.template_properties["inappropriate_tool_use_guidelines"], str 489 ) 490 ): 491 raise ValueError( 492 "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string" 493 ) 494 return self
The type of the None singleton.
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].
337def init_private_attributes(self: BaseModel, context: Any, /) -> None: 338 """This function is meant to behave like a BaseModel method to initialise private attributes. 339 340 It takes context as an argument since that's what pydantic-core passes when calling it. 341 342 Args: 343 self: The BaseModel instance. 344 context: The context. 345 """ 346 if getattr(self, '__pydantic_private__', None) is None: 347 pydantic_private = {} 348 for name, private_attr in self.__private_attributes__.items(): 349 default = private_attr.get_default() 350 if default is not PydanticUndefined: 351 pydantic_private[name] = default 352 object_setattr(self, '__pydantic_private__', pydantic_private)
This function is meant to behave like a BaseModel method to initialise private attributes.
It takes context as an argument since that's what pydantic-core passes when calling it.
Args: self: The BaseModel instance. context: The context.