kiln_ai.datamodel.eval

  1import json
  2from enum import Enum
  3from threading import Lock
  4from typing import TYPE_CHECKING, Any, Dict, List, Union
  5
  6from pydantic import BaseModel, Field, model_validator
  7from typing_extensions import Self
  8
  9from kiln_ai.datamodel.basemodel import (
 10    ID_TYPE,
 11    FilenameString,
 12    FilenameStringShort,
 13    KilnParentedModel,
 14    KilnParentModel,
 15)
 16from kiln_ai.datamodel.datamodel_enums import TaskOutputRatingType
 17from kiln_ai.datamodel.dataset_filters import DatasetFilterId
 18from kiln_ai.datamodel.json_schema import string_to_json_key
 19from kiln_ai.datamodel.task_run import Usage
 20from kiln_ai.utils.exhaustive_error import raise_exhaustive_enum_error
 21
 22if TYPE_CHECKING:
 23    from kiln_ai.datamodel.spec import Spec
 24    from kiln_ai.datamodel.task import Task
 25
 26EvalScores = Dict[str, float]
 27
 28# Module-level set to track evals currently being migrated (to prevent recursion)
 29# Protected by _migration_lock to ensure thread-safe access
 30_migration_lock = Lock()
 31_currently_migrating_eval_ids: set[ID_TYPE] = set()
 32
 33
 34class EvalTemplateId(str, Enum):
 35    """
 36    An eval template is a pre-defined eval that can be used as a starting point for a new eval.
 37    """
 38
 39    kiln_requirements = "kiln_requirements"
 40    desired_behaviour = "desired_behaviour"
 41    issue = "kiln_issue"
 42    tool_call = "tool_call"
 43    toxicity = "toxicity"
 44    bias = "bias"
 45    maliciousness = "maliciousness"
 46    factual_correctness = "factual_correctness"
 47    jailbreak = "jailbreak"
 48    rag = "rag"
 49
 50
 51class EvalConfigType(str, Enum):
 52    """The type of eval configuration, determining how scores are generated."""
 53
 54    g_eval = "g_eval"
 55    llm_as_judge = "llm_as_judge"
 56
 57
 58class EvalOutputScore(BaseModel):
 59    """
 60    A definition of a score that an evaluator will produce.
 61
 62    Very similar to TaskRequirement, but conceptually different keeping in a separate models.
 63    """
 64
 65    name: FilenameStringShort = Field(
 66        description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance."
 67    )
 68    instruction: str | None = Field(
 69        default=None,
 70        description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.",
 71    )
 72    type: TaskOutputRatingType = Field(
 73        description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical').",
 74    )
 75
 76    def json_key(self) -> str:
 77        """
 78        The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
 79
 80        For example, "Overall Rating" -> "overall_rating"
 81        """
 82        return string_to_json_key(self.name)
 83
 84    @model_validator(mode="after")
 85    def validate_type(self) -> Self:
 86        if self.type == TaskOutputRatingType.custom:
 87            raise ValueError(
 88                f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
 89            )
 90        return self
 91
 92
 93class EvalRun(KilnParentedModel):
 94    """
 95    The results of running an eval on a single dataset item.
 96
 97    This is a child of an EvalConfig, which specifies how the scores were generated.
 98
 99    Eval runs can be one of 2 types:
100    1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run.
101    2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
102    """
103
104    dataset_id: ID_TYPE = Field(
105        description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun."
106    )
107    task_run_config_id: ID_TYPE | None = Field(
108        description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config."
109    )
110    eval_config_eval: bool = Field(
111        description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.",
112        default=False,
113    )
114    # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally.
115    input: str = Field(
116        description="The input to the task. JSON formatted for structured input, plaintext for unstructured input."
117    )
118    output: str = Field(
119        description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
120    )
121    reference_answer: str | None = Field(
122        default=None,
123        description="The reference answer for the input. JSON formatted for structured reference answer, plaintext for unstructured reference answer. Used for reference answer evals.",
124    )
125    intermediate_outputs: Dict[str, str] | None = Field(
126        default=None,
127        description="The intermediate outputs of the task (example, eval thinking).",
128    )
129    task_run_trace: str | None = Field(
130        default=None,
131        description="The JSON formatted trace of the task run that produced the output.",
132    )
133    scores: EvalScores = Field(
134        description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)."
135    )
136    task_run_usage: Usage | None = Field(
137        default=None,
138        description="The usage of the task run that produced this eval run output (not the usage by the evaluation model).",
139    )
140
141    def parent_eval_config(self) -> Union["EvalConfig", None]:
142        if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
143            raise ValueError("parent must be an EvalConfig")
144        return self.parent  # type: ignore
145
146    @model_validator(mode="after")
147    def validate_output_fields(self) -> Self:
148        parent_eval_config = self.parent_eval_config()
149        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
150        if not parent_eval:
151            return self
152
153        evaluation_data_type = parent_eval.evaluation_data_type
154        if (
155            evaluation_data_type == EvalDataType.final_answer
156            and self.task_run_trace is not None
157        ):
158            raise ValueError("final_answer runs should not set trace")
159        elif (
160            not self.eval_config_eval
161            and evaluation_data_type == EvalDataType.full_trace
162            and self.task_run_trace is None
163        ):
164            raise ValueError("full_trace task run eval runs should include trace")
165
166        return self
167
168    @model_validator(mode="after")
169    def validate_eval_run_types(self) -> Self:
170        if self.eval_config_eval and self.task_run_config_id is not None:
171            raise ValueError(
172                "task_run_config_id must be None if eval_config_eval is true"
173            )
174        if not self.eval_config_eval and self.task_run_config_id is None:
175            raise ValueError(
176                "task_run_config_id must be set if eval_config_eval is false"
177            )
178        return self
179
180    @model_validator(mode="after")
181    def validate_scores(self) -> Self:
182        # We're checking the scores have the expected keys from the grand-parent eval
183        if self.scores is None or len(self.scores) == 0:
184            raise ValueError("scores are required, and must have at least one score.")
185
186        parent_eval_config = self.parent_eval_config()
187        eval = parent_eval_config.parent_eval() if parent_eval_config else None
188        if not eval:
189            # Can't validate without the grand-parent eval, allow it to be validated later
190            return self
191
192        output_score_keys = [score.json_key() for score in eval.output_scores]
193        if set(output_score_keys) != set(self.scores.keys()):
194            raise ValueError(
195                f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
196            )
197
198        # Check that each score is expected in this eval and the correct type
199        for output_score in eval.output_scores:
200            match output_score.type:
201                case TaskOutputRatingType.five_star:
202                    five_star_score = self.scores[output_score.json_key()]
203                    if (
204                        not isinstance(five_star_score, float)
205                        or five_star_score < 1.0
206                        or five_star_score > 5.0
207                    ):
208                        raise ValueError(
209                            f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
210                        )
211                case TaskOutputRatingType.pass_fail:
212                    pass_fail_score = self.scores[output_score.json_key()]
213                    if (
214                        not isinstance(pass_fail_score, float)
215                        or pass_fail_score < 0.0
216                        or pass_fail_score > 1.0
217                    ):
218                        raise ValueError(
219                            f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
220                        )
221                case TaskOutputRatingType.pass_fail_critical:
222                    pass_fail_critical_score = self.scores[output_score.json_key()]
223                    if (
224                        not isinstance(pass_fail_critical_score, float)
225                        or pass_fail_critical_score < -1.0
226                        or pass_fail_critical_score > 1.0
227                    ):
228                        raise ValueError(
229                            f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
230                        )
231                case TaskOutputRatingType.custom:
232                    raise ValueError(
233                        f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
234                    )
235                case _:
236                    # Catch missing cases
237                    raise_exhaustive_enum_error(output_score.type)
238        return self
239
240    @model_validator(mode="after")
241    def validate_reference_answer(self) -> Self:
242        parent_eval_config = self.parent_eval_config()
243        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
244        if not parent_eval:
245            # Can't validate without the grand-parent eval, allow it to be validated later
246            return self
247
248        evaluation_data_type = parent_eval.evaluation_data_type
249        if (
250            self.reference_answer is not None
251            and evaluation_data_type != EvalDataType.reference_answer
252        ):
253            raise ValueError(
254                f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}"
255            )
256        return self
257
258
259class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}):
260    """
261    A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
262
263    A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
264    """
265
266    name: FilenameString = Field(description="The name of the eval config.")
267    model_name: str = Field(
268        description="The name of the model to use for this eval config. ",
269    )
270    model_provider: str = Field(
271        description="The provider of the model to use for this eval config.",
272    )
273    config_type: EvalConfigType = Field(
274        default=EvalConfigType.g_eval,
275        description="This is used to determine the type of eval to run.",
276    )
277    properties: dict[str, Any] = Field(
278        default={},
279        description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.",
280    )
281
282    def parent_eval(self) -> Union["Eval", None]:
283        if self.parent is not None and self.parent.__class__.__name__ != "Eval":
284            raise ValueError("parent must be an Eval")
285        return self.parent  # type: ignore
286
287    def runs(self, readonly: bool = False) -> list[EvalRun]:
288        return super().runs(readonly=readonly)  # type: ignore
289
290    @model_validator(mode="after")
291    def validate_properties(self) -> Self:
292        if (
293            self.config_type == EvalConfigType.g_eval
294            or self.config_type == EvalConfigType.llm_as_judge
295        ):
296            if "eval_steps" not in self.properties or not isinstance(
297                self.properties["eval_steps"], list
298            ):
299                raise ValueError("eval_steps is required and must be a list for g_eval")
300            if "task_description" in self.properties and not isinstance(
301                self.properties["task_description"], str
302            ):
303                raise ValueError(
304                    "task_description is optional, but if provided must be a string"
305                )
306            return self
307        else:
308            raise ValueError(f"Invalid eval config type: {self.config_type}")
309
310    @model_validator(mode="after")
311    def validate_json_serializable(self) -> "EvalConfig":
312        try:
313            # This will raise a TypeError if the dict contains non-JSON-serializable objects
314            json.dumps(self.properties)
315        except TypeError as e:
316            raise ValueError(f"Properties must be JSON serializable: {e!s}")
317        return self
318
319
320class EvalDataType(str, Enum):
321    """The type of task output data to evaluate."""
322
323    final_answer = "final_answer"
324    full_trace = "full_trace"
325    reference_answer = "reference_answer"
326
327
328class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}):
329    """An evaluator definition that specifies what to evaluate and how scores should be produced."""
330
331    name: FilenameString = Field(description="The name of the eval.")
332    description: str | None = Field(
333        default=None, description="The description of the eval"
334    )
335    template: EvalTemplateId | None = Field(
336        default=None,
337        description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.",
338    )
339    current_config_id: ID_TYPE = Field(
340        default=None,
341        description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.",
342    )
343    eval_set_filter_id: DatasetFilterId = Field(
344        description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id and train_set_filter_id."
345    )
346    eval_configs_filter_id: DatasetFilterId | None = Field(
347        default=None,
348        description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id.",
349    )
350    train_set_filter_id: DatasetFilterId | None = Field(
351        default=None,
352        description="The id of the dataset filter which defines which dataset items are included in the training set for fine-tuning. Should be mutually exclusive with eval_set_filter_id.",
353    )
354    output_scores: List[EvalOutputScore] = Field(
355        description="The scores this evaluator should produce."
356    )
357    favourite: bool = Field(
358        default=False,
359        description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.",
360    )
361    template_properties: dict[str, str | int | bool | float] | None = Field(
362        default=None,
363        description="Properties to be used to execute the eval. This is template_type specific and should serialize to a json dict.",
364    )
365    evaluation_data_type: EvalDataType = Field(
366        default=EvalDataType.final_answer,
367        description="The output of the task run to evaluate. Can be final answer or full trace.",
368    )
369
370    # Workaround to return typed parent without importing Task
371    def parent_task(self) -> Union["Task", None]:
372        if self.parent is not None and self.parent.__class__.__name__ != "Task":
373            raise ValueError("parent must be a Task")
374        return self.parent  # type: ignore
375
376    def configs(self, readonly: bool = False) -> list[EvalConfig]:
377        return super().configs(readonly=readonly)  # type: ignore
378
379    # Workaround to return typed parent without importing Spec
380    def associated_spec(self, readonly: bool = False) -> Union["Spec", None]:
381        """
382        Get the spec associated with this eval, if any.
383        Returns None for legacy evals that are not associated with a spec.
384        """
385
386        task = self.parent_task()
387        if not task or not self.id:
388            return None
389
390        specs = task.specs(readonly=readonly)
391        for spec in specs:
392            if spec.eval_id == self.id:
393                return spec
394        return None
395
396    @model_validator(mode="after")
397    def upgrade_old_reference_answer_eval_config(self) -> Self:
398        """
399        Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set.
400
401        For reference_answer evals that don't have a current_config_id set, this migration
402        will set the first config (by created_at) as the default.
403        """
404        if self.id is None:
405            return self
406
407        # Only run during file loading
408        if not self._loaded_from_file:
409            return self
410
411        # Skip if already migrated (has a current_config_id set)
412        if self.current_config_id is not None:
413            return self
414
415        # Only migrate reference_answer evals
416        if self.evaluation_data_type != EvalDataType.reference_answer:
417            return self
418
419        # Prevent recursion: self.configs() loads child files, which re-loads this parent
420        # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file)
421        # This causes the validator to run again, creating an infinite loop without this guard.
422        with _migration_lock:
423            if self.id in _currently_migrating_eval_ids:
424                return self
425            _currently_migrating_eval_ids.add(self.id)
426
427        try:
428            # Get the configs - these are loaded from child files
429            configs_list = self.configs(readonly=True)
430            if configs_list and len(configs_list) > 0:
431                # Sort by created_at to get the oldest (first created) config
432                sorted_configs = sorted(configs_list, key=lambda c: c.created_at)
433                self.current_config_id = sorted_configs[0].id
434        finally:
435            with _migration_lock:
436                _currently_migrating_eval_ids.discard(self.id)
437
438        return self
439
440    @model_validator(mode="after")
441    def migrate_train_set_filter_id(self) -> Self:
442        """
443        Migration: Auto-create a train_set_filter_id for legacy evals that don't have one.
444
445        Generates a tag-based filter ID from the eval name following the convention
446        used by spec-based evals (e.g., "train_{name_slug}").
447        """
448        if self.id is None:
449            return self
450
451        if not self._loaded_from_file:
452            return self
453
454        if self.train_set_filter_id is not None:
455            return self
456
457        tag_suffix = self.name.lower().replace(" ", "_")
458        self.train_set_filter_id = f"tag::train_{tag_suffix}"
459        return self
460
461    @model_validator(mode="after")
462    def validate_scores(self) -> Self:
463        if self.output_scores is None or len(self.output_scores) == 0:
464            raise ValueError(
465                "output_scores are required, and must have at least one score."
466            )
467
468        # check for duplicate names (once transformed to JSON keys)
469        output_score_keys = [score.json_key() for score in self.output_scores]
470        if len(output_score_keys) != len(set(output_score_keys)):
471            raise ValueError(
472                f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
473            )
474        return self
475
476    @model_validator(mode="after")
477    def validate_template_properties(self) -> Self:
478        # eval_configs_filter_id is required for all templates except "rag"
479        if (
480            self.template is not EvalTemplateId.rag
481            and self.eval_configs_filter_id is None
482        ):
483            raise ValueError(
484                "eval_configs_filter_id is required for all templates except 'rag'"
485            )
486
487        # For spec-based evals, template_properties will be None and validation happens in the spec
488        # For legacy evals, template_properties contains the data and we validate here
489        if self.template_properties is None:
490            return self
491
492        # Check for properties that are required for the issue template (legacy evals only)
493        if self.template == EvalTemplateId.issue:
494            if "issue_prompt" not in self.template_properties or not isinstance(
495                self.template_properties["issue_prompt"], str
496            ):
497                raise ValueError("issue_prompt is required for issue template")
498            if "failure_example" in self.template_properties and not isinstance(
499                self.template_properties["failure_example"], str
500            ):
501                raise ValueError(
502                    "failure_example is optional for issue template, but if provided must be a string"
503                )
504            if "pass_example" in self.template_properties and not isinstance(
505                self.template_properties["pass_example"], str
506            ):
507                raise ValueError(
508                    "pass_example is optional for issue template, but if provided must be a string"
509                )
510
511        if self.template == EvalTemplateId.tool_call:
512            if self.evaluation_data_type != EvalDataType.full_trace:
513                raise ValueError(
514                    "tool_call template should have evaluation_data_type set to full_trace"
515                )
516            if (
517                "tool" not in self.template_properties
518                or not isinstance(self.template_properties["tool"], str)
519                or not self.template_properties["tool"].strip()
520            ):
521                raise ValueError("tool is required for tool call template")
522            if "tool_function_name" not in self.template_properties or not isinstance(
523                self.template_properties["tool_function_name"], str
524            ):
525                raise ValueError(
526                    "tool_function_name is required for tool call template"
527                )
528            if (
529                "appropriate_tool_use_guidelines" not in self.template_properties
530                or not isinstance(
531                    self.template_properties["appropriate_tool_use_guidelines"], str
532                )
533                or not self.template_properties[
534                    "appropriate_tool_use_guidelines"
535                ].strip()
536            ):
537                raise ValueError(
538                    "appropriate_tool_use_guidelines is required for tool call template"
539                )
540            if (
541                "inappropriate_tool_use_guidelines" in self.template_properties
542                and not isinstance(
543                    self.template_properties["inappropriate_tool_use_guidelines"], str
544                )
545            ):
546                raise ValueError(
547                    "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string"
548                )
549        return self
EvalScores = typing.Dict[str, float]
class EvalTemplateId(builtins.str, enum.Enum):
35class EvalTemplateId(str, Enum):
36    """
37    An eval template is a pre-defined eval that can be used as a starting point for a new eval.
38    """
39
40    kiln_requirements = "kiln_requirements"
41    desired_behaviour = "desired_behaviour"
42    issue = "kiln_issue"
43    tool_call = "tool_call"
44    toxicity = "toxicity"
45    bias = "bias"
46    maliciousness = "maliciousness"
47    factual_correctness = "factual_correctness"
48    jailbreak = "jailbreak"
49    rag = "rag"

An eval template is a pre-defined eval that can be used as a starting point for a new eval.

kiln_requirements = <EvalTemplateId.kiln_requirements: 'kiln_requirements'>
desired_behaviour = <EvalTemplateId.desired_behaviour: 'desired_behaviour'>
issue = <EvalTemplateId.issue: 'kiln_issue'>
tool_call = <EvalTemplateId.tool_call: 'tool_call'>
toxicity = <EvalTemplateId.toxicity: 'toxicity'>
bias = <EvalTemplateId.bias: 'bias'>
maliciousness = <EvalTemplateId.maliciousness: 'maliciousness'>
factual_correctness = <EvalTemplateId.factual_correctness: 'factual_correctness'>
jailbreak = <EvalTemplateId.jailbreak: 'jailbreak'>
rag = <EvalTemplateId.rag: 'rag'>
class EvalConfigType(builtins.str, enum.Enum):
52class EvalConfigType(str, Enum):
53    """The type of eval configuration, determining how scores are generated."""
54
55    g_eval = "g_eval"
56    llm_as_judge = "llm_as_judge"

The type of eval configuration, determining how scores are generated.

g_eval = <EvalConfigType.g_eval: 'g_eval'>
llm_as_judge = <EvalConfigType.llm_as_judge: 'llm_as_judge'>
class EvalOutputScore(pydantic.main.BaseModel):
59class EvalOutputScore(BaseModel):
60    """
61    A definition of a score that an evaluator will produce.
62
63    Very similar to TaskRequirement, but conceptually different keeping in a separate models.
64    """
65
66    name: FilenameStringShort = Field(
67        description="The name of the score. Will be provided to the model so use a descriptive name. Should align to the model's TaskRequirement name if you want to use human evals to evaluate the evaluator's performance."
68    )
69    instruction: str | None = Field(
70        default=None,
71        description="A description of the score, used to help the model understand the goal of the score. Will be provided to evaluator models, so should be written for the model, not the team/user.",
72    )
73    type: TaskOutputRatingType = Field(
74        description="The type of rating to use ('five_star', 'pass_fail', 'pass_fail_critical').",
75    )
76
77    def json_key(self) -> str:
78        """
79        The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
80
81        For example, "Overall Rating" -> "overall_rating"
82        """
83        return string_to_json_key(self.name)
84
85    @model_validator(mode="after")
86    def validate_type(self) -> Self:
87        if self.type == TaskOutputRatingType.custom:
88            raise ValueError(
89                f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
90            )
91        return self

A definition of a score that an evaluator will produce.

Very similar to TaskRequirement, but conceptually different keeping in a separate models.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7f90236faac0>, json_schema_input_type=PydanticUndefined), StringConstraints(strip_whitespace=None, to_upper=None, to_lower=None, strict=None, min_length=1, max_length=32, pattern=None)]
instruction: str | None
def json_key(self) -> str:
77    def json_key(self) -> str:
78        """
79        The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.
80
81        For example, "Overall Rating" -> "overall_rating"
82        """
83        return string_to_json_key(self.name)

The JSON key for the score, used when running the evaluator with a LLM and we need JSON output.

For example, "Overall Rating" -> "overall_rating"

@model_validator(mode='after')
def validate_type(self) -> Self:
85    @model_validator(mode="after")
86    def validate_type(self) -> Self:
87        if self.type == TaskOutputRatingType.custom:
88            raise ValueError(
89                f"Custom scores are not supported in evaluators. Score '{self.name}' was set to a custom score."
90            )
91        return self
model_config: ClassVar[pydantic.config.ConfigDict] = {}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

class EvalRun(kiln_ai.datamodel.basemodel.KilnParentedModel):
 94class EvalRun(KilnParentedModel):
 95    """
 96    The results of running an eval on a single dataset item.
 97
 98    This is a child of an EvalConfig, which specifies how the scores were generated.
 99
100    Eval runs can be one of 2 types:
101    1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run.
102    2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.
103    """
104
105    dataset_id: ID_TYPE = Field(
106        description="The ID of the dataset item that was used for this run. Must belong to the same Task as the grand-parent eval of this EvalRun."
107    )
108    task_run_config_id: ID_TYPE | None = Field(
109        description="The ID of the TaskRunConfig that was run, if this eval run was based on a task run. Must belong to the same Task as this eval. Can be None if this eval run is based on an eval config."
110    )
111    eval_config_eval: bool = Field(
112        description="Whether this eval run to evaluate the parent eval config (evaluating the config using an existing dataset item). If true, task_run_config_id must be None, as we're not running the task.",
113        default=False,
114    )
115    # These two may duplicate the dataset_id.input/output, but we're denormalizing intentionally.
116    input: str = Field(
117        description="The input to the task. JSON formatted for structured input, plaintext for unstructured input."
118    )
119    output: str = Field(
120        description="The output of the task. JSON formatted for structured output, plaintext for unstructured output."
121    )
122    reference_answer: str | None = Field(
123        default=None,
124        description="The reference answer for the input. JSON formatted for structured reference answer, plaintext for unstructured reference answer. Used for reference answer evals.",
125    )
126    intermediate_outputs: Dict[str, str] | None = Field(
127        default=None,
128        description="The intermediate outputs of the task (example, eval thinking).",
129    )
130    task_run_trace: str | None = Field(
131        default=None,
132        description="The JSON formatted trace of the task run that produced the output.",
133    )
134    scores: EvalScores = Field(
135        description="The output scores of the evaluator (aligning to those required by the grand-parent Eval this object is a child of)."
136    )
137    task_run_usage: Usage | None = Field(
138        default=None,
139        description="The usage of the task run that produced this eval run output (not the usage by the evaluation model).",
140    )
141
142    def parent_eval_config(self) -> Union["EvalConfig", None]:
143        if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
144            raise ValueError("parent must be an EvalConfig")
145        return self.parent  # type: ignore
146
147    @model_validator(mode="after")
148    def validate_output_fields(self) -> Self:
149        parent_eval_config = self.parent_eval_config()
150        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
151        if not parent_eval:
152            return self
153
154        evaluation_data_type = parent_eval.evaluation_data_type
155        if (
156            evaluation_data_type == EvalDataType.final_answer
157            and self.task_run_trace is not None
158        ):
159            raise ValueError("final_answer runs should not set trace")
160        elif (
161            not self.eval_config_eval
162            and evaluation_data_type == EvalDataType.full_trace
163            and self.task_run_trace is None
164        ):
165            raise ValueError("full_trace task run eval runs should include trace")
166
167        return self
168
169    @model_validator(mode="after")
170    def validate_eval_run_types(self) -> Self:
171        if self.eval_config_eval and self.task_run_config_id is not None:
172            raise ValueError(
173                "task_run_config_id must be None if eval_config_eval is true"
174            )
175        if not self.eval_config_eval and self.task_run_config_id is None:
176            raise ValueError(
177                "task_run_config_id must be set if eval_config_eval is false"
178            )
179        return self
180
181    @model_validator(mode="after")
182    def validate_scores(self) -> Self:
183        # We're checking the scores have the expected keys from the grand-parent eval
184        if self.scores is None or len(self.scores) == 0:
185            raise ValueError("scores are required, and must have at least one score.")
186
187        parent_eval_config = self.parent_eval_config()
188        eval = parent_eval_config.parent_eval() if parent_eval_config else None
189        if not eval:
190            # Can't validate without the grand-parent eval, allow it to be validated later
191            return self
192
193        output_score_keys = [score.json_key() for score in eval.output_scores]
194        if set(output_score_keys) != set(self.scores.keys()):
195            raise ValueError(
196                f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
197            )
198
199        # Check that each score is expected in this eval and the correct type
200        for output_score in eval.output_scores:
201            match output_score.type:
202                case TaskOutputRatingType.five_star:
203                    five_star_score = self.scores[output_score.json_key()]
204                    if (
205                        not isinstance(five_star_score, float)
206                        or five_star_score < 1.0
207                        or five_star_score > 5.0
208                    ):
209                        raise ValueError(
210                            f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
211                        )
212                case TaskOutputRatingType.pass_fail:
213                    pass_fail_score = self.scores[output_score.json_key()]
214                    if (
215                        not isinstance(pass_fail_score, float)
216                        or pass_fail_score < 0.0
217                        or pass_fail_score > 1.0
218                    ):
219                        raise ValueError(
220                            f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
221                        )
222                case TaskOutputRatingType.pass_fail_critical:
223                    pass_fail_critical_score = self.scores[output_score.json_key()]
224                    if (
225                        not isinstance(pass_fail_critical_score, float)
226                        or pass_fail_critical_score < -1.0
227                        or pass_fail_critical_score > 1.0
228                    ):
229                        raise ValueError(
230                            f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
231                        )
232                case TaskOutputRatingType.custom:
233                    raise ValueError(
234                        f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
235                    )
236                case _:
237                    # Catch missing cases
238                    raise_exhaustive_enum_error(output_score.type)
239        return self
240
241    @model_validator(mode="after")
242    def validate_reference_answer(self) -> Self:
243        parent_eval_config = self.parent_eval_config()
244        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
245        if not parent_eval:
246            # Can't validate without the grand-parent eval, allow it to be validated later
247            return self
248
249        evaluation_data_type = parent_eval.evaluation_data_type
250        if (
251            self.reference_answer is not None
252            and evaluation_data_type != EvalDataType.reference_answer
253        ):
254            raise ValueError(
255                f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}"
256            )
257        return self

The results of running an eval on a single dataset item.

This is a child of an EvalConfig, which specifies how the scores were generated.

Eval runs can be one of 2 types: 1) eval_config_eval=False: we were evaluating a task run (a method of running the task). We get the task input from the dataset_id.input, run the task with the task_run_config, then ran the evaluator on that output. task_run_config_id must be set. The output saved in this model is the output of the task run. 2) eval_config_eval=True: we were evaluating an eval config (a method of evaluating the task). We used the existing dataset item input/output, and ran the evaluator on it. task_run_config_id must be None. The input/output saved in this model is the input/output of the dataset item.

dataset_id: Optional[str]
task_run_config_id: Optional[str]
eval_config_eval: bool
input: str
output: str
reference_answer: str | None
intermediate_outputs: Optional[Dict[str, str]]
task_run_trace: str | None
scores: Dict[str, float]
task_run_usage: kiln_ai.datamodel.Usage | None
def parent_eval_config(self) -> Optional[EvalConfig]:
142    def parent_eval_config(self) -> Union["EvalConfig", None]:
143        if self.parent is not None and self.parent.__class__.__name__ != "EvalConfig":
144            raise ValueError("parent must be an EvalConfig")
145        return self.parent  # type: ignore
@model_validator(mode='after')
def validate_output_fields(self) -> Self:
147    @model_validator(mode="after")
148    def validate_output_fields(self) -> Self:
149        parent_eval_config = self.parent_eval_config()
150        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
151        if not parent_eval:
152            return self
153
154        evaluation_data_type = parent_eval.evaluation_data_type
155        if (
156            evaluation_data_type == EvalDataType.final_answer
157            and self.task_run_trace is not None
158        ):
159            raise ValueError("final_answer runs should not set trace")
160        elif (
161            not self.eval_config_eval
162            and evaluation_data_type == EvalDataType.full_trace
163            and self.task_run_trace is None
164        ):
165            raise ValueError("full_trace task run eval runs should include trace")
166
167        return self
@model_validator(mode='after')
def validate_eval_run_types(self) -> Self:
169    @model_validator(mode="after")
170    def validate_eval_run_types(self) -> Self:
171        if self.eval_config_eval and self.task_run_config_id is not None:
172            raise ValueError(
173                "task_run_config_id must be None if eval_config_eval is true"
174            )
175        if not self.eval_config_eval and self.task_run_config_id is None:
176            raise ValueError(
177                "task_run_config_id must be set if eval_config_eval is false"
178            )
179        return self
@model_validator(mode='after')
def validate_scores(self) -> Self:
181    @model_validator(mode="after")
182    def validate_scores(self) -> Self:
183        # We're checking the scores have the expected keys from the grand-parent eval
184        if self.scores is None or len(self.scores) == 0:
185            raise ValueError("scores are required, and must have at least one score.")
186
187        parent_eval_config = self.parent_eval_config()
188        eval = parent_eval_config.parent_eval() if parent_eval_config else None
189        if not eval:
190            # Can't validate without the grand-parent eval, allow it to be validated later
191            return self
192
193        output_score_keys = [score.json_key() for score in eval.output_scores]
194        if set(output_score_keys) != set(self.scores.keys()):
195            raise ValueError(
196                f"The scores produced by the evaluator must match the scores expected by the eval. Got: [{', '.join(self.scores.keys())}] and expected: [{', '.join(output_score_keys)}]"
197            )
198
199        # Check that each score is expected in this eval and the correct type
200        for output_score in eval.output_scores:
201            match output_score.type:
202                case TaskOutputRatingType.five_star:
203                    five_star_score = self.scores[output_score.json_key()]
204                    if (
205                        not isinstance(five_star_score, float)
206                        or five_star_score < 1.0
207                        or five_star_score > 5.0
208                    ):
209                        raise ValueError(
210                            f"Score {output_score.name} is a five_star rating and must be a float between 1.0 and 5.0 inclusive. Got: {five_star_score}"
211                        )
212                case TaskOutputRatingType.pass_fail:
213                    pass_fail_score = self.scores[output_score.json_key()]
214                    if (
215                        not isinstance(pass_fail_score, float)
216                        or pass_fail_score < 0.0
217                        or pass_fail_score > 1.0
218                    ):
219                        raise ValueError(
220                            f"Score {output_score.name} is a pass_fail rating and must be a float between 0.0 and 1.0 inclusive. Got: {pass_fail_score}"
221                        )
222                case TaskOutputRatingType.pass_fail_critical:
223                    pass_fail_critical_score = self.scores[output_score.json_key()]
224                    if (
225                        not isinstance(pass_fail_critical_score, float)
226                        or pass_fail_critical_score < -1.0
227                        or pass_fail_critical_score > 1.0
228                    ):
229                        raise ValueError(
230                            f"Score {output_score.name} is a pass_fail_critical rating and must be a float between -1.0 and 1.0 inclusive. Got: {pass_fail_critical_score}"
231                        )
232                case TaskOutputRatingType.custom:
233                    raise ValueError(
234                        f"Custom scores are not supported in evaluators. '{output_score.name}' was set to a custom score."
235                    )
236                case _:
237                    # Catch missing cases
238                    raise_exhaustive_enum_error(output_score.type)
239        return self
@model_validator(mode='after')
def validate_reference_answer(self) -> Self:
241    @model_validator(mode="after")
242    def validate_reference_answer(self) -> Self:
243        parent_eval_config = self.parent_eval_config()
244        parent_eval = parent_eval_config.parent_eval() if parent_eval_config else None
245        if not parent_eval:
246            # Can't validate without the grand-parent eval, allow it to be validated later
247            return self
248
249        evaluation_data_type = parent_eval.evaluation_data_type
250        if (
251            self.reference_answer is not None
252            and evaluation_data_type != EvalDataType.reference_answer
253        ):
254            raise ValueError(
255                f"reference_answer is only valid for reference answer evals. Got: {evaluation_data_type.value}"
256            )
257        return self
def relationship_name() -> str:
761        def relationship_name_method() -> str:
762            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
754        def parent_class_method() -> Type[KilnParentModel]:
755            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class EvalConfig(kiln_ai.datamodel.basemodel.KilnParentedModel, kiln_ai.datamodel.basemodel.KilnParentModel):
260class EvalConfig(KilnParentedModel, KilnParentModel, parent_of={"runs": EvalRun}):
261    """
262    A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.
263
264    A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.
265    """
266
267    name: FilenameString = Field(description="The name of the eval config.")
268    model_name: str = Field(
269        description="The name of the model to use for this eval config. ",
270    )
271    model_provider: str = Field(
272        description="The provider of the model to use for this eval config.",
273    )
274    config_type: EvalConfigType = Field(
275        default=EvalConfigType.g_eval,
276        description="This is used to determine the type of eval to run.",
277    )
278    properties: dict[str, Any] = Field(
279        default={},
280        description="Properties to be used to execute the eval config. This is config_type specific and should serialize to a json dict.",
281    )
282
283    def parent_eval(self) -> Union["Eval", None]:
284        if self.parent is not None and self.parent.__class__.__name__ != "Eval":
285            raise ValueError("parent must be an Eval")
286        return self.parent  # type: ignore
287
288    def runs(self, readonly: bool = False) -> list[EvalRun]:
289        return super().runs(readonly=readonly)  # type: ignore
290
291    @model_validator(mode="after")
292    def validate_properties(self) -> Self:
293        if (
294            self.config_type == EvalConfigType.g_eval
295            or self.config_type == EvalConfigType.llm_as_judge
296        ):
297            if "eval_steps" not in self.properties or not isinstance(
298                self.properties["eval_steps"], list
299            ):
300                raise ValueError("eval_steps is required and must be a list for g_eval")
301            if "task_description" in self.properties and not isinstance(
302                self.properties["task_description"], str
303            ):
304                raise ValueError(
305                    "task_description is optional, but if provided must be a string"
306                )
307            return self
308        else:
309            raise ValueError(f"Invalid eval config type: {self.config_type}")
310
311    @model_validator(mode="after")
312    def validate_json_serializable(self) -> "EvalConfig":
313        try:
314            # This will raise a TypeError if the dict contains non-JSON-serializable objects
315            json.dumps(self.properties)
316        except TypeError as e:
317            raise ValueError(f"Properties must be JSON serializable: {e!s}")
318        return self

A configuration for running an eval. This includes anything needed to run the eval on a dataset like the prompt, model, thresholds, etc.

A eval might have many configs, example running the same eval with 2 different models. Comparing eval results is only valid within the scope of the same config.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7f90236f9b20>, json_schema_input_type=PydanticUndefined), StringConstraints(strip_whitespace=None, to_upper=None, to_lower=None, strict=None, min_length=1, max_length=120, pattern=None)]
model_name: str
model_provider: str
config_type: EvalConfigType
properties: dict[str, typing.Any]
def parent_eval(self) -> Optional[Eval]:
283    def parent_eval(self) -> Union["Eval", None]:
284        if self.parent is not None and self.parent.__class__.__name__ != "Eval":
285            raise ValueError("parent must be an Eval")
286        return self.parent  # type: ignore
def runs(self, readonly=False) -> List[EvalRun]:
743        def child_method(self, readonly: bool = False) -> list[child_class]:  # type: ignore[invalid-type-form]
744            return child_class.all_children_of_parent_path(self.path, readonly=readonly)

The type of the None singleton.

@model_validator(mode='after')
def validate_properties(self) -> Self:
291    @model_validator(mode="after")
292    def validate_properties(self) -> Self:
293        if (
294            self.config_type == EvalConfigType.g_eval
295            or self.config_type == EvalConfigType.llm_as_judge
296        ):
297            if "eval_steps" not in self.properties or not isinstance(
298                self.properties["eval_steps"], list
299            ):
300                raise ValueError("eval_steps is required and must be a list for g_eval")
301            if "task_description" in self.properties and not isinstance(
302                self.properties["task_description"], str
303            ):
304                raise ValueError(
305                    "task_description is optional, but if provided must be a string"
306                )
307            return self
308        else:
309            raise ValueError(f"Invalid eval config type: {self.config_type}")
@model_validator(mode='after')
def validate_json_serializable(self) -> EvalConfig:
311    @model_validator(mode="after")
312    def validate_json_serializable(self) -> "EvalConfig":
313        try:
314            # This will raise a TypeError if the dict contains non-JSON-serializable objects
315            json.dumps(self.properties)
316        except TypeError as e:
317            raise ValueError(f"Properties must be JSON serializable: {e!s}")
318        return self
def relationship_name() -> str:
761        def relationship_name_method() -> str:
762            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
754        def parent_class_method() -> Type[KilnParentModel]:
755            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.

class EvalDataType(builtins.str, enum.Enum):
321class EvalDataType(str, Enum):
322    """The type of task output data to evaluate."""
323
324    final_answer = "final_answer"
325    full_trace = "full_trace"
326    reference_answer = "reference_answer"

The type of task output data to evaluate.

final_answer = <EvalDataType.final_answer: 'final_answer'>
full_trace = <EvalDataType.full_trace: 'full_trace'>
reference_answer = <EvalDataType.reference_answer: 'reference_answer'>
class Eval(kiln_ai.datamodel.basemodel.KilnParentedModel, kiln_ai.datamodel.basemodel.KilnParentModel):
329class Eval(KilnParentedModel, KilnParentModel, parent_of={"configs": EvalConfig}):
330    """An evaluator definition that specifies what to evaluate and how scores should be produced."""
331
332    name: FilenameString = Field(description="The name of the eval.")
333    description: str | None = Field(
334        default=None, description="The description of the eval"
335    )
336    template: EvalTemplateId | None = Field(
337        default=None,
338        description="The template selected when creating this eval. Useful for suggesting eval steps and output scores.",
339    )
340    current_config_id: ID_TYPE = Field(
341        default=None,
342        description="The id of the current config to use for this eval. This can be changed over time to run the same eval with different configs.",
343    )
344    eval_set_filter_id: DatasetFilterId = Field(
345        description="The id of the dataset filter which defines which dataset items are included when running this eval. Should be mutually exclusive with eval_configs_filter_id and train_set_filter_id."
346    )
347    eval_configs_filter_id: DatasetFilterId | None = Field(
348        default=None,
349        description="The id of the dataset filter which defines which dataset items are included when comparing the quality of the eval configs under this eval. Should consist of dataset items with ratings. Should be mutually exclusive with eval_set_filter_id.",
350    )
351    train_set_filter_id: DatasetFilterId | None = Field(
352        default=None,
353        description="The id of the dataset filter which defines which dataset items are included in the training set for fine-tuning. Should be mutually exclusive with eval_set_filter_id.",
354    )
355    output_scores: List[EvalOutputScore] = Field(
356        description="The scores this evaluator should produce."
357    )
358    favourite: bool = Field(
359        default=False,
360        description="Whether this eval is a favourite of the user. Rendered as a star icon in the UI.",
361    )
362    template_properties: dict[str, str | int | bool | float] | None = Field(
363        default=None,
364        description="Properties to be used to execute the eval. This is template_type specific and should serialize to a json dict.",
365    )
366    evaluation_data_type: EvalDataType = Field(
367        default=EvalDataType.final_answer,
368        description="The output of the task run to evaluate. Can be final answer or full trace.",
369    )
370
371    # Workaround to return typed parent without importing Task
372    def parent_task(self) -> Union["Task", None]:
373        if self.parent is not None and self.parent.__class__.__name__ != "Task":
374            raise ValueError("parent must be a Task")
375        return self.parent  # type: ignore
376
377    def configs(self, readonly: bool = False) -> list[EvalConfig]:
378        return super().configs(readonly=readonly)  # type: ignore
379
380    # Workaround to return typed parent without importing Spec
381    def associated_spec(self, readonly: bool = False) -> Union["Spec", None]:
382        """
383        Get the spec associated with this eval, if any.
384        Returns None for legacy evals that are not associated with a spec.
385        """
386
387        task = self.parent_task()
388        if not task or not self.id:
389            return None
390
391        specs = task.specs(readonly=readonly)
392        for spec in specs:
393            if spec.eval_id == self.id:
394                return spec
395        return None
396
397    @model_validator(mode="after")
398    def upgrade_old_reference_answer_eval_config(self) -> Self:
399        """
400        Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set.
401
402        For reference_answer evals that don't have a current_config_id set, this migration
403        will set the first config (by created_at) as the default.
404        """
405        if self.id is None:
406            return self
407
408        # Only run during file loading
409        if not self._loaded_from_file:
410            return self
411
412        # Skip if already migrated (has a current_config_id set)
413        if self.current_config_id is not None:
414            return self
415
416        # Only migrate reference_answer evals
417        if self.evaluation_data_type != EvalDataType.reference_answer:
418            return self
419
420        # Prevent recursion: self.configs() loads child files, which re-loads this parent
421        # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file)
422        # This causes the validator to run again, creating an infinite loop without this guard.
423        with _migration_lock:
424            if self.id in _currently_migrating_eval_ids:
425                return self
426            _currently_migrating_eval_ids.add(self.id)
427
428        try:
429            # Get the configs - these are loaded from child files
430            configs_list = self.configs(readonly=True)
431            if configs_list and len(configs_list) > 0:
432                # Sort by created_at to get the oldest (first created) config
433                sorted_configs = sorted(configs_list, key=lambda c: c.created_at)
434                self.current_config_id = sorted_configs[0].id
435        finally:
436            with _migration_lock:
437                _currently_migrating_eval_ids.discard(self.id)
438
439        return self
440
441    @model_validator(mode="after")
442    def migrate_train_set_filter_id(self) -> Self:
443        """
444        Migration: Auto-create a train_set_filter_id for legacy evals that don't have one.
445
446        Generates a tag-based filter ID from the eval name following the convention
447        used by spec-based evals (e.g., "train_{name_slug}").
448        """
449        if self.id is None:
450            return self
451
452        if not self._loaded_from_file:
453            return self
454
455        if self.train_set_filter_id is not None:
456            return self
457
458        tag_suffix = self.name.lower().replace(" ", "_")
459        self.train_set_filter_id = f"tag::train_{tag_suffix}"
460        return self
461
462    @model_validator(mode="after")
463    def validate_scores(self) -> Self:
464        if self.output_scores is None or len(self.output_scores) == 0:
465            raise ValueError(
466                "output_scores are required, and must have at least one score."
467            )
468
469        # check for duplicate names (once transformed to JSON keys)
470        output_score_keys = [score.json_key() for score in self.output_scores]
471        if len(output_score_keys) != len(set(output_score_keys)):
472            raise ValueError(
473                f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
474            )
475        return self
476
477    @model_validator(mode="after")
478    def validate_template_properties(self) -> Self:
479        # eval_configs_filter_id is required for all templates except "rag"
480        if (
481            self.template is not EvalTemplateId.rag
482            and self.eval_configs_filter_id is None
483        ):
484            raise ValueError(
485                "eval_configs_filter_id is required for all templates except 'rag'"
486            )
487
488        # For spec-based evals, template_properties will be None and validation happens in the spec
489        # For legacy evals, template_properties contains the data and we validate here
490        if self.template_properties is None:
491            return self
492
493        # Check for properties that are required for the issue template (legacy evals only)
494        if self.template == EvalTemplateId.issue:
495            if "issue_prompt" not in self.template_properties or not isinstance(
496                self.template_properties["issue_prompt"], str
497            ):
498                raise ValueError("issue_prompt is required for issue template")
499            if "failure_example" in self.template_properties and not isinstance(
500                self.template_properties["failure_example"], str
501            ):
502                raise ValueError(
503                    "failure_example is optional for issue template, but if provided must be a string"
504                )
505            if "pass_example" in self.template_properties and not isinstance(
506                self.template_properties["pass_example"], str
507            ):
508                raise ValueError(
509                    "pass_example is optional for issue template, but if provided must be a string"
510                )
511
512        if self.template == EvalTemplateId.tool_call:
513            if self.evaluation_data_type != EvalDataType.full_trace:
514                raise ValueError(
515                    "tool_call template should have evaluation_data_type set to full_trace"
516                )
517            if (
518                "tool" not in self.template_properties
519                or not isinstance(self.template_properties["tool"], str)
520                or not self.template_properties["tool"].strip()
521            ):
522                raise ValueError("tool is required for tool call template")
523            if "tool_function_name" not in self.template_properties or not isinstance(
524                self.template_properties["tool_function_name"], str
525            ):
526                raise ValueError(
527                    "tool_function_name is required for tool call template"
528                )
529            if (
530                "appropriate_tool_use_guidelines" not in self.template_properties
531                or not isinstance(
532                    self.template_properties["appropriate_tool_use_guidelines"], str
533                )
534                or not self.template_properties[
535                    "appropriate_tool_use_guidelines"
536                ].strip()
537            ):
538                raise ValueError(
539                    "appropriate_tool_use_guidelines is required for tool call template"
540                )
541            if (
542                "inappropriate_tool_use_guidelines" in self.template_properties
543                and not isinstance(
544                    self.template_properties["inappropriate_tool_use_guidelines"], str
545                )
546            ):
547                raise ValueError(
548                    "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string"
549                )
550        return self

An evaluator definition that specifies what to evaluate and how scores should be produced.

name: Annotated[str, BeforeValidator(func=<function name_validator.<locals>.fn at 0x7f90236f9b20>, json_schema_input_type=PydanticUndefined), StringConstraints(strip_whitespace=None, to_upper=None, to_lower=None, strict=None, min_length=1, max_length=120, pattern=None)]
description: str | None
template: EvalTemplateId | None
current_config_id: Optional[str]
eval_set_filter_id: Annotated[str, AfterValidator(func=<function <lambda> at 0x7f902336d760>)]
eval_configs_filter_id: Optional[Annotated[str, AfterValidator(func=<function <lambda> at 0x7f902336d760>)]]
train_set_filter_id: Optional[Annotated[str, AfterValidator(func=<function <lambda> at 0x7f902336d760>)]]
output_scores: List[EvalOutputScore]
favourite: bool
template_properties: dict[str, str | int | bool | float] | None
evaluation_data_type: EvalDataType
def parent_task(self) -> Optional[kiln_ai.datamodel.Task]:
372    def parent_task(self) -> Union["Task", None]:
373        if self.parent is not None and self.parent.__class__.__name__ != "Task":
374            raise ValueError("parent must be a Task")
375        return self.parent  # type: ignore
def configs(self, readonly=False) -> List[EvalConfig]:
743        def child_method(self, readonly: bool = False) -> list[child_class]:  # type: ignore[invalid-type-form]
744            return child_class.all_children_of_parent_path(self.path, readonly=readonly)

The type of the None singleton.

def associated_spec(self, readonly: bool = False) -> Optional[kiln_ai.datamodel.spec.Spec]:
381    def associated_spec(self, readonly: bool = False) -> Union["Spec", None]:
382        """
383        Get the spec associated with this eval, if any.
384        Returns None for legacy evals that are not associated with a spec.
385        """
386
387        task = self.parent_task()
388        if not task or not self.id:
389            return None
390
391        specs = task.specs(readonly=readonly)
392        for spec in specs:
393            if spec.eval_id == self.id:
394                return spec
395        return None

Get the spec associated with this eval, if any. Returns None for legacy evals that are not associated with a spec.

@model_validator(mode='after')
def upgrade_old_reference_answer_eval_config(self) -> Self:
397    @model_validator(mode="after")
398    def upgrade_old_reference_answer_eval_config(self) -> Self:
399        """
400        Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set.
401
402        For reference_answer evals that don't have a current_config_id set, this migration
403        will set the first config (by created_at) as the default.
404        """
405        if self.id is None:
406            return self
407
408        # Only run during file loading
409        if not self._loaded_from_file:
410            return self
411
412        # Skip if already migrated (has a current_config_id set)
413        if self.current_config_id is not None:
414            return self
415
416        # Only migrate reference_answer evals
417        if self.evaluation_data_type != EvalDataType.reference_answer:
418            return self
419
420        # Prevent recursion: self.configs() loads child files, which re-loads this parent
421        # (see basemodel.py where we iterate_children_paths_of_parent_path calls load_from_file)
422        # This causes the validator to run again, creating an infinite loop without this guard.
423        with _migration_lock:
424            if self.id in _currently_migrating_eval_ids:
425                return self
426            _currently_migrating_eval_ids.add(self.id)
427
428        try:
429            # Get the configs - these are loaded from child files
430            configs_list = self.configs(readonly=True)
431            if configs_list and len(configs_list) > 0:
432                # Sort by created_at to get the oldest (first created) config
433                sorted_configs = sorted(configs_list, key=lambda c: c.created_at)
434                self.current_config_id = sorted_configs[0].id
435        finally:
436            with _migration_lock:
437                _currently_migrating_eval_ids.discard(self.id)
438
439        return self

Migration: Set the first judge config as the default for existing reference answer evals that don't have a current_config_id set.

For reference_answer evals that don't have a current_config_id set, this migration will set the first config (by created_at) as the default.

@model_validator(mode='after')
def migrate_train_set_filter_id(self) -> Self:
441    @model_validator(mode="after")
442    def migrate_train_set_filter_id(self) -> Self:
443        """
444        Migration: Auto-create a train_set_filter_id for legacy evals that don't have one.
445
446        Generates a tag-based filter ID from the eval name following the convention
447        used by spec-based evals (e.g., "train_{name_slug}").
448        """
449        if self.id is None:
450            return self
451
452        if not self._loaded_from_file:
453            return self
454
455        if self.train_set_filter_id is not None:
456            return self
457
458        tag_suffix = self.name.lower().replace(" ", "_")
459        self.train_set_filter_id = f"tag::train_{tag_suffix}"
460        return self

Migration: Auto-create a train_set_filter_id for legacy evals that don't have one.

Generates a tag-based filter ID from the eval name following the convention used by spec-based evals (e.g., "train_{name_slug}").

@model_validator(mode='after')
def validate_scores(self) -> Self:
462    @model_validator(mode="after")
463    def validate_scores(self) -> Self:
464        if self.output_scores is None or len(self.output_scores) == 0:
465            raise ValueError(
466                "output_scores are required, and must have at least one score."
467            )
468
469        # check for duplicate names (once transformed to JSON keys)
470        output_score_keys = [score.json_key() for score in self.output_scores]
471        if len(output_score_keys) != len(set(output_score_keys)):
472            raise ValueError(
473                f"output_scores must have unique names (once transformed to JSON keys). Got: [{', '.join(output_score_keys)}]"
474            )
475        return self
@model_validator(mode='after')
def validate_template_properties(self) -> Self:
477    @model_validator(mode="after")
478    def validate_template_properties(self) -> Self:
479        # eval_configs_filter_id is required for all templates except "rag"
480        if (
481            self.template is not EvalTemplateId.rag
482            and self.eval_configs_filter_id is None
483        ):
484            raise ValueError(
485                "eval_configs_filter_id is required for all templates except 'rag'"
486            )
487
488        # For spec-based evals, template_properties will be None and validation happens in the spec
489        # For legacy evals, template_properties contains the data and we validate here
490        if self.template_properties is None:
491            return self
492
493        # Check for properties that are required for the issue template (legacy evals only)
494        if self.template == EvalTemplateId.issue:
495            if "issue_prompt" not in self.template_properties or not isinstance(
496                self.template_properties["issue_prompt"], str
497            ):
498                raise ValueError("issue_prompt is required for issue template")
499            if "failure_example" in self.template_properties and not isinstance(
500                self.template_properties["failure_example"], str
501            ):
502                raise ValueError(
503                    "failure_example is optional for issue template, but if provided must be a string"
504                )
505            if "pass_example" in self.template_properties and not isinstance(
506                self.template_properties["pass_example"], str
507            ):
508                raise ValueError(
509                    "pass_example is optional for issue template, but if provided must be a string"
510                )
511
512        if self.template == EvalTemplateId.tool_call:
513            if self.evaluation_data_type != EvalDataType.full_trace:
514                raise ValueError(
515                    "tool_call template should have evaluation_data_type set to full_trace"
516                )
517            if (
518                "tool" not in self.template_properties
519                or not isinstance(self.template_properties["tool"], str)
520                or not self.template_properties["tool"].strip()
521            ):
522                raise ValueError("tool is required for tool call template")
523            if "tool_function_name" not in self.template_properties or not isinstance(
524                self.template_properties["tool_function_name"], str
525            ):
526                raise ValueError(
527                    "tool_function_name is required for tool call template"
528                )
529            if (
530                "appropriate_tool_use_guidelines" not in self.template_properties
531                or not isinstance(
532                    self.template_properties["appropriate_tool_use_guidelines"], str
533                )
534                or not self.template_properties[
535                    "appropriate_tool_use_guidelines"
536                ].strip()
537            ):
538                raise ValueError(
539                    "appropriate_tool_use_guidelines is required for tool call template"
540                )
541            if (
542                "inappropriate_tool_use_guidelines" in self.template_properties
543                and not isinstance(
544                    self.template_properties["inappropriate_tool_use_guidelines"], str
545                )
546            ):
547                raise ValueError(
548                    "inappropriate_tool_use_guidelines is optional for tool call template, but if provided must be a string"
549                )
550        return self
def relationship_name() -> str:
761        def relationship_name_method() -> str:
762            return relationship_name

The type of the None singleton.

def parent_type() -> Type[kiln_ai.datamodel.basemodel.KilnParentModel]:
754        def parent_class_method() -> Type[KilnParentModel]:
755            return cls

The type of the None singleton.

model_config = {'validate_assignment': True}

Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

def model_post_init(self: pydantic.main.BaseModel, context: Any, /) -> None:
337def init_private_attributes(self: BaseModel, context: Any, /) -> None:
338    """This function is meant to behave like a BaseModel method to initialise private attributes.
339
340    It takes context as an argument since that's what pydantic-core passes when calling it.
341
342    Args:
343        self: The BaseModel instance.
344        context: The context.
345    """
346    if getattr(self, '__pydantic_private__', None) is None:
347        pydantic_private = {}
348        for name, private_attr in self.__private_attributes__.items():
349            default = private_attr.get_default()
350            if default is not PydanticUndefined:
351                pydantic_private[name] = default
352        object_setattr(self, '__pydantic_private__', pydantic_private)

This function is meant to behave like a BaseModel method to initialise private attributes.

It takes context as an argument since that's what pydantic-core passes when calling it.

Args: self: The BaseModel instance. context: The context.